Compare commits

..

3 Commits

Author SHA1 Message Date
Heikki Linnakangas
a77dd0700c Move startup tracing context handling 2024-05-03 09:28:19 +03:00
Em Sharnoff
c9472434c9 compute_ctl: Break up main() into discrete phases
This commit is intentionally designed to have as small a diff as
possible. To that end, the basic idea is that each distinct "chunk" of
the previous main() has been wrapped in its own function, with the
return values from each function being passed directly into the next.

The structure of main() is now visible from its contents:

  1. init()
  2. process_cli()
  3. wait_spec()
  4. start_postgres()
  5. wait_postgres()
  6. cleanup_and_exit()

There's a lot of other work that can / should(?) be done beyond this,
but I figure that's more opinionated, and this should be a solid start.
2024-05-01 12:07:46 -07:00
Em Sharnoff
f9c2945f74 compute_ctl: Non-functional prep changes to reduce diff
A couple lines moved further down in main(), and one case of using
Option<&str> instead of Option<&String>.
2024-05-01 12:06:55 -07:00
169 changed files with 2115 additions and 6740 deletions

View File

@@ -236,6 +236,27 @@ jobs:
submodules: true
fetch-depth: 1
- name: Check Postgres submodules revision
shell: bash -euo pipefail {0}
run: |
# This is a temporary solution to ensure that the Postgres submodules revision is correct (i.e. the updated intentionally).
# Eventually it will be replaced by a regression test https://github.com/neondatabase/neon/pull/4603
FAILED=false
for postgres in postgres-v14 postgres-v15 postgres-v16; do
expected=$(cat vendor/revisions.json | jq --raw-output '."'"${postgres}"'"')
actual=$(git rev-parse "HEAD:vendor/${postgres}")
if [ "${expected}" != "${actual}" ]; then
echo >&2 "Expected ${postgres} rev to be at '${expected}', but it is at '${actual}'"
FAILED=true
fi
done
if [ "${FAILED}" = "true" ]; then
echo >&2 "Please update vendor/revisions.json if these changes are intentional"
exit 1
fi
- name: Set pg 14 revision for caching
id: pg_v14_rev
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT

307
Cargo.lock generated
View File

@@ -595,7 +595,7 @@ dependencies = [
"http 0.2.9",
"http-body 0.4.5",
"hyper 0.14.26",
"hyper-rustls 0.24.0",
"hyper-rustls",
"once_cell",
"pin-project-lite",
"pin-utils",
@@ -684,7 +684,7 @@ dependencies = [
"http-body 0.4.5",
"hyper 0.14.26",
"itoa",
"matchit 0.7.0",
"matchit",
"memchr",
"mime",
"percent-encoding",
@@ -740,7 +740,7 @@ dependencies = [
"pin-project",
"quick-xml",
"rand 0.8.5",
"reqwest 0.11.19",
"reqwest",
"rustc_version",
"serde",
"serde_json",
@@ -865,12 +865,6 @@ version = "0.21.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f1e31e207a6b8fb791a38ea3105e6cb541f55e4d029902d3039a4ad07cc4105"
[[package]]
name = "base64"
version = "0.22.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
[[package]]
name = "base64-simd"
version = "0.8.0"
@@ -1216,7 +1210,7 @@ dependencies = [
"postgres",
"regex",
"remote_storage",
"reqwest 0.12.4",
"reqwest",
"rust-ini",
"serde",
"serde_json",
@@ -1335,7 +1329,7 @@ dependencies = [
"postgres_backend",
"postgres_connection",
"regex",
"reqwest 0.12.4",
"reqwest",
"safekeeper_api",
"scopeguard",
"serde",
@@ -1348,7 +1342,6 @@ dependencies = [
"tokio-postgres",
"tokio-util",
"toml",
"toml_edit",
"tracing",
"url",
"utils",
@@ -2370,17 +2363,6 @@ dependencies = [
"winapi",
]
[[package]]
name = "hostname"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f9c7c7c8ac16c798734b8a24560c1362120597c40d5e1459f09498f8f6c8f2ba"
dependencies = [
"cfg-if",
"libc",
"windows 0.52.0",
]
[[package]]
name = "http"
version = "0.2.9"
@@ -2527,7 +2509,6 @@ dependencies = [
"pin-project-lite",
"smallvec",
"tokio",
"want",
]
[[package]]
@@ -2545,23 +2526,6 @@ dependencies = [
"tokio-rustls 0.24.0",
]
[[package]]
name = "hyper-rustls"
version = "0.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a0bea761b46ae2b24eb4aef630d8d1c398157b6fc29e6350ecf090a0b70c952c"
dependencies = [
"futures-util",
"http 1.1.0",
"hyper 1.2.0",
"hyper-util",
"rustls 0.22.4",
"rustls-pki-types",
"tokio",
"tokio-rustls 0.25.0",
"tower-service",
]
[[package]]
name = "hyper-timeout"
version = "0.4.1"
@@ -2609,7 +2573,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ca38ef113da30126bbff9cd1705f9273e15d45498615d138b0c20279ac7a76aa"
dependencies = [
"bytes",
"futures-channel",
"futures-util",
"http 1.1.0",
"http-body 1.0.0",
@@ -2617,9 +2580,6 @@ dependencies = [
"pin-project-lite",
"socket2 0.5.5",
"tokio",
"tower",
"tower-service",
"tracing",
]
[[package]]
@@ -2633,7 +2593,7 @@ dependencies = [
"iana-time-zone-haiku",
"js-sys",
"wasm-bindgen",
"windows 0.48.0",
"windows",
]
[[package]]
@@ -2956,12 +2916,6 @@ version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b87248edafb776e59e6ee64a79086f65890d3510f2c656c000bf2a7e8a0aea40"
[[package]]
name = "matchit"
version = "0.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "540f1c43aed89909c0cc0cc604e3bb2f7e7a341a3728a9e6cfe760e733cd11ed"
[[package]]
name = "md-5"
version = "0.10.5"
@@ -3095,6 +3049,16 @@ version = "0.3.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a"
[[package]]
name = "mime_guess"
version = "2.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4192263c238a5f0d0c6bfd21f336a313a4ce1c450542449ca191bb657b4642ef"
dependencies = [
"mime",
"unicase",
]
[[package]]
name = "minimal-lexical"
version = "0.2.1"
@@ -3438,7 +3402,7 @@ dependencies = [
"bytes",
"http 0.2.9",
"opentelemetry_api",
"reqwest 0.11.19",
"reqwest",
]
[[package]]
@@ -3456,7 +3420,7 @@ dependencies = [
"opentelemetry_api",
"opentelemetry_sdk",
"prost",
"reqwest 0.11.19",
"reqwest",
"thiserror",
"tokio",
"tonic",
@@ -3685,14 +3649,13 @@ dependencies = [
"rand 0.8.5",
"regex",
"remote_storage",
"reqwest 0.12.4",
"reqwest",
"rpds",
"scopeguard",
"serde",
"serde_json",
"serde_path_to_error",
"serde_with",
"sha2",
"signal-hook",
"smallvec",
"storage_broker",
@@ -3756,7 +3719,7 @@ dependencies = [
"futures",
"pageserver_api",
"postgres",
"reqwest 0.12.4",
"reqwest",
"serde",
"thiserror",
"tokio",
@@ -4365,7 +4328,7 @@ dependencies = [
"hashlink",
"hex",
"hmac",
"hostname 0.3.1",
"hostname",
"http 1.1.0",
"http-body-util",
"humantime",
@@ -4373,7 +4336,6 @@ dependencies = [
"hyper 1.2.0",
"hyper-tungstenite",
"hyper-util",
"indexmap 2.0.1",
"ipnet",
"itertools",
"lasso",
@@ -4399,7 +4361,7 @@ dependencies = [
"redis",
"regex",
"remote_storage",
"reqwest 0.12.4",
"reqwest",
"reqwest-middleware",
"reqwest-retry",
"reqwest-tracing",
@@ -4426,7 +4388,6 @@ dependencies = [
"tokio-postgres-rustls",
"tokio-rustls 0.25.0",
"tokio-util",
"tower-service",
"tracing",
"tracing-opentelemetry",
"tracing-subscriber",
@@ -4717,7 +4678,6 @@ dependencies = [
"scopeguard",
"serde",
"serde_json",
"sync_wrapper",
"test-context",
"tokio",
"tokio-stream",
@@ -4743,106 +4703,69 @@ dependencies = [
"http 0.2.9",
"http-body 0.4.5",
"hyper 0.14.26",
"hyper-rustls",
"hyper-tls",
"ipnet",
"js-sys",
"log",
"mime",
"mime_guess",
"native-tls",
"once_cell",
"percent-encoding",
"pin-project-lite",
"rustls 0.21.11",
"rustls-pemfile 1.0.2",
"serde",
"serde_json",
"serde_urlencoded",
"tokio",
"tokio-native-tls",
"tokio-rustls 0.24.0",
"tokio-util",
"tower-service",
"url",
"wasm-bindgen",
"wasm-bindgen-futures",
"wasm-streams 0.3.0",
"wasm-streams",
"web-sys",
"winreg 0.50.0",
]
[[package]]
name = "reqwest"
version = "0.12.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "566cafdd92868e0939d3fb961bd0dc25fcfaaed179291093b3d43e6b3150ea10"
dependencies = [
"base64 0.22.1",
"bytes",
"futures-channel",
"futures-core",
"futures-util",
"http 1.1.0",
"http-body 1.0.0",
"http-body-util",
"hyper 1.2.0",
"hyper-rustls 0.26.0",
"hyper-util",
"ipnet",
"js-sys",
"log",
"mime",
"once_cell",
"percent-encoding",
"pin-project-lite",
"rustls 0.22.4",
"rustls-pemfile 2.1.1",
"rustls-pki-types",
"serde",
"serde_json",
"serde_urlencoded",
"sync_wrapper",
"tokio",
"tokio-rustls 0.25.0",
"tokio-util",
"tower-service",
"url",
"wasm-bindgen",
"wasm-bindgen-futures",
"wasm-streams 0.4.0",
"web-sys",
"webpki-roots 0.26.1",
"winreg 0.52.0",
"webpki-roots 0.25.2",
"winreg",
]
[[package]]
name = "reqwest-middleware"
version = "0.3.0"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0209efb52486ad88136190094ee214759ef7507068b27992256ed6610eb71a01"
checksum = "4531c89d50effe1fac90d095c8b133c20c5c714204feee0bfc3fd158e784209d"
dependencies = [
"anyhow",
"async-trait",
"http 1.1.0",
"reqwest 0.12.4",
"http 0.2.9",
"reqwest",
"serde",
"task-local-extensions",
"thiserror",
"tower-service",
]
[[package]]
name = "reqwest-retry"
version = "0.5.0"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "40f342894422862af74c50e1e9601cf0931accc9c6981e5eb413c46603b616b5"
checksum = "48d0fd6ef4c6d23790399fe15efc8d12cd9f3d4133958f9bd7801ee5cbaec6c4"
dependencies = [
"anyhow",
"async-trait",
"chrono",
"futures",
"getrandom 0.2.11",
"http 1.1.0",
"hyper 1.2.0",
"http 0.2.9",
"hyper 0.14.26",
"parking_lot 0.11.2",
"reqwest 0.12.4",
"reqwest",
"reqwest-middleware",
"retry-policies",
"task-local-extensions",
"tokio",
"tracing",
"wasm-timer",
@@ -4850,27 +4773,27 @@ dependencies = [
[[package]]
name = "reqwest-tracing"
version = "0.5.0"
version = "0.4.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b253954a1979e02eabccd7e9c3d61d8f86576108baa160775e7f160bb4e800a3"
checksum = "5a0152176687dd5cfe7f507ac1cb1a491c679cfe483afd133a7db7aaea818bb3"
dependencies = [
"anyhow",
"async-trait",
"getrandom 0.2.11",
"http 1.1.0",
"matchit 0.8.2",
"matchit",
"opentelemetry",
"reqwest 0.12.4",
"reqwest",
"reqwest-middleware",
"task-local-extensions",
"tracing",
"tracing-opentelemetry",
]
[[package]]
name = "retry-policies"
version = "0.3.0"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "493b4243e32d6eedd29f9a398896e35c6943a123b55eec97dcaee98310d25810"
checksum = "e09bbcb5003282bcb688f0bae741b278e9c7e8f378f561522c9806c58e075d9b"
dependencies = [
"anyhow",
"chrono",
@@ -5196,7 +5119,7 @@ dependencies = [
"postgres_ffi",
"rand 0.8.5",
"remote_storage",
"reqwest 0.12.4",
"reqwest",
"serde",
"serde_json",
"serde_with",
@@ -5247,7 +5170,7 @@ dependencies = [
"rand 0.8.5",
"regex",
"remote_storage",
"reqwest 0.12.4",
"reqwest",
"safekeeper_api",
"scopeguard",
"sd-notify",
@@ -5377,12 +5300,12 @@ checksum = "bebd363326d05ec3e2f532ab7660680f3b02130d780c299bca73469d521bc0ed"
[[package]]
name = "sentry"
version = "0.32.3"
version = "0.31.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "00421ed8fa0c995f07cde48ba6c89e80f2b312f74ff637326f392fbfd23abe02"
checksum = "2e95efd0cefa32028cdb9766c96de71d96671072f9fb494dc9fb84c0ef93e52b"
dependencies = [
"httpdate",
"reqwest 0.12.4",
"reqwest",
"rustls 0.21.11",
"sentry-backtrace",
"sentry-contexts",
@@ -5396,9 +5319,9 @@ dependencies = [
[[package]]
name = "sentry-backtrace"
version = "0.32.3"
version = "0.31.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a79194074f34b0cbe5dd33896e5928bbc6ab63a889bd9df2264af5acb186921e"
checksum = "6ac2bac6f310c4c4c4bb094d1541d32ae497f8c5c23405e85492cefdfe0971a9"
dependencies = [
"backtrace",
"once_cell",
@@ -5408,11 +5331,11 @@ dependencies = [
[[package]]
name = "sentry-contexts"
version = "0.32.3"
version = "0.31.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eba8870c5dba2bfd9db25c75574a11429f6b95957b0a78ac02e2970dd7a5249a"
checksum = "6c3e17295cecdbacf66c5bd38d6e1147e09e1e9d824d2d5341f76638eda02a3a"
dependencies = [
"hostname 0.4.0",
"hostname",
"libc",
"os_info",
"rustc_version",
@@ -5422,9 +5345,9 @@ dependencies = [
[[package]]
name = "sentry-core"
version = "0.32.3"
version = "0.31.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "46a75011ea1c0d5c46e9e57df03ce81f5c7f0a9e199086334a1f9c0a541e0826"
checksum = "8339474f587f36cb110fa1ed1b64229eea6d47b0b886375579297b7e47aeb055"
dependencies = [
"once_cell",
"rand 0.8.5",
@@ -5435,9 +5358,9 @@ dependencies = [
[[package]]
name = "sentry-panic"
version = "0.32.3"
version = "0.31.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2eaa3ecfa3c8750c78dcfd4637cfa2598b95b52897ed184b4dc77fcf7d95060d"
checksum = "875b69f506da75bd664029eafb05f8934297d2990192896d17325f066bd665b7"
dependencies = [
"sentry-backtrace",
"sentry-core",
@@ -5445,9 +5368,9 @@ dependencies = [
[[package]]
name = "sentry-tracing"
version = "0.32.3"
version = "0.31.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f715932bf369a61b7256687c6f0554141b7ce097287e30e3f7ed6e9de82498fe"
checksum = "89feead9bdd116f8035e89567651340fc382db29240b6c55ef412078b08d1aa3"
dependencies = [
"sentry-backtrace",
"sentry-core",
@@ -5457,13 +5380,13 @@ dependencies = [
[[package]]
name = "sentry-types"
version = "0.32.3"
version = "0.31.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4519c900ce734f7a0eb7aba0869dfb225a7af8820634a7dd51449e3b093cfb7c"
checksum = "99dc599bd6646884fc403d593cdcb9816dd67c50cff3271c01ff123617908dcd"
dependencies = [
"debugid",
"getrandom 0.2.11",
"hex",
"rand 0.8.5",
"serde",
"serde_json",
"thiserror",
@@ -5855,12 +5778,10 @@ dependencies = [
"pageserver_client",
"postgres_connection",
"r2d2",
"reqwest 0.12.4",
"reqwest",
"routerify",
"serde",
"serde_json",
"strum",
"strum_macros",
"thiserror",
"tokio",
"tokio-util",
@@ -5879,7 +5800,7 @@ dependencies = [
"hyper 0.14.26",
"pageserver_api",
"pageserver_client",
"reqwest 0.12.4",
"reqwest",
"serde",
"serde_json",
"thiserror",
@@ -5933,7 +5854,7 @@ checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"
[[package]]
name = "svg_fmt"
version = "0.4.2"
source = "git+https://github.com/neondatabase/fork--nical--rust_debug?branch=neon#c1820b28664b5df68de7f043fccf2ed5d67b6ae8"
source = "git+https://github.com/neondatabase/fork--nical--rust_debug?branch=neon#b9501105e746629004bc6d0473639320939dbe10"
[[package]]
name = "syn"
@@ -5962,9 +5883,6 @@ name = "sync_wrapper"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160"
dependencies = [
"futures-core",
]
[[package]]
name = "synstructure"
@@ -6517,11 +6435,10 @@ dependencies = [
[[package]]
name = "tracing"
version = "0.1.37"
version = "0.1.40"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8ce8c33a8d48bd45d624a6e523445fd21ec13d3653cd51f681abf67418f54eb8"
checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef"
dependencies = [
"cfg-if",
"log",
"pin-project-lite",
"tracing-attributes",
@@ -6541,9 +6458,9 @@ dependencies = [
[[package]]
name = "tracing-attributes"
version = "0.1.24"
version = "0.1.27"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0f57e3ca2a01450b1a921183a9c9cbfda207fd822cef4ccb00a65402cbba7a74"
checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7"
dependencies = [
"proc-macro2",
"quote",
@@ -6552,9 +6469,9 @@ dependencies = [
[[package]]
name = "tracing-core"
version = "0.1.31"
version = "0.1.32"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0955b8137a1df6f1a2e9a37d8a6656291ff0297c1a97c24e0d8425fe2312f79a"
checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54"
dependencies = [
"once_cell",
"valuable",
@@ -6583,14 +6500,12 @@ dependencies = [
[[package]]
name = "tracing-opentelemetry"
version = "0.21.0"
version = "0.20.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "75327c6b667828ddc28f5e3f169036cb793c3f588d83bf0f262a7f062ffed3c8"
checksum = "fc09e402904a5261e42cf27aea09ccb7d5318c6717a9eec3d8e2e65c56b18f19"
dependencies = [
"once_cell",
"opentelemetry",
"opentelemetry_sdk",
"smallvec",
"tracing",
"tracing-core",
"tracing-log",
@@ -6636,7 +6551,7 @@ dependencies = [
"opentelemetry",
"opentelemetry-otlp",
"opentelemetry-semantic-conventions",
"reqwest 0.12.4",
"reqwest",
"tokio",
"tracing",
"tracing-opentelemetry",
@@ -6722,6 +6637,15 @@ dependencies = [
"libc",
]
[[package]]
name = "unicase"
version = "2.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "50f37be617794602aabbeee0be4f259dc1778fabe05e2d67ee8f79326d5cb4f6"
dependencies = [
"version_check",
]
[[package]]
name = "unicode-bidi"
version = "0.3.13"
@@ -7080,19 +7004,6 @@ dependencies = [
"web-sys",
]
[[package]]
name = "wasm-streams"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b65dc4c90b63b118468cf747d8bf3566c1913ef60be765b5730ead9e0a3ba129"
dependencies = [
"futures-util",
"js-sys",
"wasm-bindgen",
"wasm-bindgen-futures",
"web-sys",
]
[[package]]
name = "wasm-timer"
version = "0.2.5"
@@ -7133,15 +7044,6 @@ version = "0.25.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "14247bb57be4f377dfb94c72830b8ce8fc6beac03cf4bf7b9732eadd414123fc"
[[package]]
name = "webpki-roots"
version = "0.26.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b3de34ae270483955a94f4b21bdaaeb83d508bb84a01435f393818edb0012009"
dependencies = [
"rustls-pki-types",
]
[[package]]
name = "which"
version = "4.4.0"
@@ -7193,25 +7095,6 @@ dependencies = [
"windows-targets 0.48.0",
]
[[package]]
name = "windows"
version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e48a53791691ab099e5e2ad123536d0fff50652600abaf43bbf952894110d0be"
dependencies = [
"windows-core",
"windows-targets 0.52.4",
]
[[package]]
name = "windows-core"
version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9"
dependencies = [
"windows-targets 0.52.4",
]
[[package]]
name = "windows-sys"
version = "0.42.0"
@@ -7444,16 +7327,6 @@ dependencies = [
"windows-sys 0.48.0",
]
[[package]]
name = "winreg"
version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a277a57398d4bfa075df44f501a17cfdf8542d224f0d36095a2adc7aee4ef0a5"
dependencies = [
"cfg-if",
"windows-sys 0.48.0",
]
[[package]]
name = "workspace_hack"
version = "0.1.0"
@@ -7503,8 +7376,7 @@ dependencies = [
"regex",
"regex-automata 0.4.3",
"regex-syntax 0.8.2",
"reqwest 0.11.19",
"reqwest 0.12.4",
"reqwest",
"rustls 0.21.11",
"scopeguard",
"serde",
@@ -7514,7 +7386,6 @@ dependencies = [
"subtle",
"syn 1.0.109",
"syn 2.0.52",
"sync_wrapper",
"time",
"time-macros",
"tokio",

View File

@@ -99,7 +99,6 @@ humantime = "2.1"
humantime-serde = "1.1.1"
hyper = "0.14"
hyper-tungstenite = "0.13.0"
indexmap = "2"
inotify = "0.10.2"
ipnet = "2.9.0"
itertools = "0.10"
@@ -131,10 +130,10 @@ prost = "0.11"
rand = "0.8"
redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] }
regex = "1.10.2"
reqwest = { version = "0.12", default-features = false, features = ["rustls-tls"] }
reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_20"] }
reqwest-middleware = "0.3.0"
reqwest-retry = "0.5"
reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
reqwest-tracing = { version = "0.4.7", features = ["opentelemetry_0_20"] }
reqwest-middleware = "0.2.0"
reqwest-retry = "0.2.2"
routerify = "3"
rpds = "0.13"
rustc-hash = "1.1.0"
@@ -144,7 +143,7 @@ rustls-split = "0.3"
scopeguard = "1.1"
sysinfo = "0.29.2"
sd-notify = "0.4.1"
sentry = { version = "0.32", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
sentry = { version = "0.31", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
serde = { version = "1.0", features = ["derive"] }
serde_json = "1"
serde_path_to_error = "0.1"
@@ -178,10 +177,9 @@ tokio-util = { version = "0.7.10", features = ["io", "rt"] }
toml = "0.7"
toml_edit = "0.19"
tonic = {version = "0.9", features = ["tls", "tls-roots"]}
tower-service = "0.3.2"
tracing = "0.1"
tracing-error = "0.2.0"
tracing-opentelemetry = "0.21.0"
tracing-opentelemetry = "0.20.0"
tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json", "ansi"] }
twox-hash = { version = "1.6.3", default-features = false }
url = "2.2"

View File

@@ -65,7 +65,7 @@ RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/
&& mv s5cmd /usr/local/bin/s5cmd
# LLVM
ENV LLVM_VERSION=18
ENV LLVM_VERSION=17
RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
&& echo "deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
&& apt update \
@@ -141,7 +141,7 @@ WORKDIR /home/nonroot
# Rust
# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
ENV RUSTC_VERSION=1.78.0
ENV RUSTC_VERSION=1.77.0
ENV RUSTUP_HOME="/home/nonroot/.rustup"
ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \

View File

@@ -81,14 +81,11 @@ $(POSTGRES_INSTALL_DIR)/build/%/config.status:
echo "'git submodule update --init --recursive --depth 2 --progress .' in project root.\n"; \
exit 1; }
mkdir -p $(POSTGRES_INSTALL_DIR)/build/$*
VERSION=$*; \
EXTRA_VERSION=$$(cd $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION && git rev-parse HEAD); \
(cd $(POSTGRES_INSTALL_DIR)/build/$$VERSION && \
env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION/configure \
(cd $(POSTGRES_INSTALL_DIR)/build/$* && \
env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure \
CFLAGS='$(PG_CFLAGS)' \
$(PG_CONFIGURE_OPTS) --with-extra-version=" ($$EXTRA_VERSION)" \
--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/$$VERSION > configure.log)
$(PG_CONFIGURE_OPTS) \
--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/$* > configure.log)
# nicer alias to run 'configure'
# Note: I've been unable to use templates for this part of our configuration.

View File

@@ -47,10 +47,11 @@ use chrono::Utc;
use clap::Arg;
use signal_hook::consts::{SIGQUIT, SIGTERM};
use signal_hook::{consts::SIGINT, iterator::Signals};
use tracing::{error, info, warn};
use tracing::{error, info};
use url::Url;
use compute_api::responses::ComputeStatus;
use compute_api::spec::ComputeSpec;
use compute_tools::compute::{
forward_termination_signal, ComputeNode, ComputeState, ParsedSpec, PG_PID,
@@ -62,13 +63,35 @@ use compute_tools::logger::*;
use compute_tools::monitor::launch_monitor;
use compute_tools::params::*;
use compute_tools::spec::*;
use compute_tools::swap::resize_swap;
// this is an arbitrary build tag. Fine as a default / for testing purposes
// in-case of not-set environment var
const BUILD_TAG_DEFAULT: &str = "latest";
fn main() -> Result<()> {
let (build_tag, clap_args) = init()?;
let (pg_handle, start_pg_result) =
{
// Enter startup tracing context
let _startup_context_guard = startup_context_from_env();
let cli_result = process_cli(&clap_args)?;
let wait_spec_result = wait_spec(build_tag, cli_result)?;
start_postgres(&clap_args, wait_spec_result)?
// Startup is finished, exit the startup tracing context
};
// PostgreSQL is now running, if startup was successful. Wait until it exits.
let wait_pg_result = wait_postgres(pg_handle)?;
cleanup_and_exit(start_pg_result, wait_pg_result)
}
fn init() -> Result<(String, clap::ArgMatches)> {
init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?;
@@ -83,36 +106,11 @@ fn main() -> Result<()> {
.to_string();
info!("build_tag: {build_tag}");
let matches = cli().get_matches();
let pgbin_default = String::from("postgres");
let pgbin = matches.get_one::<String>("pgbin").unwrap_or(&pgbin_default);
let ext_remote_storage = matches
.get_one::<String>("remote-ext-config")
// Compatibility hack: if the control plane specified any remote-ext-config
// use the default value for extension storage proxy gateway.
// Remove this once the control plane is updated to pass the gateway URL
.map(|conf| {
if conf.starts_with("http") {
conf.trim_end_matches('/')
} else {
"http://pg-ext-s3-gateway"
}
});
let http_port = *matches
.get_one::<u16>("http-port")
.expect("http-port is required");
let pgdata = matches
.get_one::<String>("pgdata")
.expect("PGDATA path is required");
let connstr = matches
.get_one::<String>("connstr")
.expect("Postgres connection string is required");
let spec_json = matches.get_one::<String>("spec");
let spec_path = matches.get_one::<String>("spec-path");
let resize_swap_on_bind = matches.get_flag("resize-swap-on-bind");
Ok((build_tag, cli().get_matches()))
}
fn startup_context_from_env() -> Option<opentelemetry::ContextGuard>
{
// Extract OpenTelemetry context for the startup actions from the
// TRACEPARENT and TRACESTATE env variables, and attach it to the current
// tracing context.
@@ -149,7 +147,7 @@ fn main() -> Result<()> {
if let Ok(val) = std::env::var("TRACESTATE") {
startup_tracing_carrier.insert("tracestate".to_string(), val);
}
let startup_context_guard = if !startup_tracing_carrier.is_empty() {
if !startup_tracing_carrier.is_empty() {
use opentelemetry::propagation::TextMapPropagator;
use opentelemetry::sdk::propagation::TraceContextPropagator;
let guard = TraceContextPropagator::new()
@@ -159,8 +157,42 @@ fn main() -> Result<()> {
Some(guard)
} else {
None
};
}
}
fn process_cli(
matches: &clap::ArgMatches,
) -> Result<ProcessCliResult> {
let pgbin_default = "postgres";
let pgbin = matches
.get_one::<String>("pgbin")
.map(|s| s.as_str())
.unwrap_or(pgbin_default);
let ext_remote_storage = matches
.get_one::<String>("remote-ext-config")
// Compatibility hack: if the control plane specified any remote-ext-config
// use the default value for extension storage proxy gateway.
// Remove this once the control plane is updated to pass the gateway URL
.map(|conf| {
if conf.starts_with("http") {
conf.trim_end_matches('/')
} else {
"http://pg-ext-s3-gateway"
}
});
let http_port = *matches
.get_one::<u16>("http-port")
.expect("http-port is required");
let pgdata = matches
.get_one::<String>("pgdata")
.expect("PGDATA path is required");
let connstr = matches
.get_one::<String>("connstr")
.expect("Postgres connection string is required");
let spec_json = matches.get_one::<String>("spec");
let spec_path = matches.get_one::<String>("spec-path");
let compute_id = matches.get_one::<String>("compute-id");
let control_plane_uri = matches.get_one::<String>("control-plane-uri");
@@ -201,6 +233,45 @@ fn main() -> Result<()> {
}
};
let result = ProcessCliResult {
// directly from CLI:
connstr,
pgdata,
pgbin,
ext_remote_storage,
http_port,
// others:
spec,
live_config_allowed,
};
Ok(result)
}
struct ProcessCliResult<'clap> {
connstr: &'clap str,
pgdata: &'clap str,
pgbin: &'clap str,
ext_remote_storage: Option<&'clap str>,
http_port: u16,
/// If a spec was provided via CLI or file, the [`ComputeSpec`]
spec: Option<ComputeSpec>,
live_config_allowed: bool,
}
fn wait_spec(
build_tag: String,
ProcessCliResult {
connstr,
pgdata,
pgbin,
ext_remote_storage,
http_port,
spec,
live_config_allowed,
}: ProcessCliResult,
) -> Result<WaitSpecResult> {
let mut new_state = ComputeState::new();
let spec_set;
@@ -228,19 +299,17 @@ fn main() -> Result<()> {
// If this is a pooled VM, prewarm before starting HTTP server and becoming
// available for binding. Prewarming helps Postgres start quicker later,
// because QEMU will already have its memory allocated from the host, and
// because QEMU will already have it's memory allocated from the host, and
// the necessary binaries will already be cached.
if !spec_set {
compute.prewarm_postgres()?;
}
// Launch http service first, so that we can serve control-plane requests
// while configuration is still in progress.
// Launch http service first, so we were able to serve control-plane
// requests, while configuration is still in progress.
let _http_handle =
launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread");
let extension_server_port: u16 = http_port;
if !spec_set {
// No spec provided, hang waiting for it.
info!("no compute spec provided, waiting");
@@ -255,22 +324,34 @@ fn main() -> Result<()> {
break;
}
}
// Record for how long we slept waiting for the spec.
let now = Utc::now();
state.metrics.wait_for_spec_ms = now
.signed_duration_since(state.start_time)
.to_std()
.unwrap()
.as_millis() as u64;
// Reset start time, so that the total startup time that is calculated later will
// not include the time that we waited for the spec.
state.start_time = now;
}
Ok(WaitSpecResult { compute, http_port })
}
struct WaitSpecResult {
compute: Arc<ComputeNode>,
// passed through from ProcessCliResult
http_port: u16,
}
fn start_postgres(
matches: &clap::ArgMatches,
WaitSpecResult { compute, http_port }: WaitSpecResult,
) -> Result<(Option<PostgresHandle>, StartPostgresResult)> {
// We got all we need, update the state.
let mut state = compute.state.lock().unwrap();
// Record for how long we slept waiting for the spec.
state.metrics.wait_for_spec_ms = Utc::now()
.signed_duration_since(state.start_time)
.to_std()
.unwrap()
.as_millis() as u64;
// Reset start time to the actual start of the configuration, so that
// total startup time was properly measured at the end.
state.start_time = Utc::now();
state.status = ComputeStatus::Init;
compute.state_changed.notify_all();
@@ -278,72 +359,34 @@ fn main() -> Result<()> {
"running compute with features: {:?}",
state.pspec.as_ref().unwrap().spec.features
);
// before we release the mutex, fetch the swap size (if any) for later.
let swap_size_bytes = state.pspec.as_ref().unwrap().spec.swap_size_bytes;
drop(state);
// Launch remaining service threads
let _monitor_handle = launch_monitor(&compute);
let _configurator_handle = launch_configurator(&compute);
let mut prestartup_failed = false;
let mut delay_exit = false;
// Resize swap to the desired size if the compute spec says so
if let (Some(size_bytes), true) = (swap_size_bytes, resize_swap_on_bind) {
// To avoid 'swapoff' hitting postgres startup, we need to run resize-swap to completion
// *before* starting postgres.
//
// In theory, we could do this asynchronously if SkipSwapon was enabled for VMs, but this
// carries a risk of introducing hard-to-debug issues - e.g. if postgres sometimes gets
// OOM-killed during startup because swap wasn't available yet.
match resize_swap(size_bytes) {
Ok(()) => {
let size_gib = size_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
info!(%size_bytes, %size_gib, "resized swap");
}
Err(err) => {
let err = err.context("failed to resize swap");
error!("{err:#}");
// Mark compute startup as failed; don't try to start postgres, and report this
// error to the control plane when it next asks.
prestartup_failed = true;
let mut state = compute.state.lock().unwrap();
state.error = Some(format!("{err:?}"));
state.status = ComputeStatus::Failed;
compute.state_changed.notify_all();
delay_exit = true;
}
}
}
let extension_server_port: u16 = http_port;
// Start Postgres
let mut pg = None;
let mut exit_code = None;
if !prestartup_failed {
pg = match compute.start_compute(extension_server_port) {
Ok(pg) => Some(pg),
Err(err) => {
error!("could not start the compute node: {:#}", err);
let mut state = compute.state.lock().unwrap();
state.error = Some(format!("{:?}", err));
state.status = ComputeStatus::Failed;
// Notify others that Postgres failed to start. In case of configuring the
// empty compute, it's likely that API handler is still waiting for compute
// state change. With this we will notify it that compute is in Failed state,
// so control plane will know about it earlier and record proper error instead
// of timeout.
compute.state_changed.notify_all();
drop(state); // unlock
delay_exit = true;
None
}
};
} else {
warn!("skipping postgres startup because pre-startup step failed");
}
let mut delay_exit = false;
let pg = match compute.start_compute(extension_server_port) {
Ok(pg) => Some(pg),
Err(err) => {
error!("could not start the compute node: {:#}", err);
let mut state = compute.state.lock().unwrap();
state.error = Some(format!("{:?}", err));
state.status = ComputeStatus::Failed;
// Notify others that Postgres failed to start. In case of configuring the
// empty compute, it's likely that API handler is still waiting for compute
// state change. With this we will notify it that compute is in Failed state,
// so control plane will know about it earlier and record proper error instead
// of timeout.
compute.state_changed.notify_all();
drop(state); // unlock
delay_exit = true;
None
}
};
// Start the vm-monitor if directed to. The vm-monitor only runs on linux
// because it requires cgroups.
@@ -376,7 +419,7 @@ fn main() -> Result<()> {
// This token is used internally by the monitor to clean up all threads
let token = CancellationToken::new();
let vm_monitor = &rt.as_ref().map(|rt| {
let vm_monitor = rt.as_ref().map(|rt| {
rt.spawn(vm_monitor::start(
Box::leak(Box::new(vm_monitor::Args {
cgroup: cgroup.cloned(),
@@ -389,12 +432,43 @@ fn main() -> Result<()> {
}
}
Ok((
pg,
StartPostgresResult {
delay_exit,
compute,
#[cfg(target_os = "linux")]
rt,
#[cfg(target_os = "linux")]
token,
#[cfg(target_os = "linux")]
vm_monitor,
},
))
}
type PostgresHandle = (std::process::Child, std::thread::JoinHandle<()>);
struct StartPostgresResult {
delay_exit: bool,
// passed through from WaitSpecResult
compute: Arc<ComputeNode>,
#[cfg(target_os = "linux")]
rt: Option<tokio::runtime::Runtime>,
#[cfg(target_os = "linux")]
token: tokio_util::sync::CancellationToken,
#[cfg(target_os = "linux")]
vm_monitor: Option<tokio::task::JoinHandle<Result<()>>>,
}
fn wait_postgres(
pg: Option<PostgresHandle>,
) -> Result<WaitPostgresResult> {
// Wait for the child Postgres process forever. In this state Ctrl+C will
// propagate to Postgres and it will be shut down as well.
let mut exit_code = None;
if let Some((mut pg, logs_handle)) = pg {
// Startup is finished, exit the startup tracing span
drop(startup_context_guard);
let ecode = pg
.wait()
.expect("failed to start waiting on Postgres process");
@@ -409,6 +483,26 @@ fn main() -> Result<()> {
exit_code = ecode.code()
}
Ok(WaitPostgresResult { exit_code })
}
struct WaitPostgresResult {
exit_code: Option<i32>,
}
fn cleanup_and_exit(
StartPostgresResult {
mut delay_exit,
compute,
#[cfg(target_os = "linux")]
vm_monitor,
#[cfg(target_os = "linux")]
token,
#[cfg(target_os = "linux")]
rt,
}: StartPostgresResult,
WaitPostgresResult { exit_code }: WaitPostgresResult,
) -> Result<()> {
// Terminate the vm_monitor so it releases the file watcher on
// /sys/fs/cgroup/neon-postgres.
// Note: the vm-monitor only runs on linux because it requires cgroups.
@@ -568,11 +662,6 @@ fn cli() -> clap::Command {
)
.value_name("FILECACHE_CONNSTR"),
)
.arg(
Arg::new("resize-swap-on-bind")
.long("resize-swap-on-bind")
.action(clap::ArgAction::SetTrue),
)
}
/// When compute_ctl is killed, send also termination signal to sync-safekeepers

View File

@@ -14,5 +14,4 @@ pub mod monitor;
pub mod params;
pub mod pg_helpers;
pub mod spec;
pub mod swap;
pub mod sync_sk;

View File

@@ -490,7 +490,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
"rename_db" => {
let new_name = op.new_name.as_ref().unwrap();
if existing_dbs.contains_key(&op.name) {
if existing_dbs.get(&op.name).is_some() {
let query: String = format!(
"ALTER DATABASE {} RENAME TO {}",
op.name.pg_quote(),

View File

@@ -1,36 +0,0 @@
use anyhow::{anyhow, Context};
use tracing::warn;
pub const RESIZE_SWAP_BIN: &str = "/neonvm/bin/resize-swap";
pub fn resize_swap(size_bytes: u64) -> anyhow::Result<()> {
// run `/neonvm/bin/resize-swap --once {size_bytes}`
//
// Passing '--once' causes resize-swap to delete itself after successful completion, which
// means that if compute_ctl restarts later, we won't end up calling 'swapoff' while
// postgres is running.
//
// NOTE: resize-swap is not very clever. If present, --once MUST be the first arg.
let child_result = std::process::Command::new("/usr/bin/sudo")
.arg(RESIZE_SWAP_BIN)
.arg("--once")
.arg(size_bytes.to_string())
.spawn();
if matches!(&child_result, Err(e) if e.kind() == std::io::ErrorKind::NotFound) {
warn!("ignoring \"not found\" error from resize-swap to avoid swapoff while compute is running");
return Ok(());
}
child_result
.context("spawn() failed")
.and_then(|mut child| child.wait().context("wait() failed"))
.and_then(|status| match status.success() {
true => Ok(()),
false => Err(anyhow!("process exited with {status}")),
})
// wrap any prior error with the overall context that we couldn't run the command
.with_context(|| {
format!("could not run `/usr/bin/sudo {RESIZE_SWAP_BIN} --once {size_bytes}`")
})
}

View File

@@ -28,7 +28,6 @@ serde_with.workspace = true
tar.workspace = true
thiserror.workspace = true
toml.workspace = true
toml_edit.workspace = true
tokio.workspace = true
tokio-postgres.workspace = true
tokio-util.workspace = true

View File

@@ -14,15 +14,15 @@ use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
use control_plane::safekeeper::SafekeeperNode;
use control_plane::storage_controller::StorageController;
use control_plane::{broker, local_env};
use pageserver_api::config::{
DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
};
use pageserver_api::controller_api::PlacementPolicy;
use pageserver_api::models::{
ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
};
use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId};
use pageserver_api::{
DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
};
use postgres_backend::AuthType;
use postgres_connection::parse_host_port;
use safekeeper_api::{
@@ -133,7 +133,7 @@ fn main() -> Result<()> {
let subcommand_result = match sub_name {
"tenant" => rt.block_on(handle_tenant(sub_args, &mut env)),
"timeline" => rt.block_on(handle_timeline(sub_args, &mut env)),
"start" => rt.block_on(handle_start_all(&env)),
"start" => rt.block_on(handle_start_all(sub_args, &env)),
"stop" => rt.block_on(handle_stop_all(sub_args, &env)),
"pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
"storage_controller" => rt.block_on(handle_storage_controller(sub_args, &env)),
@@ -358,13 +358,6 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
default_conf(*num_pageservers)
};
let pageserver_config: toml_edit::Document =
if let Some(path) = init_match.get_one::<PathBuf>("pageserver-config") {
std::fs::read_to_string(path)?.parse()?
} else {
toml_edit::Document::new()
};
let pg_version = init_match
.get_one::<u32>("pg-version")
.copied()
@@ -382,7 +375,7 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
// Initialize pageserver, create initial tenant and timeline.
for ps_conf in &env.pageservers {
PageServerNode::from_env(&env, ps_conf)
.initialize(&pageserver_config)
.initialize(&pageserver_config_overrides(init_match))
.unwrap_or_else(|e| {
eprintln!("pageserver init failed: {e:?}");
exit(1);
@@ -404,6 +397,15 @@ fn get_default_pageserver(env: &local_env::LocalEnv) -> PageServerNode {
PageServerNode::from_env(env, ps_conf)
}
fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> {
init_match
.get_many::<String>("pageserver-config-override")
.into_iter()
.flatten()
.map(String::as_str)
.collect()
}
async fn handle_tenant(
tenant_match: &ArgMatches,
env: &mut local_env::LocalEnv,
@@ -835,8 +837,6 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
.copied()
.unwrap_or(false);
let allow_multiple = sub_args.get_flag("allow-multiple");
let mode = match (lsn, hot_standby) {
(Some(lsn), false) => ComputeMode::Static(lsn),
(None, true) => ComputeMode::Replica,
@@ -854,9 +854,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
_ => {}
}
if !allow_multiple {
cplane.check_conflicting_endpoints(mode, tenant_id, timeline_id)?;
}
cplane.check_conflicting_endpoints(mode, tenant_id, timeline_id)?;
cplane.new_endpoint(
&endpoint_id,
@@ -885,8 +883,6 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
let remote_ext_config = sub_args.get_one::<String>("remote-ext-config");
let allow_multiple = sub_args.get_flag("allow-multiple");
// If --safekeepers argument is given, use only the listed safekeeper nodes.
let safekeepers =
if let Some(safekeepers_str) = sub_args.get_one::<String>("safekeepers") {
@@ -912,13 +908,11 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
.cloned()
.unwrap_or_default();
if !allow_multiple {
cplane.check_conflicting_endpoints(
endpoint.mode,
endpoint.tenant_id,
endpoint.timeline_id,
)?;
}
cplane.check_conflicting_endpoints(
endpoint.mode,
endpoint.tenant_id,
endpoint.timeline_id,
)?;
let (pageservers, stripe_size) = if let Some(pageserver_id) = pageserver_id {
let conf = env.get_pageserver_conf(pageserver_id).unwrap();
@@ -1074,7 +1068,10 @@ fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageSe
async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
match sub_match.subcommand() {
Some(("start", subcommand_args)) => {
if let Err(e) = get_pageserver(env, subcommand_args)?.start().await {
if let Err(e) = get_pageserver(env, subcommand_args)?
.start(&pageserver_config_overrides(subcommand_args))
.await
{
eprintln!("pageserver start failed: {e}");
exit(1);
}
@@ -1100,7 +1097,10 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
exit(1);
}
if let Err(e) = pageserver.start().await {
if let Err(e) = pageserver
.start(&pageserver_config_overrides(subcommand_args))
.await
{
eprintln!("pageserver start failed: {e}");
exit(1);
}
@@ -1227,7 +1227,7 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
Ok(())
}
async fn handle_start_all(env: &local_env::LocalEnv) -> anyhow::Result<()> {
async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<()> {
// Endpoints are not started automatically
broker::start_broker_process(env).await?;
@@ -1244,7 +1244,10 @@ async fn handle_start_all(env: &local_env::LocalEnv) -> anyhow::Result<()> {
for ps_conf in &env.pageservers {
let pageserver = PageServerNode::from_env(env, ps_conf);
if let Err(e) = pageserver.start().await {
if let Err(e) = pageserver
.start(&pageserver_config_overrides(sub_match))
.await
{
eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
try_stop_all(env, true).await;
exit(1);
@@ -1385,6 +1388,13 @@ fn cli() -> Command {
.required(false)
.value_name("stop-mode");
let pageserver_config_args = Arg::new("pageserver-config-override")
.long("pageserver-config-override")
.num_args(1)
.action(ArgAction::Append)
.help("Additional pageserver's configuration options or overrides, refer to pageserver's 'config-override' CLI parameter docs for more")
.required(false);
let remote_ext_config_args = Arg::new("remote-ext-config")
.long("remote-ext-config")
.num_args(1)
@@ -1434,33 +1444,20 @@ fn cli() -> Command {
.help("If set, will create test user `user` and `neondb` database. Requires `update-catalog = true`")
.required(false);
let allow_multiple = Arg::new("allow-multiple")
.help("Allow multiple primary endpoints running on the same branch. Shouldn't be used normally, but useful for tests.")
.long("allow-multiple")
.action(ArgAction::SetTrue)
.required(false);
Command::new("Neon CLI")
.arg_required_else_help(true)
.version(GIT_VERSION)
.subcommand(
Command::new("init")
.about("Initialize a new Neon repository, preparing configs for services to start with")
.arg(pageserver_config_args.clone())
.arg(num_pageservers_arg.clone())
.arg(
Arg::new("config")
.long("config")
.required(false)
.value_parser(value_parser!(PathBuf))
.value_name("config")
)
.arg(
Arg::new("pageserver-config")
.long("pageserver-config")
.required(false)
.value_parser(value_parser!(PathBuf))
.value_name("pageserver-config")
.help("Merge the provided pageserver config into the one generated by neon_local."),
.value_name("config"),
)
.arg(pg_version_arg.clone())
.arg(force_arg)
@@ -1542,6 +1539,7 @@ fn cli() -> Command {
.subcommand(Command::new("status"))
.subcommand(Command::new("start")
.about("Start local pageserver")
.arg(pageserver_config_args.clone())
)
.subcommand(Command::new("stop")
.about("Stop local pageserver")
@@ -1549,6 +1547,7 @@ fn cli() -> Command {
)
.subcommand(Command::new("restart")
.about("Restart local pageserver")
.arg(pageserver_config_args.clone())
)
)
.subcommand(
@@ -1602,7 +1601,6 @@ fn cli() -> Command {
.arg(pg_version_arg.clone())
.arg(hot_standby_arg.clone())
.arg(update_catalog)
.arg(allow_multiple.clone())
)
.subcommand(Command::new("start")
.about("Start postgres.\n If the endpoint doesn't exist yet, it is created.")
@@ -1611,7 +1609,6 @@ fn cli() -> Command {
.arg(safekeepers_arg)
.arg(remote_ext_config_args)
.arg(create_test_user)
.arg(allow_multiple.clone())
)
.subcommand(Command::new("reconfigure")
.about("Reconfigure the endpoint")
@@ -1663,6 +1660,7 @@ fn cli() -> Command {
.subcommand(
Command::new("start")
.about("Start page server and safekeepers")
.arg(pageserver_config_args)
)
.subcommand(
Command::new("stop")

View File

@@ -554,7 +554,6 @@ impl Endpoint {
format_version: 1.0,
operation_uuid: None,
features: self.features.clone(),
swap_size_bytes: None,
cluster: Cluster {
cluster_id: None, // project ID: not used
name: None, // project name: not used

View File

@@ -382,10 +382,7 @@ impl LocalEnv {
// Find neon binaries.
if env.neon_distrib_dir == Path::new("") {
env::current_exe()?
.parent()
.unwrap()
.clone_into(&mut env.neon_distrib_dir);
env.neon_distrib_dir = env::current_exe()?.parent().unwrap().to_owned();
}
if env.pageservers.is_empty() {

View File

@@ -4,6 +4,7 @@
//!
//! .neon/
//!
use std::borrow::Cow;
use std::collections::HashMap;
use std::io;
@@ -17,8 +18,7 @@ use anyhow::{bail, Context};
use camino::Utf8PathBuf;
use futures::SinkExt;
use pageserver_api::models::{
self, AuxFilePolicy, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo,
TimelineInfo,
self, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo, TimelineInfo,
};
use pageserver_api::shard::TenantShardId;
use pageserver_client::mgmt_api;
@@ -77,7 +77,7 @@ impl PageServerNode {
/// Merge overrides provided by the user on the command line with our default overides derived from neon_local configuration.
///
/// These all end up on the command line of the `pageserver` binary.
fn neon_local_overrides(&self, cli_overrides: &toml_edit::Document) -> Vec<String> {
fn neon_local_overrides(&self, cli_overrides: &[&str]) -> Vec<String> {
// FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc.
let pg_distrib_dir_param = format!(
"pg_distrib_dir='{}'",
@@ -157,7 +157,10 @@ impl PageServerNode {
}
}
if !cli_overrides.contains_key("remote_storage") {
if !cli_overrides
.iter()
.any(|c| c.starts_with("remote_storage"))
{
overrides.push(format!(
"remote_storage={{local_path='../{PAGESERVER_REMOTE_STORAGE_DIR}'}}"
));
@@ -170,13 +173,13 @@ impl PageServerNode {
}
// Apply the user-provided overrides
overrides.push(cli_overrides.to_string());
overrides.extend(cli_overrides.iter().map(|&c| c.to_owned()));
overrides
}
/// Initializes a pageserver node by creating its config with the overrides provided.
pub fn initialize(&self, config_overrides: &toml_edit::Document) -> anyhow::Result<()> {
pub fn initialize(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
// First, run `pageserver --init` and wait for it to write a config into FS and exit.
self.pageserver_init(config_overrides)
.with_context(|| format!("Failed to run init for pageserver node {}", self.conf.id))
@@ -194,11 +197,11 @@ impl PageServerNode {
.expect("non-Unicode path")
}
pub async fn start(&self) -> anyhow::Result<()> {
self.start_node().await
pub async fn start(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
self.start_node(config_overrides, false).await
}
fn pageserver_init(&self, config_overrides: &toml_edit::Document) -> anyhow::Result<()> {
fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
let datadir = self.repo_path();
let node_id = self.conf.id;
println!(
@@ -216,18 +219,11 @@ impl PageServerNode {
let datadir_path_str = datadir.to_str().with_context(|| {
format!("Cannot start pageserver node {node_id} in path that has no string representation: {datadir:?}")
})?;
let mut args = self.pageserver_basic_args(config_overrides, datadir_path_str);
args.push(Cow::Borrowed("--init"));
// `pageserver --init` merges the `--config-override`s into a built-in default config,
// then writes out the merged product to `pageserver.toml`.
// TODO: just write the full `pageserver.toml` and get rid of `--config-override`.
let mut args = vec!["--init", "--workdir", datadir_path_str];
let overrides = self.neon_local_overrides(config_overrides);
for piece in &overrides {
args.push("--config-override");
args.push(piece);
}
let init_output = Command::new(self.env.pageserver_bin())
.args(args)
.args(args.iter().map(Cow::as_ref))
.envs(self.pageserver_env_variables()?)
.output()
.with_context(|| format!("Failed to run pageserver init for node {node_id}"))?;
@@ -252,13 +248,12 @@ impl PageServerNode {
// situation: the metadata is written by some other script.
std::fs::write(
metadata_path,
serde_json::to_vec(&pageserver_api::config::NodeMetadata {
postgres_host: "localhost".to_string(),
postgres_port: self.pg_connection_config.port(),
http_host: "localhost".to_string(),
http_port,
other: HashMap::new(),
})
serde_json::to_vec(&serde_json::json!({
"host": "localhost",
"port": self.pg_connection_config.port(),
"http_host": "localhost",
"http_port": http_port,
}))
.unwrap(),
)
.expect("Failed to write metadata file");
@@ -266,7 +261,11 @@ impl PageServerNode {
Ok(())
}
async fn start_node(&self) -> anyhow::Result<()> {
async fn start_node(
&self,
config_overrides: &[&str],
update_config: bool,
) -> anyhow::Result<()> {
// TODO: using a thread here because start_process() is not async but we need to call check_status()
let datadir = self.repo_path();
print!(
@@ -283,12 +282,15 @@ impl PageServerNode {
self.conf.id, datadir,
)
})?;
let args = vec!["-D", datadir_path_str];
let mut args = self.pageserver_basic_args(config_overrides, datadir_path_str);
if update_config {
args.push(Cow::Borrowed("--update-config"));
}
background_process::start_process(
"pageserver",
&datadir,
&self.env.pageserver_bin(),
args,
args.iter().map(Cow::as_ref),
self.pageserver_env_variables()?,
background_process::InitialPidFile::Expect(self.pid_file()),
|| async {
@@ -305,6 +307,22 @@ impl PageServerNode {
Ok(())
}
fn pageserver_basic_args<'a>(
&self,
config_overrides: &'a [&'a str],
datadir_path_str: &'a str,
) -> Vec<Cow<'a, str>> {
let mut args = vec![Cow::Borrowed("-D"), Cow::Borrowed(datadir_path_str)];
let overrides = self.neon_local_overrides(config_overrides);
for config_override in overrides {
args.push(Cow::Borrowed("-c"));
args.push(Cow::Owned(config_override));
}
args
}
fn pageserver_env_variables(&self) -> anyhow::Result<Vec<(String, String)>> {
// FIXME: why is this tied to pageserver's auth type? Whether or not the safekeeper
// needs a token, and how to generate that token, seems independent to whether
@@ -430,11 +448,11 @@ impl PageServerNode {
.map(serde_json::from_str)
.transpose()
.context("parse `timeline_get_throttle` from json")?,
switch_aux_file_policy: settings
.remove("switch_aux_file_policy")
.map(|x| x.parse::<AuxFilePolicy>())
switch_to_aux_file_v2: settings
.remove("switch_to_aux_file_v2")
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'switch_aux_file_policy'")?,
.context("Failed to parse 'switch_to_aux_file_v2' as bool")?,
};
if !settings.is_empty() {
bail!("Unrecognized tenant settings: {settings:?}")
@@ -553,11 +571,11 @@ impl PageServerNode {
.map(serde_json::from_str)
.transpose()
.context("parse `timeline_get_throttle` from json")?,
switch_aux_file_policy: settings
.remove("switch_aux_file_policy")
.map(|x| x.parse::<AuxFilePolicy>())
switch_to_aux_file_v2: settings
.remove("switch_to_aux_file_v2")
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'switch_aux_file_policy'")?,
.context("Failed to parse 'switch_to_aux_file_v2' as bool")?,
}
};

View File

@@ -3,6 +3,7 @@ use crate::{
local_env::{LocalEnv, NeonStorageControllerConf},
};
use camino::{Utf8Path, Utf8PathBuf};
use hyper::Method;
use pageserver_api::{
controller_api::{
NodeConfigureRequest, NodeRegisterRequest, TenantCreateResponse, TenantLocateResponse,
@@ -16,7 +17,6 @@ use pageserver_api::{
};
use pageserver_client::mgmt_api::ResponseErrorMessageExt;
use postgres_backend::AuthType;
use reqwest::Method;
use serde::{de::DeserializeOwned, Deserialize, Serialize};
use std::{fs, str::FromStr};
use tokio::process::Command;
@@ -379,7 +379,7 @@ impl StorageController {
/// Simple HTTP request wrapper for calling into storage controller
async fn dispatch<RQ, RS>(
&self,
method: reqwest::Method,
method: hyper::Method,
path: String,
body: Option<RQ>,
) -> anyhow::Result<RS>

View File

@@ -1,6 +1,7 @@
use std::{collections::HashMap, str::FromStr, time::Duration};
use clap::{Parser, Subcommand};
use hyper::{Method, StatusCode};
use pageserver_api::{
controller_api::{
NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy,
@@ -13,7 +14,7 @@ use pageserver_api::{
shard::{ShardStripeSize, TenantShardId},
};
use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt};
use reqwest::{Method, StatusCode, Url};
use reqwest::Url;
use serde::{de::DeserializeOwned, Serialize};
use utils::id::{NodeId, TenantId};
@@ -231,7 +232,7 @@ impl Client {
/// Simple HTTP request wrapper for calling into storage controller
async fn dispatch<RQ, RS>(
&self,
method: Method,
method: hyper::Method,
path: String,
body: Option<RQ>,
) -> mgmt_api::Result<RS>

View File

@@ -33,23 +33,6 @@ pub struct ComputeSpec {
#[serde(default)]
pub features: Vec<ComputeFeature>,
/// If compute_ctl was passed `--resize-swap-on-bind`, a value of `Some(_)` instructs
/// compute_ctl to `/neonvm/bin/resize-swap` with the given size, when the spec is first
/// received.
///
/// Both this field and `--resize-swap-on-bind` are required, so that the control plane's
/// spec generation doesn't need to be aware of the actual compute it's running on, while
/// guaranteeing gradual rollout of swap. Otherwise, without `--resize-swap-on-bind`, we could
/// end up trying to resize swap in VMs without it -- or end up *not* resizing swap, thus
/// giving every VM much more swap than it should have (32GiB).
///
/// Eventually we may remove `--resize-swap-on-bind` and exclusively use `swap_size_bytes` for
/// enabling the swap resizing behavior once rollout is complete.
///
/// See neondatabase/cloud#12047 for more.
#[serde(default)]
pub swap_size_bytes: Option<u64>,
/// Expected cluster state at the end of transition process.
pub cluster: Cluster,
pub delta_operations: Option<Vec<DeltaOp>>,

View File

@@ -480,15 +480,6 @@ impl<A: CounterPairAssoc> CounterPairVec<A> {
let id = self.vec.with_labels(labels);
self.vec.remove_metric(id)
}
pub fn sample(&self, labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>) -> u64 {
let id = self.vec.with_labels(labels);
let metric = self.vec.get_metric(id);
let inc = metric.inc.count.load(std::sync::atomic::Ordering::Relaxed);
let dec = metric.dec.count.load(std::sync::atomic::Ordering::Relaxed);
inc.saturating_sub(dec)
}
}
impl<T, A> ::measured::metric::group::MetricGroup<T> for CounterPairVec<A>

View File

@@ -1,31 +0,0 @@
use std::collections::HashMap;
use const_format::formatcp;
#[cfg(test)]
mod tests;
pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");
// Certain metadata (e.g. externally-addressable name, AZ) is delivered
// as a separate structure. This information is not neeed by the pageserver
// itself, it is only used for registering the pageserver with the control
// plane and/or storage controller.
//
#[derive(PartialEq, Eq, Debug, serde::Serialize, serde::Deserialize)]
pub struct NodeMetadata {
#[serde(rename = "host")]
pub postgres_host: String,
#[serde(rename = "port")]
pub postgres_port: u16,
pub http_host: String,
pub http_port: u16,
// Deployment tools may write fields to the metadata file beyond what we
// use in this type: this type intentionally only names fields that require.
#[serde(flatten)]
pub other: HashMap<String, serde_json::Value>,
}

View File

@@ -1,22 +0,0 @@
use super::*;
#[test]
fn test_node_metadata_v1_backward_compatibilty() {
let v1 = serde_json::to_vec(&serde_json::json!({
"host": "localhost",
"port": 23,
"http_host": "localhost",
"http_port": 42,
}));
assert_eq!(
serde_json::from_slice::<NodeMetadata>(&v1.unwrap()).unwrap(),
NodeMetadata {
postgres_host: "localhost".to_string(),
postgres_port: 23,
http_host: "localhost".to_string(),
http_port: 42,
other: HashMap::new(),
}
)
}

View File

@@ -80,7 +80,7 @@ impl Key {
}
/// Get the range of metadata keys.
pub const fn metadata_key_range() -> Range<Self> {
pub fn metadata_key_range() -> Range<Self> {
Key {
field1: METADATA_KEY_BEGIN_PREFIX,
field2: 0,
@@ -572,17 +572,14 @@ pub const AUX_FILES_KEY: Key = Key {
// Reverse mappings for a few Keys.
// These are needed by WAL redo manager.
/// Non inherited range for vectored get.
pub const NON_INHERITED_RANGE: Range<Key> = AUX_FILES_KEY..AUX_FILES_KEY.next();
/// Sparse keyspace range for vectored get. Missing key error will be ignored for this range.
pub const NON_INHERITED_SPARSE_RANGE: Range<Key> = Key::metadata_key_range();
// AUX_FILES currently stores only data for logical replication (slots etc), and
// we don't preserve these on a branch because safekeepers can't follow timeline
// switch (and generally it likely should be optional), so ignore these.
#[inline(always)]
pub fn is_inherited_key(key: Key) -> bool {
!NON_INHERITED_RANGE.contains(&key) && !NON_INHERITED_SPARSE_RANGE.contains(&key)
!NON_INHERITED_RANGE.contains(&key)
}
#[inline(always)]

View File

@@ -240,7 +240,7 @@ impl<'a> ShardedRange<'a> {
/// pages that would not actually be stored on this node.
///
/// Don't use this function in code that works with physical entities like layer files.
pub fn raw_size(range: &Range<Key>) -> u32 {
fn raw_size(range: &Range<Key>) -> u32 {
if is_contiguous_range(range) {
contiguous_range_len(range)
} else {

View File

@@ -1,5 +1,6 @@
#![deny(unsafe_code)]
#![deny(clippy::undocumented_unsafe_blocks)]
use const_format::formatcp;
pub mod controller_api;
pub mod key;
@@ -10,4 +11,7 @@ pub mod shard;
/// Public API types
pub mod upcall_api;
pub mod config;
pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");

View File

@@ -1,4 +1,3 @@
pub mod detach_ancestor;
pub mod partitioning;
pub mod utilization;
@@ -9,7 +8,6 @@ use std::{
collections::HashMap,
io::{BufRead, Read},
num::{NonZeroU64, NonZeroUsize},
str::FromStr,
time::{Duration, SystemTime},
};
@@ -305,31 +303,7 @@ pub struct TenantConfig {
pub lazy_slru_download: Option<bool>,
pub timeline_get_throttle: Option<ThrottleConfig>,
pub image_layer_creation_check_threshold: Option<u8>,
pub switch_aux_file_policy: Option<AuxFilePolicy>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum AuxFilePolicy {
V1,
V2,
CrossValidation,
}
impl FromStr for AuxFilePolicy {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
let s = s.to_lowercase();
if s == "v1" {
Ok(Self::V1)
} else if s == "v2" {
Ok(Self::V2)
} else if s == "crossvalidation" || s == "cross_validation" {
Ok(Self::CrossValidation)
} else {
anyhow::bail!("cannot parse {} to aux file policy", s)
}
}
pub switch_to_aux_file_v2: Option<bool>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
@@ -456,6 +430,8 @@ pub struct StatusResponse {
#[derive(Serialize, Deserialize, Debug)]
#[serde(deny_unknown_fields)]
pub struct TenantLocationConfigRequest {
#[serde(skip_serializing_if = "Option::is_none")]
pub tenant_id: Option<TenantShardId>,
#[serde(flatten)]
pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
}

View File

@@ -1,6 +0,0 @@
use utils::id::TimelineId;
#[derive(Default, serde::Serialize)]
pub struct AncestorDetached {
pub reparented_timelines: Vec<TimelineId>,
}

View File

@@ -97,7 +97,7 @@ impl ShardCount {
/// The internal value of a ShardCount may be zero, which means "1 shard, but use
/// legacy format for TenantShardId that excludes the shard suffix", also known
/// as [`TenantShardId::unsharded`].
/// as `TenantShardId::unsharded`.
///
/// This method returns the actual number of shards, i.e. if our internal value is
/// zero, we return 1 (unsharded tenants have 1 shard).
@@ -116,9 +116,7 @@ impl ShardCount {
self.0
}
/// Whether the `ShardCount` is for an unsharded tenant, so uses one shard but
/// uses the legacy format for `TenantShardId`. See also the documentation for
/// [`Self::count`].
///
pub fn is_unsharded(&self) -> bool {
self.0 == 0
}

View File

@@ -331,10 +331,7 @@ impl CheckPoint {
/// Returns 'true' if the XID was updated.
pub fn update_next_xid(&mut self, xid: u32) -> bool {
// nextXid should be greater than any XID in WAL, so increment provided XID and check for wraparround.
let mut new_xid = std::cmp::max(
xid.wrapping_add(1),
pg_constants::FIRST_NORMAL_TRANSACTION_ID,
);
let mut new_xid = std::cmp::max(xid.wrapping_add(1), pg_constants::FIRST_NORMAL_TRANSACTION_ID);
// To reduce number of metadata checkpoints, we forward align XID on XID_CHECKPOINT_INTERVAL.
// XID_CHECKPOINT_INTERVAL should not be larger than BLCKSZ*CLOG_XACTS_PER_BYTE
new_xid =
@@ -370,16 +367,8 @@ pub fn generate_wal_segment(segno: u64, system_id: u64, lsn: Lsn) -> Result<Byte
let seg_off = lsn.segment_offset(WAL_SEGMENT_SIZE);
let first_page_only = seg_off < XLOG_BLCKSZ;
// If first records starts in the middle of the page, pretend in page header
// there is a fake record which ends where first real record starts. This
// makes pg_waldump etc happy.
let (shdr_rem_len, infoflags) = if first_page_only && seg_off > 0 {
assert!(seg_off >= XLOG_SIZE_OF_XLOG_LONG_PHD);
// xlp_rem_len doesn't include page header, hence the subtraction.
(
seg_off - XLOG_SIZE_OF_XLOG_LONG_PHD,
pg_constants::XLP_FIRST_IS_CONTRECORD,
)
let (shdr_rem_len, infoflags) = if first_page_only {
(seg_off, pg_constants::XLP_FIRST_IS_CONTRECORD)
} else {
(0, 0)
};
@@ -408,22 +397,20 @@ pub fn generate_wal_segment(segno: u64, system_id: u64, lsn: Lsn) -> Result<Byte
if !first_page_only {
let block_offset = lsn.page_offset_in_segment(WAL_SEGMENT_SIZE) as usize;
// see comments above about XLP_FIRST_IS_CONTRECORD and xlp_rem_len.
let (xlp_rem_len, xlp_info) = if page_off > 0 {
assert!(page_off >= XLOG_SIZE_OF_XLOG_SHORT_PHD as u64);
(
(page_off - XLOG_SIZE_OF_XLOG_SHORT_PHD as u64) as u32,
pg_constants::XLP_FIRST_IS_CONTRECORD,
)
} else {
(0, 0)
};
let header = XLogPageHeaderData {
xlp_magic: XLOG_PAGE_MAGIC as u16,
xlp_info,
xlp_info: if page_off >= pg_constants::SIZE_OF_PAGE_HEADER as u64 {
pg_constants::XLP_FIRST_IS_CONTRECORD
} else {
0
},
xlp_tli: PG_TLI,
xlp_pageaddr: lsn.page_lsn().0,
xlp_rem_len,
xlp_rem_len: if page_off >= pg_constants::SIZE_OF_PAGE_HEADER as u64 {
page_off as u32
} else {
0u32
},
..Default::default() // Put 0 in padding fields.
};
let hdr_bytes = header.encode()?;

View File

@@ -38,7 +38,6 @@ azure_storage_blobs.workspace = true
futures-util.workspace = true
http-types.workspace = true
itertools.workspace = true
sync_wrapper = { workspace = true, features = ["futures"] }
[dev-dependencies]
camino-tempfile.workspace = true

View File

@@ -3,7 +3,6 @@
use std::borrow::Cow;
use std::collections::HashMap;
use std::env;
use std::io;
use std::num::NonZeroU32;
use std::pin::Pin;
use std::str::FromStr;
@@ -21,7 +20,6 @@ use azure_storage_blobs::blob::CopyStatus;
use azure_storage_blobs::prelude::ClientBuilder;
use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient};
use bytes::Bytes;
use futures::future::Either;
use futures::stream::Stream;
use futures_util::StreamExt;
use futures_util::TryStreamExt;
@@ -130,12 +128,12 @@ impl AzureBlobStorage {
let kind = RequestKind::Get;
let _permit = self.permit(kind, cancel).await?;
let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
let cancel_or_timeout_ = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
let mut etag = None;
let mut last_modified = None;
let mut metadata = HashMap::new();
// TODO give proper streaming response instead of buffering into RAM
// https://github.com/neondatabase/neon/issues/5563
let download = async {
let response = builder
@@ -154,46 +152,39 @@ impl AzureBlobStorage {
Err(_elapsed) => Err(DownloadError::Timeout),
});
let mut response = Box::pin(response);
let mut response = std::pin::pin!(response);
let Some(part) = response.next().await else {
let mut bufs = Vec::new();
while let Some(part) = response.next().await {
let part = part?;
if etag.is_none() {
etag = Some(part.blob.properties.etag);
}
if last_modified.is_none() {
last_modified = Some(part.blob.properties.last_modified.into());
}
if let Some(blob_meta) = part.blob.metadata {
metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
}
let data = part
.data
.collect()
.await
.map_err(|e| DownloadError::Other(e.into()))?;
bufs.push(data);
}
if bufs.is_empty() {
return Err(DownloadError::Other(anyhow::anyhow!(
"Azure GET response contained no response body"
"Azure GET response contained no buffers"
)));
};
let part = part?;
if etag.is_none() {
etag = Some(part.blob.properties.etag);
}
if last_modified.is_none() {
last_modified = Some(part.blob.properties.last_modified.into());
}
if let Some(blob_meta) = part.blob.metadata {
metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
}
// unwrap safety: if these were None, bufs would be empty and we would have returned an error already
let etag = etag.unwrap();
let last_modified = last_modified.unwrap();
let tail_stream = response
.map(|part| match part {
Ok(part) => Either::Left(part.data.map(|r| r.map_err(io::Error::other))),
Err(e) => {
Either::Right(futures::stream::once(async { Err(io::Error::other(e)) }))
}
})
.flatten();
let stream = part
.data
.map(|r| r.map_err(io::Error::other))
.chain(sync_wrapper::SyncStream::new(tail_stream));
//.chain(SyncStream::from_pin(Box::pin(tail_stream)));
let download_stream = crate::support::DownloadStream::new(cancel_or_timeout_, stream);
Ok(Download {
download_stream: Box::pin(download_stream),
download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
etag,
last_modified,
metadata: Some(StorageMetadata(metadata)),
@@ -202,10 +193,7 @@ impl AzureBlobStorage {
tokio::select! {
bufs = download => bufs,
cancel_or_timeout = cancel_or_timeout => match cancel_or_timeout {
TimeoutOrCancel::Timeout => Err(DownloadError::Timeout),
TimeoutOrCancel::Cancel => Err(DownloadError::Cancelled),
},
_ = cancel.cancelled() => Err(DownloadError::Cancelled),
}
}

View File

@@ -55,11 +55,11 @@ pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel};
/// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests
/// <https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/>
pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
/// Set this limit analogously to the S3 limit
/// We set this a little bit low as we currently buffer the entire file into RAM
///
/// Here, a limit of max 20k concurrent connections was noted.
/// <https://learn.microsoft.com/en-us/answers/questions/1301863/is-there-any-limitation-to-concurrent-connections>
pub const DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT: usize = 100;
pub const DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT: usize = 30;
/// No limits on the client side, which currenltly means 1000 for AWS S3.
/// <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax>
pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;

View File

@@ -50,14 +50,6 @@ extern "C" fn get_flush_rec_ptr(wp: *mut WalProposer) -> XLogRecPtr {
}
}
extern "C" fn update_donor(wp: *mut WalProposer, donor: *mut Safekeeper, donor_lsn: XLogRecPtr) {
unsafe {
let callback_data = (*(*wp).config).callback_data;
let api = callback_data as *mut Box<dyn ApiImpl>;
(*api).update_donor(&mut (*donor), donor_lsn)
}
}
extern "C" fn get_current_timestamp(wp: *mut WalProposer) -> TimestampTz {
unsafe {
let callback_data = (*(*wp).config).callback_data;
@@ -399,7 +391,6 @@ pub(crate) fn create_api() -> walproposer_api {
get_shmem_state: Some(get_shmem_state),
start_streaming: Some(start_streaming),
get_flush_rec_ptr: Some(get_flush_rec_ptr),
update_donor: Some(update_donor),
get_current_timestamp: Some(get_current_timestamp),
conn_error_message: Some(conn_error_message),
conn_status: Some(conn_status),
@@ -430,32 +421,6 @@ pub(crate) fn create_api() -> walproposer_api {
}
}
pub fn empty_shmem() -> crate::bindings::WalproposerShmemState {
let empty_feedback = crate::bindings::PageserverFeedback {
present: false,
currentClusterSize: 0,
last_received_lsn: 0,
disk_consistent_lsn: 0,
remote_consistent_lsn: 0,
replytime: 0,
shard_number: 0,
};
crate::bindings::WalproposerShmemState {
propEpochStartLsn: crate::bindings::pg_atomic_uint64 { value: 0 },
donor_name: [0; 64],
donor_conninfo: [0; 1024],
donor_lsn: 0,
mutex: 0,
mineLastElectedTerm: crate::bindings::pg_atomic_uint64 { value: 0 },
backpressureThrottlingTime: crate::bindings::pg_atomic_uint64 { value: 0 },
currentClusterSize: crate::bindings::pg_atomic_uint64 { value: 0 },
shard_ps_feedback: [empty_feedback; 128],
num_shards: 0,
min_ps_feedback: empty_feedback,
}
}
impl std::fmt::Display for Level {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(f, "{:?}", self)

View File

@@ -1,5 +1,8 @@
use std::ffi::CString;
use postgres_ffi::WAL_SEGMENT_SIZE;
use utils::{id::TenantTimelineId, lsn::Lsn};
use crate::{
api_bindings::{create_api, take_vec_u8, Level},
bindings::{
@@ -7,8 +10,6 @@ use crate::{
WalProposerCreate, WalProposerFree, WalProposerPoll, WalProposerStart,
},
};
use postgres_ffi::WAL_SEGMENT_SIZE;
use utils::{id::TenantTimelineId, lsn::Lsn};
/// Rust high-level wrapper for C walproposer API. Many methods are not required
/// for simple cases, hence todo!() in default implementations.
@@ -27,10 +28,6 @@ pub trait ApiImpl {
todo!()
}
fn update_donor(&self, _donor: &mut Safekeeper, _donor_lsn: u64) {
todo!()
}
fn get_current_timestamp(&self) -> i64 {
todo!()
}
@@ -277,7 +274,6 @@ mod tests {
sync::{atomic::AtomicUsize, mpsc::sync_channel},
};
use std::cell::UnsafeCell;
use utils::id::TenantTimelineId;
use crate::{api_bindings::Level, bindings::NeonWALReadResult, walproposer::Wrapper};
@@ -301,8 +297,6 @@ mod tests {
replies_ptr: AtomicUsize,
// channel to send LSN to the main thread
sync_channel: std::sync::mpsc::SyncSender<u64>,
// Shmem state, used for storing donor info
shmem: UnsafeCell<crate::bindings::WalproposerShmemState>,
}
impl MockImpl {
@@ -333,22 +327,11 @@ mod tests {
}
impl ApiImpl for MockImpl {
fn get_shmem_state(&self) -> *mut crate::bindings::WalproposerShmemState {
self.shmem.get()
}
fn get_current_timestamp(&self) -> i64 {
println!("get_current_timestamp");
0
}
fn update_donor(&self, donor: &mut crate::bindings::Safekeeper, donor_lsn: u64) {
let mut shmem = unsafe { *self.get_shmem_state() };
shmem.propEpochStartLsn.value = donor_lsn;
shmem.donor_conninfo = donor.conninfo;
shmem.donor_lsn = donor_lsn;
}
fn conn_status(
&self,
_: &mut crate::bindings::Safekeeper,
@@ -524,7 +507,6 @@ mod tests {
],
replies_ptr: AtomicUsize::new(0),
sync_channel: sender,
shmem: UnsafeCell::new(crate::api_bindings::empty_shmem()),
});
let config = crate::walproposer::Config {
ttid,

View File

@@ -88,7 +88,6 @@ reqwest.workspace = true
rpds.workspace = true
enum-map.workspace = true
enumset = { workspace = true, features = ["serde"]}
sha2.workspace = true
strum.workspace = true
strum_macros.workspace = true

View File

@@ -284,34 +284,6 @@ impl Client {
Ok((status, progress))
}
pub async fn tenant_secondary_status(
&self,
tenant_shard_id: TenantShardId,
) -> Result<SecondaryProgress> {
let path = reqwest::Url::parse(&format!(
"{}/v1/tenant/{}/secondary/status",
self.mgmt_api_endpoint, tenant_shard_id
))
.expect("Cannot build URL");
self.request(Method::GET, path, ())
.await?
.json()
.await
.map_err(Error::ReceiveBody)
}
pub async fn tenant_heatmap_upload(&self, tenant_id: TenantShardId) -> Result<()> {
let path = reqwest::Url::parse(&format!(
"{}/v1/tenant/{}/heatmap_upload",
self.mgmt_api_endpoint, tenant_id
))
.expect("Cannot build URL");
self.request(Method::POST, path, ()).await?;
Ok(())
}
pub async fn location_config(
&self,
tenant_shard_id: TenantShardId,
@@ -319,7 +291,10 @@ impl Client {
flush_ms: Option<std::time::Duration>,
lazy: bool,
) -> Result<()> {
let req_body = TenantLocationConfigRequest { config };
let req_body = TenantLocationConfigRequest {
tenant_id: None,
config,
};
let mut path = reqwest::Url::parse(&format!(
"{}/v1/tenant/{}/location_config",

View File

@@ -1,16 +1,10 @@
use bytes::{Buf, BufMut, Bytes};
use pageserver_api::key::{Key, AUX_KEY_PREFIX, METADATA_KEY_SIZE};
use sha2::Digest;
use tracing::warn;
fn hash256(data: &[u8]) -> [u8; 32] {
sha2::Sha256::digest(data).into()
}
/// Create a metadata key from a hash, encoded as [AUX_KEY_PREFIX, 2B directory prefix, first 13B of 128b sha256].
/// Create a metadata key from a hash, encoded as [AUX_KEY_PREFIX, 2B directory prefix, first 13B of 128b xxhash].
fn aux_hash_to_metadata_key(dir_level1: u8, dir_level2: u8, data: &[u8]) -> Key {
let mut key = [0; METADATA_KEY_SIZE];
let hash = hash256(data);
let hash = twox_hash::xxh3::hash128(data).to_be_bytes();
key[0] = AUX_KEY_PREFIX;
key[1] = dir_level1;
key[2] = dir_level2;
@@ -67,84 +61,6 @@ pub fn encode_aux_file_key(path: &str) -> Key {
}
}
const AUX_FILE_ENCODING_VERSION: u8 = 0x01;
pub fn decode_file_value(val: &[u8]) -> anyhow::Result<Vec<(&str, &[u8])>> {
let mut ptr = val;
if ptr.is_empty() {
// empty value = no files
return Ok(Vec::new());
}
assert_eq!(
ptr.get_u8(),
AUX_FILE_ENCODING_VERSION,
"unsupported aux file value"
);
let mut files = vec![];
while ptr.has_remaining() {
let key_len = ptr.get_u32() as usize;
let key = &ptr[..key_len];
ptr.advance(key_len);
let val_len = ptr.get_u32() as usize;
let content = &ptr[..val_len];
ptr.advance(val_len);
let path = std::str::from_utf8(key)?;
files.push((path, content));
}
Ok(files)
}
/// Decode an aux file key-value pair into a list of files. The returned `Bytes` contains reference
/// to the original value slice. Be cautious about memory consumption.
pub fn decode_file_value_bytes(val: &Bytes) -> anyhow::Result<Vec<(String, Bytes)>> {
let mut ptr = val.clone();
if ptr.is_empty() {
// empty value = no files
return Ok(Vec::new());
}
assert_eq!(
ptr.get_u8(),
AUX_FILE_ENCODING_VERSION,
"unsupported aux file value"
);
let mut files = vec![];
while ptr.has_remaining() {
let key_len = ptr.get_u32() as usize;
let key = ptr.slice(..key_len);
ptr.advance(key_len);
let val_len = ptr.get_u32() as usize;
let content = ptr.slice(..val_len);
ptr.advance(val_len);
let path = std::str::from_utf8(&key)?.to_string();
files.push((path, content));
}
Ok(files)
}
pub fn encode_file_value(files: &[(&str, &[u8])]) -> anyhow::Result<Vec<u8>> {
if files.is_empty() {
// no files = empty value
return Ok(Vec::new());
}
let mut encoded = vec![];
encoded.put_u8(AUX_FILE_ENCODING_VERSION);
for (path, content) in files {
if path.len() > u32::MAX as usize {
anyhow::bail!("{} exceeds path size limit", path);
}
encoded.put_u32(path.len() as u32);
encoded.put_slice(path.as_bytes());
if content.len() > u32::MAX as usize {
anyhow::bail!("{} exceeds content size limit", path);
}
encoded.put_u32(content.len() as u32);
encoded.put_slice(content);
}
Ok(encoded)
}
#[cfg(test)]
mod tests {
use super::*;
@@ -154,17 +70,14 @@ mod tests {
// AUX file encoding requires the hash to be portable across all platforms. This test case checks
// if the algorithm produces the same hash across different environments.
assert_eq!(
"1b4f0e9851971998e732078544c96b36c3d01cedf7caa332359d6f1d83567014",
hex::encode(hash256("test1".as_bytes()))
305317690835051308206966631765527126151,
twox_hash::xxh3::hash128("test1".as_bytes())
);
assert_eq!(
"3dfc364fd121aaa081cec54ef9ef6f69a4756df50cf3c52f1abd2b451829c4c0",
hex::encode(hash256("test/test2".as_bytes()))
);
assert_eq!(
"e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
hex::encode(hash256("".as_bytes()))
85104974691013376326742244813280798847,
twox_hash::xxh3::hash128("test/test2".as_bytes())
);
assert_eq!(0, twox_hash::xxh3::hash128("".as_bytes()));
}
#[test]
@@ -172,45 +85,28 @@ mod tests {
// To correct retrieve AUX files, the generated keys for the same file must be the same for all versions
// of the page server.
assert_eq!(
"62000001011B4F0E9851971998E732078544",
"6200000101E5B20C5F8DD5AA3289D6D9EAFA",
encode_aux_file_key("pg_logical/mappings/test1").to_string()
);
assert_eq!(
"620000010260303AE22B998861BCE3B28F33",
"620000010239AAC544893139B26F501B97E6",
encode_aux_file_key("pg_logical/snapshots/test2").to_string()
);
assert_eq!(
"6200000103E3B0C44298FC1C149AFBF4C899",
"620000010300000000000000000000000000",
encode_aux_file_key("pg_logical/replorigin_checkpoint").to_string()
);
assert_eq!(
"62000001FF60945D49535300BE8E42108658",
"62000001FF8635AF2134B7266EC5B4189FD6",
encode_aux_file_key("pg_logical/unsupported").to_string()
);
assert_eq!(
"6200000201FD61A03AF4F77D870FC21E05E7",
"6200000201772D0E5D71DE14DA86142A1619",
encode_aux_file_key("pg_replslot/test3").to_string()
);
assert_eq!(
"620000FFFF7F75AD39B73CD2A1FE41680550",
"620000FFFF1866EBEB53B807B26A2416F317",
encode_aux_file_key("other_file_not_supported").to_string()
);
}
#[test]
fn test_value_encoding() {
let files = vec![
("pg_logical/1.file", "1111".as_bytes()),
("pg_logical/2.file", "2222".as_bytes()),
];
assert_eq!(
files,
decode_file_value(&encode_file_value(&files).unwrap()).unwrap()
);
let files = vec![];
assert_eq!(
files,
decode_file_value(&encode_file_value(&files).unwrap()).unwrap()
);
}
}

View File

@@ -3,7 +3,6 @@
//! Main entry point for the Page Server executable.
use std::env::{var, VarError};
use std::io::Read;
use std::sync::Arc;
use std::time::Duration;
use std::{env, ops::ControlFlow, str::FromStr};
@@ -152,34 +151,37 @@ fn initialize_config(
workdir: &Utf8Path,
) -> anyhow::Result<ControlFlow<(), &'static PageServerConf>> {
let init = arg_matches.get_flag("init");
let update_config = init || arg_matches.get_flag("update-config");
let file_contents: Option<toml_edit::Document> = match std::fs::File::open(cfg_file_path) {
Ok(mut f) => {
if init {
anyhow::bail!("config file already exists: {cfg_file_path}");
}
let md = f.metadata().context("stat config file")?;
if md.is_file() {
let mut s = String::new();
f.read_to_string(&mut s).context("read config file")?;
Some(s.parse().context("parse config file toml")?)
} else {
anyhow::bail!("directory entry exists but is not a file: {cfg_file_path}");
}
}
Err(e) if e.kind() == std::io::ErrorKind::NotFound => None,
Err(e) => {
anyhow::bail!("open pageserver config: {e}: {cfg_file_path}");
let (mut toml, config_file_exists) = if cfg_file_path.is_file() {
if init {
anyhow::bail!(
"Config file '{cfg_file_path}' already exists, cannot init it, use --update-config to update it",
);
}
// Supplement the CLI arguments with the config file
let cfg_file_contents = std::fs::read_to_string(cfg_file_path)
.with_context(|| format!("Failed to read pageserver config at '{cfg_file_path}'"))?;
(
cfg_file_contents
.parse::<toml_edit::Document>()
.with_context(|| {
format!("Failed to parse '{cfg_file_path}' as pageserver config")
})?,
true,
)
} else if cfg_file_path.exists() {
anyhow::bail!("Config file '{cfg_file_path}' exists but is not a regular file");
} else {
// We're initializing the tenant, so there's no config file yet
(
DEFAULT_CONFIG_FILE
.parse::<toml_edit::Document>()
.context("could not parse built-in config file")?,
false,
)
};
let mut effective_config = file_contents.unwrap_or_else(|| {
DEFAULT_CONFIG_FILE
.parse()
.expect("unit tests ensure this works")
});
// Patch with overrides from the command line
if let Some(values) = arg_matches.get_many::<String>("config-override") {
for option_line in values {
let doc = toml_edit::Document::from_str(option_line).with_context(|| {
@@ -187,21 +189,22 @@ fn initialize_config(
})?;
for (key, item) in doc.iter() {
effective_config.insert(key, item.clone());
if config_file_exists && update_config && key == "id" && toml.contains_key(key) {
anyhow::bail!("Pageserver config file exists at '{cfg_file_path}' and has node id already, it cannot be overridden");
}
toml.insert(key, item.clone());
}
}
}
debug!("Resulting toml: {effective_config}");
// Construct the runtime representation
let conf = PageServerConf::parse_and_validate(&effective_config, workdir)
debug!("Resulting toml: {toml}");
let conf = PageServerConf::parse_and_validate(&toml, workdir)
.context("Failed to parse pageserver configuration")?;
if init {
if update_config {
info!("Writing pageserver config to '{cfg_file_path}'");
std::fs::write(cfg_file_path, effective_config.to_string())
std::fs::write(cfg_file_path, toml.to_string())
.with_context(|| format!("Failed to write pageserver config to '{cfg_file_path}'"))?;
info!("Config successfully written to '{cfg_file_path}'")
}
@@ -755,13 +758,18 @@ fn cli() -> Command {
// See `settings.md` for more details on the extra configuration patameters pageserver can process
.arg(
Arg::new("config-override")
.long("config-override")
.short('c')
.num_args(1)
.action(ArgAction::Append)
.help("Additional configuration overrides of the ones from the toml config file (or new ones to add there). \
Any option has to be a valid toml document, example: `-c=\"foo='hey'\"` `-c=\"foo={value=1}\"`"),
)
.arg(
Arg::new("update-config")
.long("update-config")
.action(ArgAction::SetTrue)
.help("Update the config file when started"),
)
.arg(
Arg::new("enabled-features")
.long("enabled-features")

View File

@@ -9,7 +9,7 @@ use pageserver_api::shard::TenantShardId;
use remote_storage::{RemotePath, RemoteStorageConfig};
use serde;
use serde::de::IntoDeserializer;
use std::env;
use std::{collections::HashMap, env};
use storage_broker::Uri;
use utils::crashsafe::path_with_suffix_extension;
use utils::id::ConnectionId;
@@ -51,7 +51,7 @@ pub mod defaults {
use crate::tenant::config::defaults::*;
use const_format::formatcp;
pub use pageserver_api::config::{
pub use pageserver_api::{
DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR,
DEFAULT_PG_LISTEN_PORT,
};
@@ -335,6 +335,26 @@ impl<T: Clone> BuilderValue<T> {
}
}
// Certain metadata (e.g. externally-addressable name, AZ) is delivered
// as a separate structure. This information is not neeed by the pageserver
// itself, it is only used for registering the pageserver with the control
// plane and/or storage controller.
//
#[derive(serde::Deserialize)]
pub(crate) struct NodeMetadata {
#[serde(rename = "host")]
pub(crate) postgres_host: String,
#[serde(rename = "port")]
pub(crate) postgres_port: u16,
pub(crate) http_host: String,
pub(crate) http_port: u16,
// Deployment tools may write fields to the metadata file beyond what we
// use in this type: this type intentionally only names fields that require.
#[serde(flatten)]
pub(crate) other: HashMap<String, serde_json::Value>,
}
// needed to simplify config construction
#[derive(Default)]
struct PageServerConfigBuilder {

View File

@@ -14,8 +14,10 @@ use tokio_util::sync::CancellationToken;
use url::Url;
use utils::{backoff, failpoint_support, generation::Generation, id::NodeId};
use crate::{config::PageServerConf, virtual_file::on_fatal_io_error};
use pageserver_api::config::NodeMetadata;
use crate::{
config::{NodeMetadata, PageServerConf},
virtual_file::on_fatal_io_error,
};
/// The Pageserver's client for using the control plane API: this is a small subset
/// of the overall control plane API, for dealing with generations (see docs/rfcs/025-generation-numbers.md)
@@ -63,7 +65,7 @@ impl ControlPlaneClient {
let mut client = reqwest::ClientBuilder::new();
if let Some(jwt) = &conf.control_plane_api_token {
let mut headers = reqwest::header::HeaderMap::new();
let mut headers = hyper::HeaderMap::new();
headers.insert(
"Authorization",
format!("Bearer {}", jwt.get_contents()).parse().unwrap(),

View File

@@ -540,12 +540,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
js.spawn(async move {
layer
.secondary_tenant
.evict_layer(
tenant_manager.get_conf(),
layer.timeline_id,
layer.name,
layer.metadata,
)
.evict_layer(tenant_manager.get_conf(), layer.timeline_id, layer.name)
.await;
Ok(file_size)
});

View File

@@ -782,6 +782,9 @@ components:
required:
- mode
properties:
tenant_id:
type: string
description: Not used, scheduled for removal.
mode:
type: string
enum: ["AttachedSingle", "AttachedMulti", "AttachedStale", "Secondary", "Detached"]

View File

@@ -63,7 +63,6 @@ use crate::tenant::remote_timeline_client::list_remote_timelines;
use crate::tenant::secondary::SecondaryController;
use crate::tenant::size::ModelInputs;
use crate::tenant::storage_layer::LayerAccessStatsReset;
use crate::tenant::storage_layer::LayerFileName;
use crate::tenant::timeline::CompactFlags;
use crate::tenant::timeline::Timeline;
use crate::tenant::SpawnMode;
@@ -1229,15 +1228,13 @@ async fn layer_download_handler(
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
let layer_file_name = get_request_param(&request, "layer_file_name")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let layer_name = LayerFileName::from_str(layer_file_name)
.map_err(|s| ApiError::BadRequest(anyhow::anyhow!(s)))?;
let state = get_state(&request);
let timeline =
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
.await?;
let downloaded = timeline
.download_layer(&layer_name)
.download_layer(layer_file_name)
.await
.map_err(ApiError::InternalServerError)?;
@@ -1261,14 +1258,11 @@ async fn evict_timeline_layer_handler(
let layer_file_name = get_request_param(&request, "layer_file_name")?;
let state = get_state(&request);
let layer_name = LayerFileName::from_str(layer_file_name)
.map_err(|s| ApiError::BadRequest(anyhow::anyhow!(s)))?;
let timeline =
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
.await?;
let evicted = timeline
.evict_layer(&layer_name)
.evict_layer(layer_file_name)
.await
.map_err(ApiError::InternalServerError)?;
@@ -1833,75 +1827,6 @@ async fn timeline_download_remote_layers_handler_get(
json_response(StatusCode::OK, info)
}
async fn timeline_detach_ancestor_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
use crate::tenant::timeline::detach_ancestor::Options;
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
let span = tracing::info_span!("detach_ancestor", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id);
async move {
let mut options = Options::default();
let rewrite_concurrency =
parse_query_param::<_, std::num::NonZeroUsize>(&request, "rewrite_concurrency")?;
let copy_concurrency =
parse_query_param::<_, std::num::NonZeroUsize>(&request, "copy_concurrency")?;
[
(&mut options.rewrite_concurrency, rewrite_concurrency),
(&mut options.copy_concurrency, copy_concurrency),
]
.into_iter()
.filter_map(|(target, val)| val.map(|val| (target, val)))
.for_each(|(target, val)| *target = val);
let state = get_state(&request);
let tenant = state
.tenant_manager
.get_attached_tenant_shard(tenant_shard_id)?;
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
let ctx = RequestContext::new(TaskKind::DetachAncestor, DownloadBehavior::Download);
let ctx = &ctx;
let timeline = tenant
.get_timeline(timeline_id, true)
.map_err(|e| ApiError::NotFound(e.into()))?;
let (_guard, prepared) = timeline
.prepare_to_detach_from_ancestor(&tenant, options, ctx)
.await
.map_err(|e| ApiError::InternalServerError(e.into()))?;
let res = state
.tenant_manager
.complete_detaching_timeline_ancestor(tenant_shard_id, timeline_id, prepared, ctx)
.await;
match res {
Ok(reparented_timelines) => {
let resp = pageserver_api::models::detach_ancestor::AncestorDetached {
reparented_timelines,
};
json_response(StatusCode::OK, resp)
}
Err(e) => Err(ApiError::InternalServerError(
e.context("timeline detach completion"),
)),
}
}
.instrument(span)
.await
}
async fn deletion_queue_flush(
r: Request<Body>,
cancel: CancellationToken,
@@ -2235,27 +2160,6 @@ async fn secondary_download_handler(
json_response(status, progress)
}
async fn secondary_status_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let state = get_state(&request);
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
let Some(secondary_tenant) = state
.tenant_manager
.get_secondary_tenant_shard(tenant_shard_id)
else {
return Err(ApiError::NotFound(
anyhow::anyhow!("Shard {} not found", tenant_shard_id).into(),
));
};
let progress = secondary_tenant.progress.lock().unwrap().clone();
json_response(StatusCode::OK, progress)
}
async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
json_response(
StatusCode::NOT_FOUND,
@@ -2590,10 +2494,6 @@ pub fn make_router(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_remote_layers",
|r| api_handler(r, timeline_download_remote_layers_handler_get),
)
.put(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/detach_ancestor",
|r| api_handler(r, timeline_detach_ancestor_handler),
)
.delete("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| {
api_handler(r, timeline_delete_handler)
})
@@ -2621,9 +2521,6 @@ pub fn make_router(
.put("/v1/deletion_queue/flush", |r| {
api_handler(r, deletion_queue_flush)
})
.get("/v1/tenant/:tenant_shard_id/secondary/status", |r| {
api_handler(r, secondary_status_handler)
})
.post("/v1/tenant/:tenant_shard_id/secondary/download", |r| {
api_handler(r, secondary_download_handler)
})

View File

@@ -51,8 +51,8 @@ pub(crate) enum StorageTimeOperation {
#[strum(serialize = "gc")]
Gc,
#[strum(serialize = "find gc cutoffs")]
FindGcCutoffs,
#[strum(serialize = "update gc info")]
UpdateGcInfo,
#[strum(serialize = "create tenant")]
CreateTenant,
@@ -194,11 +194,6 @@ pub(crate) struct GetVectoredLatency {
map: EnumMap<TaskKind, Option<Histogram>>,
}
#[allow(dead_code)]
pub(crate) struct ScanLatency {
map: EnumMap<TaskKind, Option<Histogram>>,
}
impl GetVectoredLatency {
// Only these task types perform vectored gets. Filter all other tasks out to reduce total
// cardinality of the metric.
@@ -209,48 +204,6 @@ impl GetVectoredLatency {
}
}
impl ScanLatency {
// Only these task types perform vectored gets. Filter all other tasks out to reduce total
// cardinality of the metric.
const TRACKED_TASK_KINDS: [TaskKind; 1] = [TaskKind::PageRequestHandler];
pub(crate) fn for_task_kind(&self, task_kind: TaskKind) -> Option<&Histogram> {
self.map[task_kind].as_ref()
}
}
pub(crate) struct ScanLatencyOngoingRecording<'a> {
parent: &'a Histogram,
start: std::time::Instant,
}
impl<'a> ScanLatencyOngoingRecording<'a> {
pub(crate) fn start_recording(parent: &'a Histogram) -> ScanLatencyOngoingRecording<'a> {
let start = Instant::now();
ScanLatencyOngoingRecording { parent, start }
}
pub(crate) fn observe(self, throttled: Option<Duration>) {
let elapsed = self.start.elapsed();
let ex_throttled = if let Some(throttled) = throttled {
elapsed.checked_sub(throttled)
} else {
Some(elapsed)
};
if let Some(ex_throttled) = ex_throttled {
self.parent.observe(ex_throttled.as_secs_f64());
} else {
use utils::rate_limit::RateLimit;
static LOGGED: Lazy<Mutex<RateLimit>> =
Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
let mut rate_limit = LOGGED.lock().unwrap();
rate_limit.call(|| {
warn!("error deducting time spent throttled; this message is logged at a global rate limit");
});
}
}
}
pub(crate) static GET_VECTORED_LATENCY: Lazy<GetVectoredLatency> = Lazy::new(|| {
let inner = register_histogram_vec!(
"pageserver_get_vectored_seconds",
@@ -274,29 +227,6 @@ pub(crate) static GET_VECTORED_LATENCY: Lazy<GetVectoredLatency> = Lazy::new(||
}
});
pub(crate) static SCAN_LATENCY: Lazy<ScanLatency> = Lazy::new(|| {
let inner = register_histogram_vec!(
"pageserver_scan_seconds",
"Time spent in scan, excluding time spent in timeline_get_throttle.",
&["task_kind"],
CRITICAL_OP_BUCKETS.into(),
)
.expect("failed to define a metric");
ScanLatency {
map: EnumMap::from_array(std::array::from_fn(|task_kind_idx| {
let task_kind = <TaskKind as enum_map::Enum>::from_usize(task_kind_idx);
if ScanLatency::TRACKED_TASK_KINDS.contains(&task_kind) {
let task_kind = task_kind.into();
Some(inner.with_label_values(&[task_kind]))
} else {
None
}
})),
}
});
pub(crate) struct PageCacheMetricsForTaskKind {
pub read_accesses_materialized_page: IntCounter,
pub read_accesses_immutable: IntCounter,
@@ -1512,80 +1442,29 @@ static REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER: Lazy<IntCounterVec> = Lazy
});
pub(crate) struct TenantManagerMetrics {
tenant_slots_attached: UIntGauge,
tenant_slots_secondary: UIntGauge,
tenant_slots_inprogress: UIntGauge,
pub(crate) tenant_slots: UIntGauge,
pub(crate) tenant_slot_writes: IntCounter,
pub(crate) unexpected_errors: IntCounter,
}
impl TenantManagerMetrics {
/// Helpers for tracking slots. Note that these do not track the lifetime of TenantSlot objects
/// exactly: they track the lifetime of the slots _in the tenant map_.
pub(crate) fn slot_inserted(&self, slot: &TenantSlot) {
match slot {
TenantSlot::Attached(_) => {
self.tenant_slots_attached.inc();
}
TenantSlot::Secondary(_) => {
self.tenant_slots_secondary.inc();
}
TenantSlot::InProgress(_) => {
self.tenant_slots_inprogress.inc();
}
}
}
pub(crate) fn slot_removed(&self, slot: &TenantSlot) {
match slot {
TenantSlot::Attached(_) => {
self.tenant_slots_attached.dec();
}
TenantSlot::Secondary(_) => {
self.tenant_slots_secondary.dec();
}
TenantSlot::InProgress(_) => {
self.tenant_slots_inprogress.dec();
}
}
}
#[cfg(all(debug_assertions, not(test)))]
pub(crate) fn slots_total(&self) -> u64 {
self.tenant_slots_attached.get()
+ self.tenant_slots_secondary.get()
+ self.tenant_slots_inprogress.get()
}
}
pub(crate) static TENANT_MANAGER: Lazy<TenantManagerMetrics> = Lazy::new(|| {
let tenant_slots = register_uint_gauge_vec!(
TenantManagerMetrics {
tenant_slots: register_uint_gauge!(
"pageserver_tenant_manager_slots",
"How many slots currently exist, including all attached, secondary and in-progress operations",
&["mode"]
)
.expect("failed to define a metric");
TenantManagerMetrics {
tenant_slots_attached: tenant_slots
.get_metric_with_label_values(&["attached"])
.unwrap(),
tenant_slots_secondary: tenant_slots
.get_metric_with_label_values(&["secondary"])
.unwrap(),
tenant_slots_inprogress: tenant_slots
.get_metric_with_label_values(&["inprogress"])
.unwrap(),
tenant_slot_writes: register_int_counter!(
"pageserver_tenant_manager_slot_writes",
"Writes to a tenant slot, including all of create/attach/detach/delete"
)
.expect("failed to define a metric"),
unexpected_errors: register_int_counter!(
"pageserver_tenant_manager_unexpected_errors_total",
"Number of unexpected conditions encountered: nonzero value indicates a non-fatal bug."
)
.expect("failed to define a metric"),
}
.expect("failed to define a metric"),
tenant_slot_writes: register_int_counter!(
"pageserver_tenant_manager_slot_writes",
"Writes to a tenant slot, including all of create/attach/detach/delete"
)
.expect("failed to define a metric"),
unexpected_errors: register_int_counter!(
"pageserver_tenant_manager_unexpected_errors_total",
"Number of unexpected conditions encountered: nonzero value indicates a non-fatal bug."
)
.expect("failed to define a metric"),
}
});
pub(crate) struct DeletionQueueMetrics {
@@ -2110,7 +1989,7 @@ pub(crate) struct TimelineMetrics {
pub imitate_logical_size_histo: StorageTimeMetrics,
pub load_layer_map_histo: StorageTimeMetrics,
pub garbage_collect_histo: StorageTimeMetrics,
pub find_gc_cutoffs_histo: StorageTimeMetrics,
pub update_gc_info_histo: StorageTimeMetrics,
pub last_record_gauge: IntGauge,
resident_physical_size_gauge: UIntGauge,
/// copy of LayeredTimeline.current_logical_size
@@ -2171,8 +2050,8 @@ impl TimelineMetrics {
&shard_id,
&timeline_id,
);
let find_gc_cutoffs_histo = StorageTimeMetrics::new(
StorageTimeOperation::FindGcCutoffs,
let update_gc_info_histo = StorageTimeMetrics::new(
StorageTimeOperation::UpdateGcInfo,
&tenant_id,
&shard_id,
&timeline_id,
@@ -2219,7 +2098,7 @@ impl TimelineMetrics {
logical_size_histo,
imitate_logical_size_histo,
garbage_collect_histo,
find_gc_cutoffs_histo,
update_gc_info_histo,
load_layer_map_histo,
last_record_gauge,
resident_physical_size_gauge,
@@ -2326,7 +2205,6 @@ use std::time::{Duration, Instant};
use crate::context::{PageContentKind, RequestContext};
use crate::task_mgr::TaskKind;
use crate::tenant::mgr::TenantSlot;
/// Maintain a per timeline gauge in addition to the global gauge.
struct PerTimelineRemotePhysicalSizeGauge {
@@ -2929,8 +2807,6 @@ pub fn preinitialize_metrics() {
&WALRECEIVER_CANDIDATES_REMOVED,
&tokio_epoll_uring::THREAD_LOCAL_LAUNCH_FAILURES,
&tokio_epoll_uring::THREAD_LOCAL_LAUNCH_SUCCESSES,
&REMOTE_ONDEMAND_DOWNLOADED_LAYERS,
&REMOTE_ONDEMAND_DOWNLOADED_BYTES,
]
.into_iter()
.for_each(|c| {

View File

@@ -10,9 +10,9 @@ use super::tenant::{PageReconstructError, Timeline};
use crate::context::RequestContext;
use crate::keyspace::{KeySpace, KeySpaceAccum};
use crate::metrics::WAL_INGEST;
use crate::repository::*;
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
use crate::walrecord::NeonWalRecord;
use crate::{aux_file, repository::*};
use anyhow::{ensure, Context};
use bytes::{Buf, Bytes, BytesMut};
use enum_map::Enum;
@@ -24,7 +24,6 @@ use pageserver_api::key::{
AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
};
use pageserver_api::keyspace::SparseKeySpace;
use pageserver_api::models::AuxFilePolicy;
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
use postgres_ffi::BLCKSZ;
@@ -280,7 +279,7 @@ impl Timeline {
match RelDirectory::des(&buf).context("deserialization failure") {
Ok(dir) => {
let exists = dir.rels.contains(&(tag.relnode, tag.forknum));
let exists = dir.rels.get(&(tag.relnode, tag.forknum)).is_some();
Ok(exists)
}
Err(e) => Err(PageReconstructError::from(e)),
@@ -380,7 +379,7 @@ impl Timeline {
match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
Ok(dir) => {
let exists = dir.segments.contains(&segno);
let exists = dir.segments.get(&segno).is_some();
Ok(exists)
}
Err(e) => Err(PageReconstructError::from(e)),
@@ -671,7 +670,7 @@ impl Timeline {
self.get(CHECKPOINT_KEY, lsn, ctx).await
}
async fn list_aux_files_v1(
pub(crate) async fn list_aux_files(
&self,
lsn: Lsn,
ctx: &RequestContext,
@@ -689,63 +688,6 @@ impl Timeline {
}
}
async fn list_aux_files_v2(
&self,
lsn: Lsn,
ctx: &RequestContext,
) -> Result<HashMap<String, Bytes>, PageReconstructError> {
let kv = self
.scan(KeySpace::single(Key::metadata_aux_key_range()), lsn, ctx)
.await
.context("scan")?;
let mut result = HashMap::new();
for (_, v) in kv {
let v = v.context("get value")?;
let v = aux_file::decode_file_value_bytes(&v).context("value decode")?;
for (fname, content) in v {
result.insert(fname, content);
}
}
Ok(result)
}
pub(crate) async fn list_aux_files(
&self,
lsn: Lsn,
ctx: &RequestContext,
) -> Result<HashMap<String, Bytes>, PageReconstructError> {
match self.get_switch_aux_file_policy() {
AuxFilePolicy::V1 => self.list_aux_files_v1(lsn, ctx).await,
AuxFilePolicy::V2 => self.list_aux_files_v2(lsn, ctx).await,
AuxFilePolicy::CrossValidation => {
let v1_result = self.list_aux_files_v1(lsn, ctx).await;
let v2_result = self.list_aux_files_v2(lsn, ctx).await;
match (v1_result, v2_result) {
(Ok(v1), Ok(v2)) => {
if v1 != v2 {
tracing::error!(
"unmatched aux file v1 v2 result:\nv1 {v1:?}\nv2 {v2:?}"
);
return Err(PageReconstructError::Other(anyhow::anyhow!(
"unmatched aux file v1 v2 result"
)));
}
Ok(v1)
}
(Ok(_), Err(v2)) => {
tracing::error!("aux file v1 returns Ok while aux file v2 returns an err");
Err(v2)
}
(Err(v1), Ok(_)) => {
tracing::error!("aux file v2 returns Ok while aux file v1 returns an err");
Err(v1)
}
(Err(_), Err(v2)) => Err(v2),
}
}
}
}
/// Does the same as get_current_logical_size but counted on demand.
/// Used to initialize the logical size tracking on startup.
///
@@ -1201,22 +1143,21 @@ impl<'a> DatadirModification<'a> {
let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await.context("read db")?)
.context("deserialize db")?;
let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
let mut rel_dir =
if let hash_map::Entry::Vacant(e) = dbdir.dbdirs.entry((rel.spcnode, rel.dbnode)) {
// Didn't exist. Update dbdir
e.insert(false);
let buf = DbDirectory::ser(&dbdir).context("serialize db")?;
self.pending_directory_entries
.push((DirectoryKind::Db, dbdir.dbdirs.len()));
self.put(DBDIR_KEY, Value::Image(buf.into()));
let mut rel_dir = if dbdir.dbdirs.get(&(rel.spcnode, rel.dbnode)).is_none() {
// Didn't exist. Update dbdir
dbdir.dbdirs.insert((rel.spcnode, rel.dbnode), false);
let buf = DbDirectory::ser(&dbdir).context("serialize db")?;
self.pending_directory_entries
.push((DirectoryKind::Db, dbdir.dbdirs.len()));
self.put(DBDIR_KEY, Value::Image(buf.into()));
// and create the RelDirectory
RelDirectory::default()
} else {
// reldir already exists, fetch it
RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?)
.context("deserialize db")?
};
// and create the RelDirectory
RelDirectory::default()
} else {
// reldir already exists, fetch it
RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?)
.context("deserialize db")?
};
// Add the new relation to the rel directory entry, and write it back
if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
@@ -1447,9 +1388,6 @@ impl<'a> DatadirModification<'a> {
}
pub fn init_aux_dir(&mut self) -> anyhow::Result<()> {
if let AuxFilePolicy::V2 = self.tline.get_switch_aux_file_policy() {
return Ok(());
}
let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
files: HashMap::new(),
})?;
@@ -1465,122 +1403,90 @@ impl<'a> DatadirModification<'a> {
content: &[u8],
ctx: &RequestContext,
) -> anyhow::Result<()> {
let policy = self.tline.get_switch_aux_file_policy();
if let AuxFilePolicy::V2 | AuxFilePolicy::CrossValidation = policy {
let key = aux_file::encode_aux_file_key(path);
// retrieve the key from the engine
let old_val = match self.get(key, ctx).await {
Ok(val) => Some(val),
Err(PageReconstructError::MissingKey(_)) => None,
Err(e) => return Err(e.into()),
};
let files = if let Some(ref old_val) = old_val {
aux_file::decode_file_value(old_val)?
} else {
Vec::new()
};
let new_files = if content.is_empty() {
files
.into_iter()
.filter(|(p, _)| &path != p)
.collect::<Vec<_>>()
} else {
files
.into_iter()
.filter(|(p, _)| &path != p)
.chain(std::iter::once((path, content)))
.collect::<Vec<_>>()
};
let new_val = aux_file::encode_file_value(&new_files)?;
self.put(key, Value::Image(new_val.into()));
}
let file_path = path.to_string();
let content = if content.is_empty() {
None
} else {
Some(Bytes::copy_from_slice(content))
};
if let AuxFilePolicy::V1 | AuxFilePolicy::CrossValidation = policy {
let file_path = path.to_string();
let content = if content.is_empty() {
None
let n_files;
let mut aux_files = self.tline.aux_files.lock().await;
if let Some(mut dir) = aux_files.dir.take() {
// We already updated aux files in `self`: emit a delta and update our latest value.
dir.upsert(file_path.clone(), content.clone());
n_files = dir.files.len();
if aux_files.n_deltas == MAX_AUX_FILE_DELTAS {
self.put(
AUX_FILES_KEY,
Value::Image(Bytes::from(
AuxFilesDirectory::ser(&dir).context("serialize")?,
)),
);
aux_files.n_deltas = 0;
} else {
Some(Bytes::copy_from_slice(content))
};
self.put(
AUX_FILES_KEY,
Value::WalRecord(NeonWalRecord::AuxFile { file_path, content }),
);
aux_files.n_deltas += 1;
}
aux_files.dir = Some(dir);
} else {
// Check if the AUX_FILES_KEY is initialized
match self.get(AUX_FILES_KEY, ctx).await {
Ok(dir_bytes) => {
let mut dir = AuxFilesDirectory::des(&dir_bytes)?;
// Key is already set, we may append a delta
self.put(
AUX_FILES_KEY,
Value::WalRecord(NeonWalRecord::AuxFile {
file_path: file_path.clone(),
content: content.clone(),
}),
);
dir.upsert(file_path, content);
n_files = dir.files.len();
aux_files.dir = Some(dir);
}
Err(
e @ (PageReconstructError::AncestorStopping(_)
| PageReconstructError::Cancelled
| PageReconstructError::AncestorLsnTimeout(_)),
) => {
// Important that we do not interpret a shutdown error as "not found" and thereby
// reset the map.
return Err(e.into());
}
// Note: we added missing key error variant in https://github.com/neondatabase/neon/pull/7393 but
// the original code assumes all other errors are missing keys. Therefore, we keep the code path
// the same for now, though in theory, we should only match the `MissingKey` variant.
Err(
PageReconstructError::Other(_)
| PageReconstructError::WalRedo(_)
| PageReconstructError::MissingKey { .. },
) => {
// Key is missing, we must insert an image as the basis for subsequent deltas.
let n_files;
let mut aux_files = self.tline.aux_files.lock().await;
if let Some(mut dir) = aux_files.dir.take() {
// We already updated aux files in `self`: emit a delta and update our latest value.
dir.upsert(file_path.clone(), content.clone());
n_files = dir.files.len();
if aux_files.n_deltas == MAX_AUX_FILE_DELTAS {
let mut dir = AuxFilesDirectory {
files: HashMap::new(),
};
dir.upsert(file_path, content);
self.put(
AUX_FILES_KEY,
Value::Image(Bytes::from(
AuxFilesDirectory::ser(&dir).context("serialize")?,
)),
);
aux_files.n_deltas = 0;
} else {
self.put(
AUX_FILES_KEY,
Value::WalRecord(NeonWalRecord::AuxFile { file_path, content }),
);
aux_files.n_deltas += 1;
}
aux_files.dir = Some(dir);
} else {
// Check if the AUX_FILES_KEY is initialized
match self.get(AUX_FILES_KEY, ctx).await {
Ok(dir_bytes) => {
let mut dir = AuxFilesDirectory::des(&dir_bytes)?;
// Key is already set, we may append a delta
self.put(
AUX_FILES_KEY,
Value::WalRecord(NeonWalRecord::AuxFile {
file_path: file_path.clone(),
content: content.clone(),
}),
);
dir.upsert(file_path, content);
n_files = dir.files.len();
aux_files.dir = Some(dir);
}
Err(
e @ (PageReconstructError::AncestorStopping(_)
| PageReconstructError::Cancelled
| PageReconstructError::AncestorLsnTimeout(_)),
) => {
// Important that we do not interpret a shutdown error as "not found" and thereby
// reset the map.
return Err(e.into());
}
// Note: we added missing key error variant in https://github.com/neondatabase/neon/pull/7393 but
// the original code assumes all other errors are missing keys. Therefore, we keep the code path
// the same for now, though in theory, we should only match the `MissingKey` variant.
Err(
PageReconstructError::Other(_)
| PageReconstructError::WalRedo(_)
| PageReconstructError::MissingKey { .. },
) => {
// Key is missing, we must insert an image as the basis for subsequent deltas.
let mut dir = AuxFilesDirectory {
files: HashMap::new(),
};
dir.upsert(file_path, content);
self.put(
AUX_FILES_KEY,
Value::Image(Bytes::from(
AuxFilesDirectory::ser(&dir).context("serialize")?,
)),
);
n_files = 1;
aux_files.dir = Some(dir);
}
n_files = 1;
aux_files.dir = Some(dir);
}
}
self.pending_directory_entries
.push((DirectoryKind::AuxFiles, n_files));
}
self.pending_directory_entries
.push((DirectoryKind::AuxFiles, n_files));
Ok(())
}

View File

@@ -33,6 +33,7 @@ impl Value {
}
}
#[cfg(test)]
#[derive(Debug, PartialEq)]
pub(crate) enum InvalidInput {
TooShortValue,
@@ -41,8 +42,10 @@ pub(crate) enum InvalidInput {
/// We could have a ValueRef where everything is `serde(borrow)`. Before implementing that, lets
/// use this type for querying if a slice looks some particular way.
#[cfg(test)]
pub(crate) struct ValueBytes;
#[cfg(test)]
impl ValueBytes {
pub(crate) fn will_init(raw: &[u8]) -> Result<bool, InvalidInput> {
if raw.len() < 12 {

View File

@@ -319,9 +319,6 @@ pub enum TaskKind {
// Eviction. One per timeline.
Eviction,
// Ingest housekeeping (flushing ephemeral layers on time threshold or disk pressure)
IngestHousekeeping,
/// See [`crate::disk_usage_eviction_task`].
DiskUsageEviction,
@@ -366,12 +363,8 @@ pub enum TaskKind {
EphemeralFilePreWarmPageCache,
LayerDownload,
#[cfg(test)]
UnitTest,
DetachAncestor,
}
#[derive(Default)]

View File

@@ -64,7 +64,6 @@ use self::timeline::uninit::UninitializedTimeline;
use self::timeline::EvictionTaskTenantState;
use self::timeline::TimelineResources;
use self::timeline::WaitLsnError;
use self::timeline::{GcCutoffs, GcInfo};
use crate::config::PageServerConf;
use crate::context::{DownloadBehavior, RequestContext};
use crate::deletion_queue::DeletionQueueClient;
@@ -87,6 +86,7 @@ use crate::tenant::remote_timeline_client::INITDB_PATH;
use crate::tenant::storage_layer::DeltaLayer;
use crate::tenant::storage_layer::ImageLayer;
use crate::InitializationOrder;
use std::cmp::min;
use std::collections::hash_map::Entry;
use std::collections::BTreeSet;
use std::collections::HashMap;
@@ -322,9 +322,6 @@ pub struct Tenant {
/// All [`Tenant::timelines`] of a given [`Tenant`] instance share the same [`throttle::Throttle`] instance.
pub(crate) timeline_get_throttle:
Arc<throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>>,
/// An ongoing timeline detach must be checked during attempts to GC or compact a timeline.
ongoing_timeline_detach: std::sync::Mutex<Option<(TimelineId, utils::completion::Barrier)>>,
}
impl std::fmt::Debug for Tenant {
@@ -1679,34 +1676,6 @@ impl Tenant {
Ok(())
}
// Call through to all timelines to freeze ephemeral layers if needed. Usually
// this happens during ingest: this background housekeeping is for freezing layers
// that are open but haven't been written to for some time.
async fn ingest_housekeeping(&self) {
// Scan through the hashmap and collect a list of all the timelines,
// while holding the lock. Then drop the lock and actually perform the
// compactions. We don't want to block everything else while the
// compaction runs.
let timelines = {
self.timelines
.lock()
.unwrap()
.values()
.filter_map(|timeline| {
if timeline.is_active() {
Some(timeline.clone())
} else {
None
}
})
.collect::<Vec<_>>()
};
for timeline in &timelines {
timeline.maybe_freeze_ephemeral_layer().await;
}
}
pub fn current_state(&self) -> TenantState {
self.state.borrow().clone()
}
@@ -2560,7 +2529,6 @@ impl Tenant {
&crate::metrics::tenant_throttling::TIMELINE_GET,
)),
tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
ongoing_timeline_detach: std::sync::Mutex::default(),
}
}
@@ -2844,48 +2812,7 @@ impl Tenant {
cancel: &CancellationToken,
ctx: &RequestContext,
) -> anyhow::Result<Vec<Arc<Timeline>>> {
// before taking the gc_cs lock, do the heavier weight finding of gc_cutoff points for
// currently visible timelines.
let timelines = self
.timelines
.lock()
.unwrap()
.values()
.filter(|tl| match target_timeline_id.as_ref() {
Some(target) => &tl.timeline_id == target,
None => true,
})
.cloned()
.collect::<Vec<_>>();
let mut gc_cutoffs: HashMap<TimelineId, GcCutoffs> =
HashMap::with_capacity(timelines.len());
for timeline in timelines.iter() {
let cutoff = timeline
.get_last_record_lsn()
.checked_sub(horizon)
.unwrap_or(Lsn(0));
let res = timeline.find_gc_cutoffs(cutoff, pitr, cancel, ctx).await;
match res {
Ok(cutoffs) => {
let old = gc_cutoffs.insert(timeline.timeline_id, cutoffs);
assert!(old.is_none());
}
Err(e) => {
tracing::warn!(timeline_id = %timeline.timeline_id, "ignoring failure to find gc cutoffs: {e:#}");
}
}
}
if !self.is_active() {
anyhow::bail!("shutting down");
}
// grab mutex to prevent new timelines from being created here; avoid doing long operations
// because that will stall branch creation.
// grab mutex to prevent new timelines from being created here.
let gc_cs = self.gc_cs.lock().await;
// Scan all timelines. For each timeline, remember the timeline ID and
@@ -2947,6 +2874,11 @@ impl Tenant {
}
}
let cutoff = timeline
.get_last_record_lsn()
.checked_sub(horizon)
.unwrap_or(Lsn(0));
let branchpoints: Vec<Lsn> = all_branchpoints
.range((
Included((timeline_id, Lsn(0))),
@@ -2954,27 +2886,9 @@ impl Tenant {
))
.map(|&x| x.1)
.collect();
{
let mut target = timeline.gc_info.write().unwrap();
match gc_cutoffs.remove(&timeline_id) {
Some(cutoffs) => {
*target = GcInfo {
retain_lsns: branchpoints,
cutoffs,
};
}
None => {
// reasons for this being unavailable:
// - this timeline was created while we were finding cutoffs
// - lsn for timestamp search fails for this timeline repeatedly
//
// in both cases, refreshing the branchpoints is correct.
target.retain_lsns = branchpoints;
}
};
}
timeline
.update_gc_info(branchpoints, cutoff, pitr, cancel, ctx)
.await?;
gc_timelines.push(timeline);
}
@@ -3063,7 +2977,7 @@ impl Tenant {
// and then the planned GC cutoff
{
let gc_info = src_timeline.gc_info.read().unwrap();
let cutoff = gc_info.min_cutoff();
let cutoff = min(gc_info.pitr_cutoff, gc_info.horizon_cutoff);
if start_lsn < cutoff {
return Err(CreateTimelineError::AncestorLsn(anyhow::anyhow!(
"invalid branch start lsn: less than planned GC cutoff {cutoff}"
@@ -3758,7 +3672,7 @@ pub(crate) mod harness {
image_layer_creation_check_threshold: Some(
tenant_conf.image_layer_creation_check_threshold,
),
switch_aux_file_policy: Some(tenant_conf.switch_aux_file_policy),
switch_to_aux_file_v2: Some(tenant_conf.switch_to_aux_file_v2),
}
}
}
@@ -3957,7 +3871,7 @@ mod tests {
use crate::DEFAULT_PG_VERSION;
use bytes::BytesMut;
use hex_literal::hex;
use pageserver_api::key::{AUX_KEY_PREFIX, NON_INHERITED_RANGE};
use pageserver_api::key::NON_INHERITED_RANGE;
use pageserver_api::keyspace::KeySpace;
use pageserver_api::models::CompactionAlgorithm;
use rand::{thread_rng, Rng};
@@ -4599,20 +4513,18 @@ mod tests {
}
async fn bulk_insert_compact_gc(
tenant: &Tenant,
timeline: &Arc<Timeline>,
timeline: Arc<Timeline>,
ctx: &RequestContext,
lsn: Lsn,
repeat: usize,
key_count: usize,
) -> anyhow::Result<()> {
let compact = true;
bulk_insert_maybe_compact_gc(tenant, timeline, ctx, lsn, repeat, key_count, compact).await
bulk_insert_maybe_compact_gc(timeline, ctx, lsn, repeat, key_count, compact).await
}
async fn bulk_insert_maybe_compact_gc(
tenant: &Tenant,
timeline: &Arc<Timeline>,
timeline: Arc<Timeline>,
ctx: &RequestContext,
mut lsn: Lsn,
repeat: usize,
@@ -4625,8 +4537,6 @@ mod tests {
// Enforce that key range is monotonously increasing
let mut keyspace = KeySpaceAccum::new();
let cancel = CancellationToken::new();
for _ in 0..repeat {
for _ in 0..key_count {
test_key.field6 = blknum;
@@ -4648,19 +4558,24 @@ mod tests {
blknum += 1;
}
let cutoff = timeline.get_last_record_lsn();
timeline
.update_gc_info(
Vec::new(),
cutoff,
Duration::ZERO,
&CancellationToken::new(),
ctx,
)
.await?;
timeline.freeze_and_flush().await?;
if compact {
// this requires timeline to be &Arc<Timeline>
timeline.compact(&cancel, EnumSet::empty(), ctx).await?;
timeline
.compact(&CancellationToken::new(), EnumSet::empty(), ctx)
.await?;
}
// this doesn't really need to use the timeline_id target, but it is closer to what it
// originally was.
let res = tenant
.gc_iteration(Some(timeline.timeline_id), 0, Duration::ZERO, &cancel, ctx)
.await?;
assert_eq!(res.layers_removed, 0, "this never removes anything");
timeline.gc().await?;
}
Ok(())
@@ -4679,7 +4594,7 @@ mod tests {
.await?;
let lsn = Lsn(0x10);
bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?;
bulk_insert_compact_gc(tline.clone(), &ctx, lsn, 50, 10000).await?;
Ok(())
}
@@ -4710,7 +4625,7 @@ mod tests {
.await?;
let lsn = Lsn(0x10);
bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?;
bulk_insert_compact_gc(tline.clone(), &ctx, lsn, 50, 10000).await?;
let guard = tline.layers.read().await;
guard.layer_map().dump(true, &ctx).await?;
@@ -4823,7 +4738,15 @@ mod tests {
.await;
let images = vectored_res?;
assert!(images.is_empty());
let mut key = NON_INHERITED_RANGE.start;
while key < NON_INHERITED_RANGE.end {
assert!(matches!(
images[&key],
Err(PageReconstructError::MissingKey(_))
));
key = key.next();
}
Ok(())
}
@@ -5156,7 +5079,6 @@ mod tests {
.await?;
const NUM_KEYS: usize = 1000;
let cancel = CancellationToken::new();
let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
@@ -5216,10 +5138,18 @@ mod tests {
}
// Perform a cycle of flush, and GC
tline.freeze_and_flush().await?;
tenant
.gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
let cutoff = tline.get_last_record_lsn();
tline
.update_gc_info(
Vec::new(),
cutoff,
Duration::ZERO,
&CancellationToken::new(),
&ctx,
)
.await?;
tline.freeze_and_flush().await?;
tline.gc().await?;
}
Ok(())
@@ -5240,8 +5170,6 @@ mod tests {
let mut keyspace = KeySpaceAccum::new();
let cancel = CancellationToken::new();
// Track when each page was last modified. Used to assert that
// a read sees the latest page version.
let mut updated = [Lsn(0); NUM_KEYS];
@@ -5305,11 +5233,21 @@ mod tests {
}
// Perform a cycle of flush, compact, and GC
tline.freeze_and_flush().await?;
tline.compact(&cancel, EnumSet::empty(), &ctx).await?;
tenant
.gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
let cutoff = tline.get_last_record_lsn();
tline
.update_gc_info(
Vec::new(),
cutoff,
Duration::ZERO,
&CancellationToken::new(),
&ctx,
)
.await?;
tline.freeze_and_flush().await?;
tline
.compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
.await?;
tline.gc().await?;
}
Ok(())
@@ -5514,7 +5452,7 @@ mod tests {
let lsn = Lsn(0x10);
let compact = false;
bulk_insert_maybe_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000, compact).await?;
bulk_insert_maybe_compact_gc(tline.clone(), &ctx, lsn, 50, 10000, compact).await?;
let test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
let read_lsn = Lsn(u64::MAX - 1);
@@ -5524,108 +5462,4 @@ mod tests {
Ok(())
}
#[tokio::test]
async fn test_metadata_scan() -> anyhow::Result<()> {
let harness = TenantHarness::create("test_metadata_scan")?;
let (tenant, ctx) = harness.load().await;
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
.await?;
const NUM_KEYS: usize = 1000;
const STEP: usize = 100; // random update + scan base_key + idx * STEP
let cancel = CancellationToken::new();
let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
base_key.field1 = AUX_KEY_PREFIX;
let mut test_key = base_key;
// Track when each page was last modified. Used to assert that
// a read sees the latest page version.
let mut updated = [Lsn(0); NUM_KEYS];
let mut lsn = Lsn(0x10);
#[allow(clippy::needless_range_loop)]
for blknum in 0..NUM_KEYS {
lsn = Lsn(lsn.0 + 0x10);
test_key.field6 = (blknum * STEP) as u32;
let mut writer = tline.writer().await;
writer
.put(
test_key,
lsn,
&Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
&ctx,
)
.await?;
writer.finish_write(lsn);
updated[blknum] = lsn;
drop(writer);
}
let keyspace = KeySpace::single(base_key..base_key.add((NUM_KEYS * STEP) as u32));
for _ in 0..10 {
// Read all the blocks
for (blknum, last_lsn) in updated.iter().enumerate() {
test_key.field6 = (blknum * STEP) as u32;
assert_eq!(
tline.get(test_key, lsn, &ctx).await?,
test_img(&format!("{} at {}", blknum, last_lsn))
);
}
let mut cnt = 0;
for (key, value) in tline
.get_vectored_impl(
keyspace.clone(),
lsn,
ValuesReconstructState::default(),
&ctx,
)
.await?
{
let blknum = key.field6 as usize;
let value = value?;
assert!(blknum % STEP == 0);
let blknum = blknum / STEP;
assert_eq!(
value,
test_img(&format!("{} at {}", blknum, updated[blknum]))
);
cnt += 1;
}
assert_eq!(cnt, NUM_KEYS);
for _ in 0..NUM_KEYS {
lsn = Lsn(lsn.0 + 0x10);
let blknum = thread_rng().gen_range(0..NUM_KEYS);
test_key.field6 = (blknum * STEP) as u32;
let mut writer = tline.writer().await;
writer
.put(
test_key,
lsn,
&Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
&ctx,
)
.await?;
writer.finish_write(lsn);
drop(writer);
updated[blknum] = lsn;
}
// Perform a cycle of flush, compact, and GC
tline.freeze_and_flush().await?;
tline.compact(&cancel, EnumSet::empty(), &ctx).await?;
tenant
.gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
.await?;
}
Ok(())
}
}

View File

@@ -130,9 +130,8 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
async fn write_all_unbuffered<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
&mut self,
src_buf: B,
ctx: &RequestContext,
) -> (B::Buf, Result<(), Error>) {
let (src_buf, res) = self.inner.write_all(src_buf, ctx).await;
let (src_buf, res) = self.inner.write_all(src_buf).await;
let nbytes = match res {
Ok(nbytes) => nbytes,
Err(e) => return (src_buf, Err(e)),
@@ -143,9 +142,9 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
#[inline(always)]
/// Flushes the internal buffer to the underlying `VirtualFile`.
pub async fn flush_buffer(&mut self, ctx: &RequestContext) -> Result<(), Error> {
pub async fn flush_buffer(&mut self) -> Result<(), Error> {
let buf = std::mem::take(&mut self.buf);
let (mut buf, res) = self.inner.write_all(buf, ctx).await;
let (mut buf, res) = self.inner.write_all(buf).await;
res?;
buf.clear();
self.buf = buf;
@@ -166,11 +165,10 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
&mut self,
src_buf: B,
ctx: &RequestContext,
) -> (B::Buf, Result<(), Error>) {
if !BUFFERED {
assert!(self.buf.is_empty());
return self.write_all_unbuffered(src_buf, ctx).await;
return self.write_all_unbuffered(src_buf).await;
}
let remaining = Self::CAPACITY - self.buf.len();
let src_buf_len = src_buf.bytes_init();
@@ -185,7 +183,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
}
// Then, if the buffer is full, flush it out
if self.buf.len() == Self::CAPACITY {
if let Err(e) = self.flush_buffer(ctx).await {
if let Err(e) = self.flush_buffer().await {
return (Slice::into_inner(src_buf), Err(e));
}
}
@@ -201,7 +199,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
assert_eq!(copied, src_buf.len());
Slice::into_inner(src_buf)
} else {
let (src_buf, res) = self.write_all_unbuffered(src_buf, ctx).await;
let (src_buf, res) = self.write_all_unbuffered(src_buf).await;
if let Err(e) = res {
return (src_buf, Err(e));
}
@@ -218,7 +216,6 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
pub async fn write_blob<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
&mut self,
srcbuf: B,
ctx: &RequestContext,
) -> (B::Buf, Result<u64, Error>) {
let offset = self.offset;
@@ -230,7 +227,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
if len < 128 {
// Short blob. Write a 1-byte length header
io_buf.put_u8(len as u8);
self.write_all(io_buf, ctx).await
self.write_all(io_buf).await
} else {
// Write a 4-byte length header
if len > 0x7fff_ffff {
@@ -245,7 +242,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
let mut len_buf = (len as u32).to_be_bytes();
len_buf[0] |= 0x80;
io_buf.extend_from_slice(&len_buf[..]);
self.write_all(io_buf, ctx).await
self.write_all(io_buf).await
}
}
.await;
@@ -254,7 +251,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
Ok(_) => (),
Err(e) => return (Slice::into_inner(srcbuf.slice(..)), Err(e)),
}
let (srcbuf, res) = self.write_all(srcbuf, ctx).await;
let (srcbuf, res) = self.write_all(srcbuf).await;
(srcbuf, res.map(|_| offset))
}
}
@@ -264,8 +261,8 @@ impl BlobWriter<true> {
///
/// This function flushes the internal buffer before giving access
/// to the underlying `VirtualFile`.
pub async fn into_inner(mut self, ctx: &RequestContext) -> Result<VirtualFile, Error> {
self.flush_buffer(ctx).await?;
pub async fn into_inner(mut self) -> Result<VirtualFile, Error> {
self.flush_buffer().await?;
Ok(self.inner)
}
@@ -302,16 +299,16 @@ mod tests {
let file = VirtualFile::create(pathbuf.as_path()).await?;
let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
for blob in blobs.iter() {
let (_, res) = wtr.write_blob(blob.clone(), &ctx).await;
let (_, res) = wtr.write_blob(blob.clone()).await;
let offs = res?;
offsets.push(offs);
}
// Write out one page worth of zeros so that we can
// read again with read_blk
let (_, res) = wtr.write_blob(vec![0; PAGE_SZ], &ctx).await;
let (_, res) = wtr.write_blob(vec![0; PAGE_SZ]).await;
let offs = res?;
println!("Writing final blob at offs={offs}");
wtr.flush_buffer(&ctx).await?;
wtr.flush_buffer().await?;
}
let file = VirtualFile::open(pathbuf.as_path()).await?;

View File

@@ -9,7 +9,6 @@
//! may lead to a data loss.
//!
use anyhow::bail;
use pageserver_api::models::AuxFilePolicy;
use pageserver_api::models::CompactionAlgorithm;
use pageserver_api::models::EvictionPolicy;
use pageserver_api::models::{self, ThrottleConfig};
@@ -371,9 +370,9 @@ pub struct TenantConf {
// Expresed in multiples of checkpoint distance.
pub image_layer_creation_check_threshold: u8,
/// Switch to a new aux file policy. Switching this flag requires the user has not written any aux file into
/// Switch to aux file v2. Switching this flag requires the user has not written any aux file into
/// the storage before, and this flag cannot be switched back. Otherwise there will be data corruptions.
pub switch_aux_file_policy: AuxFilePolicy,
pub switch_to_aux_file_v2: bool,
}
/// Same as TenantConf, but this struct preserves the information about
@@ -472,7 +471,7 @@ pub struct TenantConfOpt {
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
pub switch_aux_file_policy: Option<AuxFilePolicy>,
pub switch_to_aux_file_v2: Option<bool>,
}
impl TenantConfOpt {
@@ -530,9 +529,9 @@ impl TenantConfOpt {
image_layer_creation_check_threshold: self
.image_layer_creation_check_threshold
.unwrap_or(global_conf.image_layer_creation_check_threshold),
switch_aux_file_policy: self
.switch_aux_file_policy
.unwrap_or(global_conf.switch_aux_file_policy),
switch_to_aux_file_v2: self
.switch_to_aux_file_v2
.unwrap_or(global_conf.switch_to_aux_file_v2),
}
}
}
@@ -574,7 +573,7 @@ impl Default for TenantConf {
lazy_slru_download: false,
timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
switch_aux_file_policy: AuxFilePolicy::V1,
switch_to_aux_file_v2: false,
}
}
}
@@ -649,7 +648,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
lazy_slru_download: value.lazy_slru_download,
timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
switch_aux_file_policy: value.switch_aux_file_policy,
switch_to_aux_file_v2: value.switch_to_aux_file_v2,
}
}
}

View File

@@ -585,20 +585,9 @@ impl DeleteTenantFlow {
// FIXME: we should not be modifying this from outside of mgr.rs.
// This will go away when we simplify deletion (https://github.com/neondatabase/neon/issues/5080)
// Update stats
match &removed {
TenantsMapRemoveResult::Occupied(slot) => {
crate::metrics::TENANT_MANAGER.slot_removed(slot);
}
TenantsMapRemoveResult::InProgress(barrier) => {
crate::metrics::TENANT_MANAGER
.slot_removed(&TenantSlot::InProgress(barrier.clone()));
}
TenantsMapRemoveResult::Vacant => {
// Nothing changed in map, no metric update
}
}
crate::metrics::TENANT_MANAGER
.tenant_slots
.set(locked.len() as u64);
match removed {
TenantsMapRemoveResult::Occupied(TenantSlot::Attached(tenant)) => {

View File

@@ -74,7 +74,7 @@ impl EphemeralFile {
pub(crate) async fn write_blob(
&mut self,
srcbuf: &[u8],
ctx: &RequestContext,
_ctx: &RequestContext,
) -> Result<u64, io::Error> {
let pos = self.rw.bytes_written();
@@ -83,15 +83,15 @@ impl EphemeralFile {
// short one-byte length header
let len_buf = [srcbuf.len() as u8];
self.rw.write_all_borrowed(&len_buf, ctx).await?;
self.rw.write_all_borrowed(&len_buf).await?;
} else {
let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32);
len_buf[0] |= 0x80;
self.rw.write_all_borrowed(&len_buf, ctx).await?;
self.rw.write_all_borrowed(&len_buf).await?;
}
// Write the payload
self.rw.write_all_borrowed(srcbuf, ctx).await?;
self.rw.write_all_borrowed(srcbuf).await?;
Ok(pos)
}

View File

@@ -35,14 +35,10 @@ impl RW {
self.page_cache_file_id
}
pub(crate) async fn write_all_borrowed(
&mut self,
srcbuf: &[u8],
ctx: &RequestContext,
) -> Result<usize, io::Error> {
pub(crate) async fn write_all_borrowed(&mut self, srcbuf: &[u8]) -> Result<usize, io::Error> {
// It doesn't make sense to proactively fill the page cache on the Pageserver write path
// because Compute is unlikely to access recently written data.
self.rw.write_all_borrowed(srcbuf, ctx).await
self.rw.write_all_borrowed(srcbuf).await
}
pub(crate) fn bytes_written(&self) -> u64 {
@@ -138,7 +134,6 @@ impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmi
>(
&mut self,
buf: B,
ctx: &RequestContext,
) -> std::io::Result<(usize, B::Buf)> {
let buf = buf.slice(..);
let saved_bounds = buf.bounds(); // save for reconstructing the Slice from iobuf after the IO is done
@@ -155,7 +150,7 @@ impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmi
);
// Do the IO.
let iobuf = match self.file.write_all(buf, ctx).await {
let iobuf = match self.file.write_all(buf).await {
(iobuf, Ok(nwritten)) => {
assert_eq!(nwritten, buflen);
iobuf

View File

@@ -20,7 +20,6 @@
mod zero_padded;
use crate::{
context::RequestContext,
page_cache::PAGE_SZ,
virtual_file::owned_buffers_io::{
self,
@@ -61,12 +60,8 @@ where
self.buffered_writer.as_inner().as_inner()
}
pub async fn write_all_borrowed(
&mut self,
buf: &[u8],
ctx: &RequestContext,
) -> std::io::Result<usize> {
self.buffered_writer.write_buffered_borrowed(buf, ctx).await
pub async fn write_all_borrowed(&mut self, buf: &[u8]) -> std::io::Result<usize> {
self.buffered_writer.write_buffered_borrowed(buf).await
}
pub fn bytes_written(&self) -> u64 {

View File

@@ -588,7 +588,7 @@ impl LayerMap {
let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
coverage.push((kr, current_val.take()));
current_key = change_key;
current_val.clone_from(&change_val);
current_val = change_val.clone();
}
// Add the final interval
@@ -672,12 +672,12 @@ impl LayerMap {
// Loop through the delta coverage and recurse on each part
for (change_key, change_val) in version.delta_coverage.range(start..end) {
// If there's a relevant delta in this part, add 1 and recurse down
if let Some(val) = &current_val {
if let Some(val) = current_val {
if val.get_lsn_range().end > lsn.start {
let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
let lr = lsn.start..val.get_lsn_range().start;
if !kr.is_empty() {
let base_count = Self::is_reimage_worthy(val, key) as usize;
let base_count = Self::is_reimage_worthy(&val, key) as usize;
let new_limit = limit.map(|l| l - base_count);
let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit);
max_stacked_deltas = std::cmp::max(
@@ -689,17 +689,17 @@ impl LayerMap {
}
current_key = change_key;
current_val.clone_from(&change_val);
current_val = change_val.clone();
}
// Consider the last part
if let Some(val) = &current_val {
if let Some(val) = current_val {
if val.get_lsn_range().end > lsn.start {
let kr = Key::from_i128(current_key)..Key::from_i128(end);
let lr = lsn.start..val.get_lsn_range().start;
if !kr.is_empty() {
let base_count = Self::is_reimage_worthy(val, key) as usize;
let base_count = Self::is_reimage_worthy(&val, key) as usize;
let new_limit = limit.map(|l| l - base_count);
let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit);
max_stacked_deltas = std::cmp::max(

View File

@@ -207,24 +207,6 @@ impl TimelineMetadata {
self.body.ancestor_lsn
}
/// When reparenting, the `ancestor_lsn` does not change.
pub fn reparent(&mut self, timeline: &TimelineId) {
assert!(self.body.ancestor_timeline.is_some());
// no assertion for redoing this: it's fine, we may have to repeat this multiple times over
self.body.ancestor_timeline = Some(*timeline);
}
pub fn detach_from_ancestor(&mut self, timeline: &TimelineId, ancestor_lsn: &Lsn) {
if let Some(ancestor) = self.body.ancestor_timeline {
assert_eq!(ancestor, *timeline);
}
if self.body.ancestor_lsn != Lsn(0) {
assert_eq!(self.body.ancestor_lsn, *ancestor_lsn);
}
self.body.ancestor_timeline = None;
self.body.ancestor_lsn = Lsn(0);
}
pub fn latest_gc_cutoff_lsn(&self) -> Lsn {
self.body.latest_gc_cutoff_lsn
}

View File

@@ -56,7 +56,6 @@ use utils::id::{TenantId, TimelineId};
use super::delete::DeleteTenantError;
use super::secondary::SecondaryTenant;
use super::timeline::detach_ancestor::PreparedTimelineDetach;
use super::TenantSharedResources;
/// For a tenant that appears in TenantsMap, it may either be
@@ -247,7 +246,6 @@ impl TenantsMap {
}
}
#[cfg(all(debug_assertions, not(test)))]
pub(crate) fn len(&self) -> usize {
match self {
TenantsMap::Initializing => 0,
@@ -748,7 +746,6 @@ pub async fn init_tenant_mgr(
}
};
METRICS.slot_inserted(&slot);
tenants.insert(tenant_shard_id, slot);
}
@@ -756,7 +753,7 @@ pub async fn init_tenant_mgr(
let mut tenants_map = TENANTS.write().unwrap();
assert!(matches!(&*tenants_map, &TenantsMap::Initializing));
METRICS.tenant_slots.set(tenants.len() as u64);
*tenants_map = TenantsMap::Open(tenants);
Ok(TenantManager {
@@ -827,14 +824,6 @@ fn tenant_spawn(
async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
let mut join_set = JoinSet::new();
#[cfg(all(debug_assertions, not(test)))]
{
// Check that our metrics properly tracked the size of the tenants map. This is a convenient location to check,
// as it happens implicitly at the end of tests etc.
let m = tenants.read().unwrap();
debug_assert_eq!(METRICS.slots_total(), m.len() as u64);
}
// Atomically, 1. create the shutdown tasks and 2. prevent creation of new tenants.
let (total_in_progress, total_attached) = {
let mut m = tenants.write().unwrap();
@@ -2008,101 +1997,6 @@ impl TenantManager {
})
.collect())
}
/// Completes an earlier prepared timeline detach ancestor.
pub(crate) async fn complete_detaching_timeline_ancestor(
&self,
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
prepared: PreparedTimelineDetach,
ctx: &RequestContext,
) -> Result<Vec<TimelineId>, anyhow::Error> {
struct RevertOnDropSlot(Option<SlotGuard>);
impl Drop for RevertOnDropSlot {
fn drop(&mut self) {
if let Some(taken) = self.0.take() {
taken.revert();
}
}
}
impl RevertOnDropSlot {
fn into_inner(mut self) -> SlotGuard {
self.0.take().unwrap()
}
}
impl std::ops::Deref for RevertOnDropSlot {
type Target = SlotGuard;
fn deref(&self) -> &Self::Target {
self.0.as_ref().unwrap()
}
}
let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
let slot_guard = RevertOnDropSlot(Some(slot_guard));
let tenant = {
let Some(old_slot) = slot_guard.get_old_value() else {
anyhow::bail!(
"Tenant not found when trying to complete detaching timeline ancestor"
);
};
let Some(tenant) = old_slot.get_attached() else {
anyhow::bail!("Tenant is not in attached state");
};
if !tenant.is_active() {
anyhow::bail!("Tenant is not active");
}
tenant.clone()
};
let timeline = tenant.get_timeline(timeline_id, true)?;
let reparented = timeline
.complete_detaching_timeline_ancestor(&tenant, prepared, ctx)
.await?;
let mut slot_guard = slot_guard.into_inner();
let (_guard, progress) = utils::completion::channel();
match tenant.shutdown(progress, ShutdownMode::Hard).await {
Ok(()) => {
slot_guard.drop_old_value()?;
}
Err(_barrier) => {
slot_guard.revert();
// this really should not happen, at all, unless shutdown was already going?
anyhow::bail!("Cannot restart Tenant, already shutting down");
}
}
let tenant_path = self.conf.tenant_path(&tenant_shard_id);
let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?;
let shard_identity = config.shard;
let tenant = tenant_spawn(
self.conf,
tenant_shard_id,
&tenant_path,
self.resources.clone(),
AttachedTenantConf::try_from(config)?,
shard_identity,
None,
self.tenants,
SpawnMode::Eager,
ctx,
)?;
slot_guard.upsert(TenantSlot::Attached(tenant))?;
Ok(reparented)
}
}
#[derive(Debug, thiserror::Error)]
@@ -2534,13 +2428,10 @@ impl SlotGuard {
TenantsMap::Open(m) => m,
};
METRICS.slot_inserted(&new_value);
let replaced = m.insert(self.tenant_shard_id, new_value);
self.upserted = true;
if let Some(replaced) = replaced.as_ref() {
METRICS.slot_removed(replaced);
}
METRICS.tenant_slots.set(m.len() as u64);
replaced
};
@@ -2650,13 +2541,9 @@ impl Drop for SlotGuard {
}
if self.old_value_is_shutdown() {
METRICS.slot_removed(entry.get());
entry.remove();
} else {
let inserting = self.old_value.take().unwrap();
METRICS.slot_inserted(&inserting);
let replaced = entry.insert(inserting);
METRICS.slot_removed(&replaced);
entry.insert(self.old_value.take().unwrap());
}
}
Entry::Vacant(_) => {
@@ -2667,6 +2554,8 @@ impl Drop for SlotGuard {
);
}
}
METRICS.tenant_slots.set(m.len() as u64);
}
}
@@ -2746,9 +2635,7 @@ fn tenant_map_acquire_slot_impl(
}
_ => {
let (completion, barrier) = utils::completion::channel();
let inserting = TenantSlot::InProgress(barrier);
METRICS.slot_inserted(&inserting);
v.insert(inserting);
v.insert(TenantSlot::InProgress(barrier));
tracing::debug!("Vacant, inserted InProgress");
Ok(SlotGuard::new(*tenant_shard_id, None, completion))
}
@@ -2784,10 +2671,7 @@ fn tenant_map_acquire_slot_impl(
_ => {
// Happy case: the slot was not in any state that violated our mode
let (completion, barrier) = utils::completion::channel();
let in_progress = TenantSlot::InProgress(barrier);
METRICS.slot_inserted(&in_progress);
let old_value = o.insert(in_progress);
METRICS.slot_removed(&old_value);
let old_value = o.insert(TenantSlot::InProgress(barrier));
tracing::debug!("Occupied, replaced with InProgress");
Ok(SlotGuard::new(
*tenant_shard_id,

View File

@@ -210,7 +210,6 @@ use tracing::{debug, error, info, instrument, warn};
use tracing::{info_span, Instrument};
use utils::lsn::Lsn;
use crate::context::RequestContext;
use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError};
use crate::metrics::{
MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics,
@@ -506,7 +505,6 @@ impl RemoteTimelineClient {
layer_file_name: &LayerFileName,
layer_metadata: &LayerFileMetadata,
cancel: &CancellationToken,
ctx: &RequestContext,
) -> anyhow::Result<u64> {
let downloaded_size = {
let _unfinished_gauge_guard = self.metrics.call_begin(
@@ -524,7 +522,6 @@ impl RemoteTimelineClient {
layer_file_name,
layer_metadata,
cancel,
ctx,
)
.measure_remote_op(
RemoteOpFileKind::Layer,
@@ -570,7 +567,7 @@ impl RemoteTimelineClient {
// ahead of what's _actually_ on the remote during index upload.
upload_queue.latest_metadata = metadata.clone();
self.schedule_index_upload(upload_queue);
self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone());
Ok(())
}
@@ -591,7 +588,7 @@ impl RemoteTimelineClient {
upload_queue.latest_metadata.apply(update);
self.schedule_index_upload(upload_queue);
self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone());
Ok(())
}
@@ -611,14 +608,18 @@ impl RemoteTimelineClient {
let upload_queue = guard.initialized_mut()?;
if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
self.schedule_index_upload(upload_queue);
self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone());
}
Ok(())
}
/// Launch an index-file upload operation in the background (internal function)
fn schedule_index_upload(self: &Arc<Self>, upload_queue: &mut UploadQueueInitialized) {
fn schedule_index_upload(
self: &Arc<Self>,
upload_queue: &mut UploadQueueInitialized,
metadata: TimelineMetadata,
) {
let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
info!(
@@ -627,7 +628,11 @@ impl RemoteTimelineClient {
upload_queue.latest_files_changes_since_metadata_upload_scheduled,
);
let index_part = IndexPart::from(&*upload_queue);
let index_part = IndexPart::new(
upload_queue.latest_files.clone(),
disk_consistent_lsn,
metadata,
);
let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn);
self.metric_begin(&op);
upload_queue.queued_operations.push_back(op);
@@ -637,61 +642,9 @@ impl RemoteTimelineClient {
self.launch_queued_tasks(upload_queue);
}
pub(crate) async fn schedule_reparenting_and_wait(
self: &Arc<Self>,
new_parent: &TimelineId,
) -> anyhow::Result<()> {
// FIXME: because of how Timeline::schedule_uploads works when called from layer flushing
// and reads the in-memory part we cannot do the detaching like this
let receiver = {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
upload_queue.latest_metadata.reparent(new_parent);
self.schedule_index_upload(upload_queue);
self.schedule_barrier0(upload_queue)
};
Self::wait_completion0(receiver).await
}
/// Schedules uploading a new version of `index_part.json` with the given layers added,
/// detaching from ancestor and waits for it to complete.
///
/// This is used with `Timeline::detach_ancestor` functionality.
pub(crate) async fn schedule_adding_existing_layers_to_index_detach_and_wait(
self: &Arc<Self>,
layers: &[Layer],
adopted: (TimelineId, Lsn),
) -> anyhow::Result<()> {
let barrier = {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
upload_queue
.latest_metadata
.detach_from_ancestor(&adopted.0, &adopted.1);
for layer in layers {
upload_queue
.latest_files
.insert(layer.layer_desc().filename(), layer.metadata());
}
self.schedule_index_upload(upload_queue);
let barrier = self.schedule_barrier0(upload_queue);
self.launch_queued_tasks(upload_queue);
barrier
};
Self::wait_completion0(barrier).await
}
/// Launch an upload operation in the background; the file is added to be included in next
/// `index_part.json` upload.
/// Launch an upload operation in the background.
///
pub(crate) fn schedule_layer_file_upload(
self: &Arc<Self>,
layer: ResidentLayer,
@@ -717,11 +670,9 @@ impl RemoteTimelineClient {
upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
info!(
gen=?metadata.generation,
shard=?metadata.shard,
"scheduled layer file upload {layer}",
"scheduled layer file upload {layer} gen={:?} shard={:?}",
metadata.generation, metadata.shard
);
let op = UploadOp::UploadLayer(layer, metadata);
self.metric_begin(&op);
upload_queue.queued_operations.push_back(op);
@@ -784,6 +735,10 @@ impl RemoteTimelineClient {
where
I: IntoIterator<Item = LayerFileName>,
{
// Deleting layers doesn't affect the values stored in TimelineMetadata,
// so we don't need update it. Just serialize it.
let metadata = upload_queue.latest_metadata.clone();
// Decorate our list of names with each name's metadata, dropping
// names that are unexpectedly missing from our metadata. This metadata
// is later used when physically deleting layers, to construct key paths.
@@ -822,7 +777,7 @@ impl RemoteTimelineClient {
// index_part update, because that needs to be uploaded before we can actually delete the
// files.
if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
self.schedule_index_upload(upload_queue);
self.schedule_index_upload(upload_queue, metadata);
}
with_metadata
@@ -924,18 +879,12 @@ impl RemoteTimelineClient {
/// Wait for all previously scheduled uploads/deletions to complete
pub(crate) async fn wait_completion(self: &Arc<Self>) -> anyhow::Result<()> {
let receiver = {
let mut receiver = {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
self.schedule_barrier0(upload_queue)
};
Self::wait_completion0(receiver).await
}
async fn wait_completion0(
mut receiver: tokio::sync::watch::Receiver<()>,
) -> anyhow::Result<()> {
if receiver.changed().await.is_err() {
anyhow::bail!("wait_completion aborted because upload queue was stopped");
}
@@ -1051,7 +1000,8 @@ impl RemoteTimelineClient {
let deleted_at = Utc::now().naive_utc();
stopped.deleted_at = SetDeletedFlagProgress::InProgress(deleted_at);
let mut index_part = IndexPart::from(&stopped.upload_queue_for_deletion);
let mut index_part = IndexPart::try_from(&stopped.upload_queue_for_deletion)
.context("IndexPart serialize")?;
index_part.deleted_at = Some(deleted_at);
index_part
};
@@ -1132,93 +1082,6 @@ impl RemoteTimelineClient {
Ok(())
}
/// Uploads the given layer **without** adding it to be part of a future `index_part.json` upload.
///
/// This is not normally needed.
pub(crate) async fn upload_layer_file(
self: &Arc<Self>,
uploaded: &ResidentLayer,
cancel: &CancellationToken,
) -> anyhow::Result<()> {
let remote_path = remote_layer_path(
&self.tenant_shard_id.tenant_id,
&self.timeline_id,
self.tenant_shard_id.to_index(),
&uploaded.layer_desc().filename(),
uploaded.metadata().generation,
);
backoff::retry(
|| async {
upload::upload_timeline_layer(
&self.storage_impl,
uploaded.local_path(),
&remote_path,
uploaded.metadata().file_size(),
cancel,
)
.await
},
TimeoutOrCancel::caused_by_cancel,
FAILED_UPLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
"upload a layer without adding it to latest files",
cancel,
)
.await
.ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
.and_then(|x| x)
.context("upload a layer without adding it to latest files")
}
/// Copies the `adopted` remote existing layer to the remote path of `adopted_as`. The layer is
/// not added to be part of a future `index_part.json` upload.
pub(crate) async fn copy_timeline_layer(
self: &Arc<Self>,
adopted: &Layer,
adopted_as: &Layer,
cancel: &CancellationToken,
) -> anyhow::Result<()> {
let source_remote_path = remote_layer_path(
&self.tenant_shard_id.tenant_id,
&adopted
.get_timeline_id()
.expect("Source timeline should be alive"),
self.tenant_shard_id.to_index(),
&adopted.layer_desc().filename(),
adopted.metadata().generation,
);
let target_remote_path = remote_layer_path(
&self.tenant_shard_id.tenant_id,
&self.timeline_id,
self.tenant_shard_id.to_index(),
&adopted_as.layer_desc().filename(),
adopted_as.metadata().generation,
);
backoff::retry(
|| async {
upload::copy_timeline_layer(
&self.storage_impl,
&source_remote_path,
&target_remote_path,
cancel,
)
.await
},
TimeoutOrCancel::caused_by_cancel,
FAILED_UPLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
"copy timeline layer",
cancel,
)
.await
.ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
.and_then(|x| x)
.context("remote copy timeline layer")
}
async fn flush_deletion_queue(&self) -> Result<(), DeletionQueueError> {
match tokio::time::timeout(
DELETION_QUEUE_FLUSH_TIMEOUT,
@@ -1390,7 +1253,7 @@ impl RemoteTimelineClient {
while let Some(next_op) = upload_queue.queued_operations.front() {
// Can we run this task now?
let can_run_now = match next_op {
UploadOp::UploadLayer(..) => {
UploadOp::UploadLayer(_, _) => {
// Can always be scheduled.
true
}
@@ -1517,25 +1380,13 @@ impl RemoteTimelineClient {
let upload_result: anyhow::Result<()> = match &task.op {
UploadOp::UploadLayer(ref layer, ref layer_metadata) => {
let local_path = layer.local_path();
// We should only be uploading layers created by this `Tenant`'s lifetime, so
// the metadata in the upload should always match our current generation.
assert_eq!(layer_metadata.generation, self.generation);
let remote_path = remote_layer_path(
&self.tenant_shard_id.tenant_id,
&self.timeline_id,
layer_metadata.shard,
&layer.layer_desc().filename(),
layer_metadata.generation,
);
let path = layer.local_path();
upload::upload_timeline_layer(
self.conf,
&self.storage_impl,
local_path,
&remote_path,
layer_metadata.file_size(),
path,
layer_metadata,
self.generation,
&self.cancel,
)
.measure_remote_op(
@@ -1964,6 +1815,29 @@ pub fn parse_remote_index_path(path: RemotePath) -> Option<Generation> {
}
}
/// Files on the remote storage are stored with paths, relative to the workdir.
/// That path includes in itself both tenant and timeline ids, allowing to have a unique remote storage path.
///
/// Errors if the path provided does not start from pageserver's workdir.
pub fn remote_path(
conf: &PageServerConf,
local_path: &Utf8Path,
generation: Generation,
) -> anyhow::Result<RemotePath> {
let stripped = local_path
.strip_prefix(&conf.workdir)
.context("Failed to strip workdir prefix")?;
let suffixed = format!("{0}{1}", stripped, generation.get_suffix());
RemotePath::new(Utf8Path::new(&suffixed)).with_context(|| {
format!(
"to resolve remote part of path {:?} for base {:?}",
local_path, conf.workdir
)
})
}
#[cfg(test)]
mod tests {
use super::*;
@@ -1971,7 +1845,6 @@ mod tests {
context::RequestContext,
tenant::{
harness::{TenantHarness, TIMELINE_ID},
storage_layer::layer::local_layer_path,
Tenant, Timeline,
},
DEFAULT_PG_VERSION,
@@ -2154,20 +2027,11 @@ mod tests {
]
.into_iter()
.map(|(name, contents): (LayerFileName, Vec<u8>)| {
let local_path = local_layer_path(
harness.conf,
&timeline.tenant_shard_id,
&timeline.timeline_id,
&name,
&generation,
);
std::fs::write(&local_path, &contents).unwrap();
std::fs::write(timeline_path.join(name.file_name()), &contents).unwrap();
Layer::for_resident(
harness.conf,
&timeline,
local_path,
name,
LayerFileMetadata::new(contents.len() as u64, generation, shard),
)
@@ -2304,22 +2168,19 @@ mod tests {
..
} = TestSetup::new("metrics").await.unwrap();
let client = timeline.remote_client.as_ref().unwrap();
let timeline_path = harness.timeline_path(&TIMELINE_ID);
let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
let local_path = local_layer_path(
harness.conf,
&timeline.tenant_shard_id,
&timeline.timeline_id,
&layer_file_name_1,
&harness.generation,
);
let content_1 = dummy_contents("foo");
std::fs::write(&local_path, &content_1).unwrap();
std::fs::write(
timeline_path.join(layer_file_name_1.file_name()),
&content_1,
)
.unwrap();
let layer_file_1 = Layer::for_resident(
harness.conf,
&timeline,
local_path,
layer_file_name_1.clone(),
LayerFileMetadata::new(content_1.len() as u64, harness.generation, harness.shard),
);
@@ -2388,7 +2249,12 @@ mod tests {
async fn inject_index_part(test_state: &TestSetup, generation: Generation) -> IndexPart {
// An empty IndexPart, just sufficient to ensure deserialization will succeed
let example_index_part = IndexPart::example();
let example_metadata = TimelineMetadata::example();
let example_index_part = IndexPart::new(
HashMap::new(),
example_metadata.disk_consistent_lsn(),
example_metadata,
);
let index_part_bytes = serde_json::to_vec(&example_index_part).unwrap();

View File

@@ -18,10 +18,8 @@ use tracing::warn;
use utils::backoff;
use crate::config::PageServerConf;
use crate::context::RequestContext;
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
use crate::tenant::storage_layer::layer::local_layer_path;
use crate::tenant::storage_layer::LayerFileName;
use crate::tenant::Generation;
use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
@@ -42,7 +40,6 @@ use super::{
/// in the metadata. (In the future, we might do more cross-checks, like CRC validation)
///
/// Returns the size of the downloaded file.
#[allow(clippy::too_many_arguments)]
pub async fn download_layer_file<'a>(
conf: &'static PageServerConf,
storage: &'a GenericRemoteStorage,
@@ -51,18 +48,11 @@ pub async fn download_layer_file<'a>(
layer_file_name: &'a LayerFileName,
layer_metadata: &'a LayerFileMetadata,
cancel: &CancellationToken,
ctx: &RequestContext,
) -> Result<u64, DownloadError> {
debug_assert_current_span_has_tenant_and_timeline_id();
let timeline_path = conf.timeline_path(&tenant_shard_id, &timeline_id);
let local_path = local_layer_path(
conf,
&tenant_shard_id,
&timeline_id,
layer_file_name,
&layer_metadata.generation,
);
let local_path = timeline_path.join(layer_file_name.file_name());
let remote_path = remote_layer_path(
&tenant_shard_id.tenant_id,
@@ -85,7 +75,7 @@ pub async fn download_layer_file<'a>(
let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION);
let bytes_amount = download_retry(
|| async { download_object(storage, &remote_path, &temp_file_path, cancel, ctx).await },
|| async { download_object(storage, &remote_path, &temp_file_path, cancel).await },
&format!("download {remote_path:?}"),
cancel,
)
@@ -143,7 +133,6 @@ async fn download_object<'a>(
src_path: &RemotePath,
dst_path: &Utf8PathBuf,
cancel: &CancellationToken,
#[cfg_attr(target_os = "macos", allow(unused_variables))] ctx: &RequestContext,
) -> Result<u64, DownloadError> {
let res = match crate::virtual_file::io_engine::get() {
crate::virtual_file::io_engine::IoEngine::NotSet => panic!("unset"),
@@ -219,10 +208,10 @@ async fn download_object<'a>(
Err(e) => return Err(e),
};
buffered
.write_buffered(tokio_epoll_uring::BoundedBuf::slice_full(chunk), ctx)
.write_buffered(tokio_epoll_uring::BoundedBuf::slice_full(chunk))
.await?;
}
let size_tracking = buffered.flush_and_into_inner(ctx).await?;
let size_tracking = buffered.flush_and_into_inner().await?;
Ok(size_tracking.into_inner())
}
.await?;

View File

@@ -6,6 +6,7 @@ use std::collections::HashMap;
use chrono::NaiveDateTime;
use serde::{Deserialize, Serialize};
use utils::bin_ser::SerializeError;
use crate::tenant::metadata::TimelineMetadata;
use crate::tenant::storage_layer::LayerFileName;
@@ -103,14 +104,15 @@ impl IndexPart {
pub const FILE_NAME: &'static str = "index_part.json";
fn new(
layers_and_metadata: &HashMap<LayerFileName, LayerFileMetadata>,
pub fn new(
layers_and_metadata: HashMap<LayerFileName, LayerFileMetadata>,
disk_consistent_lsn: Lsn,
metadata: TimelineMetadata,
) -> Self {
// Transform LayerFileMetadata into IndexLayerMetadata
let layer_metadata = layers_and_metadata
.iter()
.map(|(k, v)| (k.to_owned(), IndexLayerMetadata::from(v)))
.into_iter()
.map(|(k, v)| (k, IndexLayerMetadata::from(v)))
.collect();
Self {
@@ -139,24 +141,20 @@ impl IndexPart {
pub fn to_s3_bytes(&self) -> serde_json::Result<Vec<u8>> {
serde_json::to_vec(self)
}
#[cfg(test)]
pub(crate) fn example() -> Self {
let example_metadata = TimelineMetadata::example();
Self::new(
&HashMap::new(),
example_metadata.disk_consistent_lsn(),
example_metadata,
)
}
}
impl From<&UploadQueueInitialized> for IndexPart {
fn from(uq: &UploadQueueInitialized) -> Self {
let disk_consistent_lsn = uq.latest_metadata.disk_consistent_lsn();
let metadata = uq.latest_metadata.clone();
impl TryFrom<&UploadQueueInitialized> for IndexPart {
type Error = SerializeError;
Self::new(&uq.latest_files, disk_consistent_lsn, metadata)
fn try_from(upload_queue: &UploadQueueInitialized) -> Result<Self, Self::Error> {
let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
let metadata = upload_queue.latest_metadata.clone();
Ok(Self::new(
upload_queue.latest_files.clone(),
disk_consistent_lsn,
metadata,
))
}
}
@@ -174,8 +172,8 @@ pub struct IndexLayerMetadata {
pub shard: ShardIndex,
}
impl From<&LayerFileMetadata> for IndexLayerMetadata {
fn from(other: &LayerFileMetadata) -> Self {
impl From<LayerFileMetadata> for IndexLayerMetadata {
fn from(other: LayerFileMetadata) -> Self {
IndexLayerMetadata {
file_size: other.file_size,
generation: other.generation,

View File

@@ -12,13 +12,18 @@ use tokio_util::sync::CancellationToken;
use utils::backoff;
use super::Generation;
use crate::tenant::remote_timeline_client::{
index::IndexPart, remote_index_path, remote_initdb_archive_path,
remote_initdb_preserved_archive_path,
use crate::{
config::PageServerConf,
tenant::remote_timeline_client::{
index::IndexPart, remote_index_path, remote_initdb_archive_path,
remote_initdb_preserved_archive_path, remote_path,
},
};
use remote_storage::{GenericRemoteStorage, RemotePath, TimeTravelError};
use remote_storage::{GenericRemoteStorage, TimeTravelError};
use utils::id::{TenantId, TimelineId};
use super::index::LayerFileMetadata;
use tracing::info;
/// Serializes and uploads the given index part data to the remote storage.
@@ -60,10 +65,11 @@ pub(crate) async fn upload_index_part<'a>(
///
/// On an error, bumps the retries count and reschedules the entire task.
pub(super) async fn upload_timeline_layer<'a>(
conf: &'static PageServerConf,
storage: &'a GenericRemoteStorage,
local_path: &'a Utf8Path,
remote_path: &'a RemotePath,
metadata_size: u64,
source_path: &'a Utf8Path,
known_metadata: &'a LayerFileMetadata,
generation: Generation,
cancel: &CancellationToken,
) -> anyhow::Result<()> {
fail_point!("before-upload-layer", |_| {
@@ -72,7 +78,8 @@ pub(super) async fn upload_timeline_layer<'a>(
pausable_failpoint!("before-upload-layer-pausable");
let source_file_res = fs::File::open(&local_path).await;
let storage_path = remote_path(conf, source_path, generation)?;
let source_file_res = fs::File::open(&source_path).await;
let source_file = match source_file_res {
Ok(source_file) => source_file,
Err(e) if e.kind() == ErrorKind::NotFound => {
@@ -83,49 +90,34 @@ pub(super) async fn upload_timeline_layer<'a>(
// it has been written to disk yet.
//
// This is tested against `test_compaction_delete_before_upload`
info!(path = %local_path, "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more.");
info!(path = %source_path, "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more.");
return Ok(());
}
Err(e) => Err(e).with_context(|| format!("open a source file for layer {local_path:?}"))?,
Err(e) => {
Err(e).with_context(|| format!("open a source file for layer {source_path:?}"))?
}
};
let fs_size = source_file
.metadata()
.await
.with_context(|| format!("get the source file metadata for layer {local_path:?}"))?
.with_context(|| format!("get the source file metadata for layer {source_path:?}"))?
.len();
let metadata_size = known_metadata.file_size();
if metadata_size != fs_size {
bail!("File {local_path:?} has its current FS size {fs_size} diferent from initially determined {metadata_size}");
bail!("File {source_path:?} has its current FS size {fs_size} diferent from initially determined {metadata_size}");
}
let fs_size = usize::try_from(fs_size)
.with_context(|| format!("convert {local_path:?} size {fs_size} usize"))?;
.with_context(|| format!("convert {source_path:?} size {fs_size} usize"))?;
let reader = tokio_util::io::ReaderStream::with_capacity(source_file, super::BUFFER_SIZE);
storage
.upload(reader, fs_size, remote_path, None, cancel)
.upload(reader, fs_size, &storage_path, None, cancel)
.await
.with_context(|| format!("upload layer from local path '{local_path}'"))
}
pub(super) async fn copy_timeline_layer(
storage: &GenericRemoteStorage,
source_path: &RemotePath,
target_path: &RemotePath,
cancel: &CancellationToken,
) -> anyhow::Result<()> {
fail_point!("before-copy-layer", |_| {
bail!("failpoint before-copy-layer")
});
pausable_failpoint!("before-copy-layer-pausable");
storage
.copy_object(source_path, target_path, cancel)
.await
.with_context(|| format!("copy layer {source_path} to {target_path}"))
.with_context(|| format!("upload layer from local path '{source_path}'"))
}
/// Uploads the given `initdb` data to the remote storage.

View File

@@ -7,7 +7,6 @@ use std::{sync::Arc, time::SystemTime};
use crate::{
config::PageServerConf,
context::RequestContext,
disk_usage_eviction_task::DiskUsageEvictionInfo,
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
virtual_file::MaybeFatalIo,
@@ -21,9 +20,8 @@ use self::{
use super::{
config::{SecondaryLocationConfig, TenantConfOpt},
mgr::TenantManager,
remote_timeline_client::LayerFileMetadata,
span::debug_assert_current_span_has_tenant_id,
storage_layer::{layer::local_layer_path, LayerFileName},
storage_layer::LayerFileName,
};
use pageserver_api::{
@@ -183,7 +181,6 @@ impl SecondaryTenant {
conf: &PageServerConf,
timeline_id: TimelineId,
name: LayerFileName,
metadata: LayerFileMetadata,
) {
debug_assert_current_span_has_tenant_id();
@@ -197,13 +194,9 @@ impl SecondaryTenant {
let now = SystemTime::now();
let local_path = local_layer_path(
conf,
&self.tenant_shard_id,
&timeline_id,
&name,
&metadata.generation,
);
let path = conf
.timeline_path(&self.tenant_shard_id, &timeline_id)
.join(name.file_name());
let this = self.clone();
@@ -214,7 +207,7 @@ impl SecondaryTenant {
// it, the secondary downloader could have seen an updated heatmap that
// resulted in a layer being deleted.
// Other local I/O errors are process-fatal: these should never happen.
let deleted = std::fs::remove_file(local_path);
let deleted = std::fs::remove_file(path);
let not_found = deleted
.as_ref()
@@ -323,13 +316,9 @@ pub fn spawn_tasks(
let (upload_req_tx, upload_req_rx) =
tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);
let downloader_task_ctx = RequestContext::new(
TaskKind::SecondaryDownloads,
crate::context::DownloadBehavior::Download,
);
task_mgr::spawn(
BACKGROUND_RUNTIME.handle(),
downloader_task_ctx.task_kind(),
TaskKind::SecondaryDownloads,
None,
None,
"secondary tenant downloads",
@@ -341,7 +330,6 @@ pub fn spawn_tasks(
download_req_rx,
bg_jobs_clone,
cancel_clone,
downloader_task_ctx,
)
.await;

View File

@@ -8,7 +8,6 @@ use std::{
use crate::{
config::PageServerConf,
context::RequestContext,
disk_usage_eviction_task::{
finite_f32, DiskUsageEvictionInfo, EvictionCandidate, EvictionLayer, EvictionSecondaryLayer,
},
@@ -22,7 +21,7 @@ use crate::{
FAILED_REMOTE_OP_RETRIES,
},
span::debug_assert_current_span_has_tenant_id,
storage_layer::{layer::local_layer_path, LayerFileName},
storage_layer::LayerFileName,
tasks::{warn_when_period_overrun, BackgroundLoopKind},
},
virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile},
@@ -31,10 +30,7 @@ use crate::{
use super::{
heatmap::HeatMapLayer,
scheduler::{
self, period_jitter, period_warmup, Completion, JobGenerator, SchedulingResult,
TenantBackgroundJobs,
},
scheduler::{self, Completion, JobGenerator, SchedulingResult, TenantBackgroundJobs},
SecondaryTenant,
};
@@ -48,6 +44,7 @@ use chrono::format::{DelayedFormat, StrftimeItems};
use futures::Future;
use pageserver_api::models::SecondaryProgress;
use pageserver_api::shard::TenantShardId;
use rand::Rng;
use remote_storage::{DownloadError, Etag, GenericRemoteStorage};
use tokio_util::sync::CancellationToken;
@@ -77,14 +74,12 @@ pub(super) async fn downloader_task(
command_queue: tokio::sync::mpsc::Receiver<CommandRequest<DownloadCommand>>,
background_jobs_can_start: Barrier,
cancel: CancellationToken,
root_ctx: RequestContext,
) {
let concurrency = tenant_manager.get_conf().secondary_download_concurrency;
let generator = SecondaryDownloader {
tenant_manager,
remote_storage,
root_ctx,
};
let mut scheduler = Scheduler::new(generator, concurrency);
@@ -97,7 +92,6 @@ pub(super) async fn downloader_task(
struct SecondaryDownloader {
tenant_manager: Arc<TenantManager>,
remote_storage: GenericRemoteStorage,
root_ctx: RequestContext,
}
#[derive(Debug, Clone)]
@@ -276,7 +270,7 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
// Update freshened_at even if there was an error: we don't want errored tenants to implicitly
// take priority to run again.
let mut detail = secondary_state.detail.lock().unwrap();
detail.next_download = Some(Instant::now() + period_jitter(DOWNLOAD_FRESHEN_INTERVAL, 5));
detail.next_download = Some(Instant::now() + DOWNLOAD_FRESHEN_INTERVAL);
}
async fn schedule(&mut self) -> SchedulingResult<PendingDownload> {
@@ -307,9 +301,11 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
}
if detail.next_download.is_none() {
// Initialize randomly in the range from 0 to our interval: this uniformly spreads the start times. Subsequent
// rounds will use a smaller jitter to avoid accidentally synchronizing later.
detail.next_download = Some(now.checked_add(period_warmup(DOWNLOAD_FRESHEN_INTERVAL)).expect(
// Initialize with a jitter: this spreads initial downloads on startup
// or mass-attach across our freshen interval.
let jittered_period =
rand::thread_rng().gen_range(Duration::ZERO..DOWNLOAD_FRESHEN_INTERVAL);
detail.next_download = Some(now.checked_add(jittered_period).expect(
"Using our constant, which is known to be small compared with clock range",
));
}
@@ -371,12 +367,11 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
let remote_storage = self.remote_storage.clone();
let conf = self.tenant_manager.get_conf();
let tenant_shard_id = *secondary_state.get_tenant_shard_id();
let download_ctx = self.root_ctx.attached_child();
(RunningDownload { barrier }, Box::pin(async move {
let _completion = completion;
match TenantDownloader::new(conf, &remote_storage, &secondary_state)
.download(&download_ctx)
.download()
.await
{
Err(UpdateError::NoData) => {
@@ -490,7 +485,7 @@ impl<'a> TenantDownloader<'a> {
}
}
async fn download(&self, ctx: &RequestContext) -> Result<(), UpdateError> {
async fn download(&self) -> Result<(), UpdateError> {
debug_assert_current_span_has_tenant_id();
// For the duration of a download, we must hold the SecondaryTenant::gate, to ensure
@@ -565,7 +560,7 @@ impl<'a> TenantDownloader<'a> {
}
let timeline_id = timeline.timeline_id;
self.download_timeline(timeline, ctx)
self.download_timeline(timeline)
.instrument(tracing::info_span!(
"secondary_download_timeline",
tenant_id=%tenant_shard_id.tenant_id,
@@ -621,12 +616,12 @@ impl<'a> TenantDownloader<'a> {
let layers_in_heatmap = heatmap_timeline
.layers
.iter()
.map(|l| (&l.name, l.metadata.generation))
.map(|l| &l.name)
.collect::<HashSet<_>>();
let layers_on_disk = timeline_state
.on_disk_layers
.iter()
.map(|l| (l.0, l.1.metadata.generation))
.map(|l| l.0)
.collect::<HashSet<_>>();
let mut layer_count = layers_on_disk.len();
@@ -637,24 +632,16 @@ impl<'a> TenantDownloader<'a> {
.sum();
// Remove on-disk layers that are no longer present in heatmap
for (layer_file_name, generation) in layers_on_disk.difference(&layers_in_heatmap) {
for layer in layers_on_disk.difference(&layers_in_heatmap) {
layer_count -= 1;
layer_byte_count -= timeline_state
.on_disk_layers
.get(layer_file_name)
.get(layer)
.unwrap()
.metadata
.file_size();
let local_path = local_layer_path(
self.conf,
self.secondary_state.get_tenant_shard_id(),
timeline_id,
layer_file_name,
generation,
);
delete_layers.push((*timeline_id, (*layer_file_name).clone(), local_path));
delete_layers.push((*timeline_id, (*layer).clone()));
}
progress.bytes_downloaded += layer_byte_count;
@@ -669,7 +656,11 @@ impl<'a> TenantDownloader<'a> {
}
// Execute accumulated deletions
for (timeline_id, layer_name, local_path) in delete_layers {
for (timeline_id, layer_name) in delete_layers {
let timeline_path = self
.conf
.timeline_path(self.secondary_state.get_tenant_shard_id(), &timeline_id);
let local_path = timeline_path.join(layer_name.to_string());
tracing::info!(timeline_id=%timeline_id, "Removing secondary local layer {layer_name} because it's absent in heatmap",);
tokio::fs::remove_file(&local_path)
@@ -751,13 +742,12 @@ impl<'a> TenantDownloader<'a> {
.and_then(|x| x)
}
async fn download_timeline(
&self,
timeline: HeatMapTimeline,
ctx: &RequestContext,
) -> Result<(), UpdateError> {
async fn download_timeline(&self, timeline: HeatMapTimeline) -> Result<(), UpdateError> {
debug_assert_current_span_has_tenant_and_timeline_id();
let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
let timeline_path = self
.conf
.timeline_path(tenant_shard_id, &timeline.timeline_id);
// Accumulate updates to the state
let mut touched = Vec::new();
@@ -807,14 +797,10 @@ impl<'a> TenantDownloader<'a> {
if cfg!(debug_assertions) {
// Debug for https://github.com/neondatabase/neon/issues/6966: check that the files we think
// are already present on disk are really there.
let local_path = local_layer_path(
self.conf,
tenant_shard_id,
&timeline.timeline_id,
&layer.name,
&layer.metadata.generation,
);
let local_path = self
.conf
.timeline_path(tenant_shard_id, &timeline.timeline_id)
.join(layer.name.file_name());
match tokio::fs::metadata(&local_path).await {
Ok(meta) => {
tracing::debug!(
@@ -889,7 +875,6 @@ impl<'a> TenantDownloader<'a> {
&layer.name,
&LayerFileMetadata::from(&layer.metadata),
&self.secondary_state.cancel,
ctx,
)
.await
{
@@ -908,13 +893,7 @@ impl<'a> TenantDownloader<'a> {
};
if downloaded_bytes != layer.metadata.file_size {
let local_path = local_layer_path(
self.conf,
tenant_shard_id,
&timeline.timeline_id,
&layer.name,
&layer.metadata.generation,
);
let local_path = timeline_path.join(layer.name.to_string());
tracing::warn!(
"Downloaded layer {} with unexpected size {} != {}. Removing download.",

View File

@@ -20,14 +20,12 @@ use crate::{
use futures::Future;
use pageserver_api::shard::TenantShardId;
use rand::Rng;
use remote_storage::{GenericRemoteStorage, TimeoutOrCancel};
use super::{
heatmap::HeatMapTenant,
scheduler::{
self, period_jitter, period_warmup, JobGenerator, RunningJob, SchedulingResult,
TenantBackgroundJobs,
},
scheduler::{self, JobGenerator, RunningJob, SchedulingResult, TenantBackgroundJobs},
CommandRequest, UploadCommand,
};
use tokio_util::sync::CancellationToken;
@@ -183,11 +181,15 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
let state = self
.tenants
.entry(*tenant.get_tenant_shard_id())
.or_insert_with(|| UploaderTenantState {
tenant: Arc::downgrade(&tenant),
last_upload: None,
next_upload: Some(now.checked_add(period_warmup(period)).unwrap_or(now)),
last_digest: None,
.or_insert_with(|| {
let jittered_period = rand::thread_rng().gen_range(Duration::ZERO..period);
UploaderTenantState {
tenant: Arc::downgrade(&tenant),
last_upload: None,
next_upload: Some(now.checked_add(jittered_period).unwrap_or(now)),
last_digest: None,
}
});
// Decline to do the upload if insufficient time has passed
@@ -272,7 +274,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
let next_upload = tenant
.get_heatmap_period()
.and_then(|period| now.checked_add(period_jitter(period, 5)));
.and_then(|period| now.checked_add(period));
WriteComplete {
tenant_shard_id: *tenant.get_tenant_shard_id(),

View File

@@ -1,5 +1,4 @@
use futures::Future;
use rand::Rng;
use std::{
collections::HashMap,
marker::PhantomData,
@@ -20,26 +19,6 @@ use super::{CommandRequest, CommandResponse};
const MAX_SCHEDULING_INTERVAL: Duration = Duration::from_secs(10);
const MIN_SCHEDULING_INTERVAL: Duration = Duration::from_secs(1);
/// Jitter a Duration by an integer percentage. Returned values are uniform
/// in the range 100-pct..100+pct (i.e. a 5% jitter is 5% either way: a ~10% range)
pub(super) fn period_jitter(d: Duration, pct: u32) -> Duration {
if d == Duration::ZERO {
d
} else {
rand::thread_rng().gen_range((d * (100 - pct)) / 100..(d * (100 + pct)) / 100)
}
}
/// When a periodic task first starts, it should wait for some time in the range 0..period, so
/// that starting many such tasks at the same time spreads them across the time range.
pub(super) fn period_warmup(period: Duration) -> Duration {
if period == Duration::ZERO {
period
} else {
rand::thread_rng().gen_range(Duration::ZERO..period)
}
}
/// Scheduling helper for background work across many tenants.
///
/// Systems that need to run background work across many tenants may use this type

View File

@@ -118,6 +118,9 @@ pub(super) async fn gather_inputs(
ctx: &RequestContext,
) -> anyhow::Result<ModelInputs> {
// refresh is needed to update gc related pitr_cutoff and horizon_cutoff
//
// FIXME: if a single timeline is deleted while refresh gc info is ongoing, we will fail the
// whole computation. It does not make sense from the billing perspective.
tenant
.refresh_gc_info(cancel, ctx)
.await
@@ -189,9 +192,7 @@ pub(super) async fn gather_inputs(
// than a space bound (horizon cutoff). This means that if someone drops a database and waits for their
// PITR interval, they will see synthetic size decrease, even if we are still storing data inside
// horizon_cutoff.
let pitr_cutoff = gc_info.cutoffs.pitr;
let horizon_cutoff = gc_info.cutoffs.horizon;
let mut next_gc_cutoff = pitr_cutoff;
let mut next_gc_cutoff = gc_info.pitr_cutoff;
// If the caller provided a shorter retention period, use that instead of the GC cutoff.
let retention_param_cutoff = if let Some(max_retention_period) = max_retention_period {
@@ -218,8 +219,6 @@ pub(super) async fn gather_inputs(
.map(|lsn| (lsn, LsnKind::BranchPoint))
.collect::<Vec<_>>();
drop(gc_info);
// Add branch points we collected earlier, just in case there were any that were
// not present in retain_lsns. We will remove any duplicates below later.
if let Some(this_branchpoints) = branchpoints.get(&timeline_id) {
@@ -298,8 +297,8 @@ pub(super) async fn gather_inputs(
last_record: last_record_lsn,
// this is not used above, because it might not have updated recently enough
latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(),
horizon_cutoff,
pitr_cutoff,
horizon_cutoff: gc_info.horizon_cutoff,
pitr_cutoff: gc_info.pitr_cutoff,
next_gc_cutoff,
retention_param_cutoff,
});

View File

@@ -428,15 +428,9 @@ impl DeltaLayerWriterInner {
///
/// The values must be appended in key, lsn order.
///
async fn put_value(
&mut self,
key: Key,
lsn: Lsn,
val: Value,
ctx: &RequestContext,
) -> anyhow::Result<()> {
async fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
let (_, res) = self
.put_value_bytes(key, lsn, Value::ser(&val)?, val.will_init(), ctx)
.put_value_bytes(key, lsn, Value::ser(&val)?, val.will_init())
.await;
res
}
@@ -447,10 +441,9 @@ impl DeltaLayerWriterInner {
lsn: Lsn,
val: Vec<u8>,
will_init: bool,
ctx: &RequestContext,
) -> (Vec<u8>, anyhow::Result<()>) {
assert!(self.lsn_range.start <= lsn);
let (val, res) = self.blob_writer.write_blob(val, ctx).await;
let (val, res) = self.blob_writer.write_blob(val).await;
let off = match res {
Ok(off) => off,
Err(e) => return (val, Err(anyhow::anyhow!(e))),
@@ -470,23 +463,18 @@ impl DeltaLayerWriterInner {
///
/// Finish writing the delta layer.
///
async fn finish(
self,
key_end: Key,
timeline: &Arc<Timeline>,
ctx: &RequestContext,
) -> anyhow::Result<ResidentLayer> {
async fn finish(self, key_end: Key, timeline: &Arc<Timeline>) -> anyhow::Result<ResidentLayer> {
let index_start_blk =
((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
let mut file = self.blob_writer.into_inner(ctx).await?;
let mut file = self.blob_writer.into_inner().await?;
// Write out the index
let (index_root_blk, block_buf) = self.tree.finish()?;
file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64))
.await?;
for buf in block_buf.blocks {
let (_buf, res) = file.write_all(buf, ctx).await;
let (_buf, res) = file.write_all(buf).await;
res?;
}
assert!(self.lsn_range.start < self.lsn_range.end);
@@ -506,7 +494,7 @@ impl DeltaLayerWriterInner {
// TODO: could use smallvec here but it's a pain with Slice<T>
Summary::ser_into(&summary, &mut buf)?;
file.seek(SeekFrom::Start(0)).await?;
let (_buf, res) = file.write_all(buf, ctx).await;
let (_buf, res) = file.write_all(buf).await;
res?;
let metadata = file
@@ -604,18 +592,8 @@ impl DeltaLayerWriter {
///
/// The values must be appended in key, lsn order.
///
pub async fn put_value(
&mut self,
key: Key,
lsn: Lsn,
val: Value,
ctx: &RequestContext,
) -> anyhow::Result<()> {
self.inner
.as_mut()
.unwrap()
.put_value(key, lsn, val, ctx)
.await
pub async fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
self.inner.as_mut().unwrap().put_value(key, lsn, val).await
}
pub async fn put_value_bytes(
@@ -624,12 +602,11 @@ impl DeltaLayerWriter {
lsn: Lsn,
val: Vec<u8>,
will_init: bool,
ctx: &RequestContext,
) -> (Vec<u8>, anyhow::Result<()>) {
self.inner
.as_mut()
.unwrap()
.put_value_bytes(key, lsn, val, will_init, ctx)
.put_value_bytes(key, lsn, val, will_init)
.await
}
@@ -644,11 +621,10 @@ impl DeltaLayerWriter {
mut self,
key_end: Key,
timeline: &Arc<Timeline>,
ctx: &RequestContext,
) -> anyhow::Result<ResidentLayer> {
let inner = self.inner.take().unwrap();
let temp_path = inner.path.clone();
let result = inner.finish(key_end, timeline, ctx).await;
let result = inner.finish(key_end, timeline).await;
// The delta layer files can sometimes be really large. Clean them up.
if result.is_err() {
tracing::warn!(
@@ -716,7 +692,7 @@ impl DeltaLayer {
// TODO: could use smallvec here, but it's a pain with Slice<T>
Summary::ser_into(&new_summary, &mut buf).context("serialize")?;
file.seek(SeekFrom::Start(0)).await?;
let (_buf, res) = file.write_all(buf, ctx).await;
let (_buf, res) = file.write_all(buf).await;
res?;
Ok(())
}
@@ -1139,15 +1115,15 @@ impl DeltaLayerInner {
Ok(all_keys)
}
/// Using the given writer, write out a version which has the earlier Lsns than `until`.
///
/// Return the amount of key value records pushed to the writer.
/// Using the given writer, write out a truncated version, where LSNs higher than the
/// truncate_at are missing.
#[cfg(test)]
pub(super) async fn copy_prefix(
&self,
writer: &mut DeltaLayerWriter,
until: Lsn,
truncate_at: Lsn,
ctx: &RequestContext,
) -> anyhow::Result<usize> {
) -> anyhow::Result<()> {
use crate::tenant::vectored_blob_io::{
BlobMeta, VectoredReadBuilder, VectoredReadExtended,
};
@@ -1211,8 +1187,6 @@ impl DeltaLayerInner {
// FIXME: buffering of DeltaLayerWriter
let mut per_blob_copy = Vec::new();
let mut records = 0;
while let Some(item) = stream.try_next().await? {
tracing::debug!(?item, "popped");
let offset = item
@@ -1231,7 +1205,7 @@ impl DeltaLayerInner {
prev = Option::from(item);
let actionable = actionable.filter(|x| x.0.lsn < until);
let actionable = actionable.filter(|x| x.0.lsn < truncate_at);
let builder = if let Some((meta, offsets)) = actionable {
// extend or create a new builder
@@ -1299,7 +1273,7 @@ impl DeltaLayerInner {
let will_init = crate::repository::ValueBytes::will_init(data)
.inspect_err(|_e| {
#[cfg(feature = "testing")]
tracing::error!(data=?utils::Hex(data), err=?_e, %key, %lsn, "failed to parse will_init out of serialized value");
tracing::error!(data=?utils::Hex(data), err=?_e, "failed to parse will_init out of serialized value");
})
.unwrap_or(false);
@@ -1307,19 +1281,10 @@ impl DeltaLayerInner {
per_blob_copy.extend_from_slice(data);
let (tmp, res) = writer
.put_value_bytes(
key,
lsn,
std::mem::take(&mut per_blob_copy),
will_init,
ctx,
)
.put_value_bytes(key, lsn, std::mem::take(&mut per_blob_copy), will_init)
.await;
per_blob_copy = tmp;
res?;
records += 1;
}
buffer = Some(res.buf);
@@ -1331,7 +1296,7 @@ impl DeltaLayerInner {
"with the sentinel above loop should had handled all"
);
Ok(records)
Ok(())
}
pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> {
@@ -1404,6 +1369,7 @@ impl DeltaLayerInner {
Ok(())
}
#[cfg(test)]
fn stream_index_forwards<'a, R>(
&'a self,
reader: &'a DiskBtreeReader<R, DELTA_KEY_SIZE>,
@@ -1794,14 +1760,12 @@ mod test {
for entry in entries {
let (_, res) = writer
.put_value_bytes(entry.key, entry.lsn, entry.value, false, &ctx)
.put_value_bytes(entry.key, entry.lsn, entry.value, false)
.await;
res?;
}
let resident = writer
.finish(entries_meta.key_range.end, &timeline, &ctx)
.await?;
let resident = writer.finish(entries_meta.key_range.end, &timeline).await?;
let inner = resident.as_delta(&ctx).await?;
@@ -1987,7 +1951,7 @@ mod test {
.await
.unwrap();
let copied_layer = writer.finish(Key::MAX, &branch, ctx).await.unwrap();
let copied_layer = writer.finish(Key::MAX, &branch).await.unwrap();
copied_layer.as_delta(ctx).await.unwrap();

View File

@@ -2,13 +2,11 @@
//! Helper functions for dealing with filenames of the image and delta layer files.
//!
use crate::repository::Key;
use std::borrow::Cow;
use std::cmp::Ordering;
use std::fmt;
use std::ops::Range;
use std::str::FromStr;
use regex::Regex;
use utils::lsn::Lsn;
use super::PersistentLayerDesc;
@@ -76,19 +74,10 @@ impl DeltaFileName {
let key_end_str = key_parts.next()?;
let lsn_start_str = lsn_parts.next()?;
let lsn_end_str = lsn_parts.next()?;
if parts.next().is_some() || key_parts.next().is_some() || key_parts.next().is_some() {
return None;
}
if key_start_str.len() != 36
|| key_end_str.len() != 36
|| lsn_start_str.len() != 16
|| lsn_end_str.len() != 16
{
return None;
}
let key_start = Key::from_hex(key_start_str).ok()?;
let key_end = Key::from_hex(key_end_str).ok()?;
@@ -193,10 +182,6 @@ impl ImageFileName {
return None;
}
if key_start_str.len() != 36 || key_end_str.len() != 36 || lsn_str.len() != 16 {
return None;
}
let key_start = Key::from_hex(key_start_str).ok()?;
let key_end = Key::from_hex(key_end_str).ok()?;
@@ -274,22 +259,9 @@ impl From<DeltaFileName> for LayerFileName {
impl FromStr for LayerFileName {
type Err = String;
/// Conversion from either a physical layer filename, or the string-ization of
/// Self. When loading a physical layer filename, we drop any extra information
/// not needed to build Self.
fn from_str(value: &str) -> Result<Self, Self::Err> {
let gen_suffix_regex = Regex::new("^(?<base>.+)-(?<gen>[0-9a-f]{8})$").unwrap();
let file_name: Cow<str> = match gen_suffix_regex.captures(value) {
Some(captures) => captures
.name("base")
.expect("Non-optional group")
.as_str()
.into(),
None => value.into(),
};
let delta = DeltaFileName::parse_str(&file_name);
let image = ImageFileName::parse_str(&file_name);
let delta = DeltaFileName::parse_str(value);
let image = ImageFileName::parse_str(value);
let ok = match (delta, image) {
(None, None) => {
return Err(format!(
@@ -343,42 +315,3 @@ impl<'de> serde::de::Visitor<'de> for LayerFileNameVisitor {
v.parse().map_err(|e| E::custom(e))
}
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn image_layer_parse() -> anyhow::Result<()> {
let expected = LayerFileName::Image(ImageFileName {
key_range: Key::from_i128(0)
..Key::from_hex("000000067F00000001000004DF0000000006").unwrap(),
lsn: Lsn::from_hex("00000000014FED58").unwrap(),
});
let parsed = LayerFileName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-00000001").map_err(|s| anyhow::anyhow!(s))?;
assert_eq!(parsed, expected,);
// Omitting generation suffix is valid
let parsed = LayerFileName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58").map_err(|s| anyhow::anyhow!(s))?;
assert_eq!(parsed, expected,);
Ok(())
}
#[test]
fn delta_layer_parse() -> anyhow::Result<()> {
let expected = LayerFileName::Delta(DeltaFileName {
key_range: Key::from_i128(0)
..Key::from_hex("000000067F00000001000004DF0000000006").unwrap(),
lsn_range: Lsn::from_hex("00000000014FED58").unwrap()
..Lsn::from_hex("000000000154C481").unwrap(),
});
let parsed = LayerFileName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481-00000001").map_err(|s| anyhow::anyhow!(s))?;
assert_eq!(parsed, expected);
// Omitting generation suffix is valid
let parsed = LayerFileName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481").map_err(|s| anyhow::anyhow!(s))?;
assert_eq!(parsed, expected);
Ok(())
}
}

View File

@@ -357,7 +357,7 @@ impl ImageLayer {
// TODO: could use smallvec here but it's a pain with Slice<T>
Summary::ser_into(&new_summary, &mut buf).context("serialize")?;
file.seek(SeekFrom::Start(0)).await?;
let (_buf, res) = file.write_all(buf, ctx).await;
let (_buf, res) = file.write_all(buf).await;
res?;
Ok(())
}
@@ -677,14 +677,9 @@ impl ImageLayerWriterInner {
///
/// The page versions must be appended in blknum order.
///
async fn put_image(
&mut self,
key: Key,
img: Bytes,
ctx: &RequestContext,
) -> anyhow::Result<()> {
async fn put_image(&mut self, key: Key, img: Bytes) -> anyhow::Result<()> {
ensure!(self.key_range.contains(&key));
let (_img, res) = self.blob_writer.write_blob(img, ctx).await;
let (_img, res) = self.blob_writer.write_blob(img).await;
// TODO: re-use the buffer for `img` further upstack
let off = res?;
@@ -698,11 +693,7 @@ impl ImageLayerWriterInner {
///
/// Finish writing the image layer.
///
async fn finish(
self,
timeline: &Arc<Timeline>,
ctx: &RequestContext,
) -> anyhow::Result<ResidentLayer> {
async fn finish(self, timeline: &Arc<Timeline>) -> anyhow::Result<ResidentLayer> {
let index_start_blk =
((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
@@ -713,7 +704,7 @@ impl ImageLayerWriterInner {
.await?;
let (index_root_blk, block_buf) = self.tree.finish()?;
for buf in block_buf.blocks {
let (_buf, res) = file.write_all(buf, ctx).await;
let (_buf, res) = file.write_all(buf).await;
res?;
}
@@ -733,7 +724,7 @@ impl ImageLayerWriterInner {
// TODO: could use smallvec here but it's a pain with Slice<T>
Summary::ser_into(&summary, &mut buf)?;
file.seek(SeekFrom::Start(0)).await?;
let (_buf, res) = file.write_all(buf, ctx).await;
let (_buf, res) = file.write_all(buf).await;
res?;
let metadata = file
@@ -815,13 +806,8 @@ impl ImageLayerWriter {
///
/// The page versions must be appended in blknum order.
///
pub async fn put_image(
&mut self,
key: Key,
img: Bytes,
ctx: &RequestContext,
) -> anyhow::Result<()> {
self.inner.as_mut().unwrap().put_image(key, img, ctx).await
pub async fn put_image(&mut self, key: Key, img: Bytes) -> anyhow::Result<()> {
self.inner.as_mut().unwrap().put_image(key, img).await
}
///
@@ -830,9 +816,8 @@ impl ImageLayerWriter {
pub(crate) async fn finish(
mut self,
timeline: &Arc<Timeline>,
ctx: &RequestContext,
) -> anyhow::Result<super::ResidentLayer> {
self.inner.take().unwrap().finish(timeline, ctx).await
self.inner.take().unwrap().finish(timeline).await
}
}

View File

@@ -659,14 +659,14 @@ impl InMemoryLayer {
let will_init = Value::des(&buf)?.will_init();
let res;
(buf, res) = delta_layer_writer
.put_value_bytes(*key, *lsn, buf, will_init, &ctx)
.put_value_bytes(*key, *lsn, buf, will_init)
.await;
res?;
}
}
// MAX is used here because we identify L0 layers by full key range
let delta_layer = delta_layer_writer.finish(Key::MAX, timeline, &ctx).await?;
let delta_layer = delta_layer_writer.finish(Key::MAX, timeline).await?;
Ok(Some(delta_layer))
}
}

View File

@@ -4,21 +4,19 @@ use pageserver_api::keyspace::KeySpace;
use pageserver_api::models::{
HistoricLayerInfo, LayerAccessKind, LayerResidenceEventReason, LayerResidenceStatus,
};
use pageserver_api::shard::{ShardIndex, TenantShardId};
use pageserver_api::shard::ShardIndex;
use std::ops::Range;
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
use std::sync::{Arc, Weak};
use std::time::{Duration, SystemTime};
use tracing::Instrument;
use utils::id::TimelineId;
use utils::lsn::Lsn;
use utils::sync::heavier_once_cell;
use crate::config::PageServerConf;
use crate::context::{DownloadBehavior, RequestContext};
use crate::context::RequestContext;
use crate::repository::Key;
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
use crate::task_mgr::TaskKind;
use crate::tenant::timeline::GetVectoredError;
use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline};
@@ -124,25 +122,6 @@ impl PartialEq for Layer {
}
}
pub(crate) fn local_layer_path(
conf: &PageServerConf,
tenant_shard_id: &TenantShardId,
timeline_id: &TimelineId,
layer_file_name: &LayerFileName,
_generation: &Generation,
) -> Utf8PathBuf {
let timeline_path = conf.timeline_path(tenant_shard_id, timeline_id);
timeline_path.join(layer_file_name.file_name())
// TOOD: include generation in the name in now+1 releases.
// timeline_path.join(format!(
// "{}{}",
// layer_file_name.file_name(),
// generation.get_suffix()
// ))
}
impl Layer {
/// Creates a layer value for a file we know to not be resident.
pub(crate) fn for_evicted(
@@ -151,14 +130,6 @@ impl Layer {
file_name: LayerFileName,
metadata: LayerFileMetadata,
) -> Self {
let local_path = local_layer_path(
conf,
&timeline.tenant_shard_id,
&timeline.timeline_id,
&file_name,
&metadata.generation,
);
let desc = PersistentLayerDesc::from_filename(
timeline.tenant_shard_id,
timeline.timeline_id,
@@ -171,7 +142,6 @@ impl Layer {
let owner = Layer(Arc::new(LayerInner::new(
conf,
timeline,
local_path,
access_stats,
desc,
None,
@@ -188,7 +158,6 @@ impl Layer {
pub(crate) fn for_resident(
conf: &'static PageServerConf,
timeline: &Arc<Timeline>,
local_path: Utf8PathBuf,
file_name: LayerFileName,
metadata: LayerFileMetadata,
) -> ResidentLayer {
@@ -214,7 +183,6 @@ impl Layer {
LayerInner::new(
conf,
timeline,
local_path,
access_stats,
desc,
Some(inner),
@@ -256,19 +224,9 @@ impl Layer {
LayerResidenceStatus::Resident,
LayerResidenceEventReason::LayerCreate,
);
let local_path = local_layer_path(
conf,
&timeline.tenant_shard_id,
&timeline.timeline_id,
&desc.filename(),
&timeline.generation,
);
LayerInner::new(
conf,
timeline,
local_path,
access_stats,
desc,
Some(inner),
@@ -451,13 +409,6 @@ impl Layer {
self.0.metadata()
}
pub(crate) fn get_timeline_id(&self) -> Option<TimelineId> {
self.0
.timeline
.upgrade()
.map(|timeline| timeline.timeline_id)
}
/// Traditional debug dumping facility
#[allow(unused)]
pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> anyhow::Result<()> {
@@ -757,17 +708,19 @@ impl Drop for LayerInner {
}
impl LayerInner {
#[allow(clippy::too_many_arguments)]
fn new(
conf: &'static PageServerConf,
timeline: &Arc<Timeline>,
local_path: Utf8PathBuf,
access_stats: LayerAccessStats,
desc: PersistentLayerDesc,
downloaded: Option<Arc<DownloadedLayer>>,
generation: Generation,
shard: ShardIndex,
) -> Self {
let path = conf
.timeline_path(&timeline.tenant_shard_id, &timeline.timeline_id)
.join(desc.filename().to_string());
let (inner, version, init_status) = if let Some(inner) = downloaded {
let version = inner.version;
let resident = ResidentOrWantedEvicted::Resident(inner);
@@ -783,7 +736,7 @@ impl LayerInner {
LayerInner {
conf,
debug_str: { format!("timelines/{}/{}", timeline.timeline_id, desc.filename()).into() },
path: local_path,
path,
desc,
timeline: Arc::downgrade(timeline),
have_remote_client: timeline.remote_client.is_some(),
@@ -986,20 +939,11 @@ impl LayerInner {
return Err(DownloadError::DownloadRequired);
}
let download_ctx = ctx
.map(|ctx| ctx.detached_child(TaskKind::LayerDownload, DownloadBehavior::Download))
.unwrap_or(RequestContext::new(
TaskKind::LayerDownload,
DownloadBehavior::Download,
));
async move {
tracing::info!(%reason, "downloading on-demand");
let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
let res = self
.download_init_and_wait(timeline, permit, download_ctx)
.await?;
let res = self.download_init_and_wait(timeline, permit).await?;
scopeguard::ScopeGuard::into_inner(init_cancelled);
Ok(res)
}
@@ -1038,7 +982,6 @@ impl LayerInner {
self: &Arc<Self>,
timeline: Arc<Timeline>,
permit: heavier_once_cell::InitPermit,
ctx: RequestContext,
) -> Result<Arc<DownloadedLayer>, DownloadError> {
debug_assert_current_span_has_tenant_and_timeline_id();
@@ -1068,7 +1011,7 @@ impl LayerInner {
.await
.unwrap();
let res = this.download_and_init(timeline, permit, &ctx).await;
let res = this.download_and_init(timeline, permit).await;
if let Err(res) = tx.send(res) {
match res {
@@ -1111,7 +1054,6 @@ impl LayerInner {
self: &Arc<LayerInner>,
timeline: Arc<Timeline>,
permit: heavier_once_cell::InitPermit,
ctx: &RequestContext,
) -> anyhow::Result<Arc<DownloadedLayer>> {
let client = timeline
.remote_client
@@ -1119,12 +1061,7 @@ impl LayerInner {
.expect("checked before download_init_and_wait");
let result = client
.download_layer_file(
&self.desc.filename(),
&self.metadata(),
&timeline.cancel,
ctx,
)
.download_layer_file(&self.desc.filename(), &self.metadata(), &timeline.cancel)
.await;
match result {
@@ -1843,23 +1780,25 @@ impl ResidentLayer {
}
}
/// Returns the amount of keys and values written to the writer.
pub(crate) async fn copy_delta_prefix(
/// FIXME: truncate is bad name because we are not truncating anything, but copying the
/// filtered parts.
#[cfg(test)]
pub(super) async fn copy_delta_prefix(
&self,
writer: &mut super::delta_layer::DeltaLayerWriter,
until: Lsn,
truncate_at: Lsn,
ctx: &RequestContext,
) -> anyhow::Result<usize> {
) -> anyhow::Result<()> {
use LayerKind::*;
let owner = &self.owner.0;
match self.downloaded.get(owner, ctx).await? {
Delta(ref d) => d
.copy_prefix(writer, until, ctx)
.copy_prefix(writer, truncate_at, ctx)
.await
.with_context(|| format!("copy_delta_prefix until {until} of {self}")),
Image(_) => anyhow::bail!(format!("cannot copy_lsn_prefix of image layer {self}")),
.with_context(|| format!("truncate {self}")),
Image(_) => anyhow::bail!(format!("cannot truncate image layer {self}")),
}
}

View File

@@ -2,7 +2,6 @@
//! such as compaction and GC
use std::ops::ControlFlow;
use std::str::FromStr;
use std::sync::Arc;
use std::time::{Duration, Instant};
@@ -10,11 +9,9 @@ use crate::context::{DownloadBehavior, RequestContext};
use crate::metrics::TENANT_TASK_EVENTS;
use crate::task_mgr;
use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
use crate::tenant::config::defaults::DEFAULT_COMPACTION_PERIOD;
use crate::tenant::throttle::Stats;
use crate::tenant::timeline::CompactionError;
use crate::tenant::{Tenant, TenantState};
use rand::Rng;
use tokio_util::sync::CancellationToken;
use tracing::*;
use utils::{backoff, completion};
@@ -47,7 +44,6 @@ pub(crate) enum BackgroundLoopKind {
Compaction,
Gc,
Eviction,
IngestHouseKeeping,
ConsumptionMetricsCollectMetrics,
ConsumptionMetricsSyntheticSizeWorker,
InitialLogicalSizeCalculation,
@@ -136,30 +132,6 @@ pub fn start_background_loops(
}
},
);
task_mgr::spawn(
BACKGROUND_RUNTIME.handle(),
TaskKind::IngestHousekeeping,
Some(tenant_shard_id),
None,
&format!("ingest housekeeping for tenant {tenant_shard_id}"),
false,
{
let tenant = Arc::clone(tenant);
let background_jobs_can_start = background_jobs_can_start.cloned();
async move {
let cancel = task_mgr::shutdown_token();
tokio::select! {
_ = cancel.cancelled() => { return Ok(()) },
_ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
};
ingest_housekeeping_loop(tenant, cancel)
.instrument(info_span!("ingest_housekeeping_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
.await;
Ok(())
}
},
);
}
///
@@ -407,61 +379,6 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
}
async fn ingest_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
async {
loop {
tokio::select! {
_ = cancel.cancelled() => {
return;
},
tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result {
ControlFlow::Break(()) => return,
ControlFlow::Continue(()) => (),
},
}
// We run ingest housekeeping with the same frequency as compaction: it is not worth
// having a distinct setting. But we don't run it in the same task, because compaction
// blocks on acquiring the background job semaphore.
let period = tenant.get_compaction_period();
// If compaction period is set to zero (to disable it), then we will use a reasonable default
let period = if period == Duration::ZERO {
humantime::Duration::from_str(DEFAULT_COMPACTION_PERIOD)
.unwrap()
.into()
} else {
period
};
// Jitter the period by +/- 5%
let period =
rand::thread_rng().gen_range((period * (95)) / 100..(period * (105)) / 100);
// Always sleep first: we do not need to do ingest housekeeping early in the lifetime of
// a tenant, since it won't have started writing any ephemeral files yet.
if tokio::time::timeout(period, cancel.cancelled())
.await
.is_ok()
{
break;
}
let started_at = Instant::now();
tenant.ingest_housekeeping().await;
warn_when_period_overrun(
started_at.elapsed(),
period,
BackgroundLoopKind::IngestHouseKeeping,
);
}
}
.await;
TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
}
async fn wait_for_active_tenant(tenant: &Arc<Tenant>) -> ControlFlow<()> {
// if the tenant has a proper status already, no need to wait for anything
if tenant.current_state() == TenantState::Active {
@@ -503,6 +420,8 @@ pub(crate) async fn random_init_delay(
period: Duration,
cancel: &CancellationToken,
) -> Result<(), Cancelled> {
use rand::Rng;
if period == Duration::ZERO {
return Ok(());
}

View File

@@ -1,6 +1,5 @@
mod compaction;
pub mod delete;
pub(crate) mod detach_ancestor;
mod eviction_task;
mod init;
pub mod layer_manager;
@@ -17,15 +16,11 @@ use enumset::EnumSet;
use fail::fail_point;
use once_cell::sync::Lazy;
use pageserver_api::{
key::{
AUX_FILES_KEY, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE,
NON_INHERITED_SPARSE_RANGE,
},
key::{AUX_FILES_KEY, NON_INHERITED_RANGE},
keyspace::{KeySpaceAccum, SparseKeyPartitioning},
models::{
AuxFilePolicy, CompactionAlgorithm, DownloadRemoteLayersTaskInfo,
DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, InMemoryLayerInfo, LayerMapInfo,
TimelineState,
CompactionAlgorithm, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest,
EvictionPolicy, InMemoryLayerInfo, LayerMapInfo, TimelineState,
},
reltag::BlockNumber,
shard::{ShardIdentity, ShardNumber, TenantShardId},
@@ -60,7 +55,7 @@ use std::{
ops::ControlFlow,
};
use crate::tenant::storage_layer::layer::local_layer_path;
use crate::tenant::timeline::logical_size::CurrentLogicalSize;
use crate::tenant::{
layer_map::{LayerMap, SearchResult},
metadata::TimelineMetadata,
@@ -82,9 +77,6 @@ use crate::{
use crate::{
disk_usage_eviction_task::EvictionCandidate, tenant::storage_layer::delta_layer::DeltaEntry,
};
use crate::{
metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize,
};
use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind};
use crate::{
pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind},
@@ -333,7 +325,7 @@ pub struct Timeline {
// List of child timelines and their branch points. This is needed to avoid
// garbage collecting data that is still needed by the child timelines.
pub(crate) gc_info: std::sync::RwLock<GcInfo>,
pub gc_info: std::sync::RwLock<GcInfo>,
// It may change across major versions so for simplicity
// keep it after running initdb for a timeline.
@@ -417,59 +409,33 @@ pub struct WalReceiverInfo {
pub last_received_msg_ts: u128,
}
///
/// Information about how much history needs to be retained, needed by
/// Garbage Collection.
#[derive(Default)]
pub(crate) struct GcInfo {
///
pub struct GcInfo {
/// Specific LSNs that are needed.
///
/// Currently, this includes all points where child branches have
/// been forked off from. In the future, could also include
/// explicit user-defined snapshot points.
pub(crate) retain_lsns: Vec<Lsn>,
pub retain_lsns: Vec<Lsn>,
/// The cutoff coordinates, which are combined by selecting the minimum.
pub(crate) cutoffs: GcCutoffs,
}
impl GcInfo {
pub(crate) fn min_cutoff(&self) -> Lsn {
self.cutoffs.select_min()
}
}
/// The `GcInfo` component describing which Lsns need to be retained.
#[derive(Debug)]
pub(crate) struct GcCutoffs {
/// Keep everything newer than this point.
/// In addition to 'retain_lsns', keep everything newer than this
/// point.
///
/// This is calculated by subtracting 'gc_horizon' setting from
/// last-record LSN
///
/// FIXME: is this inclusive or exclusive?
pub(crate) horizon: Lsn,
pub horizon_cutoff: Lsn,
/// In addition to 'retain_lsns' and 'horizon_cutoff', keep everything newer than this
/// point.
///
/// This is calculated by finding a number such that a record is needed for PITR
/// if only if its LSN is larger than 'pitr_cutoff'.
pub(crate) pitr: Lsn,
}
impl Default for GcCutoffs {
fn default() -> Self {
Self {
horizon: Lsn::INVALID,
pitr: Lsn::INVALID,
}
}
}
impl GcCutoffs {
fn select_min(&self) -> Lsn {
std::cmp::min(self.horizon, self.pitr)
}
pub pitr_cutoff: Lsn,
}
/// An error happened in a get() operation.
@@ -498,6 +464,7 @@ pub(crate) enum PageReconstructError {
#[derive(Debug)]
pub struct MissingKeyError {
stuck_at_lsn: bool,
key: Key,
shard: ShardNumber,
cont_lsn: Lsn,
@@ -509,13 +476,23 @@ pub struct MissingKeyError {
impl std::fmt::Display for MissingKeyError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"could not find data for key {} (shard {:?}) at LSN {}, request LSN {}",
self.key, self.shard, self.cont_lsn, self.request_lsn
)?;
if let Some(ref ancestor_lsn) = self.ancestor_lsn {
write!(f, ", ancestor {}", ancestor_lsn)?;
if self.stuck_at_lsn {
// Records are found in this timeline but no image layer or initial delta record was found.
write!(
f,
"could not find layer with more data for key {} (shard {:?}) at LSN {}, request LSN {}",
self.key, self.shard, self.cont_lsn, self.request_lsn
)?;
if let Some(ref ancestor_lsn) = self.ancestor_lsn {
write!(f, ", ancestor {}", ancestor_lsn)?;
}
} else {
// No records in this timeline.
write!(
f,
"could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}",
self.key, self.shard, self.cont_lsn, self.request_lsn
)?;
}
if !self.traversal_path.is_empty() {
@@ -591,8 +568,8 @@ pub(crate) enum GetVectoredError {
#[error("Requested at invalid LSN: {0}")]
InvalidLsn(Lsn),
#[error("Requested key not found: {0}")]
MissingKey(MissingKeyError),
#[error("Requested key {0} not found")]
MissingKey(Key),
#[error(transparent)]
GetReadyAncestorError(GetReadyAncestorError),
@@ -701,7 +678,7 @@ impl From<GetVectoredError> for PageReconstructError {
GetVectoredError::Cancelled => PageReconstructError::Cancelled,
GetVectoredError::InvalidLsn(_) => PageReconstructError::Other(anyhow!("Invalid LSN")),
err @ GetVectoredError::Oversized(_) => PageReconstructError::Other(err.into()),
GetVectoredError::MissingKey(err) => PageReconstructError::MissingKey(err),
err @ GetVectoredError::MissingKey(_) => PageReconstructError::Other(err.into()),
GetVectoredError::GetReadyAncestorError(err) => PageReconstructError::from(err),
GetVectoredError::Other(err) => PageReconstructError::Other(err),
}
@@ -865,13 +842,9 @@ impl Timeline {
// Initialise the reconstruct state for the key with the cache
// entry returned above.
let mut reconstruct_state = ValuesReconstructState::new();
// Only add the cached image to the reconstruct state when it exists.
if cached_page_img.is_some() {
let mut key_state = VectoredValueReconstructState::default();
key_state.img = cached_page_img;
reconstruct_state.keys.insert(key, Ok(key_state));
}
let mut key_state = VectoredValueReconstructState::default();
key_state.img = cached_page_img;
reconstruct_state.keys.insert(key, Ok(key_state));
let vectored_res = self
.get_vectored_impl(keyspace.clone(), lsn, reconstruct_state, ctx)
@@ -897,15 +870,16 @@ impl Timeline {
value
}
}
None => Err(PageReconstructError::MissingKey(MissingKeyError {
key,
shard: self.shard_identity.get_shard_number(&key),
cont_lsn: Lsn(0),
request_lsn: lsn,
ancestor_lsn: None,
traversal_path: Vec::new(),
backtrace: None,
})),
None => {
error!(
"Expected {}, but singular vectored get returned nothing",
key
);
Err(PageReconstructError::Other(anyhow!(
"Singular vectored get did not return a value for {}",
key
)))
}
}
}
}
@@ -1055,70 +1029,6 @@ impl Timeline {
res
}
/// Scan the keyspace and return all existing key-values in the keyspace. This currently uses vectored
/// get underlying. Normal vectored get would throw an error when a key in the keyspace is not found
/// during the search, but for the scan interface, it returns all existing key-value pairs, and does
/// not expect each single key in the key space will be found. The semantics is closer to the RocksDB
/// scan iterator interface. We could optimize this interface later to avoid some checks in the vectored
/// get path to maintain and split the probing and to-be-probe keyspace. We also need to ensure that
/// the scan operation will not cause OOM in the future.
#[allow(dead_code)]
pub(crate) async fn scan(
&self,
keyspace: KeySpace,
lsn: Lsn,
ctx: &RequestContext,
) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
if !lsn.is_valid() {
return Err(GetVectoredError::InvalidLsn(lsn));
}
trace!(
"key-value scan request for {:?}@{} from task kind {:?}",
keyspace,
lsn,
ctx.task_kind()
);
// We should generalize this into Keyspace::contains in the future.
for range in &keyspace.ranges {
if range.start.field1 < METADATA_KEY_BEGIN_PREFIX
|| range.end.field1 > METADATA_KEY_END_PREFIX
{
return Err(GetVectoredError::Other(anyhow::anyhow!(
"only metadata keyspace can be scanned"
)));
}
}
let start = crate::metrics::SCAN_LATENCY
.for_task_kind(ctx.task_kind())
.map(ScanLatencyOngoingRecording::start_recording);
// start counting after throttle so that throttle time
// is always less than observation time
let throttled = self
.timeline_get_throttle
// assume scan = 1 quota for now until we find a better way to process this
.throttle(ctx, 1)
.await;
let vectored_res = self
.get_vectored_impl(
keyspace.clone(),
lsn,
ValuesReconstructState::default(),
ctx,
)
.await;
if let Some(recording) = start {
recording.observe(throttled);
}
vectored_res
}
/// Not subject to [`Self::timeline_get_throttle`].
pub(super) async fn get_vectored_sequential_impl(
&self,
@@ -1127,7 +1037,6 @@ impl Timeline {
ctx: &RequestContext,
) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
let mut values = BTreeMap::new();
for range in keyspace.ranges {
let mut key = range.start;
while key != range.end {
@@ -1140,17 +1049,16 @@ impl Timeline {
Err(Cancelled | AncestorStopping(_)) => {
return Err(GetVectoredError::Cancelled)
}
Err(MissingKey(_))
if NON_INHERITED_RANGE.contains(&key)
|| NON_INHERITED_SPARSE_RANGE.contains(&key) =>
{
// Ignore missing key error for aux key range. TODO: currently, we assume non_inherited_range == aux_key_range.
// When we add more types of keys into the page server, we should revisit this part of code and throw errors
// accordingly.
key = key.next();
}
Err(MissingKey(err)) => {
return Err(GetVectoredError::MissingKey(err));
// we only capture stuck_at_lsn=false now until we figure out https://github.com/neondatabase/neon/issues/7380
Err(MissingKey(MissingKeyError {
stuck_at_lsn: false,
..
})) if !NON_INHERITED_RANGE.contains(&key) => {
// The vectored read path handles non inherited keys specially.
// If such a a key cannot be reconstructed from the current timeline,
// the vectored read path returns a key level error as opposed to a top
// level error.
return Err(GetVectoredError::MissingKey(key));
}
Err(Other(err))
if err
@@ -1237,11 +1145,6 @@ impl Timeline {
lsn: Lsn,
ctx: &RequestContext,
) {
if keyspace.overlaps(&Key::metadata_key_range()) {
// skip validation for metadata key range
return;
}
let sequential_res = self
.get_vectored_sequential_impl(keyspace.clone(), lsn, ctx)
.await;
@@ -1251,7 +1154,7 @@ impl Timeline {
match (lhs, rhs) {
(Oversized(l), Oversized(r)) => l == r,
(InvalidLsn(l), InvalidLsn(r)) => l == r,
(MissingKey(l), MissingKey(r)) => l.key == r.key,
(MissingKey(l), MissingKey(r)) => l == r,
(GetReadyAncestorError(_), GetReadyAncestorError(_)) => true,
(Other(_), Other(_)) => true,
_ => false,
@@ -1266,7 +1169,7 @@ impl Timeline {
" - keyspace={:?} lsn={}"),
seq_err, keyspace, lsn) },
(Ok(_), Err(GetVectoredError::GetReadyAncestorError(GetReadyAncestorError::AncestorLsnTimeout(_)))) => {
// Sequential get runs after vectored get, so it is possible for the later
// Sequential get runs after vectored get, so it is possible for the later
// to time out while waiting for its ancestor's Lsn to become ready and for the
// former to succeed (it essentially has a doubled wait time).
},
@@ -1501,21 +1404,15 @@ impl Timeline {
/// Flush to disk all data that was written with the put_* functions
#[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))]
pub(crate) async fn freeze_and_flush(&self) -> anyhow::Result<()> {
self.freeze_and_flush0().await
}
// This exists to provide a non-span creating version of `freeze_and_flush` we can call without
// polluting the span hierarchy.
pub(crate) async fn freeze_and_flush0(&self) -> anyhow::Result<()> {
let to_lsn = self.freeze_inmem_layer(false).await;
self.flush_frozen_layers_and_wait(to_lsn).await
}
// Check if an open ephemeral layer should be closed: this provides
// background enforcement of checkpoint interval if there is no active WAL receiver, to avoid keeping
// an ephemeral layer open forever when idle. It also freezes layers if the global limit on
// ephemeral layer bytes has been breached.
pub(super) async fn maybe_freeze_ephemeral_layer(&self) {
/// If there is no writer, and conditions for rolling the latest layer are met, then freeze it.
///
/// This is for use in background housekeeping, to provide guarantees of layers closing eventually
/// even if there are no ongoing writes to drive that.
async fn maybe_freeze_ephemeral_layer(&self) {
let Ok(_write_guard) = self.write_lock.try_lock() else {
// If the write lock is held, there is an active wal receiver: rolling open layers
// is their responsibility while they hold this lock.
@@ -1542,11 +1439,13 @@ impl Timeline {
// we are a sharded tenant and have skipped some WAL
let last_freeze_ts = *self.last_freeze_ts.read().unwrap();
if last_freeze_ts.elapsed() >= self.get_checkpoint_timeout() {
// Only do this if have been layer-less longer than get_checkpoint_timeout, so that a shard
// without any data ingested (yet) doesn't write a remote index as soon as it
// This should be somewhat rare, so we log it at INFO level.
//
// We checked for checkpoint timeout so that a shard without any
// data ingested (yet) doesn't write a remote index as soon as it
// sees its LSN advance: we only do this if we've been layer-less
// for some time.
tracing::debug!(
tracing::info!(
"Advancing disk_consistent_lsn past WAL ingest gap {} -> {}",
disk_consistent_lsn,
last_record_lsn
@@ -1636,6 +1535,11 @@ impl Timeline {
(guard, permit)
};
// Prior to compaction, check if an open ephemeral layer should be closed: this provides
// background enforcement of checkpoint interval if there is no active WAL receiver, to avoid keeping
// an ephemeral layer open forever when idle.
self.maybe_freeze_ephemeral_layer().await;
// this wait probably never needs any "long time spent" logging, because we already nag if
// compaction task goes over it's period (20s) which is quite often in production.
let (_guard, _permit) = tokio::select! {
@@ -1905,7 +1809,7 @@ impl Timeline {
#[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))]
pub(crate) async fn download_layer(
&self,
layer_file_name: &LayerFileName,
layer_file_name: &str,
) -> anyhow::Result<Option<bool>> {
let Some(layer) = self.find_layer(layer_file_name).await else {
return Ok(None);
@@ -1923,10 +1827,7 @@ impl Timeline {
/// Evict just one layer.
///
/// Returns `Ok(None)` in the case where the layer could not be found by its `layer_file_name`.
pub(crate) async fn evict_layer(
&self,
layer_file_name: &LayerFileName,
) -> anyhow::Result<Option<bool>> {
pub(crate) async fn evict_layer(&self, layer_file_name: &str) -> anyhow::Result<Option<bool>> {
let _gate = self
.gate
.enter()
@@ -2000,12 +1901,13 @@ const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10;
// Private functions
impl Timeline {
pub(crate) fn get_switch_aux_file_policy(&self) -> AuxFilePolicy {
#[allow(dead_code)]
pub(crate) fn get_switch_to_aux_file_v2(&self) -> bool {
let tenant_conf = self.tenant_conf.load();
tenant_conf
.tenant_conf
.switch_aux_file_policy
.unwrap_or(self.conf.default_tenant_conf.switch_aux_file_policy)
.switch_to_aux_file_v2
.unwrap_or(self.conf.default_tenant_conf.switch_to_aux_file_v2)
}
pub(crate) fn get_lazy_slru_download(&self) -> bool {
@@ -2209,7 +2111,11 @@ impl Timeline {
write_lock: tokio::sync::Mutex::new(None),
gc_info: std::sync::RwLock::new(GcInfo::default()),
gc_info: std::sync::RwLock::new(GcInfo {
retain_lsns: Vec::new(),
horizon_cutoff: Lsn(0),
pitr_cutoff: Lsn(0),
}),
latest_gc_cutoff_lsn: Rcu::new(metadata.latest_gc_cutoff_lsn()),
initdb_lsn: metadata.initdb_lsn(),
@@ -2417,8 +2323,8 @@ impl Timeline {
for discovered in discovered {
let (name, kind) = match discovered {
Discovered::Layer(layer_file_name, local_path, file_size) => {
discovered_layers.push((layer_file_name, local_path, file_size));
Discovered::Layer(file_name, file_size) => {
discovered_layers.push((file_name, file_size));
continue;
}
Discovered::Metadata => {
@@ -2463,7 +2369,7 @@ impl Timeline {
let mut needs_cleanup = Vec::new();
let mut total_physical_size = 0;
for (name, local_path, decision) in decided {
for (name, decision) in decided {
let decision = match decision {
Ok(UseRemote { local, remote }) => {
// Remote is authoritative, but we may still choose to retain
@@ -2473,23 +2379,26 @@ impl Timeline {
// the correct generation.
UseLocal(remote)
} else {
let local_path = local_path.as_ref().expect("Locally found layer must have path");
init::cleanup_local_file_for_remote(local_path, &local, &remote)?;
path.push(name.file_name());
init::cleanup_local_file_for_remote(&path, &local, &remote)?;
path.pop();
UseRemote { local, remote }
}
}
Ok(decision) => decision,
Err(DismissedLayer::Future { local }) => {
if local.is_some() {
let local_path = local_path.expect("Locally found layer must have path");
init::cleanup_future_layer(&local_path, &name, disk_consistent_lsn)?;
path.push(name.file_name());
init::cleanup_future_layer(&path, &name, disk_consistent_lsn)?;
path.pop();
}
needs_cleanup.push(name);
continue;
}
Err(DismissedLayer::LocalOnly(local)) => {
let local_path = local_path.expect("Locally found layer must have path");
init::cleanup_local_only_file(&local_path, &name, &local)?;
path.push(name.file_name());
init::cleanup_local_only_file(&path, &name, &local)?;
path.pop();
// this file never existed remotely, we will have to do rework
continue;
}
@@ -2505,18 +2414,7 @@ impl Timeline {
let layer = match decision {
UseLocal(m) => {
total_physical_size += m.file_size();
let local_path = local_path.unwrap_or_else(|| {
local_layer_path(
conf,
&this.tenant_shard_id,
&this.timeline_id,
&name,
&m.generation,
)
});
Layer::for_resident(conf, &this, local_path, name, m).drop_eviction_guard()
Layer::for_resident(conf, &this, name, m).drop_eviction_guard()
}
Evicted(remote) | UseRemote { remote, .. } => {
Layer::for_evicted(conf, &this, name, remote)
@@ -2997,11 +2895,11 @@ impl Timeline {
}
}
async fn find_layer(&self, layer_name: &LayerFileName) -> Option<Layer> {
async fn find_layer(&self, layer_file_name: &str) -> Option<Layer> {
let guard = self.layers.read().await;
for historic_layer in guard.layer_map().iter_historic_layers() {
let historic_layer_name = historic_layer.filename();
if layer_name == &historic_layer_name {
let historic_layer_name = historic_layer.filename().file_name();
if layer_file_name == historic_layer_name {
return Some(guard.get_from_desc(&historic_layer));
}
}
@@ -3031,7 +2929,7 @@ impl Timeline {
HeatMapLayer::new(
layer.layer_desc().filename(),
(&layer.metadata()).into(),
layer.metadata().into(),
last_activity_ts,
)
});
@@ -3126,6 +3024,7 @@ impl Timeline {
// Didn't make any progress in last iteration. Error out to avoid
// getting stuck in the loop.
return Err(PageReconstructError::MissingKey(MissingKeyError {
stuck_at_lsn: true,
key,
shard: self.shard_identity.get_shard_number(&key),
cont_lsn: Lsn(cont_lsn.0 - 1),
@@ -3140,6 +3039,7 @@ impl Timeline {
}
ValueReconstructResult::Missing => {
return Err(PageReconstructError::MissingKey(MissingKeyError {
stuck_at_lsn: false,
key,
shard: self.shard_identity.get_shard_number(&key),
cont_lsn,
@@ -3303,12 +3203,37 @@ impl Timeline {
// Do not descend into the ancestor timeline for aux files.
// We don't return a blanket [`GetVectoredError::MissingKey`] to avoid
// stalling compaction.
keyspace.remove_overlapping_with(&KeySpace {
ranges: vec![NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE],
});
// TODO(chi): this will need to be updated for aux files v2 storage
if keyspace.overlaps(&NON_INHERITED_RANGE) {
let removed = keyspace.remove_overlapping_with(&KeySpace {
ranges: vec![NON_INHERITED_RANGE],
});
for range in removed.ranges {
let mut key = range.start;
while key < range.end {
reconstruct_state.on_key_error(
key,
PageReconstructError::MissingKey(MissingKeyError {
stuck_at_lsn: false,
key,
shard: self.shard_identity.get_shard_number(&key),
cont_lsn,
request_lsn,
ancestor_lsn: None,
traversal_path: Vec::default(),
backtrace: if cfg!(test) {
Some(std::backtrace::Backtrace::force_capture())
} else {
None
},
}),
);
key = key.next();
}
}
}
// Keyspace is fully retrieved, no ancestor timeline, or metadata scan (where we do not look
// into ancestor timelines). TODO: is there any other metadata which we want to inherit?
if keyspace.total_raw_size() == 0 || timeline.ancestor_timeline.is_none() {
break;
}
@@ -3323,17 +3248,7 @@ impl Timeline {
}
if keyspace.total_raw_size() != 0 {
return Err(GetVectoredError::MissingKey(MissingKeyError {
key: keyspace.start().unwrap(), /* better if we can store the full keyspace */
shard: self
.shard_identity
.get_shard_number(&keyspace.start().unwrap()),
cont_lsn,
request_lsn,
ancestor_lsn: Some(timeline.ancestor_lsn),
traversal_path: vec![],
backtrace: None,
}));
return Err(GetVectoredError::MissingKey(keyspace.start().unwrap()));
}
Ok(())
@@ -3533,7 +3448,7 @@ impl Timeline {
Ok(ancestor)
}
pub(crate) fn get_ancestor_timeline(&self) -> anyhow::Result<Arc<Timeline>> {
fn get_ancestor_timeline(&self) -> anyhow::Result<Arc<Timeline>> {
let ancestor = self.ancestor_timeline.as_ref().with_context(|| {
format!(
"Ancestor is missing. Timeline id: {} Ancestor id {:?}",
@@ -4271,7 +4186,7 @@ impl Timeline {
};
// Write all the keys we just read into our new image layer.
image_layer_writer.put_image(img_key, img, ctx).await?;
image_layer_writer.put_image(img_key, img).await?;
wrote_keys = true;
}
}
@@ -4282,7 +4197,7 @@ impl Timeline {
// Normal path: we have written some data into the new image layer for this
// partition, so flush it to disk.
start = img_range.end;
let image_layer = image_layer_writer.finish(self, ctx).await?;
let image_layer = image_layer_writer.finish(self).await?;
image_layers.push(image_layer);
} else {
// Special case: the image layer may be empty if this is a sharded tenant and the
@@ -4349,49 +4264,6 @@ impl Timeline {
_ = self.cancel.cancelled() => {}
)
}
/// Detach this timeline from its ancestor by copying all of ancestors layers as this
/// Timelines layers up to the ancestor_lsn.
///
/// Requires a timeline that:
/// - has an ancestor to detach from
/// - the ancestor does not have an ancestor -- follows from the original RFC limitations, not
/// a technical requirement
/// - has prev_lsn in remote storage (temporary restriction)
///
/// After the operation has been started, it cannot be canceled. Upon restart it needs to be
/// polled again until completion.
///
/// During the operation all timelines sharing the data with this timeline will be reparented
/// from our ancestor to be branches of this timeline.
pub(crate) async fn prepare_to_detach_from_ancestor(
self: &Arc<Timeline>,
tenant: &crate::tenant::Tenant,
options: detach_ancestor::Options,
ctx: &RequestContext,
) -> Result<
(
completion::Completion,
detach_ancestor::PreparedTimelineDetach,
),
detach_ancestor::Error,
> {
detach_ancestor::prepare(self, tenant, options, ctx).await
}
/// Completes the ancestor detach. This method is to be called while holding the
/// TenantManager's tenant slot, so during this method we cannot be deleted nor can any
/// timeline be deleted. After this method returns successfully, tenant must be reloaded.
///
/// Pageserver receiving a SIGKILL during this operation is not supported (yet).
pub(crate) async fn complete_detaching_timeline_ancestor(
self: &Arc<Timeline>,
tenant: &crate::tenant::Tenant,
prepared: detach_ancestor::PreparedTimelineDetach,
ctx: &RequestContext,
) -> Result<Vec<TimelineId>, anyhow::Error> {
detach_ancestor::complete(self, tenant, prepared, ctx).await
}
}
/// Top-level failure to compact.
@@ -4500,24 +4372,6 @@ impl Timeline {
Ok(())
}
async fn rewrite_layers(
self: &Arc<Self>,
replace_layers: Vec<(Layer, ResidentLayer)>,
drop_layers: Vec<Layer>,
) -> anyhow::Result<()> {
let mut guard = self.layers.write().await;
guard.rewrite_layers(&replace_layers, &drop_layers, &self.metrics);
let upload_layers: Vec<_> = replace_layers.into_iter().map(|r| r.1).collect();
if let Some(remote_client) = self.remote_client.as_ref() {
remote_client.schedule_compaction_update(&drop_layers, &upload_layers)?;
}
Ok(())
}
/// Schedules the uploads of the given image layers
fn upload_new_image_layers(
self: &Arc<Self>,
@@ -4536,7 +4390,7 @@ impl Timeline {
Ok(())
}
/// Find the Lsns above which layer files need to be retained on
/// Update information about which layer files need to be retained on
/// garbage collection. This is separate from actually performing the GC,
/// and is updated more frequently, so that compaction can remove obsolete
/// page versions more aggressively.
@@ -4544,6 +4398,17 @@ impl Timeline {
/// TODO: that's wishful thinking, compaction doesn't actually do that
/// currently.
///
/// The caller specifies how much history is needed with the 3 arguments:
///
/// retain_lsns: keep a version of each page at these LSNs
/// cutoff_horizon: also keep everything newer than this LSN
/// pitr: the time duration required to keep data for PITR
///
/// The 'retain_lsns' list is currently used to prevent removing files that
/// are needed by child timelines. In the future, the user might be able to
/// name additional points in time to retain. The caller is responsible for
/// collecting that information.
///
/// The 'cutoff_horizon' point is used to retain recent versions that might still be
/// needed by read-only nodes. (As of this writing, the caller just passes
/// the latest LSN subtracted by a constant, and doesn't do anything smart
@@ -4551,22 +4416,29 @@ impl Timeline {
///
/// The 'pitr' duration is used to calculate a 'pitr_cutoff', which can be used to determine
/// whether a record is needed for PITR.
///
/// NOTE: This function holds a short-lived lock to protect the 'gc_info'
/// field, so that the three values passed as argument are stored
/// atomically. But the caller is responsible for ensuring that no new
/// branches are created that would need to be included in 'retain_lsns',
/// for example. The caller should hold `Tenant::gc_cs` lock to ensure
/// that.
///
#[instrument(skip_all, fields(timeline_id=%self.timeline_id))]
pub(super) async fn find_gc_cutoffs(
pub(super) async fn update_gc_info(
&self,
retain_lsns: Vec<Lsn>,
cutoff_horizon: Lsn,
pitr: Duration,
cancel: &CancellationToken,
ctx: &RequestContext,
) -> anyhow::Result<GcCutoffs> {
) -> anyhow::Result<()> {
let _timer = self
.metrics
.find_gc_cutoffs_histo
.update_gc_info_histo
.start_timer()
.record_on_drop();
pausable_failpoint!("Timeline::find_gc_cutoffs-pausable");
// First, calculate pitr_cutoff_timestamp and then convert it to LSN.
//
// Some unit tests depend on garbage-collection working even when
@@ -4616,10 +4488,14 @@ impl Timeline {
self.get_last_record_lsn()
};
Ok(GcCutoffs {
horizon: cutoff_horizon,
pitr: pitr_cutoff,
})
// Grab the lock and update the values
*self.gc_info.write().unwrap() = GcInfo {
retain_lsns,
horizon_cutoff: cutoff_horizon,
pitr_cutoff,
};
Ok(())
}
/// Garbage collect layer files on a timeline that are no longer needed.
@@ -4648,8 +4524,8 @@ impl Timeline {
let (horizon_cutoff, pitr_cutoff, retain_lsns) = {
let gc_info = self.gc_info.read().unwrap();
let horizon_cutoff = min(gc_info.cutoffs.horizon, self.get_disk_consistent_lsn());
let pitr_cutoff = gc_info.cutoffs.pitr;
let horizon_cutoff = min(gc_info.horizon_cutoff, self.get_disk_consistent_lsn());
let pitr_cutoff = gc_info.pitr_cutoff;
let retain_lsns = gc_info.retain_lsns.clone();
(horizon_cutoff, pitr_cutoff, retain_lsns)
};
@@ -4676,8 +4552,6 @@ impl Timeline {
retain_lsns: Vec<Lsn>,
new_gc_cutoff: Lsn,
) -> anyhow::Result<GcResult> {
// FIXME: if there is an ongoing detach_from_ancestor, we should just skip gc
let now = SystemTime::now();
let mut result: GcResult = GcResult::default();

View File

@@ -15,8 +15,7 @@ use anyhow::{anyhow, Context};
use enumset::EnumSet;
use fail::fail_point;
use itertools::Itertools;
use pageserver_api::keyspace::ShardedRange;
use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
use pageserver_api::shard::{ShardIdentity, TenantShardId};
use tokio_util::sync::CancellationToken;
use tracing::{debug, info, info_span, trace, warn, Instrument};
use utils::id::TimelineId;
@@ -94,7 +93,7 @@ impl Timeline {
// Define partitioning schema if needed
// FIXME: the match should only cover repartitioning, not the next steps
let partition_count = match self
match self
.repartition(
self.get_last_record_lsn(),
self.get_compaction_target_size(),
@@ -147,7 +146,6 @@ impl Timeline {
assert!(sparse_layers.is_empty());
self.upload_new_image_layers(dense_layers)?;
dense_partitioning.parts.len()
}
Err(err) => {
// no partitioning? This is normal, if the timeline was just created
@@ -159,150 +157,9 @@ impl Timeline {
if !self.cancel.is_cancelled() {
tracing::error!("could not compact, repartitioning keyspace failed: {err:?}");
}
1
}
};
if self.shard_identity.count >= ShardCount::new(2) {
// Limit the number of layer rewrites to the number of partitions: this means its
// runtime should be comparable to a full round of image layer creations, rather than
// being potentially much longer.
let rewrite_max = partition_count;
self.compact_shard_ancestors(rewrite_max, ctx).await?;
}
Ok(())
}
/// Check for layers that are elegible to be rewritten:
/// - Shard splitting: After a shard split, ancestor layers beyond pitr_interval, so that
/// we don't indefinitely retain keys in this shard that aren't needed.
/// - For future use: layers beyond pitr_interval that are in formats we would
/// rather not maintain compatibility with indefinitely.
///
/// Note: this phase may read and write many gigabytes of data: use rewrite_max to bound
/// how much work it will try to do in each compaction pass.
async fn compact_shard_ancestors(
self: &Arc<Self>,
rewrite_max: usize,
_ctx: &RequestContext,
) -> anyhow::Result<()> {
let mut drop_layers = Vec::new();
let layers_to_rewrite: Vec<Layer> = Vec::new();
// We will use the PITR cutoff as a condition for rewriting layers.
let pitr_cutoff = self.gc_info.read().unwrap().cutoffs.pitr;
let layers = self.layers.read().await;
for layer_desc in layers.layer_map().iter_historic_layers() {
let layer = layers.get_from_desc(&layer_desc);
if layer.metadata().shard.shard_count == self.shard_identity.count {
// This layer does not belong to a historic ancestor, no need to re-image it.
continue;
}
// This layer was created on an ancestor shard: check if it contains any data for this shard.
let sharded_range = ShardedRange::new(layer_desc.get_key_range(), &self.shard_identity);
let layer_local_page_count = sharded_range.page_count();
let layer_raw_page_count = ShardedRange::raw_size(&layer_desc.get_key_range());
if layer_local_page_count == 0 {
// This ancestral layer only covers keys that belong to other shards.
// We include the full metadata in the log: if we had some critical bug that caused
// us to incorrectly drop layers, this would simplify manually debugging + reinstating those layers.
info!(%layer, old_metadata=?layer.metadata(),
"dropping layer after shard split, contains no keys for this shard.",
);
if cfg!(debug_assertions) {
// Expensive, exhaustive check of keys in this layer: this guards against ShardedRange's calculations being
// wrong. If ShardedRange claims the local page count is zero, then no keys in this layer
// should be !is_key_disposable()
let range = layer_desc.get_key_range();
let mut key = range.start;
while key < range.end {
debug_assert!(self.shard_identity.is_key_disposable(&key));
key = key.next();
}
}
drop_layers.push(layer);
continue;
} else if layer_local_page_count != u32::MAX
&& layer_local_page_count == layer_raw_page_count
{
debug!(%layer,
"layer is entirely shard local ({} keys), no need to filter it",
layer_local_page_count
);
continue;
}
// Don't bother re-writing a layer unless it will at least halve its size
if layer_local_page_count != u32::MAX
&& layer_local_page_count > layer_raw_page_count / 2
{
debug!(%layer,
"layer is already mostly local ({}/{}), not rewriting",
layer_local_page_count,
layer_raw_page_count
);
}
// Don't bother re-writing a layer if it is within the PITR window: it will age-out eventually
// without incurring the I/O cost of a rewrite.
if layer_desc.get_lsn_range().end >= pitr_cutoff {
debug!(%layer, "Skipping rewrite of layer still in PITR window ({} >= {})",
layer_desc.get_lsn_range().end, pitr_cutoff);
continue;
}
if layer_desc.is_delta() {
// We do not yet implement rewrite of delta layers
debug!(%layer, "Skipping rewrite of delta layer");
continue;
}
// Only rewrite layers if they would have different remote paths: either they belong to this
// shard but an old generation, or they belonged to another shard. This also implicitly
// guarantees that the layer is persistent in remote storage (as only remote persistent
// layers are carried across shard splits, any local-only layer would be in the current generation)
if layer.metadata().generation == self.generation
&& layer.metadata().shard.shard_count == self.shard_identity.count
{
debug!(%layer, "Skipping rewrite, is not from old generation");
continue;
}
if layers_to_rewrite.len() >= rewrite_max {
tracing::info!(%layer, "Will rewrite layer on a future compaction, already rewrote {}",
layers_to_rewrite.len()
);
continue;
}
// Fall through: all our conditions for doing a rewrite passed.
// TODO: implement rewriting
tracing::debug!(%layer, "Would rewrite layer");
}
// Drop the layers read lock: we will acquire it for write in [`Self::rewrite_layers`]
drop(layers);
// TODO: collect layers to rewrite
let replace_layers = Vec::new();
// Update the LayerMap so that readers will use the new layers, and enqueue it for writing to remote storage
self.rewrite_layers(replace_layers, drop_layers).await?;
if let Some(remote_client) = self.remote_client.as_ref() {
// We wait for all uploads to complete before finishing this compaction stage. This is not
// necessary for correctness, but it simplifies testing, and avoids proceeding with another
// Timeline's compaction while this timeline's uploads may be generating lots of disk I/O
// load.
remote_client.wait_completion().await?;
}
Ok(())
}
@@ -663,7 +520,7 @@ impl Timeline {
writer
.take()
.unwrap()
.finish(prev_key.unwrap().next(), self, ctx)
.finish(prev_key.unwrap().next(), self)
.await?,
);
writer = None;
@@ -705,11 +562,7 @@ impl Timeline {
);
}
writer
.as_mut()
.unwrap()
.put_value(key, lsn, value, ctx)
.await?;
writer.as_mut().unwrap().put_value(key, lsn, value).await?;
} else {
debug!(
"Dropping key {} during compaction (it belongs on shard {:?})",
@@ -725,7 +578,7 @@ impl Timeline {
prev_key = Some(key);
}
if let Some(writer) = writer {
new_layers.push(writer.finish(prev_key.unwrap().next(), self, ctx).await?);
new_layers.push(writer.finish(prev_key.unwrap().next(), self).await?);
}
// Sync layers
@@ -1119,7 +972,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
let value = val.load(ctx).await?;
writer.put_value(key, lsn, value, ctx).await?;
writer.put_value(key, lsn, value).await?;
prev = Some((key, lsn));
}
@@ -1135,7 +988,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
});
let new_delta_layer = writer
.finish(prev.unwrap().0.next(), &self.timeline, ctx)
.finish(prev.unwrap().0.next(), &self.timeline)
.await?;
self.new_deltas.push(new_delta_layer);
@@ -1205,11 +1058,11 @@ impl TimelineAdaptor {
}
}
};
image_layer_writer.put_image(key, img, ctx).await?;
image_layer_writer.put_image(key, img).await?;
key = key.next();
}
}
let image_layer = image_layer_writer.finish(&self.timeline, ctx).await?;
let image_layer = image_layer_writer.finish(&self.timeline).await?;
self.new_images.push(image_layer);

View File

@@ -422,10 +422,6 @@ impl DeleteTimelineFlow {
pub(crate) fn is_finished(&self) -> bool {
matches!(self, Self::Finished)
}
pub(crate) fn is_not_started(&self) -> bool {
matches!(self, Self::NotStarted)
}
}
struct DeletionGuard(OwnedMutexGuard<DeleteTimelineFlow>);

View File

@@ -1,550 +0,0 @@
use std::sync::Arc;
use super::{layer_manager::LayerManager, Timeline};
use crate::{
context::{DownloadBehavior, RequestContext},
task_mgr::TaskKind,
tenant::{
storage_layer::{AsLayerDesc as _, DeltaLayerWriter, Layer, ResidentLayer},
Tenant,
},
virtual_file::{MaybeFatalIo, VirtualFile},
};
use tokio_util::sync::CancellationToken;
use tracing::Instrument;
use utils::{completion, generation::Generation, id::TimelineId, lsn::Lsn};
#[derive(Debug, thiserror::Error)]
pub(crate) enum Error {
#[error("no ancestors")]
NoAncestor,
#[error("too many ancestors")]
TooManyAncestors,
#[error("shutting down, please retry later")]
ShuttingDown,
#[error("detached timeline must receive writes before the operation")]
DetachedTimelineNeedsWrites,
#[error("flushing failed")]
FlushAncestor(#[source] anyhow::Error),
#[error("layer download failed")]
RewrittenDeltaDownloadFailed(#[source] anyhow::Error),
#[error("copying LSN prefix locally failed")]
CopyDeltaPrefix(#[source] anyhow::Error),
#[error("upload rewritten layer")]
UploadRewritten(#[source] anyhow::Error),
#[error("ancestor is already being detached by: {}", .0)]
OtherTimelineDetachOngoing(TimelineId),
#[error("remote copying layer failed")]
CopyFailed(#[source] anyhow::Error),
#[error("unexpected error")]
Unexpected(#[source] anyhow::Error),
}
pub(crate) struct PreparedTimelineDetach {
layers: Vec<Layer>,
}
/// TODO: this should be part of PageserverConf because we cannot easily modify cplane arguments.
#[derive(Debug)]
pub(crate) struct Options {
pub(crate) rewrite_concurrency: std::num::NonZeroUsize,
pub(crate) copy_concurrency: std::num::NonZeroUsize,
}
impl Default for Options {
fn default() -> Self {
Self {
rewrite_concurrency: std::num::NonZeroUsize::new(2).unwrap(),
copy_concurrency: std::num::NonZeroUsize::new(10).unwrap(),
}
}
}
/// See [`Timeline::prepare_to_detach_from_ancestor`]
pub(super) async fn prepare(
detached: &Arc<Timeline>,
tenant: &Tenant,
options: Options,
ctx: &RequestContext,
) -> Result<(completion::Completion, PreparedTimelineDetach), Error> {
use Error::*;
if detached.remote_client.as_ref().is_none() {
unimplemented!("no new code for running without remote storage");
}
let Some((ancestor, ancestor_lsn)) = detached
.ancestor_timeline
.as_ref()
.map(|tl| (tl.clone(), detached.ancestor_lsn))
else {
return Err(NoAncestor);
};
if !ancestor_lsn.is_valid() {
return Err(NoAncestor);
}
if ancestor.ancestor_timeline.is_some() {
// non-technical requirement; we could flatten N ancestors just as easily but we chose
// not to
return Err(TooManyAncestors);
}
if detached.get_prev_record_lsn() == Lsn::INVALID
|| detached.disk_consistent_lsn.load() == ancestor_lsn
{
// this is to avoid a problem that after detaching we would be unable to start up the
// compute because of "PREV_LSN: invalid".
return Err(DetachedTimelineNeedsWrites);
}
// before we acquire the gate, we must mark the ancestor as having a detach operation
// ongoing which will block other concurrent detach operations so we don't get to ackward
// situations where there would be two branches trying to reparent earlier branches.
let (guard, barrier) = completion::channel();
{
let mut guard = tenant.ongoing_timeline_detach.lock().unwrap();
if let Some((tl, other)) = guard.as_ref() {
if !other.is_ready() {
return Err(OtherTimelineDetachOngoing(*tl));
}
}
*guard = Some((detached.timeline_id, barrier));
}
let _gate_entered = detached.gate.enter().map_err(|_| ShuttingDown)?;
if ancestor_lsn >= ancestor.get_disk_consistent_lsn() {
let span =
tracing::info_span!("freeze_and_flush", ancestor_timeline_id=%ancestor.timeline_id);
async {
let started_at = std::time::Instant::now();
let freeze_and_flush = ancestor.freeze_and_flush0();
let mut freeze_and_flush = std::pin::pin!(freeze_and_flush);
let res =
tokio::time::timeout(std::time::Duration::from_secs(1), &mut freeze_and_flush)
.await;
let res = match res {
Ok(res) => res,
Err(_elapsed) => {
tracing::info!("freezing and flushing ancestor is still ongoing");
freeze_and_flush.await
}
};
res.map_err(FlushAncestor)?;
// we do not need to wait for uploads to complete but we do need `struct Layer`,
// copying delta prefix is unsupported currently for `InMemoryLayer`.
tracing::info!(
elapsed_ms = started_at.elapsed().as_millis(),
"froze and flushed the ancestor"
);
Ok(())
}
.instrument(span)
.await?;
}
let end_lsn = ancestor_lsn + 1;
let (filtered_layers, straddling_branchpoint, rest_of_historic) = {
// we do not need to start from our layers, because they can only be layers that come
// *after* ancestor_lsn
let layers = tokio::select! {
guard = ancestor.layers.read() => guard,
_ = detached.cancel.cancelled() => {
return Err(ShuttingDown);
}
_ = ancestor.cancel.cancelled() => {
return Err(ShuttingDown);
}
};
// between retries, these can change if compaction or gc ran in between. this will mean
// we have to redo work.
partition_work(ancestor_lsn, &layers)
};
// TODO: layers are already sorted by something: use that to determine how much of remote
// copies are already done.
tracing::info!(filtered=%filtered_layers, to_rewrite = straddling_branchpoint.len(), historic=%rest_of_historic.len(), "collected layers");
// TODO: copying and lsn prefix copying could be done at the same time with a single fsync after
let mut new_layers: Vec<Layer> =
Vec::with_capacity(straddling_branchpoint.len() + rest_of_historic.len());
{
tracing::debug!(to_rewrite = %straddling_branchpoint.len(), "copying prefix of delta layers");
let mut tasks = tokio::task::JoinSet::new();
let mut wrote_any = false;
let limiter = Arc::new(tokio::sync::Semaphore::new(
options.rewrite_concurrency.get(),
));
for layer in straddling_branchpoint {
let limiter = limiter.clone();
let timeline = detached.clone();
let ctx = ctx.detached_child(TaskKind::DetachAncestor, DownloadBehavior::Download);
tasks.spawn(async move {
let _permit = limiter.acquire().await;
let copied =
upload_rewritten_layer(end_lsn, &layer, &timeline, &timeline.cancel, &ctx)
.await?;
Ok(copied)
});
}
while let Some(res) = tasks.join_next().await {
match res {
Ok(Ok(Some(copied))) => {
wrote_any = true;
tracing::info!(layer=%copied, "rewrote and uploaded");
new_layers.push(copied);
}
Ok(Ok(None)) => {}
Ok(Err(e)) => return Err(e),
Err(je) => return Err(Unexpected(je.into())),
}
}
// FIXME: the fsync should be mandatory, after both rewrites and copies
if wrote_any {
let timeline_dir = VirtualFile::open(
&detached
.conf
.timeline_path(&detached.tenant_shard_id, &detached.timeline_id),
)
.await
.fatal_err("VirtualFile::open for timeline dir fsync");
timeline_dir
.sync_all()
.await
.fatal_err("VirtualFile::sync_all timeline dir");
}
}
let mut tasks = tokio::task::JoinSet::new();
let limiter = Arc::new(tokio::sync::Semaphore::new(options.copy_concurrency.get()));
for adopted in rest_of_historic {
let limiter = limiter.clone();
let timeline = detached.clone();
tasks.spawn(
async move {
let _permit = limiter.acquire().await;
let owned =
remote_copy(&adopted, &timeline, timeline.generation, &timeline.cancel).await?;
tracing::info!(layer=%owned, "remote copied");
Ok(owned)
}
.in_current_span(),
);
}
while let Some(res) = tasks.join_next().await {
match res {
Ok(Ok(owned)) => {
new_layers.push(owned);
}
Ok(Err(failed)) => {
return Err(failed);
}
Err(je) => return Err(Unexpected(je.into())),
}
}
// TODO: fsync directory again if we hardlinked something
let prepared = PreparedTimelineDetach { layers: new_layers };
Ok((guard, prepared))
}
fn partition_work(
ancestor_lsn: Lsn,
source_layermap: &LayerManager,
) -> (usize, Vec<Layer>, Vec<Layer>) {
let mut straddling_branchpoint = vec![];
let mut rest_of_historic = vec![];
let mut later_by_lsn = 0;
for desc in source_layermap.layer_map().iter_historic_layers() {
// off by one chances here:
// - start is inclusive
// - end is exclusive
if desc.lsn_range.start > ancestor_lsn {
later_by_lsn += 1;
continue;
}
let target = if desc.lsn_range.start <= ancestor_lsn
&& desc.lsn_range.end > ancestor_lsn
&& desc.is_delta
{
// TODO: image layer at Lsn optimization
&mut straddling_branchpoint
} else {
&mut rest_of_historic
};
target.push(source_layermap.get_from_desc(&desc));
}
(later_by_lsn, straddling_branchpoint, rest_of_historic)
}
async fn upload_rewritten_layer(
end_lsn: Lsn,
layer: &Layer,
target: &Arc<Timeline>,
cancel: &CancellationToken,
ctx: &RequestContext,
) -> Result<Option<Layer>, Error> {
use Error::UploadRewritten;
let copied = copy_lsn_prefix(end_lsn, layer, target, ctx).await?;
let Some(copied) = copied else {
return Ok(None);
};
// FIXME: better shuttingdown error
target
.remote_client
.as_ref()
.unwrap()
.upload_layer_file(&copied, cancel)
.await
.map_err(UploadRewritten)?;
Ok(Some(copied.into()))
}
async fn copy_lsn_prefix(
end_lsn: Lsn,
layer: &Layer,
target_timeline: &Arc<Timeline>,
ctx: &RequestContext,
) -> Result<Option<ResidentLayer>, Error> {
use Error::{CopyDeltaPrefix, RewrittenDeltaDownloadFailed};
tracing::debug!(%layer, %end_lsn, "copying lsn prefix");
let mut writer = DeltaLayerWriter::new(
target_timeline.conf,
target_timeline.timeline_id,
target_timeline.tenant_shard_id,
layer.layer_desc().key_range.start,
layer.layer_desc().lsn_range.start..end_lsn,
)
.await
.map_err(CopyDeltaPrefix)?;
let resident = layer
.download_and_keep_resident()
.await
// likely shutdown
.map_err(RewrittenDeltaDownloadFailed)?;
let records = resident
.copy_delta_prefix(&mut writer, end_lsn, ctx)
.await
.map_err(CopyDeltaPrefix)?;
drop(resident);
tracing::debug!(%layer, records, "copied records");
if records == 0 {
drop(writer);
// TODO: we might want to store an empty marker in remote storage for this
// layer so that we will not needlessly walk `layer` on repeated attempts.
Ok(None)
} else {
// reuse the key instead of adding more holes between layers by using the real
// highest key in the layer.
let reused_highest_key = layer.layer_desc().key_range.end;
let copied = writer
.finish(reused_highest_key, target_timeline, ctx)
.await
.map_err(CopyDeltaPrefix)?;
tracing::debug!(%layer, %copied, "new layer produced");
Ok(Some(copied))
}
}
/// Creates a new Layer instance for the adopted layer, and ensures it is found from the remote
/// storage on successful return without the adopted layer being added to `index_part.json`.
async fn remote_copy(
adopted: &Layer,
adoptee: &Arc<Timeline>,
generation: Generation,
cancel: &CancellationToken,
) -> Result<Layer, Error> {
use Error::CopyFailed;
// depending if Layer::keep_resident we could hardlink
let mut metadata = adopted.metadata();
debug_assert!(metadata.generation <= generation);
metadata.generation = generation;
let owned = crate::tenant::storage_layer::Layer::for_evicted(
adoptee.conf,
adoptee,
adopted.layer_desc().filename(),
metadata,
);
// FIXME: better shuttingdown error
adoptee
.remote_client
.as_ref()
.unwrap()
.copy_timeline_layer(adopted, &owned, cancel)
.await
.map(move |()| owned)
.map_err(CopyFailed)
}
/// See [`Timeline::complete_detaching_timeline_ancestor`].
pub(super) async fn complete(
detached: &Arc<Timeline>,
tenant: &Tenant,
prepared: PreparedTimelineDetach,
_ctx: &RequestContext,
) -> Result<Vec<TimelineId>, anyhow::Error> {
let rtc = detached
.remote_client
.as_ref()
.expect("has to have a remote timeline client for timeline ancestor detach");
let PreparedTimelineDetach { layers } = prepared;
let ancestor = detached
.get_ancestor_timeline()
.expect("must still have a ancestor");
let ancestor_lsn = detached.get_ancestor_lsn();
// publish the prepared layers before we reparent any of the timelines, so that on restart
// reparented timelines find layers. also do the actual detaching.
//
// if we crash after this operation, we will at least come up having detached a timeline, but
// we cannot go back and reparent the timelines which would had been reparented in normal
// execution.
//
// this is not perfect, but it avoids us a retry happening after a compaction or gc on restart
// which could give us a completely wrong layer combination.
rtc.schedule_adding_existing_layers_to_index_detach_and_wait(
&layers,
(ancestor.timeline_id, ancestor_lsn),
)
.await?;
let mut tasks = tokio::task::JoinSet::new();
// because we are now keeping the slot in progress, it is unlikely that there will be any
// timeline deletions during this time. if we raced one, then we'll just ignore it.
tenant
.timelines
.lock()
.unwrap()
.values()
.filter_map(|tl| {
if Arc::ptr_eq(tl, detached) {
return None;
}
if !tl.is_active() {
return None;
}
let tl_ancestor = tl.ancestor_timeline.as_ref()?;
let is_same = Arc::ptr_eq(&ancestor, tl_ancestor);
let is_earlier = tl.get_ancestor_lsn() <= ancestor_lsn;
let is_deleting = tl
.delete_progress
.try_lock()
.map(|flow| !flow.is_not_started())
.unwrap_or(true);
if is_same && is_earlier && !is_deleting {
Some(tl.clone())
} else {
None
}
})
.for_each(|timeline| {
// important in this scope: we are holding the Tenant::timelines lock
let span = tracing::info_span!("reparent", reparented=%timeline.timeline_id);
let new_parent = detached.timeline_id;
tasks.spawn(
async move {
let res = timeline
.remote_client
.as_ref()
.expect("reparented has to have remote client because detached has one")
.schedule_reparenting_and_wait(&new_parent)
.await;
match res {
Ok(()) => Some(timeline),
Err(e) => {
// with the use of tenant slot, we no longer expect these.
tracing::warn!("reparenting failed: {e:#}");
None
}
}
}
.instrument(span),
);
});
let reparenting_candidates = tasks.len();
let mut reparented = Vec::with_capacity(tasks.len());
while let Some(res) = tasks.join_next().await {
match res {
Ok(Some(timeline)) => {
tracing::info!(reparented=%timeline.timeline_id, "reparenting done");
reparented.push(timeline.timeline_id);
}
Ok(None) => {
// lets just ignore this for now. one or all reparented timelines could had
// started deletion, and that is fine.
}
Err(je) if je.is_cancelled() => unreachable!("not used"),
Err(je) if je.is_panic() => {
// ignore; it's better to continue with a single reparenting failing (or even
// all of them) in order to get to the goal state.
//
// these timelines will never be reparentable, but they can be always detached as
// separate tree roots.
}
Err(je) => tracing::error!("unexpected join error: {je:?}"),
}
}
if reparenting_candidates != reparented.len() {
tracing::info!("failed to reparent some candidates");
}
Ok(reparented)
}

View File

@@ -12,7 +12,7 @@ use crate::{
METADATA_FILE_NAME,
};
use anyhow::Context;
use camino::{Utf8Path, Utf8PathBuf};
use camino::Utf8Path;
use pageserver_api::shard::ShardIndex;
use std::{collections::HashMap, str::FromStr};
use utils::lsn::Lsn;
@@ -20,7 +20,7 @@ use utils::lsn::Lsn;
/// Identified files in the timeline directory.
pub(super) enum Discovered {
/// The only one we care about
Layer(LayerFileName, Utf8PathBuf, u64),
Layer(LayerFileName, u64),
/// Old ephmeral files from previous launches, should be removed
Ephemeral(String),
/// Old temporary timeline files, unsure what these really are, should be removed
@@ -46,7 +46,7 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result<Vec<Discovere
let discovered = match LayerFileName::from_str(&file_name) {
Ok(file_name) => {
let file_size = direntry.metadata()?.len();
Discovered::Layer(file_name, direntry.path().to_owned(), file_size)
Discovered::Layer(file_name, file_size)
}
Err(_) => {
if file_name == METADATA_FILE_NAME {
@@ -104,38 +104,26 @@ pub(super) enum DismissedLayer {
/// Merges local discoveries and remote [`IndexPart`] to a collection of decisions.
pub(super) fn reconcile(
discovered: Vec<(LayerFileName, Utf8PathBuf, u64)>,
discovered: Vec<(LayerFileName, u64)>,
index_part: Option<&IndexPart>,
disk_consistent_lsn: Lsn,
generation: Generation,
shard: ShardIndex,
) -> Vec<(
LayerFileName,
Option<Utf8PathBuf>,
Result<Decision, DismissedLayer>,
)> {
) -> Vec<(LayerFileName, Result<Decision, DismissedLayer>)> {
use Decision::*;
// name => (local_path, local_metadata, remote_metadata)
type Collected = HashMap<
LayerFileName,
(
Option<Utf8PathBuf>,
Option<LayerFileMetadata>,
Option<LayerFileMetadata>,
),
>;
// name => (local, remote)
type Collected = HashMap<LayerFileName, (Option<LayerFileMetadata>, Option<LayerFileMetadata>)>;
let mut discovered = discovered
.into_iter()
.map(|(layer_name, local_path, file_size)| {
.map(|(name, file_size)| {
(
layer_name,
name,
// The generation and shard here will be corrected to match IndexPart in the merge below, unless
// it is not in IndexPart, in which case using our current generation makes sense
// because it will be uploaded in this generation.
(
Some(local_path),
Some(LayerFileMetadata::new(file_size, generation, shard)),
None,
),
@@ -152,15 +140,15 @@ pub(super) fn reconcile(
.map(|(name, metadata)| (name, LayerFileMetadata::from(metadata)))
.for_each(|(name, metadata)| {
if let Some(existing) = discovered.get_mut(name) {
existing.2 = Some(metadata);
existing.1 = Some(metadata);
} else {
discovered.insert(name.to_owned(), (None, None, Some(metadata)));
discovered.insert(name.to_owned(), (None, Some(metadata)));
}
});
discovered
.into_iter()
.map(|(name, (local_path, local, remote))| {
.map(|(name, (local, remote))| {
let decision = if name.is_in_future(disk_consistent_lsn) {
Err(DismissedLayer::Future { local })
} else {
@@ -177,7 +165,7 @@ pub(super) fn reconcile(
}
};
(name, local_path, decision)
(name, decision)
})
.collect::<Vec<_>>()
}

View File

@@ -205,24 +205,6 @@ impl LayerManager {
updates.flush();
}
/// Called when compaction is completed.
pub(crate) fn rewrite_layers(
&mut self,
rewrite_layers: &[(Layer, ResidentLayer)],
drop_layers: &[Layer],
_metrics: &TimelineMetrics,
) {
let mut updates = self.layer_map.batch_update();
// TODO: implement rewrites (currently this code path only used for drops)
assert!(rewrite_layers.is_empty());
for l in drop_layers {
Self::delete_historic_layer(l, &mut updates, &mut self.layer_fmgr);
}
updates.flush();
}
/// Called when garbage collect has selected the layers to be removed.
pub(crate) fn finish_gc_timeline(&mut self, gc_layers: &[Layer]) {
let mut updates = self.layer_map.batch_update();

View File

@@ -1535,7 +1535,7 @@ mod tests {
let harness = TenantHarness::create("switch_to_same_availability_zone")?;
let mut state = dummy_state(&harness).await;
state.conf.availability_zone.clone_from(&test_az);
state.conf.availability_zone = test_az.clone();
let current_lsn = Lsn(100_000).align();
let now = Utc::now().naive_utc();
@@ -1568,7 +1568,7 @@ mod tests {
// We have another safekeeper with the same commit_lsn, and it have the same availability zone as
// the current pageserver.
let mut same_az_sk = dummy_broker_sk_timeline(current_lsn.0, "same_az", now);
same_az_sk.timeline.availability_zone.clone_from(&test_az);
same_az_sk.timeline.availability_zone = test_az.clone();
state.wal_stream_candidates = HashMap::from([
(

View File

@@ -10,7 +10,6 @@
//! This is similar to PostgreSQL's virtual file descriptor facility in
//! src/backend/storage/file/fd.c
//!
use crate::context::RequestContext;
use crate::metrics::{StorageIoOperation, STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC};
use crate::page_cache::PageWriteGuard;
@@ -616,7 +615,6 @@ impl VirtualFile {
&self,
buf: B,
mut offset: u64,
ctx: &RequestContext,
) -> (B::Buf, Result<(), Error>) {
let buf_len = buf.bytes_init();
if buf_len == 0 {
@@ -625,7 +623,7 @@ impl VirtualFile {
let mut buf = buf.slice(0..buf_len);
while !buf.is_empty() {
let res;
(buf, res) = self.write_at(buf, offset, ctx).await;
(buf, res) = self.write_at(buf, offset).await;
match res {
Ok(0) => {
return (
@@ -654,7 +652,6 @@ impl VirtualFile {
pub async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
&mut self,
buf: B,
ctx: &RequestContext,
) -> (B::Buf, Result<usize, Error>) {
let nbytes = buf.bytes_init();
if nbytes == 0 {
@@ -663,7 +660,7 @@ impl VirtualFile {
let mut buf = buf.slice(0..nbytes);
while !buf.is_empty() {
let res;
(buf, res) = self.write(buf, ctx).await;
(buf, res) = self.write(buf).await;
match res {
Ok(0) => {
return (
@@ -687,10 +684,9 @@ impl VirtualFile {
async fn write<B: IoBuf + Send>(
&mut self,
buf: Slice<B>,
ctx: &RequestContext,
) -> (Slice<B>, Result<usize, std::io::Error>) {
let pos = self.pos;
let (buf, res) = self.write_at(buf, pos, ctx).await;
let (buf, res) = self.write_at(buf, pos).await;
let n = match res {
Ok(n) => n,
Err(e) => return (buf, Err(e)),
@@ -728,7 +724,6 @@ impl VirtualFile {
&self,
buf: Slice<B>,
offset: u64,
_ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */
) -> (Slice<B>, Result<usize, Error>) {
let file_guard = match self.lock_file().await {
Ok(file_guard) => file_guard,
@@ -1093,9 +1088,8 @@ impl OwnedAsyncWriter for VirtualFile {
async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
&mut self,
buf: B,
ctx: &RequestContext,
) -> std::io::Result<(usize, B::Buf)> {
let (buf, res) = VirtualFile::write_all(self, buf, ctx).await;
let (buf, res) = VirtualFile::write_all(self, buf).await;
res.map(move |v| (v, buf))
}
}
@@ -1152,9 +1146,6 @@ fn get_open_files() -> &'static OpenFiles {
#[cfg(test)]
mod tests {
use crate::context::DownloadBehavior;
use crate::task_mgr::TaskKind;
use super::*;
use rand::seq::SliceRandom;
use rand::thread_rng;
@@ -1186,11 +1177,10 @@ mod tests {
&self,
buf: B,
offset: u64,
ctx: &RequestContext,
) -> Result<(), Error> {
match self {
MaybeVirtualFile::VirtualFile(file) => {
let (_buf, res) = file.write_all_at(buf, offset, ctx).await;
let (_buf, res) = file.write_all_at(buf, offset).await;
res
}
MaybeVirtualFile::File(file) => {
@@ -1211,11 +1201,10 @@ mod tests {
async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
&mut self,
buf: B,
ctx: &RequestContext,
) -> Result<(), Error> {
match self {
MaybeVirtualFile::VirtualFile(file) => {
let (_buf, res) = file.write_all(buf, ctx).await;
let (_buf, res) = file.write_all(buf).await;
res.map(|_| ())
}
MaybeVirtualFile::File(file) => {
@@ -1286,7 +1275,6 @@ mod tests {
OF: Fn(Utf8PathBuf, OpenOptions) -> FT,
FT: Future<Output = Result<MaybeVirtualFile, std::io::Error>>,
{
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
let testdir = crate::config::PageServerConf::test_repo_dir(testname);
std::fs::create_dir_all(&testdir)?;
@@ -1300,7 +1288,7 @@ mod tests {
.to_owned(),
)
.await?;
file_a.write_all(b"foobar".to_vec(), &ctx).await?;
file_a.write_all(b"foobar".to_vec()).await?;
// cannot read from a file opened in write-only mode
let _ = file_a.read_string().await.unwrap_err();
@@ -1309,7 +1297,7 @@ mod tests {
let mut file_a = openfunc(path_a, OpenOptions::new().read(true).to_owned()).await?;
// cannot write to a file opened in read-only mode
let _ = file_a.write_all(b"bar".to_vec(), &ctx).await.unwrap_err();
let _ = file_a.write_all(b"bar".to_vec()).await.unwrap_err();
// Try simple read
assert_eq!("foobar", file_a.read_string().await?);
@@ -1351,8 +1339,8 @@ mod tests {
.to_owned(),
)
.await?;
file_b.write_all_at(b"BAR".to_vec(), 3, &ctx).await?;
file_b.write_all_at(b"FOO".to_vec(), 0, &ctx).await?;
file_b.write_all_at(b"BAR".to_vec(), 3).await?;
file_b.write_all_at(b"FOO".to_vec(), 0).await?;
assert_eq!(file_b.read_string_at(2, 3).await?, "OBA");

View File

@@ -1,4 +1,4 @@
use crate::{context::RequestContext, virtual_file::owned_buffers_io::write::OwnedAsyncWriter};
use crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter;
use tokio_epoll_uring::{BoundedBuf, IoBuf};
pub struct Writer<W> {
@@ -38,9 +38,8 @@ where
async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
&mut self,
buf: B,
ctx: &RequestContext,
) -> std::io::Result<(usize, B::Buf)> {
let (nwritten, buf) = self.dst.write_all(buf, ctx).await?;
let (nwritten, buf) = self.dst.write_all(buf).await?;
self.bytes_amount += u64::try_from(nwritten).unwrap();
Ok((nwritten, buf))
}

View File

@@ -1,15 +1,12 @@
use bytes::BytesMut;
use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};
use crate::context::RequestContext;
/// A trait for doing owned-buffer write IO.
/// Think [`tokio::io::AsyncWrite`] but with owned buffers.
pub trait OwnedAsyncWriter {
async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
&mut self,
buf: B,
ctx: &RequestContext,
) -> std::io::Result<(usize, B::Buf)>;
}
@@ -60,9 +57,8 @@ where
}
#[cfg_attr(target_os = "macos", allow(dead_code))]
pub async fn flush_and_into_inner(mut self, ctx: &RequestContext) -> std::io::Result<W> {
self.flush(ctx).await?;
pub async fn flush_and_into_inner(mut self) -> std::io::Result<W> {
self.flush().await?;
let Self { buf, writer } = self;
assert!(buf.is_some());
Ok(writer)
@@ -76,15 +72,14 @@ where
}
#[cfg_attr(target_os = "macos", allow(dead_code))]
pub async fn write_buffered<S: IoBuf + Send>(
&mut self,
chunk: Slice<S>,
ctx: &RequestContext,
) -> std::io::Result<(usize, S)> {
pub async fn write_buffered<S: IoBuf>(&mut self, chunk: Slice<S>) -> std::io::Result<(usize, S)>
where
S: IoBuf + Send,
{
let chunk_len = chunk.len();
// avoid memcpy for the middle of the chunk
if chunk.len() >= self.buf().cap() {
self.flush(ctx).await?;
self.flush().await?;
// do a big write, bypassing `buf`
assert_eq!(
self.buf
@@ -93,7 +88,7 @@ where
.pending(),
0
);
let (nwritten, chunk) = self.writer.write_all(chunk, ctx).await?;
let (nwritten, chunk) = self.writer.write_all(chunk).await?;
assert_eq!(nwritten, chunk_len);
return Ok((nwritten, chunk));
}
@@ -109,7 +104,7 @@ where
slice = &slice[n..];
if buf.pending() >= buf.cap() {
assert_eq!(buf.pending(), buf.cap());
self.flush(ctx).await?;
self.flush().await?;
}
}
assert!(slice.is_empty(), "by now we should have drained the chunk");
@@ -121,11 +116,7 @@ where
/// It is less performant because we always have to copy the borrowed data into the internal buffer
/// before we can do the IO. The [`Self::write_buffered`] can avoid this, which is more performant
/// for large writes.
pub async fn write_buffered_borrowed(
&mut self,
mut chunk: &[u8],
ctx: &RequestContext,
) -> std::io::Result<usize> {
pub async fn write_buffered_borrowed(&mut self, mut chunk: &[u8]) -> std::io::Result<usize> {
let chunk_len = chunk.len();
while !chunk.is_empty() {
let buf = self.buf.as_mut().expect("must not use after an error");
@@ -136,20 +127,20 @@ where
chunk = &chunk[n..];
if buf.pending() >= buf.cap() {
assert_eq!(buf.pending(), buf.cap());
self.flush(ctx).await?;
self.flush().await?;
}
}
Ok(chunk_len)
}
async fn flush(&mut self, ctx: &RequestContext) -> std::io::Result<()> {
async fn flush(&mut self) -> std::io::Result<()> {
let buf = self.buf.take().expect("must not use after an error");
let buf_len = buf.pending();
if buf_len == 0 {
self.buf = Some(buf);
return Ok(());
}
let (nwritten, io_buf) = self.writer.write_all(buf.flush(), ctx).await?;
let (nwritten, io_buf) = self.writer.write_all(buf.flush()).await?;
assert_eq!(nwritten, buf_len);
self.buf = Some(Buffer::reuse_after_flush(io_buf));
Ok(())
@@ -215,7 +206,6 @@ impl OwnedAsyncWriter for Vec<u8> {
async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
&mut self,
buf: B,
_: &RequestContext,
) -> std::io::Result<(usize, B::Buf)> {
let nbytes = buf.bytes_init();
if nbytes == 0 {
@@ -232,8 +222,6 @@ mod tests {
use bytes::BytesMut;
use super::*;
use crate::context::{DownloadBehavior, RequestContext};
use crate::task_mgr::TaskKind;
#[derive(Default)]
struct RecorderWriter {
@@ -243,7 +231,6 @@ mod tests {
async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
&mut self,
buf: B,
_: &RequestContext,
) -> std::io::Result<(usize, B::Buf)> {
let nbytes = buf.bytes_init();
if nbytes == 0 {
@@ -256,14 +243,10 @@ mod tests {
}
}
fn test_ctx() -> RequestContext {
RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error)
}
macro_rules! write {
($writer:ident, $data:literal) => {{
$writer
.write_buffered(::bytes::Bytes::from_static($data).slice_full(), &test_ctx())
.write_buffered(::bytes::Bytes::from_static($data).slice_full())
.await?;
}};
}
@@ -277,7 +260,7 @@ mod tests {
write!(writer, b"c");
write!(writer, b"d");
write!(writer, b"e");
let recorder = writer.flush_and_into_inner(&test_ctx()).await?;
let recorder = writer.flush_and_into_inner().await?;
assert_eq!(
recorder.writes,
vec![Vec::from(b"ab"), Vec::from(b"cd"), Vec::from(b"e")]
@@ -293,7 +276,7 @@ mod tests {
write!(writer, b"de");
write!(writer, b"");
write!(writer, b"fghijk");
let recorder = writer.flush_and_into_inner(&test_ctx()).await?;
let recorder = writer.flush_and_into_inner().await?;
assert_eq!(
recorder.writes,
vec![Vec::from(b"abc"), Vec::from(b"de"), Vec::from(b"fghijk")]
@@ -309,7 +292,7 @@ mod tests {
write!(writer, b"bc");
write!(writer, b"d");
write!(writer, b"e");
let recorder = writer.flush_and_into_inner(&test_ctx()).await?;
let recorder = writer.flush_and_into_inner().await?;
assert_eq!(
recorder.writes,
vec![Vec::from(b"a"), Vec::from(b"bc"), Vec::from(b"de")]
@@ -319,20 +302,18 @@ mod tests {
#[tokio::test]
async fn test_write_all_borrowed_always_goes_through_buffer() -> std::io::Result<()> {
let ctx = test_ctx();
let ctx = &ctx;
let recorder = RecorderWriter::default();
let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
writer.write_buffered_borrowed(b"abc", ctx).await?;
writer.write_buffered_borrowed(b"d", ctx).await?;
writer.write_buffered_borrowed(b"e", ctx).await?;
writer.write_buffered_borrowed(b"fg", ctx).await?;
writer.write_buffered_borrowed(b"hi", ctx).await?;
writer.write_buffered_borrowed(b"j", ctx).await?;
writer.write_buffered_borrowed(b"klmno", ctx).await?;
writer.write_buffered_borrowed(b"abc").await?;
writer.write_buffered_borrowed(b"d").await?;
writer.write_buffered_borrowed(b"e").await?;
writer.write_buffered_borrowed(b"fg").await?;
writer.write_buffered_borrowed(b"hi").await?;
writer.write_buffered_borrowed(b"j").await?;
writer.write_buffered_borrowed(b"klmno").await?;
let recorder = writer.flush_and_into_inner(ctx).await?;
let recorder = writer.flush_and_into_inner().await?;
assert_eq!(
recorder.writes,
{

View File

@@ -14,8 +14,7 @@ OBJS = \
relsize_cache.o \
walproposer.o \
walproposer_pg.o \
control_plane_connector.o \
walsender_hooks.o
control_plane_connector.o
PG_CPPFLAGS = -I$(libpq_srcdir)
SHLIB_LINK_INTERNAL = $(libpq)

View File

@@ -49,7 +49,7 @@ char *neon_auth_token;
int readahead_buffer_size = 128;
int flush_every_n_requests = 8;
int neon_protocol_version = 2;
int neon_protocol_version = 1;
static int n_reconnect_attempts = 0;
static int max_reconnect_attempts = 60;
@@ -860,7 +860,7 @@ pg_init_libpagestore(void)
"Version of compute<->page server protocol",
NULL,
&neon_protocol_version,
2, /* use protocol version 2 */
1, /* default to old protocol for now */
1, /* min */
2, /* max */
PGC_SU_BACKEND,

View File

@@ -34,7 +34,6 @@
#include "walproposer.h"
#include "pagestore_client.h"
#include "control_plane_connector.h"
#include "walsender_hooks.h"
PG_MODULE_MAGIC;
void _PG_init(void);
@@ -266,6 +265,7 @@ LogicalSlotsMonitorMain(Datum main_arg)
}
}
void
_PG_init(void)
{
@@ -279,7 +279,6 @@ _PG_init(void)
pg_init_libpagestore();
pg_init_walproposer();
WalSender_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
InitLogicalReplicationMonitor();

View File

@@ -36,7 +36,10 @@
static NeonWALReadResult NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli);
static NeonWALReadResult NeonWALReaderReadMsg(NeonWALReader *state);
static void NeonWALReaderResetRemote(NeonWALReader *state);
static bool NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli);
static bool neon_wal_segment_open(NeonWALReader *state, XLogSegNo nextSegNo, TimeLineID *tli_p);
static void neon_wal_segment_close(NeonWALReader *state);
static bool is_wal_segment_exists(XLogSegNo segno, int segsize,
TimeLineID tli);
@@ -79,9 +82,8 @@ struct NeonWALReader
XLogRecPtr req_lsn;
Size req_len;
Size req_progress;
char donor_conninfo[MAXCONNINFO];
WalProposer *wp; /* we learn donor through walproposer */
char donor_name[64]; /* saved donor safekeeper name for logging */
XLogRecPtr donor_lsn;
/* state of connection to safekeeper */
NeonWALReaderRemoteState rem_state;
WalProposerConn *wp_conn;
@@ -105,7 +107,7 @@ struct NeonWALReader
/* palloc and initialize NeonWALReader */
NeonWALReader *
NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, char *log_prefix)
NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, WalProposer *wp, char *log_prefix)
{
NeonWALReader *reader;
@@ -121,6 +123,8 @@ NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, char *log_
reader->seg.ws_tli = 0;
reader->segcxt.ws_segsize = wal_segment_size;
reader->wp = wp;
reader->rem_state = RS_NONE;
if (log_prefix)
@@ -200,16 +204,21 @@ NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size cou
{
if (state->rem_state == RS_NONE)
{
if (!NeonWALReaderUpdateDonor(state))
XLogRecPtr donor_lsn;
/* no connection yet; start one */
Safekeeper *donor = GetDonor(state->wp, &donor_lsn);
if (donor == NULL)
{
snprintf(state->err_msg, sizeof(state->err_msg),
"failed to establish remote connection to fetch WAL: no donor available");
return NEON_WALREAD_ERROR;
}
/* no connection yet; start one */
nwr_log(LOG, "establishing connection to %s, lsn=%X/%X to fetch WAL", state->donor_name, LSN_FORMAT_ARGS(state->donor_lsn));
state->wp_conn = libpqwp_connect_start(state->donor_conninfo);
snprintf(state->donor_name, sizeof(state->donor_name), "%s:%s", donor->host, donor->port);
nwr_log(LOG, "establishing connection to %s, flush_lsn %X/%X to fetch WAL",
state->donor_name, LSN_FORMAT_ARGS(donor_lsn));
state->wp_conn = libpqwp_connect_start(donor->conninfo);
if (PQstatus(state->wp_conn->pg_conn) == CONNECTION_BAD)
{
snprintf(state->err_msg, sizeof(state->err_msg),
@@ -242,22 +251,10 @@ NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size cou
{
/* connection successfully established */
char start_repl_query[128];
term_t term = pg_atomic_read_u64(&GetWalpropShmemState()->mineLastElectedTerm);
/*
* Set elected walproposer's term to pull only data from
* its history. Note: for logical walsender it means we
* might stream WAL not yet committed by safekeepers. It
* would be cleaner to fix this.
*
* mineLastElectedTerm shouldn't be 0 at this point
* because we checked above that donor exists and it
* appears only after successfull election.
*/
Assert(term > 0);
snprintf(start_repl_query, sizeof(start_repl_query),
"START_REPLICATION PHYSICAL %X/%X (term='" UINT64_FORMAT "')",
LSN_FORMAT_ARGS(startptr), term);
LSN_FORMAT_ARGS(startptr), state->wp->propTerm);
nwr_log(LOG, "connection to %s to fetch WAL succeeded, running %s",
state->donor_name, start_repl_query);
if (!libpqwp_send_query(state->wp_conn, start_repl_query))
@@ -407,10 +404,6 @@ NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size cou
state->req_lsn = InvalidXLogRecPtr;
state->req_len = 0;
state->req_progress = 0;
/* Update the current segment info. */
state->seg.ws_tli = tli;
return NEON_WALREAD_SUCCESS;
}
}
@@ -533,7 +526,7 @@ err:
}
/* reset remote connection and request in progress */
void
static void
NeonWALReaderResetRemote(NeonWALReader *state)
{
state->req_lsn = InvalidXLogRecPtr;
@@ -698,25 +691,13 @@ NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size coun
return true;
}
XLogRecPtr
NeonWALReaderGetRemLsn(NeonWALReader *state)
{
return state->rem_lsn;
}
const WALOpenSegment *
NeonWALReaderGetSegment(NeonWALReader *state)
{
return &state->seg;
}
/*
* Copy of vanilla wal_segment_open, but returns false in case of error instead
* of ERROR, with errno set.
*
* XLogReaderRoutine->segment_open callback for local pg_wal files
*/
bool
static bool
neon_wal_segment_open(NeonWALReader *state, XLogSegNo nextSegNo,
TimeLineID *tli_p)
{
@@ -743,7 +724,7 @@ is_wal_segment_exists(XLogSegNo segno, int segsize, TimeLineID tli)
}
/* copy of vanilla wal_segment_close with NeonWALReader */
void
static void
neon_wal_segment_close(NeonWALReader *state)
{
if (state->seg.ws_file >= 0)
@@ -759,19 +740,3 @@ NeonWALReaderErrMsg(NeonWALReader *state)
{
return state->err_msg;
}
/*
* Returns true if there is a donor, and false otherwise
*/
bool
NeonWALReaderUpdateDonor(NeonWALReader *state)
{
WalproposerShmemState *wps = GetWalpropShmemState();
SpinLockAcquire(&wps->mutex);
memcpy(state->donor_name, wps->donor_name, sizeof(state->donor_name));
memcpy(state->donor_conninfo, wps->donor_conninfo, sizeof(state->donor_conninfo));
state->donor_lsn = wps->donor_lsn;
SpinLockRelease(&wps->mutex);
return state->donor_name[0] != '\0';
}

View File

@@ -19,19 +19,12 @@ typedef enum
NEON_WALREAD_ERROR,
} NeonWALReadResult;
extern NeonWALReader *NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, char *log_prefix);
extern NeonWALReader *NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, WalProposer *wp, char *log_prefix);
extern void NeonWALReaderFree(NeonWALReader *state);
extern void NeonWALReaderResetRemote(NeonWALReader *state);
extern NeonWALReadResult NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli);
extern pgsocket NeonWALReaderSocket(NeonWALReader *state);
extern uint32 NeonWALReaderEvents(NeonWALReader *state);
extern bool NeonWALReaderIsRemConnEstablished(NeonWALReader *state);
extern char *NeonWALReaderErrMsg(NeonWALReader *state);
extern XLogRecPtr NeonWALReaderGetRemLsn(NeonWALReader *state);
extern const WALOpenSegment *NeonWALReaderGetSegment(NeonWALReader *state);
extern bool neon_wal_segment_open(NeonWALReader *state, XLogSegNo nextSegNo, TimeLineID *tli_p);
extern void neon_wal_segment_close(NeonWALReader *state);
extern bool NeonWALReaderUpdateDonor(NeonWALReader *state);
#endif /* __NEON_WALREADER_H__ */

View File

@@ -80,7 +80,7 @@ static int CompareLsn(const void *a, const void *b);
static char *FormatSafekeeperState(Safekeeper *sk);
static void AssertEventsOkForState(uint32 events, Safekeeper *sk);
static char *FormatEvents(WalProposer *wp, uint32 events);
static void UpdateDonorShmem(WalProposer *wp);
WalProposer *
WalProposerCreate(WalProposerConfig *config, walproposer_api api)
@@ -922,8 +922,7 @@ static void
DetermineEpochStartLsn(WalProposer *wp)
{
TermHistory *dth;
int n_ready = 0;
WalproposerShmemState *walprop_shared;
int n_ready = 0;
wp->propEpochStartLsn = InvalidXLogRecPtr;
wp->donorEpoch = 0;
@@ -965,18 +964,16 @@ DetermineEpochStartLsn(WalProposer *wp)
if (n_ready < wp->quorum)
{
/*
* This is a rare case that can be triggered if safekeeper has voted
* and disconnected. In this case, its state will not be SS_IDLE and
* its vote cannot be used, because we clean up `voteResponse` in
* `ShutdownConnection`.
* This is a rare case that can be triggered if safekeeper has voted and disconnected.
* In this case, its state will not be SS_IDLE and its vote cannot be used, because
* we clean up `voteResponse` in `ShutdownConnection`.
*/
wp_log(FATAL, "missing majority of votes, collected %d, expected %d, got %d", wp->n_votes, wp->quorum, n_ready);
}
/*
* If propEpochStartLsn is 0, it means flushLsn is 0 everywhere, we are
* bootstrapping and nothing was committed yet. Start streaming then from
* the basebackup LSN.
* If propEpochStartLsn is 0, it means flushLsn is 0 everywhere, we are bootstrapping
* and nothing was committed yet. Start streaming then from the basebackup LSN.
*/
if (wp->propEpochStartLsn == InvalidXLogRecPtr && !wp->config->syncSafekeepers)
{
@@ -987,12 +984,11 @@ DetermineEpochStartLsn(WalProposer *wp)
}
wp_log(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(wp->propEpochStartLsn));
}
pg_atomic_write_u64(&wp->api.get_shmem_state(wp)->propEpochStartLsn, wp->propEpochStartLsn);
/*
* Safekeepers are setting truncateLsn after timelineStartLsn is known, so
* it should never be zero at this point, if we know timelineStartLsn.
*
* Safekeepers are setting truncateLsn after timelineStartLsn is known, so it
* should never be zero at this point, if we know timelineStartLsn.
*
* timelineStartLsn can be zero only on the first syncSafekeepers run.
*/
Assert((wp->truncateLsn != InvalidXLogRecPtr) ||
@@ -1026,9 +1022,10 @@ DetermineEpochStartLsn(WalProposer *wp)
* since which we are going to write according to the consensus. If not,
* we must bail out, as clog and other non rel data is inconsistent.
*/
walprop_shared = wp->api.get_shmem_state(wp);
if (!wp->config->syncSafekeepers)
{
WalproposerShmemState *walprop_shared = wp->api.get_shmem_state(wp);
/*
* Basebackup LSN always points to the beginning of the record (not
* the page), as StartupXLOG most probably wants it this way.
@@ -1043,7 +1040,7 @@ DetermineEpochStartLsn(WalProposer *wp)
* compute (who could generate WAL) is ok.
*/
if (!((dth->n_entries >= 1) && (dth->entries[dth->n_entries - 1].term ==
pg_atomic_read_u64(&walprop_shared->mineLastElectedTerm))))
walprop_shared->mineLastElectedTerm)))
{
/*
* Panic to restart PG as we need to retake basebackup.
@@ -1057,8 +1054,8 @@ DetermineEpochStartLsn(WalProposer *wp)
LSN_FORMAT_ARGS(wp->api.get_redo_start_lsn(wp)));
}
}
walprop_shared->mineLastElectedTerm = wp->propTerm;
}
pg_atomic_write_u64(&walprop_shared->mineLastElectedTerm, wp->propTerm);
}
/*
@@ -1108,13 +1105,9 @@ SendProposerElected(Safekeeper *sk)
{
/* safekeeper is empty or no common point, start from the beginning */
sk->startStreamingAt = wp->propTermHistory.entries[0].lsn;
wp_log(LOG, "no common point with sk %s:%s, streaming since first term at %X/%X, timelineStartLsn=%X/%X, termHistory.n_entries=%u",
sk->host, sk->port, LSN_FORMAT_ARGS(sk->startStreamingAt), LSN_FORMAT_ARGS(wp->timelineStartLsn), wp->propTermHistory.n_entries);
/*
* wp->timelineStartLsn == InvalidXLogRecPtr can be only when timeline
* is created manually (test_s3_wal_replay)
*/
wp_log(LOG, "no common point with sk %s:%s, streaming since first term at %X/%X, timelineStartLsn=%X/%X, termHistory.n_entries=%u" ,
sk->host, sk->port, LSN_FORMAT_ARGS(sk->startStreamingAt), LSN_FORMAT_ARGS(wp->timelineStartLsn), wp->propTermHistory.n_entries);
/* wp->timelineStartLsn == InvalidXLogRecPtr can be only when timeline is created manually (test_s3_wal_replay) */
Assert(sk->startStreamingAt == wp->timelineStartLsn || wp->timelineStartLsn == InvalidXLogRecPtr);
}
else
@@ -1184,12 +1177,6 @@ StartStreaming(Safekeeper *sk)
sk->active_state = SS_ACTIVE_SEND;
sk->streamingAt = sk->startStreamingAt;
/*
* Donors can only be in SS_ACTIVE state, so we potentially update the
* donor when we switch one to SS_ACTIVE.
*/
UpdateDonorShmem(sk->wp);
/* event set will be updated inside SendMessageToNode */
SendMessageToNode(sk);
}
@@ -1581,17 +1568,17 @@ GetAcknowledgedByQuorumWALPosition(WalProposer *wp)
* none if it doesn't exist. donor_lsn is set to end position of the donor to
* the best of our knowledge.
*/
static void
UpdateDonorShmem(WalProposer *wp)
Safekeeper *
GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn)
{
Safekeeper *donor = NULL;
int i;
XLogRecPtr donor_lsn = InvalidXLogRecPtr;
*donor_lsn = InvalidXLogRecPtr;
if (wp->n_votes < wp->quorum)
{
wp_log(WARNING, "UpdateDonorShmem called before elections are won");
return;
wp_log(WARNING, "GetDonor called before elections are won");
return NULL;
}
/*
@@ -1602,7 +1589,7 @@ UpdateDonorShmem(WalProposer *wp)
if (wp->safekeeper[wp->donor].state >= SS_IDLE)
{
donor = &wp->safekeeper[wp->donor];
donor_lsn = wp->propEpochStartLsn;
*donor_lsn = wp->propEpochStartLsn;
}
/*
@@ -1614,19 +1601,13 @@ UpdateDonorShmem(WalProposer *wp)
{
Safekeeper *sk = &wp->safekeeper[i];
if (sk->state == SS_ACTIVE && sk->appendResponse.flushLsn > donor_lsn)
if (sk->state == SS_ACTIVE && sk->appendResponse.flushLsn > *donor_lsn)
{
donor = sk;
donor_lsn = sk->appendResponse.flushLsn;
*donor_lsn = sk->appendResponse.flushLsn;
}
}
if (donor == NULL)
{
wp_log(WARNING, "UpdateDonorShmem didn't find a suitable donor, skipping");
return;
}
wp->api.update_donor(wp, donor, donor_lsn);
return donor;
}
/*
@@ -1636,7 +1617,7 @@ static void
HandleSafekeeperResponse(WalProposer *wp, Safekeeper *sk)
{
XLogRecPtr candidateTruncateLsn;
XLogRecPtr newCommitLsn;
XLogRecPtr newCommitLsn;
newCommitLsn = GetAcknowledgedByQuorumWALPosition(wp);
if (newCommitLsn > wp->commitLsn)
@@ -1646,7 +1627,7 @@ HandleSafekeeperResponse(WalProposer *wp, Safekeeper *sk)
BroadcastAppendRequest(wp);
}
/*
/*
* Unlock syncrep waiters, update ps_feedback, CheckGracefulShutdown().
* The last one will terminate the process if the shutdown is requested
* and WAL is committed by the quorum. BroadcastAppendRequest() should be

View File

@@ -284,19 +284,14 @@ typedef struct PageserverFeedback
typedef struct WalproposerShmemState
{
pg_atomic_uint64 propEpochStartLsn;
char donor_name[64];
char donor_conninfo[MAXCONNINFO];
XLogRecPtr donor_lsn;
slock_t mutex;
pg_atomic_uint64 mineLastElectedTerm;
term_t mineLastElectedTerm;
pg_atomic_uint64 backpressureThrottlingTime;
pg_atomic_uint64 currentClusterSize;
/* last feedback from each shard */
PageserverFeedback shard_ps_feedback[MAX_SHARDS];
int num_shards;
int num_shards;
/* aggregated feedback with min LSNs across shards */
PageserverFeedback min_ps_feedback;
@@ -470,9 +465,6 @@ typedef struct walproposer_api
/* Get pointer to the latest available WAL. */
XLogRecPtr (*get_flush_rec_ptr) (WalProposer *wp);
/* Update current donor info in WalProposer Shmem */
void (*update_donor) (WalProposer *wp, Safekeeper *donor, XLogRecPtr donor_lsn);
/* Get current time. */
TimestampTz (*get_current_timestamp) (WalProposer *wp);
@@ -505,7 +497,7 @@ typedef struct walproposer_api
*
* On success, the data is placed in *buf. It is valid until the next call
* to this function.
*
*
* Returns PG_ASYNC_READ_FAIL on closed connection.
*/
PGAsyncReadResult (*conn_async_read) (Safekeeper *sk, char **buf, int *amount);
@@ -553,14 +545,13 @@ typedef struct walproposer_api
* Returns 0 if timeout is reached, 1 if some event happened. Updates
* events mask to indicate events and sets sk to the safekeeper which has
* an event.
*
*
* On timeout, events is set to WL_NO_EVENTS. On socket event, events is
* set to WL_SOCKET_READABLE and/or WL_SOCKET_WRITEABLE. When socket is
* closed, events is set to WL_SOCKET_READABLE.
*
* WL_SOCKET_WRITEABLE is usually set only when we need to flush the
* buffer. It can be returned only if caller asked for this event in the
* last *_event_set call.
*
* WL_SOCKET_WRITEABLE is usually set only when we need to flush the buffer.
* It can be returned only if caller asked for this event in the last *_event_set call.
*/
int (*wait_event_set) (WalProposer *wp, long timeout, Safekeeper **sk, uint32 *events);
@@ -580,9 +571,9 @@ typedef struct walproposer_api
void (*finish_sync_safekeepers) (WalProposer *wp, XLogRecPtr lsn);
/*
* Called after every AppendResponse from the safekeeper. Used to
* propagate backpressure feedback and to confirm WAL persistence (has
* been commited on the quorum of safekeepers).
* Called after every AppendResponse from the safekeeper. Used to propagate
* backpressure feedback and to confirm WAL persistence (has been commited
* on the quorum of safekeepers).
*/
void (*process_safekeeper_feedback) (WalProposer *wp, Safekeeper *sk);
@@ -725,14 +716,12 @@ extern void WalProposerBroadcast(WalProposer *wp, XLogRecPtr startpos, XLogRecPt
extern void WalProposerPoll(WalProposer *wp);
extern void WalProposerFree(WalProposer *wp);
extern WalproposerShmemState *GetWalpropShmemState();
/*
* WaitEventSet API doesn't allow to remove socket, so walproposer_pg uses it to
* recreate set from scratch, hence the export.
*/
extern void SafekeeperStateDesiredEvents(Safekeeper *sk, uint32 *sk_events, uint32 *nwr_events);
extern TimeLineID walprop_pg_get_timeline_id(void);
extern Safekeeper *GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn);
#define WPEVENT 1337 /* special log level for walproposer internal

View File

@@ -85,6 +85,7 @@ static void walprop_pg_init_standalone_sync_safekeepers(void);
static void walprop_pg_init_walsender(void);
static void walprop_pg_init_bgworker(void);
static TimestampTz walprop_pg_get_current_timestamp(WalProposer *wp);
static TimeLineID walprop_pg_get_timeline_id(void);
static void walprop_pg_load_libpqwalreceiver(void);
static process_interrupts_callback_t PrevProcessInterruptsCallback;
@@ -93,8 +94,6 @@ static shmem_startup_hook_type prev_shmem_startup_hook_type;
static shmem_request_hook_type prev_shmem_request_hook = NULL;
static void walproposer_shmem_request(void);
#endif
static void WalproposerShmemInit_SyncSafekeeper(void);
static void StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd);
static void WalSndLoop(WalProposer *wp);
@@ -137,7 +136,6 @@ WalProposerSync(int argc, char *argv[])
WalProposer *wp;
init_walprop_config(true);
WalproposerShmemInit_SyncSafekeeper();
walprop_pg_init_standalone_sync_safekeepers();
walprop_pg_load_libpqwalreceiver();
@@ -283,8 +281,6 @@ WalproposerShmemInit(void)
{
memset(walprop_shared, 0, WalproposerShmemSize());
SpinLockInit(&walprop_shared->mutex);
pg_atomic_init_u64(&walprop_shared->propEpochStartLsn, 0);
pg_atomic_init_u64(&walprop_shared->mineLastElectedTerm, 0);
pg_atomic_init_u64(&walprop_shared->backpressureThrottlingTime, 0);
pg_atomic_init_u64(&walprop_shared->currentClusterSize, 0);
}
@@ -293,17 +289,6 @@ WalproposerShmemInit(void)
return found;
}
static void
WalproposerShmemInit_SyncSafekeeper(void)
{
walprop_shared = palloc(WalproposerShmemSize());
memset(walprop_shared, 0, WalproposerShmemSize());
SpinLockInit(&walprop_shared->mutex);
pg_atomic_init_u64(&walprop_shared->propEpochStartLsn, 0);
pg_atomic_init_u64(&walprop_shared->mineLastElectedTerm, 0);
pg_atomic_init_u64(&walprop_shared->backpressureThrottlingTime, 0);
}
#define BACK_PRESSURE_DELAY 10000L // 0.01 sec
static bool
@@ -414,13 +399,6 @@ nwp_shmem_startup_hook(void)
WalproposerShmemInit();
}
WalproposerShmemState *
GetWalpropShmemState()
{
Assert(walprop_shared != NULL);
return walprop_shared;
}
static WalproposerShmemState *
walprop_pg_get_shmem_state(WalProposer *wp)
{
@@ -453,15 +431,14 @@ record_pageserver_feedback(PageserverFeedback *ps_feedback)
for (int i = 0; i < walprop_shared->num_shards; i++)
{
PageserverFeedback *feedback = &walprop_shared->shard_ps_feedback[i];
if (feedback->present)
{
if (min_feedback.last_received_lsn == InvalidXLogRecPtr || feedback->last_received_lsn < min_feedback.last_received_lsn)
min_feedback.last_received_lsn = feedback->last_received_lsn;
if (min_feedback.disk_consistent_lsn == InvalidXLogRecPtr || feedback->disk_consistent_lsn < min_feedback.disk_consistent_lsn)
min_feedback.disk_consistent_lsn = feedback->disk_consistent_lsn;
if (min_feedback.remote_consistent_lsn == InvalidXLogRecPtr || feedback->remote_consistent_lsn < min_feedback.remote_consistent_lsn)
min_feedback.remote_consistent_lsn = feedback->remote_consistent_lsn;
}
@@ -574,7 +551,6 @@ static void
walprop_sigusr2(SIGNAL_ARGS)
{
int save_errno = errno;
got_SIGUSR2 = true;
SetLatch(MyLatch);
errno = save_errno;
@@ -622,7 +598,7 @@ walprop_pg_get_current_timestamp(WalProposer *wp)
return GetCurrentTimestamp();
}
TimeLineID
static TimeLineID
walprop_pg_get_timeline_id(void)
{
#if PG_VERSION_NUM >= 150000
@@ -641,20 +617,6 @@ walprop_pg_load_libpqwalreceiver(void)
wpg_log(ERROR, "libpqwalreceiver didn't initialize correctly");
}
static void
walprop_pg_update_donor(WalProposer *wp, Safekeeper *donor, XLogRecPtr donor_lsn)
{
WalproposerShmemState *wps = wp->api.get_shmem_state(wp);
char donor_name[64];
pg_snprintf(donor_name, sizeof(donor_name), "%s:%s", donor->host, donor->port);
SpinLockAcquire(&wps->mutex);
memcpy(wps->donor_name, donor_name, sizeof(donor_name));
memcpy(wps->donor_conninfo, donor->conninfo, sizeof(donor->conninfo));
wps->donor_lsn = donor_lsn;
SpinLockRelease(&wps->mutex);
}
/* Helper function */
static bool
ensure_nonblocking_status(WalProposerConn *conn, bool is_nonblocking)
@@ -755,6 +717,7 @@ walprop_connect_start(Safekeeper *sk)
{
Assert(sk->conn == NULL);
sk->conn = libpqwp_connect_start(sk->conninfo);
}
static WalProposerConnectPollStatusType
@@ -1128,7 +1091,7 @@ static void
StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd)
{
XLogRecPtr FlushPtr;
__attribute__((unused)) TimeLineID currTLI;
__attribute__((unused)) TimeLineID currTLI;
#if PG_VERSION_NUM < 150000
if (ThisTimeLineID == 0)
@@ -1332,13 +1295,116 @@ XLogBroadcastWalProposer(WalProposer *wp)
}
}
/*
Used to download WAL before basebackup for logical walsenders from sk, no longer
needed because walsender always uses neon_walreader.
*/
/* Download WAL before basebackup for logical walsenders from sk, if needed */
static bool
WalProposerRecovery(WalProposer *wp, Safekeeper *sk)
{
char *err;
WalReceiverConn *wrconn;
WalRcvStreamOptions options;
char conninfo[MAXCONNINFO];
TimeLineID timeline;
XLogRecPtr startpos;
XLogRecPtr endpos;
startpos = GetLogRepRestartLSN(wp);
if (startpos == InvalidXLogRecPtr)
return true; /* recovery not needed */
endpos = wp->propEpochStartLsn;
timeline = wp->greetRequest.timeline;
if (!neon_auth_token)
{
memcpy(conninfo, sk->conninfo, MAXCONNINFO);
}
else
{
int written = 0;
written = snprintf((char *) conninfo, MAXCONNINFO, "password=%s %s", neon_auth_token, sk->conninfo);
if (written > MAXCONNINFO || written < 0)
wpg_log(FATAL, "could not append password to the safekeeper connection string");
}
#if PG_MAJORVERSION_NUM < 16
wrconn = walrcv_connect(conninfo, false, "wal_proposer_recovery", &err);
#else
wrconn = walrcv_connect(conninfo, false, false, "wal_proposer_recovery", &err);
#endif
if (!wrconn)
{
ereport(WARNING,
(errmsg("could not connect to WAL acceptor %s:%s: %s",
sk->host, sk->port,
err)));
return false;
}
wpg_log(LOG,
"start recovery for logical replication from %s:%s starting from %X/%08X till %X/%08X timeline "
"%d",
sk->host, sk->port, (uint32) (startpos >> 32),
(uint32) startpos, (uint32) (endpos >> 32), (uint32) endpos, timeline);
options.logical = false;
options.startpoint = startpos;
options.slotname = NULL;
options.proto.physical.startpointTLI = timeline;
if (walrcv_startstreaming(wrconn, &options))
{
XLogRecPtr rec_start_lsn;
XLogRecPtr rec_end_lsn = 0;
int len;
char *buf;
pgsocket wait_fd = PGINVALID_SOCKET;
while ((len = walrcv_receive(wrconn, &buf, &wait_fd)) >= 0)
{
if (len == 0)
{
(void) WaitLatchOrSocket(
MyLatch, WL_EXIT_ON_PM_DEATH | WL_SOCKET_READABLE, wait_fd,
-1, WAIT_EVENT_WAL_RECEIVER_MAIN);
}
else
{
Assert(buf[0] == 'w' || buf[0] == 'k');
if (buf[0] == 'k')
continue; /* keepalive */
memcpy(&rec_start_lsn, &buf[XLOG_HDR_START_POS],
sizeof rec_start_lsn);
rec_start_lsn = pg_ntoh64(rec_start_lsn);
rec_end_lsn = rec_start_lsn + len - XLOG_HDR_SIZE;
/* write WAL to disk */
XLogWalPropWrite(sk->wp, &buf[XLOG_HDR_SIZE], len - XLOG_HDR_SIZE, rec_start_lsn);
ereport(DEBUG1,
(errmsg("Recover message %X/%X length %d",
LSN_FORMAT_ARGS(rec_start_lsn), len)));
if (rec_end_lsn >= endpos)
break;
}
}
ereport(LOG,
(errmsg("end of replication stream at %X/%X: %m",
LSN_FORMAT_ARGS(rec_end_lsn))));
walrcv_disconnect(wrconn);
/* failed to receive all WAL till endpos */
if (rec_end_lsn < endpos)
return false;
}
else
{
ereport(LOG,
(errmsg("primary server contains no more WAL on requested timeline %u LSN %X/%08X",
timeline, (uint32) (startpos >> 32), (uint32) startpos)));
return false;
}
return true;
}
@@ -1479,7 +1545,7 @@ walprop_pg_wal_reader_allocate(Safekeeper *sk)
snprintf(log_prefix, sizeof(log_prefix), WP_LOG_PREFIX "sk %s:%s nwr: ", sk->host, sk->port);
Assert(!sk->xlogreader);
sk->xlogreader = NeonWALReaderAllocate(wal_segment_size, sk->wp->propEpochStartLsn, log_prefix);
sk->xlogreader = NeonWALReaderAllocate(wal_segment_size, sk->wp->propEpochStartLsn, sk->wp, log_prefix);
if (sk->xlogreader == NULL)
wpg_log(FATAL, "failed to allocate xlog reader");
}
@@ -1894,8 +1960,8 @@ CombineHotStanbyFeedbacks(HotStandbyFeedback *hs, WalProposer *wp)
static void
walprop_pg_process_safekeeper_feedback(WalProposer *wp, Safekeeper *sk)
{
HotStandbyFeedback hsFeedback;
bool needToAdvanceSlot = false;
HotStandbyFeedback hsFeedback;
bool needToAdvanceSlot = false;
if (wp->config->syncSafekeepers)
return;
@@ -2029,25 +2095,22 @@ GetLogRepRestartLSN(WalProposer *wp)
return lrRestartLsn;
}
void
SetNeonCurrentClusterSize(uint64 size)
void SetNeonCurrentClusterSize(uint64 size)
{
pg_atomic_write_u64(&walprop_shared->currentClusterSize, size);
}
uint64
GetNeonCurrentClusterSize(void)
uint64 GetNeonCurrentClusterSize(void)
{
return pg_atomic_read_u64(&walprop_shared->currentClusterSize);
}
uint64 GetNeonCurrentClusterSize(void);
uint64 GetNeonCurrentClusterSize(void);
static const walproposer_api walprop_pg = {
.get_shmem_state = walprop_pg_get_shmem_state,
.start_streaming = walprop_pg_start_streaming,
.get_flush_rec_ptr = walprop_pg_get_flush_rec_ptr,
.update_donor = walprop_pg_update_donor,
.get_current_timestamp = walprop_pg_get_current_timestamp,
.conn_error_message = walprop_error_message,
.conn_status = walprop_status,

View File

@@ -1,172 +0,0 @@
/*-------------------------------------------------------------------------
*
* walsender_hooks.c
*
* Implements XLogReaderRoutine in terms of NeonWALReader. Allows for
* fetching WAL from safekeepers, which normal xlogreader can't do.
*
*-------------------------------------------------------------------------
*/
#include "walsender_hooks.h"
#include "postgres.h"
#include "fmgr.h"
#include "access/xlogdefs.h"
#include "replication/walsender.h"
#include "access/xlog.h"
#include "access/xlog_internal.h"
#include "access/xlogreader.h"
#include "miscadmin.h"
#include "utils/wait_event.h"
#include "utils/guc.h"
#include "postmaster/interrupt.h"
#include "neon_walreader.h"
#include "walproposer.h"
static NeonWALReader *wal_reader = NULL;
extern XLogRecPtr WalSndWaitForWal(XLogRecPtr loc);
extern bool GetDonorShmem(XLogRecPtr *donor_lsn);
static XLogRecPtr
NeonWALReadWaitForWAL(XLogRecPtr loc)
{
while (!NeonWALReaderUpdateDonor(wal_reader))
{
pg_usleep(1000);
CHECK_FOR_INTERRUPTS();
}
return WalSndWaitForWal(loc);
}
static int
NeonWALPageRead(
XLogReaderState *xlogreader,
XLogRecPtr targetPagePtr,
int reqLen,
XLogRecPtr targetRecPtr,
char *readBuf)
{
XLogRecPtr rem_lsn;
/* Wait for flush pointer to advance past our request */
XLogRecPtr flushptr = NeonWALReadWaitForWAL(targetPagePtr + reqLen);
int count;
if (flushptr < targetPagePtr + reqLen)
return -1;
/* Read at most XLOG_BLCKSZ bytes */
if (targetPagePtr + XLOG_BLCKSZ <= flushptr)
count = XLOG_BLCKSZ;
else
count = flushptr - targetPagePtr;
/*
* Sometimes walsender requests non-monotonic sequences of WAL. If that's
* the case, we have to reset streaming from remote at the correct
* position. For example, walsender may try to verify the segment header
* when trying to read in the middle of it.
*/
rem_lsn = NeonWALReaderGetRemLsn(wal_reader);
if (rem_lsn != InvalidXLogRecPtr && targetPagePtr != rem_lsn)
{
NeonWALReaderResetRemote(wal_reader);
}
for (;;)
{
NeonWALReadResult res = NeonWALRead(
wal_reader,
readBuf,
targetPagePtr,
count,
walprop_pg_get_timeline_id());
if (res == NEON_WALREAD_SUCCESS)
{
/*
* Setting ws_tli is required by the XLogReaderRoutine, it is used
* for segment name generation in error reports.
*
* ReadPageInternal updates ws_segno after calling cb on its own
* and XLogReaderRoutine description doesn't require it, but
* WALRead sets, let's follow it.
*/
xlogreader->seg.ws_tli = NeonWALReaderGetSegment(wal_reader)->ws_tli;
xlogreader->seg.ws_segno = NeonWALReaderGetSegment(wal_reader)->ws_segno;
/*
* ws_file doesn't exist in case of remote read, and isn't used by
* xlogreader except by WALRead on which we don't rely anyway.
*/
return count;
}
if (res == NEON_WALREAD_ERROR)
{
elog(ERROR, "[walsender] Failed to read WAL (req_lsn=%X/%X, len=%d): %s",
LSN_FORMAT_ARGS(targetPagePtr),
reqLen,
NeonWALReaderErrMsg(wal_reader));
return -1;
}
/*
* Res is WOULDBLOCK, so we wait on the socket, recreating event set
* if necessary
*/
{
pgsocket sock = NeonWALReaderSocket(wal_reader);
uint32_t reader_events = NeonWALReaderEvents(wal_reader);
long timeout_ms = 1000;
ResetLatch(MyLatch);
CHECK_FOR_INTERRUPTS();
if (ConfigReloadPending)
{
ConfigReloadPending = false;
ProcessConfigFile(PGC_SIGHUP);
}
WaitLatchOrSocket(
MyLatch,
WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | reader_events,
sock,
timeout_ms,
WAIT_EVENT_WAL_SENDER_MAIN);
}
}
}
static void
NeonWALReadSegmentOpen(XLogReaderState *xlogreader, XLogSegNo nextSegNo, TimeLineID *tli_p)
{
neon_wal_segment_open(wal_reader, nextSegNo, tli_p);
xlogreader->seg.ws_file = NeonWALReaderGetSegment(wal_reader)->ws_file;
}
static void
NeonWALReadSegmentClose(XLogReaderState *xlogreader)
{
neon_wal_segment_close(wal_reader);
xlogreader->seg.ws_file = NeonWALReaderGetSegment(wal_reader)->ws_file;
}
void
NeonOnDemandXLogReaderRoutines(XLogReaderRoutine *xlr)
{
if (!wal_reader)
{
XLogRecPtr epochStartLsn = pg_atomic_read_u64(&GetWalpropShmemState()->propEpochStartLsn);
if (epochStartLsn == 0)
{
elog(ERROR, "Unable to start walsender when propEpochStartLsn is 0!");
}
wal_reader = NeonWALReaderAllocate(wal_segment_size, epochStartLsn, "[walsender] ");
}
xlr->page_read = NeonWALPageRead;
xlr->segment_open = NeonWALReadSegmentOpen;
xlr->segment_close = NeonWALReadSegmentClose;
}

View File

@@ -1,7 +0,0 @@
#ifndef __WALSENDER_HOOKS_H__
#define __WALSENDER_HOOKS_H__
struct XLogReaderRoutine;
void NeonOnDemandXLogReaderRoutines(struct XLogReaderRoutine *xlr);
#endif

21
poetry.lock generated
View File

@@ -1001,17 +1001,18 @@ dotenv = ["python-dotenv"]
[[package]]
name = "flask-cors"
version = "4.0.1"
version = "3.0.10"
description = "A Flask extension adding a decorator for CORS support"
optional = false
python-versions = "*"
files = [
{file = "Flask_Cors-4.0.1-py2.py3-none-any.whl", hash = "sha256:f2a704e4458665580c074b714c4627dd5a306b333deb9074d0b1794dfa2fb677"},
{file = "flask_cors-4.0.1.tar.gz", hash = "sha256:eeb69b342142fdbf4766ad99357a7f3876a2ceb77689dc10ff912aac06c389e4"},
{file = "Flask-Cors-3.0.10.tar.gz", hash = "sha256:b60839393f3b84a0f3746f6cdca56c1ad7426aa738b70d6c61375857823181de"},
{file = "Flask_Cors-3.0.10-py2.py3-none-any.whl", hash = "sha256:74efc975af1194fc7891ff5cd85b0f7478be4f7f59fe158102e91abb72bb4438"},
]
[package.dependencies]
Flask = ">=0.9"
Six = "*"
[[package]]
name = "frozenlist"
@@ -1242,13 +1243,13 @@ files = [
[[package]]
name = "jinja2"
version = "3.1.4"
version = "3.1.3"
description = "A very fast and expressive template engine."
optional = false
python-versions = ">=3.7"
files = [
{file = "jinja2-3.1.4-py3-none-any.whl", hash = "sha256:bc5dd2abb727a5319567b7a813e6a2e7318c39f4f487cfe6c89c6f9c7d25197d"},
{file = "jinja2-3.1.4.tar.gz", hash = "sha256:4a3aee7acbbe7303aede8e9648d13b8bf88a429282aa6122a993f0ac800cb369"},
{file = "Jinja2-3.1.3-py3-none-any.whl", hash = "sha256:7d6d50dd97d52cbc355597bd845fabfbac3f551e1f99619e39a35ce8c370b5fa"},
{file = "Jinja2-3.1.3.tar.gz", hash = "sha256:ac8bd6544d4bb2c9792bf3a159e80bba8fda7f07e81bc3aed565432d5925ba90"},
]
[package.dependencies]
@@ -2611,13 +2612,13 @@ files = [
[[package]]
name = "werkzeug"
version = "3.0.3"
version = "3.0.1"
description = "The comprehensive WSGI web application library."
optional = false
python-versions = ">=3.8"
files = [
{file = "werkzeug-3.0.3-py3-none-any.whl", hash = "sha256:fc9645dc43e03e4d630d23143a04a7f947a9a3b5727cd535fdfe155a17cc48c8"},
{file = "werkzeug-3.0.3.tar.gz", hash = "sha256:097e5bfda9f0aba8da6b8545146def481d06aa7d3266e7448e2cccf67dd8bd18"},
{file = "werkzeug-3.0.1-py3-none-any.whl", hash = "sha256:90a285dc0e42ad56b34e696398b8122ee4c681833fb35b8334a095d82c56da10"},
{file = "werkzeug-3.0.1.tar.gz", hash = "sha256:507e811ecea72b18a404947aded4b3390e1db8f826b494d76550ef45bb3b1dcc"},
]
[package.dependencies]
@@ -2899,4 +2900,4 @@ cffi = ["cffi (>=1.11)"]
[metadata]
lock-version = "2.0"
python-versions = "^3.9"
content-hash = "496d6d9f722983bda4d1265370bc8ba75560da74ab5d6b68c94a03290815e1eb"
content-hash = "b3452b50901123fd5f2c385ce8a0c1c492296393b8a7926a322b6df0ea3ac572"

View File

@@ -40,7 +40,6 @@ hyper.workspace = true
hyper1 = { package = "hyper", version = "1.2", features = ["server"] }
hyper-util = { version = "0.1", features = ["server", "http1", "http2", "tokio"] }
http-body-util = { version = "0.1" }
indexmap.workspace = true
ipnet.workspace = true
itertools.workspace = true
lasso = { workspace = true, features = ["multi-threaded"] }
@@ -60,8 +59,8 @@ prometheus.workspace = true
rand.workspace = true
regex.workspace = true
remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
reqwest.workspace = true
reqwest-middleware = { workspace = true, features = ["json"] }
reqwest = { workspace = true, features = ["json"] }
reqwest-middleware.workspace = true
reqwest-retry.workspace = true
reqwest-tracing.workspace = true
routerify.workspace = true
@@ -85,7 +84,6 @@ tokio-postgres.workspace = true
tokio-rustls.workspace = true
tokio-util.workspace = true
tokio = { workspace = true, features = ["signal"] }
tower-service.workspace = true
tracing-opentelemetry.workspace = true
tracing-subscriber.workspace = true
tracing-utils.workspace = true

View File

@@ -27,7 +27,6 @@ use proxy::redis::cancellation_publisher::RedisPublisherClient;
use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
use proxy::redis::elasticache;
use proxy::redis::notifications;
use proxy::serverless::cancel_set::CancelSet;
use proxy::serverless::GlobalConnPoolOptions;
use proxy::usage_metrics;
@@ -119,11 +118,8 @@ struct ProxyCliArgs {
#[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
wake_compute_cache: String,
/// lock for `wake_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable).
#[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK)]
#[clap(long, default_value = config::WakeComputeLockOptions::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK)]
wake_compute_lock: String,
/// lock for `connect_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable).
#[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK)]
connect_compute_lock: String,
/// Allow self-signed certificates for compute nodes (for testing)
#[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
allow_self_signed_compute: bool,
@@ -244,12 +240,6 @@ struct SqlOverHttpArgs {
/// increase memory used by the pool
#[clap(long, default_value_t = 128)]
sql_over_http_pool_shards: usize,
#[clap(long, default_value_t = 10000)]
sql_over_http_client_conn_threshold: u64,
#[clap(long, default_value_t = 64)]
sql_over_http_cancel_set_shards: usize,
}
#[tokio::main]
@@ -539,21 +529,24 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
endpoint_cache_config,
)));
let config::ConcurrencyLockOptions {
let config::WakeComputeLockOptions {
shards,
permits,
epoch,
timeout,
} = args.wake_compute_lock.parse()?;
info!(permits, shards, ?epoch, "Using NodeLocks (wake_compute)");
let locks = Box::leak(Box::new(console::locks::ApiLocks::new(
"wake_compute_lock",
permits,
shards,
timeout,
epoch,
&Metrics::get().wake_compute_lock,
)?));
let locks = Box::leak(Box::new(
console::locks::ApiLocks::new(
"wake_compute_lock",
permits,
shards,
timeout,
epoch,
&Metrics::get().wake_compute_lock,
)
.unwrap(),
));
tokio::spawn(locks.garbage_collect_worker());
let url = args.auth_endpoint.parse()?;
@@ -579,23 +572,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
auth::BackendType::Link(MaybeOwned::Owned(url), ())
}
};
let config::ConcurrencyLockOptions {
shards,
permits,
epoch,
timeout,
} = args.connect_compute_lock.parse()?;
info!(permits, shards, ?epoch, "Using NodeLocks (connect_compute)");
let connect_compute_locks = console::locks::ApiLocks::new(
"connect_compute_lock",
permits,
shards,
timeout,
epoch,
&Metrics::get().proxy.connect_compute_lock,
)?;
let http_config = HttpConfig {
request_timeout: args.sql_over_http.sql_over_http_timeout,
pool_options: GlobalConnPoolOptions {
@@ -606,8 +582,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
opt_in: args.sql_over_http.sql_over_http_pool_opt_in,
max_total_conns: args.sql_over_http.sql_over_http_pool_max_total_conns,
},
cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards),
client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold,
};
let authentication_config = AuthenticationConfig {
scram_protocol_timeout: args.scram_protocol_timeout,
@@ -633,14 +607,11 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
region: args.region.clone(),
aws_region: args.aws_region.clone(),
wake_compute_retry_config: config::RetryConfig::parse(&args.wake_compute_retry)?,
connect_compute_locks,
connect_to_compute_retry_config: config::RetryConfig::parse(
&args.connect_to_compute_retry,
)?,
}));
tokio::spawn(config.connect_compute_locks.garbage_collect_worker());
Ok(config)
}

View File

@@ -21,7 +21,7 @@ use crate::{
config::EndpointCacheConfig,
context::RequestMonitoring,
intern::{BranchIdInt, EndpointIdInt, ProjectIdInt},
metrics::{Metrics, RedisErrors, RedisEventsCount},
metrics::{Metrics, RedisErrors},
rate_limiter::GlobalRateLimiter,
redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider,
EndpointId,
@@ -100,26 +100,14 @@ impl EndpointsCache {
if let Some(endpoint_created) = key.endpoint_created {
self.endpoints
.insert(EndpointIdInt::from(&endpoint_created.endpoint_id.into()));
Metrics::get()
.proxy
.redis_events_count
.inc(RedisEventsCount::EndpointCreated);
}
if let Some(branch_created) = key.branch_created {
self.branches
.insert(BranchIdInt::from(&branch_created.branch_id.into()));
Metrics::get()
.proxy
.redis_events_count
.inc(RedisEventsCount::BranchCreated);
}
if let Some(project_created) = key.project_created {
self.projects
.insert(ProjectIdInt::from(&project_created.project_id.into()));
Metrics::get()
.proxy
.redis_events_count
.inc(RedisEventsCount::ProjectCreated);
}
}
pub async fn do_read(

View File

@@ -5,11 +5,9 @@ use std::{
time::Duration,
};
use async_trait::async_trait;
use dashmap::DashMap;
use rand::{thread_rng, Rng};
use smol_str::SmolStr;
use tokio::sync::Mutex;
use tokio::time::Instant;
use tracing::{debug, info};
@@ -23,12 +21,11 @@ use crate::{
use super::{Cache, Cached};
#[async_trait]
pub trait ProjectInfoCache {
fn invalidate_allowed_ips_for_project(&self, project_id: ProjectIdInt);
fn invalidate_role_secret_for_project(&self, project_id: ProjectIdInt, role_name: RoleNameInt);
async fn decrement_active_listeners(&self);
async fn increment_active_listeners(&self);
fn enable_ttl(&self);
fn disable_ttl(&self);
}
struct Entry<T> {
@@ -119,10 +116,8 @@ pub struct ProjectInfoCacheImpl {
start_time: Instant,
ttl_disabled_since_us: AtomicU64,
active_listeners_lock: Mutex<usize>,
}
#[async_trait]
impl ProjectInfoCache for ProjectInfoCacheImpl {
fn invalidate_allowed_ips_for_project(&self, project_id: ProjectIdInt) {
info!("invalidating allowed ips for project `{}`", project_id);
@@ -153,27 +148,15 @@ impl ProjectInfoCache for ProjectInfoCacheImpl {
}
}
}
async fn decrement_active_listeners(&self) {
let mut listeners_guard = self.active_listeners_lock.lock().await;
if *listeners_guard == 0 {
tracing::error!("active_listeners count is already 0, something is broken");
return;
}
*listeners_guard -= 1;
if *listeners_guard == 0 {
self.ttl_disabled_since_us
.store(u64::MAX, std::sync::atomic::Ordering::SeqCst);
}
fn enable_ttl(&self) {
self.ttl_disabled_since_us
.store(u64::MAX, std::sync::atomic::Ordering::Relaxed);
}
async fn increment_active_listeners(&self) {
let mut listeners_guard = self.active_listeners_lock.lock().await;
*listeners_guard += 1;
if *listeners_guard == 1 {
let new_ttl = (self.start_time.elapsed() + self.config.ttl).as_micros() as u64;
self.ttl_disabled_since_us
.store(new_ttl, std::sync::atomic::Ordering::SeqCst);
}
fn disable_ttl(&self) {
let new_ttl = (self.start_time.elapsed() + self.config.ttl).as_micros() as u64;
self.ttl_disabled_since_us
.store(new_ttl, std::sync::atomic::Ordering::Relaxed);
}
}
@@ -185,7 +168,6 @@ impl ProjectInfoCacheImpl {
config,
ttl_disabled_since_us: AtomicU64::new(u64::MAX),
start_time: Instant::now(),
active_listeners_lock: Mutex::new(0),
}
}
@@ -450,7 +432,7 @@ mod tests {
ttl: Duration::from_secs(1),
gc_interval: Duration::from_secs(600),
}));
cache.clone().increment_active_listeners().await;
cache.clone().disable_ttl();
tokio::time::advance(Duration::from_secs(2)).await;
let project_id: ProjectId = "project".into();
@@ -507,7 +489,7 @@ mod tests {
}
#[tokio::test]
async fn test_increment_active_listeners_invalidate_added_before() {
async fn test_disable_ttl_invalidate_added_before() {
tokio::time::pause();
let cache = Arc::new(ProjectInfoCacheImpl::new(ProjectInfoCacheOptions {
size: 2,
@@ -532,7 +514,7 @@ mod tests {
(&user1).into(),
secret1.clone(),
);
cache.clone().increment_active_listeners().await;
cache.clone().disable_ttl();
tokio::time::advance(Duration::from_millis(100)).await;
cache.insert_role_secret(
(&project_id).into(),

View File

@@ -6,7 +6,6 @@ use crate::{
error::{ReportableError, UserFacingError},
metrics::{Metrics, NumDbConnectionsGuard},
proxy::neon_option,
Host,
};
use futures::{FutureExt, TryFutureExt};
use itertools::Itertools;
@@ -102,16 +101,6 @@ impl ConnCfg {
}
}
pub fn get_host(&self) -> Result<Host, WakeComputeError> {
match self.0.get_hosts() {
[tokio_postgres::config::Host::Tcp(s)] => Ok(s.into()),
// we should not have multiple address or unix addresses.
_ => Err(WakeComputeError::BadComputeAddress(
"invalid compute address".into(),
)),
}
}
/// Apply startup message params to the connection config.
pub fn set_startup_params(&mut self, params: &StartupMessageParams) {
// Only set `user` if it's not present in the config.

View File

@@ -1,9 +1,7 @@
use crate::{
auth::{self, backend::AuthRateLimiter},
console::locks::ApiLocks,
rate_limiter::RateBucketInfo,
serverless::{cancel_set::CancelSet, GlobalConnPoolOptions},
Host,
serverless::GlobalConnPoolOptions,
};
use anyhow::{bail, ensure, Context, Ok};
use itertools::Itertools;
@@ -36,7 +34,6 @@ pub struct ProxyConfig {
pub handshake_timeout: Duration,
pub aws_region: String,
pub wake_compute_retry_config: RetryConfig,
pub connect_compute_locks: ApiLocks<Host>,
pub connect_to_compute_retry_config: RetryConfig,
}
@@ -56,8 +53,6 @@ pub struct TlsConfig {
pub struct HttpConfig {
pub request_timeout: tokio::time::Duration,
pub pool_options: GlobalConnPoolOptions,
pub cancel_set: CancelSet,
pub client_conn_threshold: u64,
}
pub struct AuthenticationConfig {
@@ -538,9 +533,9 @@ pub struct RetryConfig {
impl RetryConfig {
/// Default options for RetryConfig.
/// Total delay for 5 retries with 200ms base delay and 2 backoff factor is about 6s.
/// Total delay for 8 retries with 100ms base delay and 1.6 backoff factor is about 7s.
pub const CONNECT_TO_COMPUTE_DEFAULT_VALUES: &'static str =
"num_retries=5,base_retry_wait_duration=200ms,retry_wait_exponent_base=2";
"num_retries=8,base_retry_wait_duration=100ms,retry_wait_exponent_base=1.6";
/// Total delay for 8 retries with 100ms base delay and 1.6 backoff factor is about 7s.
/// Cplane has timeout of 60s on each request. 8m7s in total.
pub const WAKE_COMPUTE_DEFAULT_VALUES: &'static str =
@@ -578,7 +573,7 @@ impl RetryConfig {
}
/// Helper for cmdline cache options parsing.
pub struct ConcurrencyLockOptions {
pub struct WakeComputeLockOptions {
/// The number of shards the lock map should have
pub shards: usize,
/// The number of allowed concurrent requests for each endpoitn
@@ -589,12 +584,9 @@ pub struct ConcurrencyLockOptions {
pub timeout: Duration,
}
impl ConcurrencyLockOptions {
impl WakeComputeLockOptions {
/// Default options for [`crate::console::provider::ApiLocks`].
pub const DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK: &'static str = "permits=0";
/// Default options for [`crate::console::provider::ApiLocks`].
pub const DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK: &'static str =
"shards=64,permits=10,epoch=10m,timeout=10ms";
// pub const DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK: &'static str = "shards=32,permits=4,epoch=10m,timeout=1s";
@@ -644,7 +636,7 @@ impl ConcurrencyLockOptions {
}
}
impl FromStr for ConcurrencyLockOptions {
impl FromStr for WakeComputeLockOptions {
type Err = anyhow::Error;
fn from_str(options: &str) -> Result<Self, Self::Err> {
@@ -680,7 +672,7 @@ mod tests {
#[test]
fn test_parse_lock_options() -> anyhow::Result<()> {
let ConcurrencyLockOptions {
let WakeComputeLockOptions {
epoch,
permits,
shards,
@@ -691,7 +683,7 @@ mod tests {
assert_eq!(shards, 32);
assert_eq!(permits, 4);
let ConcurrencyLockOptions {
let WakeComputeLockOptions {
epoch,
permits,
shards,
@@ -702,7 +694,7 @@ mod tests {
assert_eq!(shards, 16);
assert_eq!(permits, 8);
let ConcurrencyLockOptions {
let WakeComputeLockOptions {
epoch,
permits,
shards,

View File

@@ -17,7 +17,7 @@ use crate::{
scram, EndpointCacheKey,
};
use dashmap::DashMap;
use std::{hash::Hash, sync::Arc, time::Duration};
use std::{sync::Arc, time::Duration};
use tokio::sync::{OwnedSemaphorePermit, Semaphore};
use tokio::time::Instant;
use tracing::info;
@@ -76,7 +76,7 @@ pub mod errors {
}
http::StatusCode::LOCKED | http::StatusCode::UNPROCESSABLE_ENTITY => {
// Status 423: project might be in maintenance mode (or bad state), or quotas exceeded.
format!("{REQUEST_FAILED}: endpoint is temporarily unavailable. Check your quotas and/or contact our support.")
format!("{REQUEST_FAILED}: endpoint is temporary unavailable. check your quotas and/or contact our support")
}
_ => REQUEST_FAILED.to_owned(),
},
@@ -447,16 +447,16 @@ impl ApiCaches {
}
/// Various caches for [`console`](super).
pub struct ApiLocks<K> {
pub struct ApiLocks {
name: &'static str,
node_locks: DashMap<K, Arc<Semaphore>>,
node_locks: DashMap<EndpointCacheKey, Arc<Semaphore>>,
permits: usize,
timeout: Duration,
epoch: std::time::Duration,
metrics: &'static ApiLockMetrics,
}
impl<K: Hash + Eq + Clone> ApiLocks<K> {
impl ApiLocks {
pub fn new(
name: &'static str,
permits: usize,
@@ -475,7 +475,10 @@ impl<K: Hash + Eq + Clone> ApiLocks<K> {
})
}
pub async fn get_permit(&self, key: &K) -> Result<WakeComputePermit, errors::WakeComputeError> {
pub async fn get_wake_compute_permit(
&self,
key: &EndpointCacheKey,
) -> Result<WakeComputePermit, errors::WakeComputeError> {
if self.permits == 0 {
return Ok(WakeComputePermit { permit: None });
}

View File

@@ -13,7 +13,7 @@ use crate::{
http,
metrics::{CacheOutcome, Metrics},
rate_limiter::EndpointRateLimiter,
scram, EndpointCacheKey, Normalize,
scram, Normalize,
};
use crate::{cache::Cached, context::RequestMonitoring};
use futures::TryFutureExt;
@@ -25,7 +25,7 @@ use tracing::{error, info, info_span, warn, Instrument};
pub struct Api {
endpoint: http::Endpoint,
pub caches: &'static ApiCaches,
pub locks: &'static ApiLocks<EndpointCacheKey>,
pub locks: &'static ApiLocks,
pub endpoint_rate_limiter: Arc<EndpointRateLimiter>,
jwt: String,
}
@@ -35,7 +35,7 @@ impl Api {
pub fn new(
endpoint: http::Endpoint,
caches: &'static ApiCaches,
locks: &'static ApiLocks<EndpointCacheKey>,
locks: &'static ApiLocks,
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
) -> Self {
let jwt: String = match std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") {
@@ -289,7 +289,7 @@ impl super::Api for Api {
return Err(WakeComputeError::TooManyConnections);
}
let permit = self.locks.get_permit(&key).await?;
let permit = self.locks.get_wake_compute_permit(&key).await?;
// after getting back a permit - it's possible the cache was filled
// double check

View File

@@ -4,7 +4,7 @@
pub mod health_server;
use std::{str::FromStr, sync::Arc, time::Duration};
use std::{sync::Arc, time::Duration};
use futures::FutureExt;
pub use reqwest::{Request, Response, StatusCode};
@@ -103,12 +103,12 @@ impl Endpoint {
}
}
use hyper_util::client::legacy::connect::dns::{
GaiResolver as HyperGaiResolver, Name as HyperName,
};
use reqwest::dns::{Addrs, Name, Resolve, Resolving};
/// https://docs.rs/reqwest/0.11.18/src/reqwest/dns/gai.rs.html
use tower_service::Service;
use hyper::{
client::connect::dns::{GaiResolver as HyperGaiResolver, Name},
service::Service,
};
use reqwest::dns::{Addrs, Resolve, Resolving};
#[derive(Debug)]
pub struct GaiResolver(HyperGaiResolver);
@@ -121,12 +121,11 @@ impl Default for GaiResolver {
impl Resolve for GaiResolver {
fn resolve(&self, name: Name) -> Resolving {
let this = &mut self.0.clone();
let hyper_name = HyperName::from_str(name.as_str()).expect("name should be valid");
let start = Instant::now();
Box::pin(
Service::<HyperName>::call(this, hyper_name).map(move |result| {
Service::<Name>::call(this, name.clone()).map(move |result| {
let resolve_duration = start.elapsed();
trace!(duration = ?resolve_duration, addr = %name.as_str(), "resolve host complete");
trace!(duration = ?resolve_duration, addr = %name, "resolve host complete");
result
.map(|addrs| -> Addrs { Box::new(addrs) })
.map_err(|err| -> Box<dyn std::error::Error + Send + Sync> { Box::new(err) })

Some files were not shown because too many files have changed in this diff Show More