mirror of
https://github.com/neondatabase/neon.git
synced 2026-02-13 15:40:37 +00:00
Compare commits
69 Commits
nix-devenv
...
arpad/less
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e97e86eb43 | ||
|
|
c9370d48de | ||
|
|
e6da7e29ed | ||
|
|
0353a72a00 | ||
|
|
ce4d3da3ae | ||
|
|
5da3e2113a | ||
|
|
4deb8dc52e | ||
|
|
64f0613edf | ||
|
|
1e7cd6ac9f | ||
|
|
ef03b38e52 | ||
|
|
9b65946566 | ||
|
|
a3fe12b6d8 | ||
|
|
b5a6e68e68 | ||
|
|
ce0ddd749c | ||
|
|
426598cf76 | ||
|
|
8b4dd5dc27 | ||
|
|
ed9a114bde | ||
|
|
b7385bb016 | ||
|
|
37b1930b2f | ||
|
|
d76963691f | ||
|
|
60f570c70d | ||
|
|
3582a95c87 | ||
|
|
00423152c6 | ||
|
|
240efb82f9 | ||
|
|
5f099dc760 | ||
|
|
7a49e5d5c2 | ||
|
|
45ec8688ea | ||
|
|
4b55dad813 | ||
|
|
ab95942fc2 | ||
|
|
f656db09a4 | ||
|
|
69bf1bae7d | ||
|
|
25af32e834 | ||
|
|
cb4b4750ba | ||
|
|
d43d77389e | ||
|
|
5558457c84 | ||
|
|
26e6ff8ba6 | ||
|
|
50a45e67dc | ||
|
|
fcbe60f436 | ||
|
|
e018cac1f7 | ||
|
|
a74b60066c | ||
|
|
3a2f10712a | ||
|
|
4ac4b21598 | ||
|
|
9f792f9c0b | ||
|
|
7434674d86 | ||
|
|
ea37234ccc | ||
|
|
3da54e6d90 | ||
|
|
010f0a310a | ||
|
|
eb53345d48 | ||
|
|
45c625fb34 | ||
|
|
84b6b95783 | ||
|
|
577982b778 | ||
|
|
574645412b | ||
|
|
11945e64ec | ||
|
|
cddafc79e1 | ||
|
|
af7cca4949 | ||
|
|
89cae64e38 | ||
|
|
1f417af9fd | ||
|
|
6f714c308b | ||
|
|
2e6afaa642 | ||
|
|
8f0a0440ba | ||
|
|
987dc01ed7 | ||
|
|
719e4ad580 | ||
|
|
e61b2a08b3 | ||
|
|
cc89b46ae5 | ||
|
|
d5cbdd2e90 | ||
|
|
6ad9c3560e | ||
|
|
9dc3b09e57 | ||
|
|
fe762e35d8 | ||
|
|
0c4988a92c |
@@ -48,6 +48,10 @@ inputs:
|
||||
description: 'benchmark durations JSON'
|
||||
required: false
|
||||
default: '{}'
|
||||
session_timeout:
|
||||
description: 'Session timeout for the test suite'
|
||||
required: false
|
||||
default: ''
|
||||
|
||||
runs:
|
||||
using: "composite"
|
||||
@@ -107,6 +111,7 @@ runs:
|
||||
ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage')
|
||||
RERUN_FLAKY: ${{ inputs.rerun_flaky }}
|
||||
PG_VERSION: ${{ inputs.pg_version }}
|
||||
SESSION_TIMEOUT: ${{ inputs.session_timeout }}
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
# PLATFORM will be embedded in the perf test report
|
||||
@@ -168,6 +173,10 @@ runs:
|
||||
EXTRA_PARAMS="--durations-path $TEST_OUTPUT/benchmark_durations.json $EXTRA_PARAMS"
|
||||
fi
|
||||
|
||||
if [ -n "${SESSION_TIMEOUT}" ]; then
|
||||
EXTRA_PARAMS="--session-timeout ${SESSION_TIMEOUT} ${EXTRA_PARAMS}"
|
||||
fi
|
||||
|
||||
if [[ "${{ inputs.build_type }}" == "debug" ]]; then
|
||||
cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
|
||||
elif [[ "${{ inputs.build_type }}" == "release" ]]; then
|
||||
|
||||
9
.github/workflows/build_and_test.yml
vendored
9
.github/workflows/build_and_test.yml
vendored
@@ -461,7 +461,8 @@ jobs:
|
||||
|
||||
- name: Pytest regression tests
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
timeout-minutes: 60
|
||||
# Hard timeout to prevent hanging tests, we also have set softer pytest timeout (set via `session_timeout`) which is shorter
|
||||
timeout-minutes: 110
|
||||
with:
|
||||
build_type: ${{ matrix.build_type }}
|
||||
test_selection: regress
|
||||
@@ -471,6 +472,8 @@ jobs:
|
||||
real_s3_region: eu-central-1
|
||||
rerun_flaky: true
|
||||
pg_version: ${{ matrix.pg_version }}
|
||||
# Set pytest session timeout to 25 minutes
|
||||
session_timeout: '1500'
|
||||
env:
|
||||
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
|
||||
CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
|
||||
@@ -478,6 +481,7 @@ jobs:
|
||||
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
|
||||
PAGESERVER_GET_VECTORED_IMPL: vectored
|
||||
PAGESERVER_GET_IMPL: vectored
|
||||
PAGESERVER_VALIDATE_VEC_GET: true
|
||||
|
||||
# Temporary disable this step until we figure out why it's so flaky
|
||||
# Ref https://github.com/neondatabase/neon/issues/4540
|
||||
@@ -557,6 +561,9 @@ jobs:
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
|
||||
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
|
||||
PAGESERVER_GET_VECTORED_IMPL: vectored
|
||||
PAGESERVER_GET_IMPL: vectored
|
||||
PAGESERVER_VALIDATE_VEC_GET: false
|
||||
# XXX: no coverage data handling here, since benchmarks are run on release builds,
|
||||
# while coverage is currently collected for the debug ones
|
||||
|
||||
|
||||
4
.gitignore
vendored
4
.gitignore
vendored
@@ -23,7 +23,3 @@ compaction-suite-results.*
|
||||
|
||||
# pgindent typedef lists
|
||||
*.list
|
||||
|
||||
# nix dev env
|
||||
.direnv
|
||||
.devenv
|
||||
|
||||
368
Cargo.lock
generated
368
Cargo.lock
generated
@@ -595,7 +595,7 @@ dependencies = [
|
||||
"http 0.2.9",
|
||||
"http-body 0.4.5",
|
||||
"hyper 0.14.26",
|
||||
"hyper-rustls",
|
||||
"hyper-rustls 0.24.0",
|
||||
"once_cell",
|
||||
"pin-project-lite",
|
||||
"pin-utils",
|
||||
@@ -684,7 +684,7 @@ dependencies = [
|
||||
"http-body 0.4.5",
|
||||
"hyper 0.14.26",
|
||||
"itoa",
|
||||
"matchit",
|
||||
"matchit 0.7.0",
|
||||
"memchr",
|
||||
"mime",
|
||||
"percent-encoding",
|
||||
@@ -722,9 +722,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "azure_core"
|
||||
version = "0.18.0"
|
||||
version = "0.19.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a6218987c374650fdad0b476bfc675729762c28dfb35f58608a38a2b1ea337dd"
|
||||
checksum = "70fd680c0d0424a518229b1150922f92653ba2ac933aa000abc8bf1ca08105f7"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"base64 0.21.1",
|
||||
@@ -740,7 +740,7 @@ dependencies = [
|
||||
"pin-project",
|
||||
"quick-xml",
|
||||
"rand 0.8.5",
|
||||
"reqwest",
|
||||
"reqwest 0.11.19",
|
||||
"rustc_version",
|
||||
"serde",
|
||||
"serde_json",
|
||||
@@ -752,9 +752,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "azure_identity"
|
||||
version = "0.18.1"
|
||||
version = "0.19.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9e1eacc4f7fb2a73d57c39139d0fc3aed78435606055779ddaef4b43cdf919a8"
|
||||
checksum = "a6d2060f5b2e1c664026ca4edd561306c473be887c1f7a81f10bf06f9b71c63f"
|
||||
dependencies = [
|
||||
"async-lock",
|
||||
"async-trait",
|
||||
@@ -772,9 +772,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "azure_storage"
|
||||
version = "0.18.0"
|
||||
version = "0.19.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ade8f2653e408de88b9eafec9f48c3c26b94026375e88adbd34523a7dd9795a1"
|
||||
checksum = "15d3da73bfa09350e1bd6ae2a260806fcf90048c7e78cd2d8f88be60b19a7266"
|
||||
dependencies = [
|
||||
"RustyXML",
|
||||
"async-lock",
|
||||
@@ -791,9 +791,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "azure_storage_blobs"
|
||||
version = "0.18.0"
|
||||
version = "0.19.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "025701c7cc5b523100f0f3b2b01723564ec5a86c03236521c06826337047e872"
|
||||
checksum = "149c21834a4105d761e3dd33d91c2a3064acc05a3c978848ea8089102ae45c94"
|
||||
dependencies = [
|
||||
"RustyXML",
|
||||
"azure_core",
|
||||
@@ -812,9 +812,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "azure_svc_blobstorage"
|
||||
version = "0.18.0"
|
||||
version = "0.19.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "76051e5bb67cea1055abe5e530a0878feac7e0ab4cbbcb4a6adc953a58993389"
|
||||
checksum = "88c888b7bf522d5405218b8613bf0fae7ddaae6ef3bf4ad42ae005993c96ab8b"
|
||||
dependencies = [
|
||||
"azure_core",
|
||||
"bytes",
|
||||
@@ -865,6 +865,12 @@ version = "0.21.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3f1e31e207a6b8fb791a38ea3105e6cb541f55e4d029902d3039a4ad07cc4105"
|
||||
|
||||
[[package]]
|
||||
name = "base64"
|
||||
version = "0.22.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
|
||||
|
||||
[[package]]
|
||||
name = "base64-simd"
|
||||
version = "0.8.0"
|
||||
@@ -1210,7 +1216,7 @@ dependencies = [
|
||||
"postgres",
|
||||
"regex",
|
||||
"remote_storage",
|
||||
"reqwest",
|
||||
"reqwest 0.12.4",
|
||||
"rust-ini",
|
||||
"serde",
|
||||
"serde_json",
|
||||
@@ -1319,6 +1325,7 @@ dependencies = [
|
||||
"git-version",
|
||||
"hex",
|
||||
"humantime",
|
||||
"humantime-serde",
|
||||
"hyper 0.14.26",
|
||||
"nix 0.27.1",
|
||||
"once_cell",
|
||||
@@ -1328,7 +1335,7 @@ dependencies = [
|
||||
"postgres_backend",
|
||||
"postgres_connection",
|
||||
"regex",
|
||||
"reqwest",
|
||||
"reqwest 0.12.4",
|
||||
"safekeeper_api",
|
||||
"scopeguard",
|
||||
"serde",
|
||||
@@ -2362,6 +2369,17 @@ dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hostname"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f9c7c7c8ac16c798734b8a24560c1362120597c40d5e1459f09498f8f6c8f2ba"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"libc",
|
||||
"windows 0.52.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "http"
|
||||
version = "0.2.9"
|
||||
@@ -2508,6 +2526,7 @@ dependencies = [
|
||||
"pin-project-lite",
|
||||
"smallvec",
|
||||
"tokio",
|
||||
"want",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2525,6 +2544,23 @@ dependencies = [
|
||||
"tokio-rustls 0.24.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hyper-rustls"
|
||||
version = "0.26.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a0bea761b46ae2b24eb4aef630d8d1c398157b6fc29e6350ecf090a0b70c952c"
|
||||
dependencies = [
|
||||
"futures-util",
|
||||
"http 1.1.0",
|
||||
"hyper 1.2.0",
|
||||
"hyper-util",
|
||||
"rustls 0.22.4",
|
||||
"rustls-pki-types",
|
||||
"tokio",
|
||||
"tokio-rustls 0.25.0",
|
||||
"tower-service",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hyper-timeout"
|
||||
version = "0.4.1"
|
||||
@@ -2572,6 +2608,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ca38ef113da30126bbff9cd1705f9273e15d45498615d138b0c20279ac7a76aa"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"futures-channel",
|
||||
"futures-util",
|
||||
"http 1.1.0",
|
||||
"http-body 1.0.0",
|
||||
@@ -2579,6 +2616,9 @@ dependencies = [
|
||||
"pin-project-lite",
|
||||
"socket2 0.5.5",
|
||||
"tokio",
|
||||
"tower",
|
||||
"tower-service",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2592,7 +2632,7 @@ dependencies = [
|
||||
"iana-time-zone-haiku",
|
||||
"js-sys",
|
||||
"wasm-bindgen",
|
||||
"windows",
|
||||
"windows 0.48.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2763,9 +2803,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "js-sys"
|
||||
version = "0.3.63"
|
||||
version = "0.3.69"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2f37a4a5928311ac501dee68b3c7613a1037d0edb30c8e5427bd832d55d1b790"
|
||||
checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d"
|
||||
dependencies = [
|
||||
"wasm-bindgen",
|
||||
]
|
||||
@@ -2915,6 +2955,12 @@ version = "0.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b87248edafb776e59e6ee64a79086f65890d3510f2c656c000bf2a7e8a0aea40"
|
||||
|
||||
[[package]]
|
||||
name = "matchit"
|
||||
version = "0.8.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "540f1c43aed89909c0cc0cc604e3bb2f7e7a341a3728a9e6cfe760e733cd11ed"
|
||||
|
||||
[[package]]
|
||||
name = "md-5"
|
||||
version = "0.10.5"
|
||||
@@ -3048,16 +3094,6 @@ version = "0.3.17"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a"
|
||||
|
||||
[[package]]
|
||||
name = "mime_guess"
|
||||
version = "2.0.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4192263c238a5f0d0c6bfd21f336a313a4ce1c450542449ca191bb657b4642ef"
|
||||
dependencies = [
|
||||
"mime",
|
||||
"unicase",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "minimal-lexical"
|
||||
version = "0.2.1"
|
||||
@@ -3184,6 +3220,16 @@ dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nu-ansi-term"
|
||||
version = "0.46.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
|
||||
dependencies = [
|
||||
"overload",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num"
|
||||
version = "0.4.1"
|
||||
@@ -3391,7 +3437,7 @@ dependencies = [
|
||||
"bytes",
|
||||
"http 0.2.9",
|
||||
"opentelemetry_api",
|
||||
"reqwest",
|
||||
"reqwest 0.11.19",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -3409,7 +3455,7 @@ dependencies = [
|
||||
"opentelemetry_api",
|
||||
"opentelemetry_sdk",
|
||||
"prost",
|
||||
"reqwest",
|
||||
"reqwest 0.11.19",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tonic",
|
||||
@@ -3520,6 +3566,12 @@ version = "0.5.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a"
|
||||
|
||||
[[package]]
|
||||
name = "overload"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
|
||||
|
||||
[[package]]
|
||||
name = "p256"
|
||||
version = "0.11.1"
|
||||
@@ -3586,7 +3638,6 @@ dependencies = [
|
||||
"arc-swap",
|
||||
"async-compression",
|
||||
"async-stream",
|
||||
"async-trait",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"camino",
|
||||
@@ -3632,7 +3683,7 @@ dependencies = [
|
||||
"rand 0.8.5",
|
||||
"regex",
|
||||
"remote_storage",
|
||||
"reqwest",
|
||||
"reqwest 0.12.4",
|
||||
"rpds",
|
||||
"scopeguard",
|
||||
"serde",
|
||||
@@ -3702,7 +3753,7 @@ dependencies = [
|
||||
"futures",
|
||||
"pageserver_api",
|
||||
"postgres",
|
||||
"reqwest",
|
||||
"reqwest 0.12.4",
|
||||
"serde",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
@@ -4055,7 +4106,6 @@ name = "postgres_backend"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-trait",
|
||||
"bytes",
|
||||
"futures",
|
||||
"once_cell",
|
||||
@@ -4311,7 +4361,7 @@ dependencies = [
|
||||
"hashlink",
|
||||
"hex",
|
||||
"hmac",
|
||||
"hostname",
|
||||
"hostname 0.3.1",
|
||||
"http 1.1.0",
|
||||
"http-body-util",
|
||||
"humantime",
|
||||
@@ -4344,7 +4394,7 @@ dependencies = [
|
||||
"redis",
|
||||
"regex",
|
||||
"remote_storage",
|
||||
"reqwest",
|
||||
"reqwest 0.12.4",
|
||||
"reqwest-middleware",
|
||||
"reqwest-retry",
|
||||
"reqwest-tracing",
|
||||
@@ -4371,6 +4421,7 @@ dependencies = [
|
||||
"tokio-postgres-rustls",
|
||||
"tokio-rustls 0.25.0",
|
||||
"tokio-util",
|
||||
"tower-service",
|
||||
"tracing",
|
||||
"tracing-opentelemetry",
|
||||
"tracing-subscriber",
|
||||
@@ -4661,6 +4712,7 @@ dependencies = [
|
||||
"scopeguard",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"sync_wrapper",
|
||||
"test-context",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
@@ -4686,69 +4738,106 @@ dependencies = [
|
||||
"http 0.2.9",
|
||||
"http-body 0.4.5",
|
||||
"hyper 0.14.26",
|
||||
"hyper-rustls",
|
||||
"hyper-tls",
|
||||
"ipnet",
|
||||
"js-sys",
|
||||
"log",
|
||||
"mime",
|
||||
"mime_guess",
|
||||
"native-tls",
|
||||
"once_cell",
|
||||
"percent-encoding",
|
||||
"pin-project-lite",
|
||||
"rustls 0.21.11",
|
||||
"rustls-pemfile 1.0.2",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_urlencoded",
|
||||
"tokio",
|
||||
"tokio-native-tls",
|
||||
"tokio-rustls 0.24.0",
|
||||
"tokio-util",
|
||||
"tower-service",
|
||||
"url",
|
||||
"wasm-bindgen",
|
||||
"wasm-bindgen-futures",
|
||||
"wasm-streams",
|
||||
"wasm-streams 0.3.0",
|
||||
"web-sys",
|
||||
"webpki-roots 0.25.2",
|
||||
"winreg",
|
||||
"winreg 0.50.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "reqwest"
|
||||
version = "0.12.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "566cafdd92868e0939d3fb961bd0dc25fcfaaed179291093b3d43e6b3150ea10"
|
||||
dependencies = [
|
||||
"base64 0.22.1",
|
||||
"bytes",
|
||||
"futures-channel",
|
||||
"futures-core",
|
||||
"futures-util",
|
||||
"http 1.1.0",
|
||||
"http-body 1.0.0",
|
||||
"http-body-util",
|
||||
"hyper 1.2.0",
|
||||
"hyper-rustls 0.26.0",
|
||||
"hyper-util",
|
||||
"ipnet",
|
||||
"js-sys",
|
||||
"log",
|
||||
"mime",
|
||||
"once_cell",
|
||||
"percent-encoding",
|
||||
"pin-project-lite",
|
||||
"rustls 0.22.4",
|
||||
"rustls-pemfile 2.1.1",
|
||||
"rustls-pki-types",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_urlencoded",
|
||||
"sync_wrapper",
|
||||
"tokio",
|
||||
"tokio-rustls 0.25.0",
|
||||
"tokio-util",
|
||||
"tower-service",
|
||||
"url",
|
||||
"wasm-bindgen",
|
||||
"wasm-bindgen-futures",
|
||||
"wasm-streams 0.4.0",
|
||||
"web-sys",
|
||||
"webpki-roots 0.26.1",
|
||||
"winreg 0.52.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "reqwest-middleware"
|
||||
version = "0.2.2"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4531c89d50effe1fac90d095c8b133c20c5c714204feee0bfc3fd158e784209d"
|
||||
checksum = "0209efb52486ad88136190094ee214759ef7507068b27992256ed6610eb71a01"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-trait",
|
||||
"http 0.2.9",
|
||||
"reqwest",
|
||||
"http 1.1.0",
|
||||
"reqwest 0.12.4",
|
||||
"serde",
|
||||
"task-local-extensions",
|
||||
"thiserror",
|
||||
"tower-service",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "reqwest-retry"
|
||||
version = "0.2.2"
|
||||
version = "0.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "48d0fd6ef4c6d23790399fe15efc8d12cd9f3d4133958f9bd7801ee5cbaec6c4"
|
||||
checksum = "40f342894422862af74c50e1e9601cf0931accc9c6981e5eb413c46603b616b5"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-trait",
|
||||
"chrono",
|
||||
"futures",
|
||||
"getrandom 0.2.11",
|
||||
"http 0.2.9",
|
||||
"hyper 0.14.26",
|
||||
"http 1.1.0",
|
||||
"hyper 1.2.0",
|
||||
"parking_lot 0.11.2",
|
||||
"reqwest",
|
||||
"reqwest 0.12.4",
|
||||
"reqwest-middleware",
|
||||
"retry-policies",
|
||||
"task-local-extensions",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"wasm-timer",
|
||||
@@ -4756,27 +4845,27 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "reqwest-tracing"
|
||||
version = "0.4.7"
|
||||
version = "0.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5a0152176687dd5cfe7f507ac1cb1a491c679cfe483afd133a7db7aaea818bb3"
|
||||
checksum = "b253954a1979e02eabccd7e9c3d61d8f86576108baa160775e7f160bb4e800a3"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-trait",
|
||||
"getrandom 0.2.11",
|
||||
"matchit",
|
||||
"http 1.1.0",
|
||||
"matchit 0.8.2",
|
||||
"opentelemetry",
|
||||
"reqwest",
|
||||
"reqwest 0.12.4",
|
||||
"reqwest-middleware",
|
||||
"task-local-extensions",
|
||||
"tracing",
|
||||
"tracing-opentelemetry",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "retry-policies"
|
||||
version = "0.1.2"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e09bbcb5003282bcb688f0bae741b278e9c7e8f378f561522c9806c58e075d9b"
|
||||
checksum = "493b4243e32d6eedd29f9a398896e35c6943a123b55eec97dcaee98310d25810"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"chrono",
|
||||
@@ -5095,18 +5184,23 @@ dependencies = [
|
||||
"hex",
|
||||
"histogram",
|
||||
"itertools",
|
||||
"native-tls",
|
||||
"pageserver",
|
||||
"pageserver_api",
|
||||
"postgres-native-tls",
|
||||
"postgres_ffi",
|
||||
"rand 0.8.5",
|
||||
"remote_storage",
|
||||
"reqwest",
|
||||
"reqwest 0.12.4",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_with",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
"tokio-rustls 0.25.0",
|
||||
"tokio-stream",
|
||||
"tokio-util",
|
||||
"tracing",
|
||||
"tracing-appender",
|
||||
"tracing-subscriber",
|
||||
@@ -5148,7 +5242,7 @@ dependencies = [
|
||||
"rand 0.8.5",
|
||||
"regex",
|
||||
"remote_storage",
|
||||
"reqwest",
|
||||
"reqwest 0.12.4",
|
||||
"safekeeper_api",
|
||||
"scopeguard",
|
||||
"sd-notify",
|
||||
@@ -5278,12 +5372,12 @@ checksum = "bebd363326d05ec3e2f532ab7660680f3b02130d780c299bca73469d521bc0ed"
|
||||
|
||||
[[package]]
|
||||
name = "sentry"
|
||||
version = "0.31.6"
|
||||
version = "0.32.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2e95efd0cefa32028cdb9766c96de71d96671072f9fb494dc9fb84c0ef93e52b"
|
||||
checksum = "00421ed8fa0c995f07cde48ba6c89e80f2b312f74ff637326f392fbfd23abe02"
|
||||
dependencies = [
|
||||
"httpdate",
|
||||
"reqwest",
|
||||
"reqwest 0.12.4",
|
||||
"rustls 0.21.11",
|
||||
"sentry-backtrace",
|
||||
"sentry-contexts",
|
||||
@@ -5297,9 +5391,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "sentry-backtrace"
|
||||
version = "0.31.6"
|
||||
version = "0.32.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6ac2bac6f310c4c4c4bb094d1541d32ae497f8c5c23405e85492cefdfe0971a9"
|
||||
checksum = "a79194074f34b0cbe5dd33896e5928bbc6ab63a889bd9df2264af5acb186921e"
|
||||
dependencies = [
|
||||
"backtrace",
|
||||
"once_cell",
|
||||
@@ -5309,11 +5403,11 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "sentry-contexts"
|
||||
version = "0.31.6"
|
||||
version = "0.32.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6c3e17295cecdbacf66c5bd38d6e1147e09e1e9d824d2d5341f76638eda02a3a"
|
||||
checksum = "eba8870c5dba2bfd9db25c75574a11429f6b95957b0a78ac02e2970dd7a5249a"
|
||||
dependencies = [
|
||||
"hostname",
|
||||
"hostname 0.4.0",
|
||||
"libc",
|
||||
"os_info",
|
||||
"rustc_version",
|
||||
@@ -5323,9 +5417,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "sentry-core"
|
||||
version = "0.31.6"
|
||||
version = "0.32.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8339474f587f36cb110fa1ed1b64229eea6d47b0b886375579297b7e47aeb055"
|
||||
checksum = "46a75011ea1c0d5c46e9e57df03ce81f5c7f0a9e199086334a1f9c0a541e0826"
|
||||
dependencies = [
|
||||
"once_cell",
|
||||
"rand 0.8.5",
|
||||
@@ -5336,9 +5430,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "sentry-panic"
|
||||
version = "0.31.6"
|
||||
version = "0.32.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "875b69f506da75bd664029eafb05f8934297d2990192896d17325f066bd665b7"
|
||||
checksum = "2eaa3ecfa3c8750c78dcfd4637cfa2598b95b52897ed184b4dc77fcf7d95060d"
|
||||
dependencies = [
|
||||
"sentry-backtrace",
|
||||
"sentry-core",
|
||||
@@ -5346,9 +5440,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "sentry-tracing"
|
||||
version = "0.31.6"
|
||||
version = "0.32.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "89feead9bdd116f8035e89567651340fc382db29240b6c55ef412078b08d1aa3"
|
||||
checksum = "f715932bf369a61b7256687c6f0554141b7ce097287e30e3f7ed6e9de82498fe"
|
||||
dependencies = [
|
||||
"sentry-backtrace",
|
||||
"sentry-core",
|
||||
@@ -5358,13 +5452,13 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "sentry-types"
|
||||
version = "0.31.6"
|
||||
version = "0.32.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "99dc599bd6646884fc403d593cdcb9816dd67c50cff3271c01ff123617908dcd"
|
||||
checksum = "4519c900ce734f7a0eb7aba0869dfb225a7af8820634a7dd51449e3b093cfb7c"
|
||||
dependencies = [
|
||||
"debugid",
|
||||
"getrandom 0.2.11",
|
||||
"hex",
|
||||
"rand 0.8.5",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"thiserror",
|
||||
@@ -5756,10 +5850,12 @@ dependencies = [
|
||||
"pageserver_client",
|
||||
"postgres_connection",
|
||||
"r2d2",
|
||||
"reqwest",
|
||||
"reqwest 0.12.4",
|
||||
"routerify",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"strum",
|
||||
"strum_macros",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
@@ -5778,7 +5874,7 @@ dependencies = [
|
||||
"hyper 0.14.26",
|
||||
"pageserver_api",
|
||||
"pageserver_client",
|
||||
"reqwest",
|
||||
"reqwest 0.12.4",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"thiserror",
|
||||
@@ -5832,7 +5928,7 @@ checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"
|
||||
[[package]]
|
||||
name = "svg_fmt"
|
||||
version = "0.4.2"
|
||||
source = "git+https://github.com/neondatabase/fork--nical--rust_debug?branch=neon#b9501105e746629004bc6d0473639320939dbe10"
|
||||
source = "git+https://github.com/neondatabase/fork--nical--rust_debug?branch=neon#c1820b28664b5df68de7f043fccf2ed5d67b6ae8"
|
||||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
@@ -5861,6 +5957,9 @@ name = "sync_wrapper"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160"
|
||||
dependencies = [
|
||||
"futures-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "synstructure"
|
||||
@@ -6479,12 +6578,14 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tracing-opentelemetry"
|
||||
version = "0.20.0"
|
||||
version = "0.21.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fc09e402904a5261e42cf27aea09ccb7d5318c6717a9eec3d8e2e65c56b18f19"
|
||||
checksum = "75327c6b667828ddc28f5e3f169036cb793c3f588d83bf0f262a7f062ffed3c8"
|
||||
dependencies = [
|
||||
"once_cell",
|
||||
"opentelemetry",
|
||||
"opentelemetry_sdk",
|
||||
"smallvec",
|
||||
"tracing",
|
||||
"tracing-core",
|
||||
"tracing-log",
|
||||
@@ -6508,6 +6609,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77"
|
||||
dependencies = [
|
||||
"matchers",
|
||||
"nu-ansi-term",
|
||||
"once_cell",
|
||||
"regex",
|
||||
"serde",
|
||||
@@ -6529,7 +6631,7 @@ dependencies = [
|
||||
"opentelemetry",
|
||||
"opentelemetry-otlp",
|
||||
"opentelemetry-semantic-conventions",
|
||||
"reqwest",
|
||||
"reqwest 0.12.4",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"tracing-opentelemetry",
|
||||
@@ -6615,15 +6717,6 @@ dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unicase"
|
||||
version = "2.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "50f37be617794602aabbeee0be4f259dc1778fabe05e2d67ee8f79326d5cb4f6"
|
||||
dependencies = [
|
||||
"version_check",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unicode-bidi"
|
||||
version = "0.3.13"
|
||||
@@ -6905,9 +6998,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen"
|
||||
version = "0.2.86"
|
||||
version = "0.2.92"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5bba0e8cb82ba49ff4e229459ff22a191bbe9a1cb3a341610c9c33efc27ddf73"
|
||||
checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"wasm-bindgen-macro",
|
||||
@@ -6915,9 +7008,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-backend"
|
||||
version = "0.2.86"
|
||||
version = "0.2.92"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "19b04bc93f9d6bdee709f6bd2118f57dd6679cf1176a1af464fca3ab0d66d8fb"
|
||||
checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da"
|
||||
dependencies = [
|
||||
"bumpalo",
|
||||
"log",
|
||||
@@ -6930,9 +7023,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-futures"
|
||||
version = "0.4.36"
|
||||
version = "0.4.42"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2d1985d03709c53167ce907ff394f5316aa22cb4e12761295c5dc57dacb6297e"
|
||||
checksum = "76bc14366121efc8dbb487ab05bcc9d346b3b5ec0eaa76e46594cabbe51762c0"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"js-sys",
|
||||
@@ -6942,9 +7035,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-macro"
|
||||
version = "0.2.86"
|
||||
version = "0.2.92"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "14d6b024f1a526bb0234f52840389927257beb670610081360e5a03c5df9c258"
|
||||
checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726"
|
||||
dependencies = [
|
||||
"quote",
|
||||
"wasm-bindgen-macro-support",
|
||||
@@ -6952,9 +7045,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-macro-support"
|
||||
version = "0.2.86"
|
||||
version = "0.2.92"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e128beba882dd1eb6200e1dc92ae6c5dbaa4311aa7bb211ca035779e5efc39f8"
|
||||
checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
@@ -6965,9 +7058,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-shared"
|
||||
version = "0.2.86"
|
||||
version = "0.2.92"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ed9d5b4305409d1fc9482fee2d7f9bcbf24b3972bf59817ef757e23982242a93"
|
||||
checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96"
|
||||
|
||||
[[package]]
|
||||
name = "wasm-streams"
|
||||
@@ -6982,6 +7075,19 @@ dependencies = [
|
||||
"web-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wasm-streams"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b65dc4c90b63b118468cf747d8bf3566c1913ef60be765b5730ead9e0a3ba129"
|
||||
dependencies = [
|
||||
"futures-util",
|
||||
"js-sys",
|
||||
"wasm-bindgen",
|
||||
"wasm-bindgen-futures",
|
||||
"web-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wasm-timer"
|
||||
version = "0.2.5"
|
||||
@@ -6999,9 +7105,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "web-sys"
|
||||
version = "0.3.63"
|
||||
version = "0.3.69"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3bdd9ef4e984da1187bf8110c5cf5b845fbc87a23602cdf912386a76fcd3a7c2"
|
||||
checksum = "77afa9a11836342370f4817622a2f0f418b134426d91a82dfb48f532d2ec13ef"
|
||||
dependencies = [
|
||||
"js-sys",
|
||||
"wasm-bindgen",
|
||||
@@ -7022,6 +7128,15 @@ version = "0.25.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "14247bb57be4f377dfb94c72830b8ce8fc6beac03cf4bf7b9732eadd414123fc"
|
||||
|
||||
[[package]]
|
||||
name = "webpki-roots"
|
||||
version = "0.26.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b3de34ae270483955a94f4b21bdaaeb83d508bb84a01435f393818edb0012009"
|
||||
dependencies = [
|
||||
"rustls-pki-types",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "which"
|
||||
version = "4.4.0"
|
||||
@@ -7073,6 +7188,25 @@ dependencies = [
|
||||
"windows-targets 0.48.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows"
|
||||
version = "0.52.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e48a53791691ab099e5e2ad123536d0fff50652600abaf43bbf952894110d0be"
|
||||
dependencies = [
|
||||
"windows-core",
|
||||
"windows-targets 0.52.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-core"
|
||||
version = "0.52.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9"
|
||||
dependencies = [
|
||||
"windows-targets 0.52.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-sys"
|
||||
version = "0.42.0"
|
||||
@@ -7305,6 +7439,16 @@ dependencies = [
|
||||
"windows-sys 0.48.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "winreg"
|
||||
version = "0.52.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a277a57398d4bfa075df44f501a17cfdf8542d224f0d36095a2adc7aee4ef0a5"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"windows-sys 0.48.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "workspace_hack"
|
||||
version = "0.1.0"
|
||||
@@ -7354,7 +7498,8 @@ dependencies = [
|
||||
"regex",
|
||||
"regex-automata 0.4.3",
|
||||
"regex-syntax 0.8.2",
|
||||
"reqwest",
|
||||
"reqwest 0.11.19",
|
||||
"reqwest 0.12.4",
|
||||
"rustls 0.21.11",
|
||||
"scopeguard",
|
||||
"serde",
|
||||
@@ -7364,6 +7509,7 @@ dependencies = [
|
||||
"subtle",
|
||||
"syn 1.0.109",
|
||||
"syn 2.0.52",
|
||||
"sync_wrapper",
|
||||
"time",
|
||||
"time-macros",
|
||||
"tokio",
|
||||
|
||||
23
Cargo.toml
23
Cargo.toml
@@ -45,10 +45,10 @@ anyhow = { version = "1.0", features = ["backtrace"] }
|
||||
arc-swap = "1.6"
|
||||
async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
|
||||
atomic-take = "1.1.0"
|
||||
azure_core = "0.18"
|
||||
azure_identity = "0.18"
|
||||
azure_storage = "0.18"
|
||||
azure_storage_blobs = "0.18"
|
||||
azure_core = "0.19"
|
||||
azure_identity = "0.19"
|
||||
azure_storage = "0.19"
|
||||
azure_storage_blobs = "0.19"
|
||||
flate2 = "1.0.26"
|
||||
async-stream = "0.3"
|
||||
async-trait = "0.1"
|
||||
@@ -130,10 +130,10 @@ prost = "0.11"
|
||||
rand = "0.8"
|
||||
redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] }
|
||||
regex = "1.10.2"
|
||||
reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
|
||||
reqwest-tracing = { version = "0.4.7", features = ["opentelemetry_0_20"] }
|
||||
reqwest-middleware = "0.2.0"
|
||||
reqwest-retry = "0.2.2"
|
||||
reqwest = { version = "0.12", default-features = false, features = ["rustls-tls"] }
|
||||
reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_20"] }
|
||||
reqwest-middleware = "0.3.0"
|
||||
reqwest-retry = "0.5"
|
||||
routerify = "3"
|
||||
rpds = "0.13"
|
||||
rustc-hash = "1.1.0"
|
||||
@@ -143,7 +143,7 @@ rustls-split = "0.3"
|
||||
scopeguard = "1.1"
|
||||
sysinfo = "0.29.2"
|
||||
sd-notify = "0.4.1"
|
||||
sentry = { version = "0.31", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
|
||||
sentry = { version = "0.32", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
serde_path_to_error = "0.1"
|
||||
@@ -177,10 +177,11 @@ tokio-util = { version = "0.7.10", features = ["io", "rt"] }
|
||||
toml = "0.7"
|
||||
toml_edit = "0.19"
|
||||
tonic = {version = "0.9", features = ["tls", "tls-roots"]}
|
||||
tower-service = "0.3.2"
|
||||
tracing = "0.1"
|
||||
tracing-error = "0.2.0"
|
||||
tracing-opentelemetry = "0.20.0"
|
||||
tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
|
||||
tracing-opentelemetry = "0.21.0"
|
||||
tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json", "ansi"] }
|
||||
twox-hash = { version = "1.6.3", default-features = false }
|
||||
url = "2.2"
|
||||
urlencoding = "2.1"
|
||||
|
||||
@@ -65,7 +65,7 @@ RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/
|
||||
&& mv s5cmd /usr/local/bin/s5cmd
|
||||
|
||||
# LLVM
|
||||
ENV LLVM_VERSION=17
|
||||
ENV LLVM_VERSION=18
|
||||
RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
|
||||
&& echo "deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
|
||||
&& apt update \
|
||||
@@ -141,7 +141,7 @@ WORKDIR /home/nonroot
|
||||
|
||||
# Rust
|
||||
# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
|
||||
ENV RUSTC_VERSION=1.77.0
|
||||
ENV RUSTC_VERSION=1.78.0
|
||||
ENV RUSTUP_HOME="/home/nonroot/.rustup"
|
||||
ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
|
||||
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
|
||||
|
||||
@@ -47,7 +47,7 @@ use chrono::Utc;
|
||||
use clap::Arg;
|
||||
use signal_hook::consts::{SIGQUIT, SIGTERM};
|
||||
use signal_hook::{consts::SIGINT, iterator::Signals};
|
||||
use tracing::{error, info};
|
||||
use tracing::{error, info, warn};
|
||||
use url::Url;
|
||||
|
||||
use compute_api::responses::ComputeStatus;
|
||||
@@ -62,6 +62,7 @@ use compute_tools::logger::*;
|
||||
use compute_tools::monitor::launch_monitor;
|
||||
use compute_tools::params::*;
|
||||
use compute_tools::spec::*;
|
||||
use compute_tools::swap::resize_swap;
|
||||
|
||||
// this is an arbitrary build tag. Fine as a default / for testing purposes
|
||||
// in-case of not-set environment var
|
||||
@@ -110,6 +111,7 @@ fn main() -> Result<()> {
|
||||
.expect("Postgres connection string is required");
|
||||
let spec_json = matches.get_one::<String>("spec");
|
||||
let spec_path = matches.get_one::<String>("spec-path");
|
||||
let resize_swap_on_bind = matches.get_flag("resize-swap-on-bind");
|
||||
|
||||
// Extract OpenTelemetry context for the startup actions from the
|
||||
// TRACEPARENT and TRACESTATE env variables, and attach it to the current
|
||||
@@ -226,14 +228,14 @@ fn main() -> Result<()> {
|
||||
|
||||
// If this is a pooled VM, prewarm before starting HTTP server and becoming
|
||||
// available for binding. Prewarming helps Postgres start quicker later,
|
||||
// because QEMU will already have it's memory allocated from the host, and
|
||||
// because QEMU will already have its memory allocated from the host, and
|
||||
// the necessary binaries will already be cached.
|
||||
if !spec_set {
|
||||
compute.prewarm_postgres()?;
|
||||
}
|
||||
|
||||
// Launch http service first, so we were able to serve control-plane
|
||||
// requests, while configuration is still in progress.
|
||||
// Launch http service first, so that we can serve control-plane requests
|
||||
// while configuration is still in progress.
|
||||
let _http_handle =
|
||||
launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread");
|
||||
|
||||
@@ -253,21 +255,22 @@ fn main() -> Result<()> {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Record for how long we slept waiting for the spec.
|
||||
let now = Utc::now();
|
||||
state.metrics.wait_for_spec_ms = now
|
||||
.signed_duration_since(state.start_time)
|
||||
.to_std()
|
||||
.unwrap()
|
||||
.as_millis() as u64;
|
||||
|
||||
// Reset start time, so that the total startup time that is calculated later will
|
||||
// not include the time that we waited for the spec.
|
||||
state.start_time = now;
|
||||
}
|
||||
|
||||
// We got all we need, update the state.
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
|
||||
// Record for how long we slept waiting for the spec.
|
||||
state.metrics.wait_for_spec_ms = Utc::now()
|
||||
.signed_duration_since(state.start_time)
|
||||
.to_std()
|
||||
.unwrap()
|
||||
.as_millis() as u64;
|
||||
// Reset start time to the actual start of the configuration, so that
|
||||
// total startup time was properly measured at the end.
|
||||
state.start_time = Utc::now();
|
||||
|
||||
state.status = ComputeStatus::Init;
|
||||
compute.state_changed.notify_all();
|
||||
|
||||
@@ -275,33 +278,72 @@ fn main() -> Result<()> {
|
||||
"running compute with features: {:?}",
|
||||
state.pspec.as_ref().unwrap().spec.features
|
||||
);
|
||||
// before we release the mutex, fetch the swap size (if any) for later.
|
||||
let swap_size_bytes = state.pspec.as_ref().unwrap().spec.swap_size_bytes;
|
||||
drop(state);
|
||||
|
||||
// Launch remaining service threads
|
||||
let _monitor_handle = launch_monitor(&compute);
|
||||
let _configurator_handle = launch_configurator(&compute);
|
||||
|
||||
// Start Postgres
|
||||
let mut prestartup_failed = false;
|
||||
let mut delay_exit = false;
|
||||
let mut exit_code = None;
|
||||
let pg = match compute.start_compute(extension_server_port) {
|
||||
Ok(pg) => Some(pg),
|
||||
Err(err) => {
|
||||
error!("could not start the compute node: {:#}", err);
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
state.error = Some(format!("{:?}", err));
|
||||
state.status = ComputeStatus::Failed;
|
||||
// Notify others that Postgres failed to start. In case of configuring the
|
||||
// empty compute, it's likely that API handler is still waiting for compute
|
||||
// state change. With this we will notify it that compute is in Failed state,
|
||||
// so control plane will know about it earlier and record proper error instead
|
||||
// of timeout.
|
||||
compute.state_changed.notify_all();
|
||||
drop(state); // unlock
|
||||
delay_exit = true;
|
||||
None
|
||||
|
||||
// Resize swap to the desired size if the compute spec says so
|
||||
if let (Some(size_bytes), true) = (swap_size_bytes, resize_swap_on_bind) {
|
||||
// To avoid 'swapoff' hitting postgres startup, we need to run resize-swap to completion
|
||||
// *before* starting postgres.
|
||||
//
|
||||
// In theory, we could do this asynchronously if SkipSwapon was enabled for VMs, but this
|
||||
// carries a risk of introducing hard-to-debug issues - e.g. if postgres sometimes gets
|
||||
// OOM-killed during startup because swap wasn't available yet.
|
||||
match resize_swap(size_bytes) {
|
||||
Ok(()) => {
|
||||
let size_gib = size_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
|
||||
info!(%size_bytes, %size_gib, "resized swap");
|
||||
}
|
||||
Err(err) => {
|
||||
let err = err.context("failed to resize swap");
|
||||
error!("{err:#}");
|
||||
|
||||
// Mark compute startup as failed; don't try to start postgres, and report this
|
||||
// error to the control plane when it next asks.
|
||||
prestartup_failed = true;
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
state.error = Some(format!("{err:?}"));
|
||||
state.status = ComputeStatus::Failed;
|
||||
compute.state_changed.notify_all();
|
||||
delay_exit = true;
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// Start Postgres
|
||||
let mut pg = None;
|
||||
let mut exit_code = None;
|
||||
|
||||
if !prestartup_failed {
|
||||
pg = match compute.start_compute(extension_server_port) {
|
||||
Ok(pg) => Some(pg),
|
||||
Err(err) => {
|
||||
error!("could not start the compute node: {:#}", err);
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
state.error = Some(format!("{:?}", err));
|
||||
state.status = ComputeStatus::Failed;
|
||||
// Notify others that Postgres failed to start. In case of configuring the
|
||||
// empty compute, it's likely that API handler is still waiting for compute
|
||||
// state change. With this we will notify it that compute is in Failed state,
|
||||
// so control plane will know about it earlier and record proper error instead
|
||||
// of timeout.
|
||||
compute.state_changed.notify_all();
|
||||
drop(state); // unlock
|
||||
delay_exit = true;
|
||||
None
|
||||
}
|
||||
};
|
||||
} else {
|
||||
warn!("skipping postgres startup because pre-startup step failed");
|
||||
}
|
||||
|
||||
// Start the vm-monitor if directed to. The vm-monitor only runs on linux
|
||||
// because it requires cgroups.
|
||||
@@ -526,6 +568,11 @@ fn cli() -> clap::Command {
|
||||
)
|
||||
.value_name("FILECACHE_CONNSTR"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("resize-swap-on-bind")
|
||||
.long("resize-swap-on-bind")
|
||||
.action(clap::ArgAction::SetTrue),
|
||||
)
|
||||
}
|
||||
|
||||
/// When compute_ctl is killed, send also termination signal to sync-safekeepers
|
||||
|
||||
@@ -14,4 +14,5 @@ pub mod monitor;
|
||||
pub mod params;
|
||||
pub mod pg_helpers;
|
||||
pub mod spec;
|
||||
pub mod swap;
|
||||
pub mod sync_sk;
|
||||
|
||||
@@ -490,7 +490,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
"rename_db" => {
|
||||
let new_name = op.new_name.as_ref().unwrap();
|
||||
|
||||
if existing_dbs.get(&op.name).is_some() {
|
||||
if existing_dbs.contains_key(&op.name) {
|
||||
let query: String = format!(
|
||||
"ALTER DATABASE {} RENAME TO {}",
|
||||
op.name.pg_quote(),
|
||||
|
||||
36
compute_tools/src/swap.rs
Normal file
36
compute_tools/src/swap.rs
Normal file
@@ -0,0 +1,36 @@
|
||||
use anyhow::{anyhow, Context};
|
||||
use tracing::warn;
|
||||
|
||||
pub const RESIZE_SWAP_BIN: &str = "/neonvm/bin/resize-swap";
|
||||
|
||||
pub fn resize_swap(size_bytes: u64) -> anyhow::Result<()> {
|
||||
// run `/neonvm/bin/resize-swap --once {size_bytes}`
|
||||
//
|
||||
// Passing '--once' causes resize-swap to delete itself after successful completion, which
|
||||
// means that if compute_ctl restarts later, we won't end up calling 'swapoff' while
|
||||
// postgres is running.
|
||||
//
|
||||
// NOTE: resize-swap is not very clever. If present, --once MUST be the first arg.
|
||||
let child_result = std::process::Command::new("/usr/bin/sudo")
|
||||
.arg(RESIZE_SWAP_BIN)
|
||||
.arg("--once")
|
||||
.arg(size_bytes.to_string())
|
||||
.spawn();
|
||||
|
||||
if matches!(&child_result, Err(e) if e.kind() == std::io::ErrorKind::NotFound) {
|
||||
warn!("ignoring \"not found\" error from resize-swap to avoid swapoff while compute is running");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
child_result
|
||||
.context("spawn() failed")
|
||||
.and_then(|mut child| child.wait().context("wait() failed"))
|
||||
.and_then(|status| match status.success() {
|
||||
true => Ok(()),
|
||||
false => Err(anyhow!("process exited with {status}")),
|
||||
})
|
||||
// wrap any prior error with the overall context that we couldn't run the command
|
||||
.with_context(|| {
|
||||
format!("could not run `/usr/bin/sudo {RESIZE_SWAP_BIN} --once {size_bytes}`")
|
||||
})
|
||||
}
|
||||
@@ -17,6 +17,7 @@ nix.workspace = true
|
||||
once_cell.workspace = true
|
||||
postgres.workspace = true
|
||||
hex.workspace = true
|
||||
humantime-serde.workspace = true
|
||||
hyper.workspace = true
|
||||
regex.workspace = true
|
||||
reqwest = { workspace = true, features = ["blocking", "json"] }
|
||||
|
||||
@@ -14,15 +14,15 @@ use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
|
||||
use control_plane::safekeeper::SafekeeperNode;
|
||||
use control_plane::storage_controller::StorageController;
|
||||
use control_plane::{broker, local_env};
|
||||
use pageserver_api::config::{
|
||||
DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
|
||||
DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
|
||||
};
|
||||
use pageserver_api::controller_api::PlacementPolicy;
|
||||
use pageserver_api::models::{
|
||||
ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
|
||||
};
|
||||
use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId};
|
||||
use pageserver_api::{
|
||||
DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
|
||||
DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
|
||||
};
|
||||
use postgres_backend::AuthType;
|
||||
use postgres_connection::parse_host_port;
|
||||
use safekeeper_api::{
|
||||
@@ -837,6 +837,8 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
.copied()
|
||||
.unwrap_or(false);
|
||||
|
||||
let allow_multiple = sub_args.get_flag("allow-multiple");
|
||||
|
||||
let mode = match (lsn, hot_standby) {
|
||||
(Some(lsn), false) => ComputeMode::Static(lsn),
|
||||
(None, true) => ComputeMode::Replica,
|
||||
@@ -854,7 +856,9 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
_ => {}
|
||||
}
|
||||
|
||||
cplane.check_conflicting_endpoints(mode, tenant_id, timeline_id)?;
|
||||
if !allow_multiple {
|
||||
cplane.check_conflicting_endpoints(mode, tenant_id, timeline_id)?;
|
||||
}
|
||||
|
||||
cplane.new_endpoint(
|
||||
&endpoint_id,
|
||||
@@ -883,6 +887,8 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
|
||||
let remote_ext_config = sub_args.get_one::<String>("remote-ext-config");
|
||||
|
||||
let allow_multiple = sub_args.get_flag("allow-multiple");
|
||||
|
||||
// If --safekeepers argument is given, use only the listed safekeeper nodes.
|
||||
let safekeepers =
|
||||
if let Some(safekeepers_str) = sub_args.get_one::<String>("safekeepers") {
|
||||
@@ -908,11 +914,13 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
.cloned()
|
||||
.unwrap_or_default();
|
||||
|
||||
cplane.check_conflicting_endpoints(
|
||||
endpoint.mode,
|
||||
endpoint.tenant_id,
|
||||
endpoint.timeline_id,
|
||||
)?;
|
||||
if !allow_multiple {
|
||||
cplane.check_conflicting_endpoints(
|
||||
endpoint.mode,
|
||||
endpoint.tenant_id,
|
||||
endpoint.timeline_id,
|
||||
)?;
|
||||
}
|
||||
|
||||
let (pageservers, stripe_size) = if let Some(pageserver_id) = pageserver_id {
|
||||
let conf = env.get_pageserver_conf(pageserver_id).unwrap();
|
||||
@@ -1444,6 +1452,12 @@ fn cli() -> Command {
|
||||
.help("If set, will create test user `user` and `neondb` database. Requires `update-catalog = true`")
|
||||
.required(false);
|
||||
|
||||
let allow_multiple = Arg::new("allow-multiple")
|
||||
.help("Allow multiple primary endpoints running on the same branch. Shouldn't be used normally, but useful for tests.")
|
||||
.long("allow-multiple")
|
||||
.action(ArgAction::SetTrue)
|
||||
.required(false);
|
||||
|
||||
Command::new("Neon CLI")
|
||||
.arg_required_else_help(true)
|
||||
.version(GIT_VERSION)
|
||||
@@ -1554,8 +1568,8 @@ fn cli() -> Command {
|
||||
Command::new("storage_controller")
|
||||
.arg_required_else_help(true)
|
||||
.about("Manage storage_controller")
|
||||
.subcommand(Command::new("start").about("Start local pageserver").arg(pageserver_config_args.clone()))
|
||||
.subcommand(Command::new("stop").about("Stop local pageserver")
|
||||
.subcommand(Command::new("start").about("Start storage controller"))
|
||||
.subcommand(Command::new("stop").about("Stop storage controller")
|
||||
.arg(stop_mode_arg.clone()))
|
||||
)
|
||||
.subcommand(
|
||||
@@ -1601,6 +1615,7 @@ fn cli() -> Command {
|
||||
.arg(pg_version_arg.clone())
|
||||
.arg(hot_standby_arg.clone())
|
||||
.arg(update_catalog)
|
||||
.arg(allow_multiple.clone())
|
||||
)
|
||||
.subcommand(Command::new("start")
|
||||
.about("Start postgres.\n If the endpoint doesn't exist yet, it is created.")
|
||||
@@ -1609,6 +1624,7 @@ fn cli() -> Command {
|
||||
.arg(safekeepers_arg)
|
||||
.arg(remote_ext_config_args)
|
||||
.arg(create_test_user)
|
||||
.arg(allow_multiple.clone())
|
||||
)
|
||||
.subcommand(Command::new("reconfigure")
|
||||
.about("Reconfigure the endpoint")
|
||||
|
||||
@@ -554,6 +554,7 @@ impl Endpoint {
|
||||
format_version: 1.0,
|
||||
operation_uuid: None,
|
||||
features: self.features.clone(),
|
||||
swap_size_bytes: None,
|
||||
cluster: Cluster {
|
||||
cluster_id: None, // project ID: not used
|
||||
name: None, // project name: not used
|
||||
|
||||
@@ -17,6 +17,7 @@ use std::net::Ipv4Addr;
|
||||
use std::net::SocketAddr;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::process::{Command, Stdio};
|
||||
use std::time::Duration;
|
||||
use utils::{
|
||||
auth::{encode_from_key_file, Claims},
|
||||
id::{NodeId, TenantId, TenantTimelineId, TimelineId},
|
||||
@@ -66,6 +67,10 @@ pub struct LocalEnv {
|
||||
|
||||
pub broker: NeonBroker,
|
||||
|
||||
// Configuration for the storage controller (1 per neon_local environment)
|
||||
#[serde(default)]
|
||||
pub storage_controller: NeonStorageControllerConf,
|
||||
|
||||
/// This Vec must always contain at least one pageserver
|
||||
pub pageservers: Vec<PageServerConf>,
|
||||
|
||||
@@ -98,6 +103,29 @@ pub struct NeonBroker {
|
||||
pub listen_addr: SocketAddr,
|
||||
}
|
||||
|
||||
/// Broker config for cluster internal communication.
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
|
||||
#[serde(default)]
|
||||
pub struct NeonStorageControllerConf {
|
||||
/// Heartbeat timeout before marking a node offline
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub max_unavailable: Duration,
|
||||
}
|
||||
|
||||
impl NeonStorageControllerConf {
|
||||
// Use a shorter pageserver unavailability interval than the default to speed up tests.
|
||||
const DEFAULT_MAX_UNAVAILABLE_INTERVAL: std::time::Duration =
|
||||
std::time::Duration::from_secs(10);
|
||||
}
|
||||
|
||||
impl Default for NeonStorageControllerConf {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
max_unavailable: Self::DEFAULT_MAX_UNAVAILABLE_INTERVAL,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Dummy Default impl to satisfy Deserialize derive.
|
||||
impl Default for NeonBroker {
|
||||
fn default() -> Self {
|
||||
@@ -130,6 +158,7 @@ pub struct PageServerConf {
|
||||
pub(crate) virtual_file_io_engine: Option<String>,
|
||||
pub(crate) get_vectored_impl: Option<String>,
|
||||
pub(crate) get_impl: Option<String>,
|
||||
pub(crate) validate_vectored_get: Option<bool>,
|
||||
}
|
||||
|
||||
impl Default for PageServerConf {
|
||||
@@ -143,6 +172,7 @@ impl Default for PageServerConf {
|
||||
virtual_file_io_engine: None,
|
||||
get_vectored_impl: None,
|
||||
get_impl: None,
|
||||
validate_vectored_get: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -352,7 +382,10 @@ impl LocalEnv {
|
||||
|
||||
// Find neon binaries.
|
||||
if env.neon_distrib_dir == Path::new("") {
|
||||
env.neon_distrib_dir = env::current_exe()?.parent().unwrap().to_owned();
|
||||
env::current_exe()?
|
||||
.parent()
|
||||
.unwrap()
|
||||
.clone_into(&mut env.neon_distrib_dir);
|
||||
}
|
||||
|
||||
if env.pageservers.is_empty() {
|
||||
|
||||
@@ -93,6 +93,7 @@ impl PageServerNode {
|
||||
virtual_file_io_engine,
|
||||
get_vectored_impl,
|
||||
get_impl,
|
||||
validate_vectored_get,
|
||||
} = &self.conf;
|
||||
|
||||
let id = format!("id={}", id);
|
||||
@@ -117,6 +118,11 @@ impl PageServerNode {
|
||||
} else {
|
||||
String::new()
|
||||
};
|
||||
let validate_vectored_get = if let Some(validate_vectored_get) = validate_vectored_get {
|
||||
format!("validate_vectored_get={validate_vectored_get}")
|
||||
} else {
|
||||
String::new()
|
||||
};
|
||||
|
||||
let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());
|
||||
|
||||
@@ -131,6 +137,7 @@ impl PageServerNode {
|
||||
virtual_file_io_engine,
|
||||
get_vectored_impl,
|
||||
get_impl,
|
||||
validate_vectored_get,
|
||||
];
|
||||
|
||||
if let Some(control_plane_api) = &self.env.control_plane_api {
|
||||
@@ -241,12 +248,13 @@ impl PageServerNode {
|
||||
// situation: the metadata is written by some other script.
|
||||
std::fs::write(
|
||||
metadata_path,
|
||||
serde_json::to_vec(&serde_json::json!({
|
||||
"host": "localhost",
|
||||
"port": self.pg_connection_config.port(),
|
||||
"http_host": "localhost",
|
||||
"http_port": http_port,
|
||||
}))
|
||||
serde_json::to_vec(&pageserver_api::config::NodeMetadata {
|
||||
postgres_host: "localhost".to_string(),
|
||||
postgres_port: self.pg_connection_config.port(),
|
||||
http_host: "localhost".to_string(),
|
||||
http_port,
|
||||
other: HashMap::new(),
|
||||
})
|
||||
.unwrap(),
|
||||
)
|
||||
.expect("Failed to write metadata file");
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
use crate::{background_process, local_env::LocalEnv};
|
||||
use crate::{
|
||||
background_process,
|
||||
local_env::{LocalEnv, NeonStorageControllerConf},
|
||||
};
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use hyper::Method;
|
||||
use pageserver_api::{
|
||||
controller_api::{
|
||||
NodeConfigureRequest, NodeRegisterRequest, TenantCreateResponse, TenantLocateResponse,
|
||||
@@ -14,6 +16,7 @@ use pageserver_api::{
|
||||
};
|
||||
use pageserver_client::mgmt_api::ResponseErrorMessageExt;
|
||||
use postgres_backend::AuthType;
|
||||
use reqwest::Method;
|
||||
use serde::{de::DeserializeOwned, Deserialize, Serialize};
|
||||
use std::{fs, str::FromStr};
|
||||
use tokio::process::Command;
|
||||
@@ -32,15 +35,13 @@ pub struct StorageController {
|
||||
public_key: Option<String>,
|
||||
postgres_port: u16,
|
||||
client: reqwest::Client,
|
||||
config: NeonStorageControllerConf,
|
||||
}
|
||||
|
||||
const COMMAND: &str = "storage_controller";
|
||||
|
||||
const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;
|
||||
|
||||
// Use a shorter pageserver unavailability interval than the default to speed up tests.
|
||||
const NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(10);
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct AttachHookRequest {
|
||||
pub tenant_shard_id: TenantShardId,
|
||||
@@ -135,6 +136,7 @@ impl StorageController {
|
||||
client: reqwest::ClientBuilder::new()
|
||||
.build()
|
||||
.expect("Failed to construct http client"),
|
||||
config: env.storage_controller.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -272,8 +274,6 @@ impl StorageController {
|
||||
// Run migrations on every startup, in case something changed.
|
||||
let database_url = self.setup_database().await?;
|
||||
|
||||
let max_unavailable: humantime::Duration = NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL.into();
|
||||
|
||||
let mut args = vec![
|
||||
"-l",
|
||||
&self.listen,
|
||||
@@ -283,7 +283,7 @@ impl StorageController {
|
||||
"--database-url",
|
||||
&database_url,
|
||||
"--max-unavailable-interval",
|
||||
&max_unavailable.to_string(),
|
||||
&humantime::Duration::from(self.config.max_unavailable).to_string(),
|
||||
]
|
||||
.into_iter()
|
||||
.map(|s| s.to_string())
|
||||
@@ -379,7 +379,7 @@ impl StorageController {
|
||||
/// Simple HTTP request wrapper for calling into storage controller
|
||||
async fn dispatch<RQ, RS>(
|
||||
&self,
|
||||
method: hyper::Method,
|
||||
method: reqwest::Method,
|
||||
path: String,
|
||||
body: Option<RQ>,
|
||||
) -> anyhow::Result<RS>
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
use std::{collections::HashMap, str::FromStr, time::Duration};
|
||||
|
||||
use clap::{Parser, Subcommand};
|
||||
use hyper::{Method, StatusCode};
|
||||
use pageserver_api::{
|
||||
controller_api::{
|
||||
NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy,
|
||||
@@ -14,7 +13,7 @@ use pageserver_api::{
|
||||
shard::{ShardStripeSize, TenantShardId},
|
||||
};
|
||||
use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt};
|
||||
use reqwest::Url;
|
||||
use reqwest::{Method, StatusCode, Url};
|
||||
use serde::{de::DeserializeOwned, Serialize};
|
||||
use utils::id::{NodeId, TenantId};
|
||||
|
||||
@@ -232,7 +231,7 @@ impl Client {
|
||||
/// Simple HTTP request wrapper for calling into storage controller
|
||||
async fn dispatch<RQ, RS>(
|
||||
&self,
|
||||
method: hyper::Method,
|
||||
method: Method,
|
||||
path: String,
|
||||
body: Option<RQ>,
|
||||
) -> mgmt_api::Result<RS>
|
||||
|
||||
@@ -30,7 +30,7 @@ The storage controller uses a postgres database to persist a subset of its state
|
||||
persist the _relationships_ between them: the attachment state of a tenant's shards to nodes is kept in memory and
|
||||
rebuilt on startup.
|
||||
|
||||
The file `[persistence.rs](http://persistence.rs)` contains all the code for accessing the database, and has a large doc comment that goes into more detail about exactly what we persist and why.
|
||||
The file `persistence.rs` contains all the code for accessing the database, and has a large doc comment that goes into more detail about exactly what we persist and why.
|
||||
|
||||
The `diesel` crate is used for defining models & migrations.
|
||||
|
||||
|
||||
477
flake.lock
generated
477
flake.lock
generated
@@ -1,477 +0,0 @@
|
||||
{
|
||||
"nodes": {
|
||||
"cachix": {
|
||||
"inputs": {
|
||||
"devenv": "devenv_2",
|
||||
"flake-compat": [
|
||||
"devenv",
|
||||
"flake-compat"
|
||||
],
|
||||
"nixpkgs": [
|
||||
"devenv",
|
||||
"nixpkgs"
|
||||
],
|
||||
"pre-commit-hooks": [
|
||||
"devenv",
|
||||
"pre-commit-hooks"
|
||||
]
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1712055811,
|
||||
"narHash": "sha256-7FcfMm5A/f02yyzuavJe06zLa9hcMHsagE28ADcmQvk=",
|
||||
"owner": "cachix",
|
||||
"repo": "cachix",
|
||||
"rev": "02e38da89851ec7fec3356a5c04bc8349cae0e30",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "cachix",
|
||||
"repo": "cachix",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"devenv": {
|
||||
"inputs": {
|
||||
"cachix": "cachix",
|
||||
"flake-compat": "flake-compat_2",
|
||||
"nix": "nix_2",
|
||||
"nixpkgs": "nixpkgs_2",
|
||||
"pre-commit-hooks": "pre-commit-hooks"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1714390914,
|
||||
"narHash": "sha256-W5DFIifCjGYJXJzLU3RpqBeqes4zrf0Sr/6rwzTygPU=",
|
||||
"owner": "cachix",
|
||||
"repo": "devenv",
|
||||
"rev": "34e6461fd76b5f51ad5f8214f5cf22c4cd7a196e",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "cachix",
|
||||
"repo": "devenv",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"devenv_2": {
|
||||
"inputs": {
|
||||
"flake-compat": [
|
||||
"devenv",
|
||||
"cachix",
|
||||
"flake-compat"
|
||||
],
|
||||
"nix": "nix",
|
||||
"nixpkgs": "nixpkgs",
|
||||
"poetry2nix": "poetry2nix",
|
||||
"pre-commit-hooks": [
|
||||
"devenv",
|
||||
"cachix",
|
||||
"pre-commit-hooks"
|
||||
]
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1708704632,
|
||||
"narHash": "sha256-w+dOIW60FKMaHI1q5714CSibk99JfYxm0CzTinYWr+Q=",
|
||||
"owner": "cachix",
|
||||
"repo": "devenv",
|
||||
"rev": "2ee4450b0f4b95a1b90f2eb5ffea98b90e48c196",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "cachix",
|
||||
"ref": "python-rewrite",
|
||||
"repo": "devenv",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"flake-compat": {
|
||||
"flake": false,
|
||||
"locked": {
|
||||
"lastModified": 1673956053,
|
||||
"narHash": "sha256-4gtG9iQuiKITOjNQQeQIpoIB6b16fm+504Ch3sNKLd8=",
|
||||
"owner": "edolstra",
|
||||
"repo": "flake-compat",
|
||||
"rev": "35bb57c0c8d8b62bbfd284272c928ceb64ddbde9",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "edolstra",
|
||||
"repo": "flake-compat",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"flake-compat_2": {
|
||||
"flake": false,
|
||||
"locked": {
|
||||
"lastModified": 1696426674,
|
||||
"narHash": "sha256-kvjfFW7WAETZlt09AgDn1MrtKzP7t90Vf7vypd3OL1U=",
|
||||
"owner": "edolstra",
|
||||
"repo": "flake-compat",
|
||||
"rev": "0f9255e01c2351cc7d116c072cb317785dd33b33",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "edolstra",
|
||||
"repo": "flake-compat",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"flake-parts": {
|
||||
"inputs": {
|
||||
"nixpkgs-lib": "nixpkgs-lib"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1712014858,
|
||||
"narHash": "sha256-sB4SWl2lX95bExY2gMFG5HIzvva5AVMJd4Igm+GpZNw=",
|
||||
"owner": "hercules-ci",
|
||||
"repo": "flake-parts",
|
||||
"rev": "9126214d0a59633752a136528f5f3b9aa8565b7d",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"id": "flake-parts",
|
||||
"type": "indirect"
|
||||
}
|
||||
},
|
||||
"flake-utils": {
|
||||
"inputs": {
|
||||
"systems": "systems"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1689068808,
|
||||
"narHash": "sha256-6ixXo3wt24N/melDWjq70UuHQLxGV8jZvooRanIHXw0=",
|
||||
"owner": "numtide",
|
||||
"repo": "flake-utils",
|
||||
"rev": "919d646de7be200f3bf08cb76ae1f09402b6f9b4",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "numtide",
|
||||
"repo": "flake-utils",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"flake-utils_2": {
|
||||
"inputs": {
|
||||
"systems": "systems_2"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1710146030,
|
||||
"narHash": "sha256-SZ5L6eA7HJ/nmkzGG7/ISclqe6oZdOZTNoesiInkXPQ=",
|
||||
"owner": "numtide",
|
||||
"repo": "flake-utils",
|
||||
"rev": "b1d9ab70662946ef0850d488da1c9019f3a9752a",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "numtide",
|
||||
"repo": "flake-utils",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"gitignore": {
|
||||
"inputs": {
|
||||
"nixpkgs": [
|
||||
"devenv",
|
||||
"pre-commit-hooks",
|
||||
"nixpkgs"
|
||||
]
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1709087332,
|
||||
"narHash": "sha256-HG2cCnktfHsKV0s4XW83gU3F57gaTljL9KNSuG6bnQs=",
|
||||
"owner": "hercules-ci",
|
||||
"repo": "gitignore.nix",
|
||||
"rev": "637db329424fd7e46cf4185293b9cc8c88c95394",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "hercules-ci",
|
||||
"repo": "gitignore.nix",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"nix": {
|
||||
"inputs": {
|
||||
"flake-compat": "flake-compat",
|
||||
"nixpkgs": [
|
||||
"devenv",
|
||||
"cachix",
|
||||
"devenv",
|
||||
"nixpkgs"
|
||||
],
|
||||
"nixpkgs-regression": "nixpkgs-regression"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1712911606,
|
||||
"narHash": "sha256-BGvBhepCufsjcUkXnEEXhEVjwdJAwPglCC2+bInc794=",
|
||||
"owner": "domenkozar",
|
||||
"repo": "nix",
|
||||
"rev": "b24a9318ea3f3600c1e24b4a00691ee912d4de12",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "domenkozar",
|
||||
"ref": "devenv-2.21",
|
||||
"repo": "nix",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"nix-github-actions": {
|
||||
"inputs": {
|
||||
"nixpkgs": [
|
||||
"devenv",
|
||||
"cachix",
|
||||
"devenv",
|
||||
"poetry2nix",
|
||||
"nixpkgs"
|
||||
]
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1688870561,
|
||||
"narHash": "sha256-4UYkifnPEw1nAzqqPOTL2MvWtm3sNGw1UTYTalkTcGY=",
|
||||
"owner": "nix-community",
|
||||
"repo": "nix-github-actions",
|
||||
"rev": "165b1650b753316aa7f1787f3005a8d2da0f5301",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "nix-community",
|
||||
"repo": "nix-github-actions",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"nix_2": {
|
||||
"inputs": {
|
||||
"flake-compat": [
|
||||
"devenv",
|
||||
"flake-compat"
|
||||
],
|
||||
"nixpkgs": [
|
||||
"devenv",
|
||||
"nixpkgs"
|
||||
],
|
||||
"nixpkgs-regression": "nixpkgs-regression_2"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1712911606,
|
||||
"narHash": "sha256-BGvBhepCufsjcUkXnEEXhEVjwdJAwPglCC2+bInc794=",
|
||||
"owner": "domenkozar",
|
||||
"repo": "nix",
|
||||
"rev": "b24a9318ea3f3600c1e24b4a00691ee912d4de12",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "domenkozar",
|
||||
"ref": "devenv-2.21",
|
||||
"repo": "nix",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1692808169,
|
||||
"narHash": "sha256-x9Opq06rIiwdwGeK2Ykj69dNc2IvUH1fY55Wm7atwrE=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "9201b5ff357e781bf014d0330d18555695df7ba8",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "NixOS",
|
||||
"ref": "nixpkgs-unstable",
|
||||
"repo": "nixpkgs",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"nixpkgs-lib": {
|
||||
"locked": {
|
||||
"dir": "lib",
|
||||
"lastModified": 1711703276,
|
||||
"narHash": "sha256-iMUFArF0WCatKK6RzfUJknjem0H9m4KgorO/p3Dopkk=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "d8fe5e6c92d0d190646fb9f1056741a229980089",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"dir": "lib",
|
||||
"owner": "NixOS",
|
||||
"ref": "nixos-unstable",
|
||||
"repo": "nixpkgs",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"nixpkgs-regression": {
|
||||
"locked": {
|
||||
"lastModified": 1643052045,
|
||||
"narHash": "sha256-uGJ0VXIhWKGXxkeNnq4TvV3CIOkUJ3PAoLZ3HMzNVMw=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "215d4d0fd80ca5163643b03a33fde804a29cc1e2",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "215d4d0fd80ca5163643b03a33fde804a29cc1e2",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"nixpkgs-regression_2": {
|
||||
"locked": {
|
||||
"lastModified": 1643052045,
|
||||
"narHash": "sha256-uGJ0VXIhWKGXxkeNnq4TvV3CIOkUJ3PAoLZ3HMzNVMw=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "215d4d0fd80ca5163643b03a33fde804a29cc1e2",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "215d4d0fd80ca5163643b03a33fde804a29cc1e2",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"nixpkgs-stable": {
|
||||
"locked": {
|
||||
"lastModified": 1710695816,
|
||||
"narHash": "sha256-3Eh7fhEID17pv9ZxrPwCLfqXnYP006RKzSs0JptsN84=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "614b4613980a522ba49f0d194531beddbb7220d3",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "NixOS",
|
||||
"ref": "nixos-23.11",
|
||||
"repo": "nixpkgs",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"nixpkgs_2": {
|
||||
"locked": {
|
||||
"lastModified": 1713361204,
|
||||
"narHash": "sha256-TA6EDunWTkc5FvDCqU3W2T3SFn0gRZqh6D/hJnM02MM=",
|
||||
"owner": "cachix",
|
||||
"repo": "devenv-nixpkgs",
|
||||
"rev": "285676e87ad9f0ca23d8714a6ab61e7e027020c6",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "cachix",
|
||||
"ref": "rolling",
|
||||
"repo": "devenv-nixpkgs",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"nixpkgs_3": {
|
||||
"locked": {
|
||||
"lastModified": 1714314149,
|
||||
"narHash": "sha256-yNAevSKF4krRWacmLUsLK7D7PlfuY3zF0lYnGYNi9vQ=",
|
||||
"owner": "nixos",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "cf8cc1201be8bc71b7cbbbdaf349b22f4f99c7ae",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "nixos",
|
||||
"ref": "nixpkgs-unstable",
|
||||
"repo": "nixpkgs",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"poetry2nix": {
|
||||
"inputs": {
|
||||
"flake-utils": "flake-utils",
|
||||
"nix-github-actions": "nix-github-actions",
|
||||
"nixpkgs": [
|
||||
"devenv",
|
||||
"cachix",
|
||||
"devenv",
|
||||
"nixpkgs"
|
||||
]
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1692876271,
|
||||
"narHash": "sha256-IXfZEkI0Mal5y1jr6IRWMqK8GW2/f28xJenZIPQqkY0=",
|
||||
"owner": "nix-community",
|
||||
"repo": "poetry2nix",
|
||||
"rev": "d5006be9c2c2417dafb2e2e5034d83fabd207ee3",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "nix-community",
|
||||
"repo": "poetry2nix",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"pre-commit-hooks": {
|
||||
"inputs": {
|
||||
"flake-compat": [
|
||||
"devenv",
|
||||
"flake-compat"
|
||||
],
|
||||
"flake-utils": "flake-utils_2",
|
||||
"gitignore": "gitignore",
|
||||
"nixpkgs": [
|
||||
"devenv",
|
||||
"nixpkgs"
|
||||
],
|
||||
"nixpkgs-stable": "nixpkgs-stable"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1713775815,
|
||||
"narHash": "sha256-Wu9cdYTnGQQwtT20QQMg7jzkANKQjwBD9iccfGKkfls=",
|
||||
"owner": "cachix",
|
||||
"repo": "pre-commit-hooks.nix",
|
||||
"rev": "2ac4dcbf55ed43f3be0bae15e181f08a57af24a4",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "cachix",
|
||||
"repo": "pre-commit-hooks.nix",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"root": {
|
||||
"inputs": {
|
||||
"devenv": "devenv",
|
||||
"flake-parts": "flake-parts",
|
||||
"nixpkgs": "nixpkgs_3"
|
||||
}
|
||||
},
|
||||
"systems": {
|
||||
"locked": {
|
||||
"lastModified": 1681028828,
|
||||
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
|
||||
"owner": "nix-systems",
|
||||
"repo": "default",
|
||||
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "nix-systems",
|
||||
"repo": "default",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"systems_2": {
|
||||
"locked": {
|
||||
"lastModified": 1681028828,
|
||||
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
|
||||
"owner": "nix-systems",
|
||||
"repo": "default",
|
||||
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "nix-systems",
|
||||
"repo": "default",
|
||||
"type": "github"
|
||||
}
|
||||
}
|
||||
},
|
||||
"root": "root",
|
||||
"version": 7
|
||||
}
|
||||
58
flake.nix
58
flake.nix
@@ -1,58 +0,0 @@
|
||||
{
|
||||
inputs = {
|
||||
nixpkgs.url = "github:nixos/nixpkgs/nixpkgs-unstable";
|
||||
devenv.url = "github:cachix/devenv";
|
||||
};
|
||||
|
||||
outputs = inputs @ {
|
||||
flake-parts,
|
||||
nixpkgs,
|
||||
...
|
||||
}:
|
||||
flake-parts.lib.mkFlake {inherit inputs;} {
|
||||
imports = [
|
||||
inputs.devenv.flakeModule
|
||||
];
|
||||
systems = nixpkgs.lib.systems.flakeExposed;
|
||||
|
||||
perSystem = {
|
||||
config,
|
||||
self',
|
||||
inputs',
|
||||
pkgs,
|
||||
system,
|
||||
lib,
|
||||
...
|
||||
}: {
|
||||
devenv.shells.default = {
|
||||
packages = with pkgs; [
|
||||
rustup
|
||||
postgresql_16
|
||||
protobuf_26
|
||||
icu74
|
||||
pkg-config
|
||||
bison
|
||||
flex
|
||||
openssl
|
||||
libiconv
|
||||
readline
|
||||
zlib
|
||||
curl
|
||||
];
|
||||
|
||||
env.DISABLE_HOMEBREW = "1";
|
||||
|
||||
scripts = {
|
||||
neonmake = {
|
||||
description = "Build Neon";
|
||||
|
||||
exec =
|
||||
if (pkgs.stdenv.isDarwin)
|
||||
then "make -j`sysctl -n hw.logicalcpu` -s"
|
||||
else "make -j`nproc` -s";
|
||||
};
|
||||
};
|
||||
};
|
||||
};
|
||||
};
|
||||
}
|
||||
@@ -33,6 +33,23 @@ pub struct ComputeSpec {
|
||||
#[serde(default)]
|
||||
pub features: Vec<ComputeFeature>,
|
||||
|
||||
/// If compute_ctl was passed `--resize-swap-on-bind`, a value of `Some(_)` instructs
|
||||
/// compute_ctl to `/neonvm/bin/resize-swap` with the given size, when the spec is first
|
||||
/// received.
|
||||
///
|
||||
/// Both this field and `--resize-swap-on-bind` are required, so that the control plane's
|
||||
/// spec generation doesn't need to be aware of the actual compute it's running on, while
|
||||
/// guaranteeing gradual rollout of swap. Otherwise, without `--resize-swap-on-bind`, we could
|
||||
/// end up trying to resize swap in VMs without it -- or end up *not* resizing swap, thus
|
||||
/// giving every VM much more swap than it should have (32GiB).
|
||||
///
|
||||
/// Eventually we may remove `--resize-swap-on-bind` and exclusively use `swap_size_bytes` for
|
||||
/// enabling the swap resizing behavior once rollout is complete.
|
||||
///
|
||||
/// See neondatabase/cloud#12047 for more.
|
||||
#[serde(default)]
|
||||
pub swap_size_bytes: Option<u64>,
|
||||
|
||||
/// Expected cluster state at the end of transition process.
|
||||
pub cluster: Cluster,
|
||||
pub delta_operations: Option<Vec<DeltaOp>>,
|
||||
|
||||
31
libs/pageserver_api/src/config.rs
Normal file
31
libs/pageserver_api/src/config.rs
Normal file
@@ -0,0 +1,31 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use const_format::formatcp;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
|
||||
pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
|
||||
pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
|
||||
pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
|
||||
pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");
|
||||
|
||||
// Certain metadata (e.g. externally-addressable name, AZ) is delivered
|
||||
// as a separate structure. This information is not neeed by the pageserver
|
||||
// itself, it is only used for registering the pageserver with the control
|
||||
// plane and/or storage controller.
|
||||
//
|
||||
#[derive(PartialEq, Eq, Debug, serde::Serialize, serde::Deserialize)]
|
||||
pub struct NodeMetadata {
|
||||
#[serde(rename = "host")]
|
||||
pub postgres_host: String,
|
||||
#[serde(rename = "port")]
|
||||
pub postgres_port: u16,
|
||||
pub http_host: String,
|
||||
pub http_port: u16,
|
||||
|
||||
// Deployment tools may write fields to the metadata file beyond what we
|
||||
// use in this type: this type intentionally only names fields that require.
|
||||
#[serde(flatten)]
|
||||
pub other: HashMap<String, serde_json::Value>,
|
||||
}
|
||||
22
libs/pageserver_api/src/config/tests.rs
Normal file
22
libs/pageserver_api/src/config/tests.rs
Normal file
@@ -0,0 +1,22 @@
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_node_metadata_v1_backward_compatibilty() {
|
||||
let v1 = serde_json::to_vec(&serde_json::json!({
|
||||
"host": "localhost",
|
||||
"port": 23,
|
||||
"http_host": "localhost",
|
||||
"http_port": 42,
|
||||
}));
|
||||
|
||||
assert_eq!(
|
||||
serde_json::from_slice::<NodeMetadata>(&v1.unwrap()).unwrap(),
|
||||
NodeMetadata {
|
||||
postgres_host: "localhost".to_string(),
|
||||
postgres_port: 23,
|
||||
http_host: "localhost".to_string(),
|
||||
http_port: 42,
|
||||
other: HashMap::new(),
|
||||
}
|
||||
)
|
||||
}
|
||||
@@ -80,7 +80,7 @@ impl Key {
|
||||
}
|
||||
|
||||
/// Get the range of metadata keys.
|
||||
pub fn metadata_key_range() -> Range<Self> {
|
||||
pub const fn metadata_key_range() -> Range<Self> {
|
||||
Key {
|
||||
field1: METADATA_KEY_BEGIN_PREFIX,
|
||||
field2: 0,
|
||||
@@ -572,14 +572,17 @@ pub const AUX_FILES_KEY: Key = Key {
|
||||
// Reverse mappings for a few Keys.
|
||||
// These are needed by WAL redo manager.
|
||||
|
||||
/// Non inherited range for vectored get.
|
||||
pub const NON_INHERITED_RANGE: Range<Key> = AUX_FILES_KEY..AUX_FILES_KEY.next();
|
||||
/// Sparse keyspace range for vectored get. Missing key error will be ignored for this range.
|
||||
pub const NON_INHERITED_SPARSE_RANGE: Range<Key> = Key::metadata_key_range();
|
||||
|
||||
// AUX_FILES currently stores only data for logical replication (slots etc), and
|
||||
// we don't preserve these on a branch because safekeepers can't follow timeline
|
||||
// switch (and generally it likely should be optional), so ignore these.
|
||||
#[inline(always)]
|
||||
pub fn is_inherited_key(key: Key) -> bool {
|
||||
!NON_INHERITED_RANGE.contains(&key)
|
||||
!NON_INHERITED_RANGE.contains(&key) && !NON_INHERITED_SPARSE_RANGE.contains(&key)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
|
||||
@@ -1,7 +1,10 @@
|
||||
use postgres_ffi::BLCKSZ;
|
||||
use std::ops::Range;
|
||||
|
||||
use crate::key::Key;
|
||||
use crate::{
|
||||
key::Key,
|
||||
shard::{ShardCount, ShardIdentity},
|
||||
};
|
||||
use itertools::Itertools;
|
||||
|
||||
///
|
||||
@@ -14,6 +17,238 @@ pub struct KeySpace {
|
||||
pub ranges: Vec<Range<Key>>,
|
||||
}
|
||||
|
||||
/// A wrapper type for sparse keyspaces.
|
||||
#[derive(Clone, Debug, Default, PartialEq, Eq)]
|
||||
pub struct SparseKeySpace(pub KeySpace);
|
||||
|
||||
/// Represents a contiguous half-open range of the keyspace, masked according to a particular
|
||||
/// ShardNumber's stripes: within this range of keys, only some "belong" to the current
|
||||
/// shard.
|
||||
///
|
||||
/// When we iterate over keys within this object, we will skip any keys that don't belong
|
||||
/// to this shard.
|
||||
///
|
||||
/// The start + end keys may not belong to the shard: these specify where layer files should
|
||||
/// start + end, but we will never actually read/write those keys.
|
||||
#[derive(Clone, Debug, PartialEq, Eq)]
|
||||
pub struct ShardedRange<'a> {
|
||||
pub shard_identity: &'a ShardIdentity,
|
||||
pub range: Range<Key>,
|
||||
}
|
||||
|
||||
// Calculate the size of a range within the blocks of the same relation, or spanning only the
|
||||
// top page in the previous relation's space.
|
||||
fn contiguous_range_len(range: &Range<Key>) -> u32 {
|
||||
debug_assert!(is_contiguous_range(range));
|
||||
if range.start.field6 == 0xffffffff {
|
||||
range.end.field6 + 1
|
||||
} else {
|
||||
range.end.field6 - range.start.field6
|
||||
}
|
||||
}
|
||||
|
||||
/// Return true if this key range includes only keys in the same relation's data blocks, or
|
||||
/// just spanning one relation and the logical size (0xffffffff) block of the relation before it.
|
||||
///
|
||||
/// Contiguous in this context means we know the keys are in use _somewhere_, but it might not
|
||||
/// be on our shard. Later in ShardedRange we do the extra work to figure out how much
|
||||
/// of a given contiguous range is present on one shard.
|
||||
///
|
||||
/// This matters, because:
|
||||
/// - Within such ranges, keys are used contiguously. Outside such ranges it is sparse.
|
||||
/// - Within such ranges, we may calculate distances using simple subtraction of field6.
|
||||
fn is_contiguous_range(range: &Range<Key>) -> bool {
|
||||
range.start.field1 == range.end.field1
|
||||
&& range.start.field2 == range.end.field2
|
||||
&& range.start.field3 == range.end.field3
|
||||
&& range.start.field4 == range.end.field4
|
||||
&& (range.start.field5 == range.end.field5
|
||||
|| (range.start.field6 == 0xffffffff && range.start.field5 + 1 == range.end.field5))
|
||||
}
|
||||
|
||||
impl<'a> ShardedRange<'a> {
|
||||
pub fn new(range: Range<Key>, shard_identity: &'a ShardIdentity) -> Self {
|
||||
Self {
|
||||
shard_identity,
|
||||
range,
|
||||
}
|
||||
}
|
||||
|
||||
/// Break up this range into chunks, each of which has at least one local key in it if the
|
||||
/// total range has at least one local key.
|
||||
pub fn fragment(self, target_nblocks: u32) -> Vec<(u32, Range<Key>)> {
|
||||
// Optimization for single-key case (e.g. logical size keys)
|
||||
if self.range.end == self.range.start.add(1) {
|
||||
return vec![(
|
||||
if self.shard_identity.is_key_disposable(&self.range.start) {
|
||||
0
|
||||
} else {
|
||||
1
|
||||
},
|
||||
self.range,
|
||||
)];
|
||||
}
|
||||
|
||||
if !is_contiguous_range(&self.range) {
|
||||
// Ranges that span relations are not fragmented. We only get these ranges as a result
|
||||
// of operations that act on existing layers, so we trust that the existing range is
|
||||
// reasonably small.
|
||||
return vec![(u32::MAX, self.range)];
|
||||
}
|
||||
|
||||
let mut fragments: Vec<(u32, Range<Key>)> = Vec::new();
|
||||
|
||||
let mut cursor = self.range.start;
|
||||
while cursor < self.range.end {
|
||||
let advance_by = self.distance_to_next_boundary(cursor);
|
||||
let is_fragment_disposable = self.shard_identity.is_key_disposable(&cursor);
|
||||
|
||||
// If the previous fragment is undersized, then we seek to consume enough
|
||||
// blocks to complete it.
|
||||
let (want_blocks, merge_last_fragment) = match fragments.last_mut() {
|
||||
Some(frag) if frag.0 < target_nblocks => (target_nblocks - frag.0, Some(frag)),
|
||||
Some(frag) => {
|
||||
// Prev block is complete, want the full number.
|
||||
(
|
||||
target_nblocks,
|
||||
if is_fragment_disposable {
|
||||
// If this current range will be empty (not shard-local data), we will merge into previous
|
||||
Some(frag)
|
||||
} else {
|
||||
None
|
||||
},
|
||||
)
|
||||
}
|
||||
None => {
|
||||
// First iteration, want the full number
|
||||
(target_nblocks, None)
|
||||
}
|
||||
};
|
||||
|
||||
let advance_by = if is_fragment_disposable {
|
||||
advance_by
|
||||
} else {
|
||||
std::cmp::min(advance_by, want_blocks)
|
||||
};
|
||||
|
||||
let next_cursor = cursor.add(advance_by);
|
||||
|
||||
let this_frag = (
|
||||
if is_fragment_disposable {
|
||||
0
|
||||
} else {
|
||||
advance_by
|
||||
},
|
||||
cursor..next_cursor,
|
||||
);
|
||||
cursor = next_cursor;
|
||||
|
||||
if let Some(last_fragment) = merge_last_fragment {
|
||||
// Previous fragment was short or this one is empty, merge into it
|
||||
last_fragment.0 += this_frag.0;
|
||||
last_fragment.1.end = this_frag.1.end;
|
||||
} else {
|
||||
fragments.push(this_frag);
|
||||
}
|
||||
}
|
||||
|
||||
fragments
|
||||
}
|
||||
|
||||
/// Estimate the physical pages that are within this range, on this shard. This returns
|
||||
/// u32::MAX if the range spans relations: this return value should be interpreted as "large".
|
||||
pub fn page_count(&self) -> u32 {
|
||||
// Special cases for single keys like logical sizes
|
||||
if self.range.end == self.range.start.add(1) {
|
||||
return if self.shard_identity.is_key_disposable(&self.range.start) {
|
||||
0
|
||||
} else {
|
||||
1
|
||||
};
|
||||
}
|
||||
|
||||
// We can only do an authentic calculation of contiguous key ranges
|
||||
if !is_contiguous_range(&self.range) {
|
||||
return u32::MAX;
|
||||
}
|
||||
|
||||
// Special case for single sharded tenants: our logical and physical sizes are the same
|
||||
if self.shard_identity.count < ShardCount::new(2) {
|
||||
return contiguous_range_len(&self.range);
|
||||
}
|
||||
|
||||
// Normal path: step through stripes and part-stripes in the range, evaluate whether each one belongs
|
||||
// to Self, and add the stripe's block count to our total if so.
|
||||
let mut result: u64 = 0;
|
||||
let mut cursor = self.range.start;
|
||||
while cursor < self.range.end {
|
||||
// Count up to the next stripe_size boundary or end of range
|
||||
let advance_by = self.distance_to_next_boundary(cursor);
|
||||
|
||||
// If this blocks in this stripe belong to us, add them to our count
|
||||
if !self.shard_identity.is_key_disposable(&cursor) {
|
||||
result += advance_by as u64;
|
||||
}
|
||||
|
||||
cursor = cursor.add(advance_by);
|
||||
}
|
||||
|
||||
if result > u32::MAX as u64 {
|
||||
u32::MAX
|
||||
} else {
|
||||
result as u32
|
||||
}
|
||||
}
|
||||
|
||||
/// Advance the cursor to the next potential fragment boundary: this is either
|
||||
/// a stripe boundary, or the end of the range.
|
||||
fn distance_to_next_boundary(&self, cursor: Key) -> u32 {
|
||||
let distance_to_range_end = contiguous_range_len(&(cursor..self.range.end));
|
||||
|
||||
if self.shard_identity.count < ShardCount::new(2) {
|
||||
// Optimization: don't bother stepping through stripes if the tenant isn't sharded.
|
||||
return distance_to_range_end;
|
||||
}
|
||||
|
||||
if cursor.field6 == 0xffffffff {
|
||||
// We are wrapping from one relation's logical size to the next relation's first data block
|
||||
return 1;
|
||||
}
|
||||
|
||||
let stripe_index = cursor.field6 / self.shard_identity.stripe_size.0;
|
||||
let stripe_remainder = self.shard_identity.stripe_size.0
|
||||
- (cursor.field6 - stripe_index * self.shard_identity.stripe_size.0);
|
||||
|
||||
if cfg!(debug_assertions) {
|
||||
// We should never overflow field5 and field6 -- our callers check this earlier
|
||||
// and would have returned their u32::MAX cases if the input range violated this.
|
||||
let next_cursor = cursor.add(stripe_remainder);
|
||||
debug_assert!(
|
||||
next_cursor.field1 == cursor.field1
|
||||
&& next_cursor.field2 == cursor.field2
|
||||
&& next_cursor.field3 == cursor.field3
|
||||
&& next_cursor.field4 == cursor.field4
|
||||
&& next_cursor.field5 == cursor.field5
|
||||
)
|
||||
}
|
||||
|
||||
std::cmp::min(stripe_remainder, distance_to_range_end)
|
||||
}
|
||||
|
||||
/// Whereas `page_count` estimates the number of pages physically in this range on this shard,
|
||||
/// this function simply calculates the number of pages in the space, without accounting for those
|
||||
/// pages that would not actually be stored on this node.
|
||||
///
|
||||
/// Don't use this function in code that works with physical entities like layer files.
|
||||
fn raw_size(range: &Range<Key>) -> u32 {
|
||||
if is_contiguous_range(range) {
|
||||
contiguous_range_len(range)
|
||||
} else {
|
||||
u32::MAX
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl KeySpace {
|
||||
/// Create a key space with a single range.
|
||||
pub fn single(key_range: Range<Key>) -> Self {
|
||||
@@ -25,39 +260,36 @@ impl KeySpace {
|
||||
/// Partition a key space into roughly chunks of roughly 'target_size' bytes
|
||||
/// in each partition.
|
||||
///
|
||||
pub fn partition(&self, target_size: u64) -> KeyPartitioning {
|
||||
pub fn partition(&self, shard_identity: &ShardIdentity, target_size: u64) -> KeyPartitioning {
|
||||
// Assume that each value is 8k in size.
|
||||
let target_nblocks = (target_size / BLCKSZ as u64) as usize;
|
||||
let target_nblocks = (target_size / BLCKSZ as u64) as u32;
|
||||
|
||||
let mut parts = Vec::new();
|
||||
let mut current_part = Vec::new();
|
||||
let mut current_part_size: usize = 0;
|
||||
for range in &self.ranges {
|
||||
// If appending the next contiguous range in the keyspace to the current
|
||||
// partition would cause it to be too large, start a new partition.
|
||||
let this_size = key_range_size(range) as usize;
|
||||
if current_part_size + this_size > target_nblocks && !current_part.is_empty() {
|
||||
parts.push(KeySpace {
|
||||
ranges: current_part,
|
||||
});
|
||||
current_part = Vec::new();
|
||||
current_part_size = 0;
|
||||
}
|
||||
// While doing partitioning, wrap the range in ShardedRange so that our size calculations
|
||||
// will respect shard striping rather than assuming all keys within a range are present.
|
||||
let range = ShardedRange::new(range.clone(), shard_identity);
|
||||
|
||||
// If the next range is larger than 'target_size', split it into
|
||||
// 'target_size' chunks.
|
||||
let mut remain_size = this_size;
|
||||
let mut start = range.start;
|
||||
while remain_size > target_nblocks {
|
||||
let next = start.add(target_nblocks as u32);
|
||||
parts.push(KeySpace {
|
||||
ranges: vec![start..next],
|
||||
});
|
||||
start = next;
|
||||
remain_size -= target_nblocks
|
||||
// Chunk up the range into parts that each contain up to target_size local blocks
|
||||
for (frag_on_shard_size, frag_range) in range.fragment(target_nblocks) {
|
||||
// If appending the next contiguous range in the keyspace to the current
|
||||
// partition would cause it to be too large, and our current partition
|
||||
// covers at least one block that is physically present in this shard,
|
||||
// then start a new partition
|
||||
if current_part_size + frag_on_shard_size as usize > target_nblocks as usize
|
||||
&& current_part_size > 0
|
||||
{
|
||||
parts.push(KeySpace {
|
||||
ranges: current_part,
|
||||
});
|
||||
current_part = Vec::new();
|
||||
current_part_size = 0;
|
||||
}
|
||||
current_part.push(frag_range.start..frag_range.end);
|
||||
current_part_size += frag_on_shard_size as usize;
|
||||
}
|
||||
current_part.push(start..range.end);
|
||||
current_part_size += remain_size;
|
||||
}
|
||||
|
||||
// add last partition that wasn't full yet.
|
||||
@@ -71,7 +303,7 @@ impl KeySpace {
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.total_size() == 0
|
||||
self.total_raw_size() == 0
|
||||
}
|
||||
|
||||
/// Merge another keyspace into the current one.
|
||||
@@ -164,11 +396,11 @@ impl KeySpace {
|
||||
self.ranges.last().map(|range| range.end)
|
||||
}
|
||||
|
||||
#[allow(unused)]
|
||||
pub fn total_size(&self) -> usize {
|
||||
/// The size of the keyspace in pages, before accounting for sharding
|
||||
pub fn total_raw_size(&self) -> usize {
|
||||
self.ranges
|
||||
.iter()
|
||||
.map(|range| key_range_size(range) as usize)
|
||||
.map(|range| ShardedRange::raw_size(range) as usize)
|
||||
.sum()
|
||||
}
|
||||
|
||||
@@ -207,10 +439,33 @@ pub struct KeyPartitioning {
|
||||
pub parts: Vec<KeySpace>,
|
||||
}
|
||||
|
||||
/// Represents a partitioning of the sparse key space.
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct SparseKeyPartitioning {
|
||||
pub parts: Vec<SparseKeySpace>,
|
||||
}
|
||||
|
||||
impl KeyPartitioning {
|
||||
pub fn new() -> Self {
|
||||
KeyPartitioning { parts: Vec::new() }
|
||||
}
|
||||
|
||||
/// Convert a key partitioning to a sparse partition.
|
||||
pub fn into_sparse(self) -> SparseKeyPartitioning {
|
||||
SparseKeyPartitioning {
|
||||
parts: self.parts.into_iter().map(SparseKeySpace).collect(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl SparseKeyPartitioning {
|
||||
/// Note: use this function with caution. Attempt to handle a sparse keyspace in the same way as a dense keyspace will
|
||||
/// cause long/dead loops.
|
||||
pub fn into_dense(self) -> KeyPartitioning {
|
||||
KeyPartitioning {
|
||||
parts: self.parts.into_iter().map(|x| x.0).collect(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
@@ -242,7 +497,7 @@ impl KeySpaceAccum {
|
||||
|
||||
#[inline(always)]
|
||||
pub fn add_range(&mut self, range: Range<Key>) {
|
||||
self.size += key_range_size(&range) as u64;
|
||||
self.size += ShardedRange::raw_size(&range) as u64;
|
||||
|
||||
match self.accum.as_mut() {
|
||||
Some(accum) => {
|
||||
@@ -274,7 +529,9 @@ impl KeySpaceAccum {
|
||||
std::mem::take(self).to_keyspace()
|
||||
}
|
||||
|
||||
pub fn size(&self) -> u64 {
|
||||
// The total number of keys in this object, ignoring any sharding effects that might cause some of
|
||||
// the keys to be omitted in storage on this shard.
|
||||
pub fn raw_size(&self) -> u64 {
|
||||
self.size
|
||||
}
|
||||
}
|
||||
@@ -330,36 +587,19 @@ impl KeySpaceRandomAccum {
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn key_range_size(key_range: &Range<Key>) -> u32 {
|
||||
let start = key_range.start;
|
||||
let end = key_range.end;
|
||||
|
||||
if end.field1 != start.field1
|
||||
|| end.field2 != start.field2
|
||||
|| end.field3 != start.field3
|
||||
|| end.field4 != start.field4
|
||||
{
|
||||
return u32::MAX;
|
||||
}
|
||||
|
||||
let start = (start.field5 as u64) << 32 | start.field6 as u64;
|
||||
let end = (end.field5 as u64) << 32 | end.field6 as u64;
|
||||
|
||||
let diff = end - start;
|
||||
if diff > u32::MAX as u64 {
|
||||
u32::MAX
|
||||
} else {
|
||||
diff as u32
|
||||
}
|
||||
}
|
||||
|
||||
pub fn singleton_range(key: Key) -> Range<Key> {
|
||||
key..key.next()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use rand::{RngCore, SeedableRng};
|
||||
|
||||
use crate::{
|
||||
models::ShardParameters,
|
||||
shard::{ShardCount, ShardNumber},
|
||||
};
|
||||
|
||||
use super::*;
|
||||
use std::fmt::Write;
|
||||
|
||||
@@ -402,14 +642,17 @@ mod tests {
|
||||
accum.add_range(range.clone());
|
||||
}
|
||||
|
||||
let expected_size: u64 = ranges.iter().map(|r| key_range_size(r) as u64).sum();
|
||||
assert_eq!(accum.size(), expected_size);
|
||||
let expected_size: u64 = ranges
|
||||
.iter()
|
||||
.map(|r| ShardedRange::raw_size(r) as u64)
|
||||
.sum();
|
||||
assert_eq!(accum.raw_size(), expected_size);
|
||||
|
||||
assert_ks_eq(&accum.consume_keyspace(), ranges.clone());
|
||||
assert_eq!(accum.size(), 0);
|
||||
assert_eq!(accum.raw_size(), 0);
|
||||
|
||||
assert_ks_eq(&accum.consume_keyspace(), vec![]);
|
||||
assert_eq!(accum.size(), 0);
|
||||
assert_eq!(accum.raw_size(), 0);
|
||||
|
||||
for range in &ranges {
|
||||
accum.add_range(range.clone());
|
||||
@@ -706,4 +949,412 @@ mod tests {
|
||||
]
|
||||
);
|
||||
}
|
||||
#[test]
|
||||
fn sharded_range_relation_gap() {
|
||||
let shard_identity = ShardIdentity::new(
|
||||
ShardNumber(0),
|
||||
ShardCount::new(4),
|
||||
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let range = ShardedRange::new(
|
||||
Range {
|
||||
start: Key::from_hex("000000067F00000005000040100300000000").unwrap(),
|
||||
end: Key::from_hex("000000067F00000005000040130000004000").unwrap(),
|
||||
},
|
||||
&shard_identity,
|
||||
);
|
||||
|
||||
// Key range spans relations, expect MAX
|
||||
assert_eq!(range.page_count(), u32::MAX);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn shard_identity_keyspaces_single_key() {
|
||||
let shard_identity = ShardIdentity::new(
|
||||
ShardNumber(1),
|
||||
ShardCount::new(4),
|
||||
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let range = ShardedRange::new(
|
||||
Range {
|
||||
start: Key::from_hex("000000067f000000010000007000ffffffff").unwrap(),
|
||||
end: Key::from_hex("000000067f00000001000000700100000000").unwrap(),
|
||||
},
|
||||
&shard_identity,
|
||||
);
|
||||
// Single-key range on logical size key
|
||||
assert_eq!(range.page_count(), 1);
|
||||
}
|
||||
|
||||
/// Test the helper that we use to identify ranges which go outside the data blocks of a single relation
|
||||
#[test]
|
||||
fn contiguous_range_check() {
|
||||
assert!(!is_contiguous_range(
|
||||
&(Key::from_hex("000000067f00000001000004df00fffffffe").unwrap()
|
||||
..Key::from_hex("000000067f00000001000004df0100000003").unwrap())
|
||||
),);
|
||||
|
||||
// The ranges goes all the way up to the 0xffffffff, including it: this is
|
||||
// not considered a rel block range because 0xffffffff stores logical sizes,
|
||||
// not blocks.
|
||||
assert!(!is_contiguous_range(
|
||||
&(Key::from_hex("000000067f00000001000004df00fffffffe").unwrap()
|
||||
..Key::from_hex("000000067f00000001000004df0100000000").unwrap())
|
||||
),);
|
||||
|
||||
// Keys within the normal data region of a relation
|
||||
assert!(is_contiguous_range(
|
||||
&(Key::from_hex("000000067f00000001000004df0000000000").unwrap()
|
||||
..Key::from_hex("000000067f00000001000004df0000000080").unwrap())
|
||||
),);
|
||||
|
||||
// The logical size key of one forkno, then some blocks in the next
|
||||
assert!(is_contiguous_range(
|
||||
&(Key::from_hex("000000067f00000001000004df00ffffffff").unwrap()
|
||||
..Key::from_hex("000000067f00000001000004df0100000080").unwrap())
|
||||
),);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn shard_identity_keyspaces_forkno_gap() {
|
||||
let shard_identity = ShardIdentity::new(
|
||||
ShardNumber(1),
|
||||
ShardCount::new(4),
|
||||
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let range = ShardedRange::new(
|
||||
Range {
|
||||
start: Key::from_hex("000000067f00000001000004df00fffffffe").unwrap(),
|
||||
end: Key::from_hex("000000067f00000001000004df0100000003").unwrap(),
|
||||
},
|
||||
&shard_identity,
|
||||
);
|
||||
|
||||
// Range spanning the end of one forkno and the start of the next: we do not attempt to
|
||||
// calculate a valid size, because we have no way to know if they keys between start
|
||||
// and end are actually in use.
|
||||
assert_eq!(range.page_count(), u32::MAX);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn shard_identity_keyspaces_one_relation() {
|
||||
for shard_number in 0..4 {
|
||||
let shard_identity = ShardIdentity::new(
|
||||
ShardNumber(shard_number),
|
||||
ShardCount::new(4),
|
||||
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let range = ShardedRange::new(
|
||||
Range {
|
||||
start: Key::from_hex("000000067f00000001000000ae0000000000").unwrap(),
|
||||
end: Key::from_hex("000000067f00000001000000ae0000000001").unwrap(),
|
||||
},
|
||||
&shard_identity,
|
||||
);
|
||||
|
||||
// Very simple case: range covering block zero of one relation, where that block maps to shard zero
|
||||
if shard_number == 0 {
|
||||
assert_eq!(range.page_count(), 1);
|
||||
} else {
|
||||
// Other shards should perceive the range's size as zero
|
||||
assert_eq!(range.page_count(), 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Test helper: construct a ShardedRange and call fragment() on it, returning
|
||||
/// the total page count in the range and the fragments.
|
||||
fn do_fragment(
|
||||
range_start: Key,
|
||||
range_end: Key,
|
||||
shard_identity: &ShardIdentity,
|
||||
target_nblocks: u32,
|
||||
) -> (u32, Vec<(u32, Range<Key>)>) {
|
||||
let range = ShardedRange::new(
|
||||
Range {
|
||||
start: range_start,
|
||||
end: range_end,
|
||||
},
|
||||
shard_identity,
|
||||
);
|
||||
|
||||
let page_count = range.page_count();
|
||||
let fragments = range.fragment(target_nblocks);
|
||||
|
||||
// Invariant: we always get at least one fragment
|
||||
assert!(!fragments.is_empty());
|
||||
|
||||
// Invariant: the first/last fragment start/end should equal the input start/end
|
||||
assert_eq!(fragments.first().unwrap().1.start, range_start);
|
||||
assert_eq!(fragments.last().unwrap().1.end, range_end);
|
||||
|
||||
if page_count > 0 {
|
||||
// Invariant: every fragment must contain at least one shard-local page, if the
|
||||
// total range contains at least one shard-local page
|
||||
let all_nonzero = fragments.iter().all(|f| f.0 > 0);
|
||||
if !all_nonzero {
|
||||
eprintln!("Found a zero-length fragment: {:?}", fragments);
|
||||
}
|
||||
assert!(all_nonzero);
|
||||
} else {
|
||||
// A range with no shard-local pages should always be returned as a single fragment
|
||||
assert_eq!(fragments, vec![(0, range_start..range_end)]);
|
||||
}
|
||||
|
||||
// Invariant: fragments must be ordered and non-overlapping
|
||||
let mut last: Option<Range<Key>> = None;
|
||||
for frag in &fragments {
|
||||
if let Some(last) = last {
|
||||
assert!(frag.1.start >= last.end);
|
||||
assert!(frag.1.start > last.start);
|
||||
}
|
||||
last = Some(frag.1.clone())
|
||||
}
|
||||
|
||||
// Invariant: fragments respect target_nblocks
|
||||
for frag in &fragments {
|
||||
assert!(frag.0 == u32::MAX || frag.0 <= target_nblocks);
|
||||
}
|
||||
|
||||
(page_count, fragments)
|
||||
}
|
||||
|
||||
/// Really simple tests for fragment(), on a range that just contains a single stripe
|
||||
/// for a single tenant.
|
||||
#[test]
|
||||
fn sharded_range_fragment_simple() {
|
||||
let shard_identity = ShardIdentity::new(
|
||||
ShardNumber(0),
|
||||
ShardCount::new(4),
|
||||
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
// A range which we happen to know covers exactly one stripe which belongs to this shard
|
||||
let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
|
||||
let input_end = Key::from_hex("000000067f00000001000000ae0000008000").unwrap();
|
||||
|
||||
// Ask for stripe_size blocks, we get the whole stripe
|
||||
assert_eq!(
|
||||
do_fragment(input_start, input_end, &shard_identity, 32768),
|
||||
(32768, vec![(32768, input_start..input_end)])
|
||||
);
|
||||
|
||||
// Ask for more, we still get the whole stripe
|
||||
assert_eq!(
|
||||
do_fragment(input_start, input_end, &shard_identity, 10000000),
|
||||
(32768, vec![(32768, input_start..input_end)])
|
||||
);
|
||||
|
||||
// Ask for target_nblocks of half the stripe size, we get two halves
|
||||
assert_eq!(
|
||||
do_fragment(input_start, input_end, &shard_identity, 16384),
|
||||
(
|
||||
32768,
|
||||
vec![
|
||||
(16384, input_start..input_start.add(16384)),
|
||||
(16384, input_start.add(16384)..input_end)
|
||||
]
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sharded_range_fragment_multi_stripe() {
|
||||
let shard_identity = ShardIdentity::new(
|
||||
ShardNumber(0),
|
||||
ShardCount::new(4),
|
||||
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
// A range which covers multiple stripes, exactly one of which belongs to the current shard.
|
||||
let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
|
||||
let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap();
|
||||
// Ask for all the blocks, get a fragment that covers the whole range but reports
|
||||
// its size to be just the blocks belonging to our shard.
|
||||
assert_eq!(
|
||||
do_fragment(input_start, input_end, &shard_identity, 131072),
|
||||
(32768, vec![(32768, input_start..input_end)])
|
||||
);
|
||||
|
||||
// Ask for a sub-stripe quantity
|
||||
assert_eq!(
|
||||
do_fragment(input_start, input_end, &shard_identity, 16000),
|
||||
(
|
||||
32768,
|
||||
vec![
|
||||
(16000, input_start..input_start.add(16000)),
|
||||
(16000, input_start.add(16000)..input_start.add(32000)),
|
||||
(768, input_start.add(32000)..input_end),
|
||||
]
|
||||
)
|
||||
);
|
||||
|
||||
// Try on a range that starts slightly after our owned stripe
|
||||
assert_eq!(
|
||||
do_fragment(input_start.add(1), input_end, &shard_identity, 131072),
|
||||
(32767, vec![(32767, input_start.add(1)..input_end)])
|
||||
);
|
||||
}
|
||||
|
||||
/// Test our calculations work correctly when we start a range from the logical size key of
|
||||
/// a previous relation.
|
||||
#[test]
|
||||
fn sharded_range_fragment_starting_from_logical_size() {
|
||||
let input_start = Key::from_hex("000000067f00000001000000ae00ffffffff").unwrap();
|
||||
let input_end = Key::from_hex("000000067f00000001000000ae0100008000").unwrap();
|
||||
|
||||
// Shard 0 owns the first stripe in the relation, and the preceding logical size is shard local too
|
||||
let shard_identity = ShardIdentity::new(
|
||||
ShardNumber(0),
|
||||
ShardCount::new(4),
|
||||
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
do_fragment(input_start, input_end, &shard_identity, 0x10000),
|
||||
(0x8001, vec![(0x8001, input_start..input_end)])
|
||||
);
|
||||
|
||||
// Shard 1 does not own the first stripe in the relation, but it does own the logical size (all shards
|
||||
// store all logical sizes)
|
||||
let shard_identity = ShardIdentity::new(
|
||||
ShardNumber(1),
|
||||
ShardCount::new(4),
|
||||
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
do_fragment(input_start, input_end, &shard_identity, 0x10000),
|
||||
(0x1, vec![(0x1, input_start..input_end)])
|
||||
);
|
||||
}
|
||||
|
||||
/// Test that ShardedRange behaves properly when used on un-sharded data
|
||||
#[test]
|
||||
fn sharded_range_fragment_unsharded() {
|
||||
let shard_identity = ShardIdentity::unsharded();
|
||||
|
||||
let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
|
||||
let input_end = Key::from_hex("000000067f00000001000000ae0000010000").unwrap();
|
||||
assert_eq!(
|
||||
do_fragment(input_start, input_end, &shard_identity, 0x8000),
|
||||
(
|
||||
0x10000,
|
||||
vec![
|
||||
(0x8000, input_start..input_start.add(0x8000)),
|
||||
(0x8000, input_start.add(0x8000)..input_start.add(0x10000))
|
||||
]
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sharded_range_fragment_cross_relation() {
|
||||
let shard_identity = ShardIdentity::unsharded();
|
||||
|
||||
// A range that spans relations: expect fragmentation to give up and return a u32::MAX size
|
||||
let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
|
||||
let input_end = Key::from_hex("000000068f00000001000000ae0000010000").unwrap();
|
||||
assert_eq!(
|
||||
do_fragment(input_start, input_end, &shard_identity, 0x8000),
|
||||
(u32::MAX, vec![(u32::MAX, input_start..input_end),])
|
||||
);
|
||||
|
||||
// Same, but using a sharded identity
|
||||
let shard_identity = ShardIdentity::new(
|
||||
ShardNumber(0),
|
||||
ShardCount::new(4),
|
||||
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
do_fragment(input_start, input_end, &shard_identity, 0x8000),
|
||||
(u32::MAX, vec![(u32::MAX, input_start..input_end),])
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sharded_range_fragment_tiny_nblocks() {
|
||||
let shard_identity = ShardIdentity::unsharded();
|
||||
|
||||
// A range that spans relations: expect fragmentation to give up and return a u32::MAX size
|
||||
let input_start = Key::from_hex("000000067F00000001000004E10000000000").unwrap();
|
||||
let input_end = Key::from_hex("000000067F00000001000004E10000000038").unwrap();
|
||||
assert_eq!(
|
||||
do_fragment(input_start, input_end, &shard_identity, 16),
|
||||
(
|
||||
0x38,
|
||||
vec![
|
||||
(16, input_start..input_start.add(16)),
|
||||
(16, input_start.add(16)..input_start.add(32)),
|
||||
(16, input_start.add(32)..input_start.add(48)),
|
||||
(8, input_start.add(48)..input_end),
|
||||
]
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sharded_range_fragment_fuzz() {
|
||||
// Use a fixed seed: we don't want to explicitly pick values, but we do want
|
||||
// the test to be reproducible.
|
||||
let mut prng = rand::rngs::StdRng::seed_from_u64(0xdeadbeef);
|
||||
|
||||
for _i in 0..1000 {
|
||||
let shard_identity = if prng.next_u32() % 2 == 0 {
|
||||
ShardIdentity::unsharded()
|
||||
} else {
|
||||
let shard_count = prng.next_u32() % 127 + 1;
|
||||
ShardIdentity::new(
|
||||
ShardNumber((prng.next_u32() % shard_count) as u8),
|
||||
ShardCount::new(shard_count as u8),
|
||||
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||
)
|
||||
.unwrap()
|
||||
};
|
||||
|
||||
let target_nblocks = prng.next_u32() % 65536 + 1;
|
||||
|
||||
let start_offset = prng.next_u32() % 16384;
|
||||
|
||||
// Try ranges up to 4GiB in size, that are always at least 1
|
||||
let range_size = prng.next_u32() % 8192 + 1;
|
||||
|
||||
// A range that spans relations: expect fragmentation to give up and return a u32::MAX size
|
||||
let input_start = Key::from_hex("000000067F00000001000004E10000000000")
|
||||
.unwrap()
|
||||
.add(start_offset);
|
||||
let input_end = input_start.add(range_size);
|
||||
|
||||
// This test's main success conditions are the invariants baked into do_fragment
|
||||
let (_total_size, fragments) =
|
||||
do_fragment(input_start, input_end, &shard_identity, target_nblocks);
|
||||
|
||||
// Pick a random key within the range and check it appears in the output
|
||||
let example_key = input_start.add(prng.next_u32() % range_size);
|
||||
|
||||
// Panic on unwrap if it isn't found
|
||||
let example_key_frag = fragments
|
||||
.iter()
|
||||
.find(|f| f.1.contains(&example_key))
|
||||
.unwrap();
|
||||
|
||||
// Check that the fragment containing our random key has a nonzero size if
|
||||
// that key is shard-local
|
||||
let example_key_local = !shard_identity.is_key_disposable(&example_key);
|
||||
if example_key_local {
|
||||
assert!(example_key_frag.0 > 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
#![deny(unsafe_code)]
|
||||
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||
use const_format::formatcp;
|
||||
|
||||
pub mod controller_api;
|
||||
pub mod key;
|
||||
@@ -11,7 +10,4 @@ pub mod shard;
|
||||
/// Public API types
|
||||
pub mod upcall_api;
|
||||
|
||||
pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
|
||||
pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
|
||||
pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
|
||||
pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");
|
||||
pub mod config;
|
||||
|
||||
@@ -430,8 +430,6 @@ pub struct StatusResponse {
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct TenantLocationConfigRequest {
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub tenant_id: Option<TenantShardId>,
|
||||
#[serde(flatten)]
|
||||
pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
|
||||
}
|
||||
|
||||
@@ -1,9 +1,11 @@
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::keyspace::SparseKeySpace;
|
||||
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
pub struct Partitioning {
|
||||
pub keys: crate::keyspace::KeySpace,
|
||||
|
||||
pub sparse_keys: crate::keyspace::SparseKeySpace,
|
||||
pub at_lsn: Lsn,
|
||||
}
|
||||
|
||||
@@ -32,6 +34,8 @@ impl serde::Serialize for Partitioning {
|
||||
let mut map = serializer.serialize_map(Some(2))?;
|
||||
map.serialize_key("keys")?;
|
||||
map.serialize_value(&KeySpace(&self.keys))?;
|
||||
map.serialize_key("sparse_keys")?;
|
||||
map.serialize_value(&KeySpace(&self.sparse_keys.0))?;
|
||||
map.serialize_key("at_lsn")?;
|
||||
map.serialize_value(&WithDisplay(&self.at_lsn))?;
|
||||
map.end()
|
||||
@@ -99,6 +103,7 @@ impl<'a> serde::Deserialize<'a> for Partitioning {
|
||||
#[derive(serde::Deserialize)]
|
||||
struct De {
|
||||
keys: KeySpace,
|
||||
sparse_keys: KeySpace,
|
||||
#[serde_as(as = "serde_with::DisplayFromStr")]
|
||||
at_lsn: Lsn,
|
||||
}
|
||||
@@ -107,6 +112,7 @@ impl<'a> serde::Deserialize<'a> for Partitioning {
|
||||
Ok(Self {
|
||||
at_lsn: de.at_lsn,
|
||||
keys: de.keys.0,
|
||||
sparse_keys: SparseKeySpace(de.sparse_keys.0),
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -133,6 +139,12 @@ mod tests {
|
||||
"030000000000000000000000000000000003"
|
||||
]
|
||||
],
|
||||
"sparse_keys": [
|
||||
[
|
||||
"620000000000000000000000000000000000",
|
||||
"620000000000000000000000000000000003"
|
||||
]
|
||||
],
|
||||
"at_lsn": "0/2240160"
|
||||
}
|
||||
"#;
|
||||
|
||||
@@ -97,7 +97,7 @@ impl ShardCount {
|
||||
|
||||
/// The internal value of a ShardCount may be zero, which means "1 shard, but use
|
||||
/// legacy format for TenantShardId that excludes the shard suffix", also known
|
||||
/// as `TenantShardId::unsharded`.
|
||||
/// as [`TenantShardId::unsharded`].
|
||||
///
|
||||
/// This method returns the actual number of shards, i.e. if our internal value is
|
||||
/// zero, we return 1 (unsharded tenants have 1 shard).
|
||||
@@ -116,7 +116,9 @@ impl ShardCount {
|
||||
self.0
|
||||
}
|
||||
|
||||
///
|
||||
/// Whether the `ShardCount` is for an unsharded tenant, so uses one shard but
|
||||
/// uses the legacy format for `TenantShardId`. See also the documentation for
|
||||
/// [`Self::count`].
|
||||
pub fn is_unsharded(&self) -> bool {
|
||||
self.0 == 0
|
||||
}
|
||||
@@ -451,7 +453,7 @@ impl ShardIdentity {
|
||||
/// An identity with number=0 count=0 is a "none" identity, which represents legacy
|
||||
/// tenants. Modern single-shard tenants should not use this: they should
|
||||
/// have number=0 count=1.
|
||||
pub fn unsharded() -> Self {
|
||||
pub const fn unsharded() -> Self {
|
||||
Self {
|
||||
number: ShardNumber(0),
|
||||
count: ShardCount(0),
|
||||
@@ -538,24 +540,6 @@ impl ShardIdentity {
|
||||
}
|
||||
}
|
||||
|
||||
/// Special case for issue `<https://github.com/neondatabase/neon/issues/7451>`
|
||||
///
|
||||
/// When we fail to read a forknum block, this function tells us whether we may ignore the error
|
||||
/// as a symptom of that issue.
|
||||
pub fn is_key_buggy_forknum(&self, key: &Key) -> bool {
|
||||
if !is_rel_block_key(key) || key.field5 != INIT_FORKNUM {
|
||||
return false;
|
||||
}
|
||||
|
||||
let mut hash = murmurhash32(key.field4);
|
||||
hash = hash_combine(hash, murmurhash32(key.field6 / self.stripe_size.0));
|
||||
let mapped_shard = ShardNumber((hash % self.count.0 as u32) as u8);
|
||||
|
||||
// The key may be affected by issue #7454: it is an initfork and it would not
|
||||
// have mapped to shard 0 until we fixed that issue.
|
||||
mapped_shard != ShardNumber(0)
|
||||
}
|
||||
|
||||
/// Return true if the key should be discarded if found in this shard's
|
||||
/// data store, e.g. during compaction after a split.
|
||||
///
|
||||
|
||||
@@ -5,7 +5,6 @@ edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
async-trait.workspace = true
|
||||
anyhow.workspace = true
|
||||
bytes.workspace = true
|
||||
futures.workspace = true
|
||||
|
||||
@@ -78,17 +78,16 @@ pub fn is_expected_io_error(e: &io::Error) -> bool {
|
||||
)
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
pub trait Handler<IO> {
|
||||
/// Handle single query.
|
||||
/// postgres_backend will issue ReadyForQuery after calling this (this
|
||||
/// might be not what we want after CopyData streaming, but currently we don't
|
||||
/// care). It will also flush out the output buffer.
|
||||
async fn process_query(
|
||||
fn process_query(
|
||||
&mut self,
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
query_string: &str,
|
||||
) -> Result<(), QueryError>;
|
||||
) -> impl Future<Output = Result<(), QueryError>> + Send;
|
||||
|
||||
/// Called on startup packet receival, allows to process params.
|
||||
///
|
||||
|
||||
@@ -22,7 +22,6 @@ async fn make_tcp_pair() -> (TcpStream, TcpStream) {
|
||||
|
||||
struct TestHandler {}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl<IO: AsyncRead + AsyncWrite + Unpin + Send> Handler<IO> for TestHandler {
|
||||
// return single col 'hey' for any query
|
||||
async fn process_query(
|
||||
|
||||
@@ -331,7 +331,10 @@ impl CheckPoint {
|
||||
/// Returns 'true' if the XID was updated.
|
||||
pub fn update_next_xid(&mut self, xid: u32) -> bool {
|
||||
// nextXid should be greater than any XID in WAL, so increment provided XID and check for wraparround.
|
||||
let mut new_xid = std::cmp::max(xid.wrapping_add(1), pg_constants::FIRST_NORMAL_TRANSACTION_ID);
|
||||
let mut new_xid = std::cmp::max(
|
||||
xid.wrapping_add(1),
|
||||
pg_constants::FIRST_NORMAL_TRANSACTION_ID,
|
||||
);
|
||||
// To reduce number of metadata checkpoints, we forward align XID on XID_CHECKPOINT_INTERVAL.
|
||||
// XID_CHECKPOINT_INTERVAL should not be larger than BLCKSZ*CLOG_XACTS_PER_BYTE
|
||||
new_xid =
|
||||
@@ -367,8 +370,16 @@ pub fn generate_wal_segment(segno: u64, system_id: u64, lsn: Lsn) -> Result<Byte
|
||||
let seg_off = lsn.segment_offset(WAL_SEGMENT_SIZE);
|
||||
|
||||
let first_page_only = seg_off < XLOG_BLCKSZ;
|
||||
let (shdr_rem_len, infoflags) = if first_page_only {
|
||||
(seg_off, pg_constants::XLP_FIRST_IS_CONTRECORD)
|
||||
// If first records starts in the middle of the page, pretend in page header
|
||||
// there is a fake record which ends where first real record starts. This
|
||||
// makes pg_waldump etc happy.
|
||||
let (shdr_rem_len, infoflags) = if first_page_only && seg_off > 0 {
|
||||
assert!(seg_off >= XLOG_SIZE_OF_XLOG_LONG_PHD);
|
||||
// xlp_rem_len doesn't include page header, hence the subtraction.
|
||||
(
|
||||
seg_off - XLOG_SIZE_OF_XLOG_LONG_PHD,
|
||||
pg_constants::XLP_FIRST_IS_CONTRECORD,
|
||||
)
|
||||
} else {
|
||||
(0, 0)
|
||||
};
|
||||
@@ -397,20 +408,22 @@ pub fn generate_wal_segment(segno: u64, system_id: u64, lsn: Lsn) -> Result<Byte
|
||||
|
||||
if !first_page_only {
|
||||
let block_offset = lsn.page_offset_in_segment(WAL_SEGMENT_SIZE) as usize;
|
||||
// see comments above about XLP_FIRST_IS_CONTRECORD and xlp_rem_len.
|
||||
let (xlp_rem_len, xlp_info) = if page_off > 0 {
|
||||
assert!(page_off >= XLOG_SIZE_OF_XLOG_SHORT_PHD as u64);
|
||||
(
|
||||
(page_off - XLOG_SIZE_OF_XLOG_SHORT_PHD as u64) as u32,
|
||||
pg_constants::XLP_FIRST_IS_CONTRECORD,
|
||||
)
|
||||
} else {
|
||||
(0, 0)
|
||||
};
|
||||
let header = XLogPageHeaderData {
|
||||
xlp_magic: XLOG_PAGE_MAGIC as u16,
|
||||
xlp_info: if page_off >= pg_constants::SIZE_OF_PAGE_HEADER as u64 {
|
||||
pg_constants::XLP_FIRST_IS_CONTRECORD
|
||||
} else {
|
||||
0
|
||||
},
|
||||
xlp_info,
|
||||
xlp_tli: PG_TLI,
|
||||
xlp_pageaddr: lsn.page_lsn().0,
|
||||
xlp_rem_len: if page_off >= pg_constants::SIZE_OF_PAGE_HEADER as u64 {
|
||||
page_off as u32
|
||||
} else {
|
||||
0u32
|
||||
},
|
||||
xlp_rem_len,
|
||||
..Default::default() // Put 0 in padding fields.
|
||||
};
|
||||
let hdr_bytes = header.encode()?;
|
||||
|
||||
@@ -38,6 +38,7 @@ azure_storage_blobs.workspace = true
|
||||
futures-util.workspace = true
|
||||
http-types.workspace = true
|
||||
itertools.workspace = true
|
||||
sync_wrapper = { workspace = true, features = ["futures"] }
|
||||
|
||||
[dev-dependencies]
|
||||
camino-tempfile.workspace = true
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::HashMap;
|
||||
use std::env;
|
||||
use std::io;
|
||||
use std::num::NonZeroU32;
|
||||
use std::pin::Pin;
|
||||
use std::str::FromStr;
|
||||
@@ -20,6 +21,7 @@ use azure_storage_blobs::blob::CopyStatus;
|
||||
use azure_storage_blobs::prelude::ClientBuilder;
|
||||
use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient};
|
||||
use bytes::Bytes;
|
||||
use futures::future::Either;
|
||||
use futures::stream::Stream;
|
||||
use futures_util::StreamExt;
|
||||
use futures_util::TryStreamExt;
|
||||
@@ -128,12 +130,12 @@ impl AzureBlobStorage {
|
||||
let kind = RequestKind::Get;
|
||||
|
||||
let _permit = self.permit(kind, cancel).await?;
|
||||
let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
|
||||
let cancel_or_timeout_ = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
|
||||
|
||||
let mut etag = None;
|
||||
let mut last_modified = None;
|
||||
let mut metadata = HashMap::new();
|
||||
// TODO give proper streaming response instead of buffering into RAM
|
||||
// https://github.com/neondatabase/neon/issues/5563
|
||||
|
||||
let download = async {
|
||||
let response = builder
|
||||
@@ -152,39 +154,46 @@ impl AzureBlobStorage {
|
||||
Err(_elapsed) => Err(DownloadError::Timeout),
|
||||
});
|
||||
|
||||
let mut response = std::pin::pin!(response);
|
||||
let mut response = Box::pin(response);
|
||||
|
||||
let mut bufs = Vec::new();
|
||||
while let Some(part) = response.next().await {
|
||||
let part = part?;
|
||||
if etag.is_none() {
|
||||
etag = Some(part.blob.properties.etag);
|
||||
}
|
||||
if last_modified.is_none() {
|
||||
last_modified = Some(part.blob.properties.last_modified.into());
|
||||
}
|
||||
if let Some(blob_meta) = part.blob.metadata {
|
||||
metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
|
||||
}
|
||||
let data = part
|
||||
.data
|
||||
.collect()
|
||||
.await
|
||||
.map_err(|e| DownloadError::Other(e.into()))?;
|
||||
bufs.push(data);
|
||||
}
|
||||
|
||||
if bufs.is_empty() {
|
||||
let Some(part) = response.next().await else {
|
||||
return Err(DownloadError::Other(anyhow::anyhow!(
|
||||
"Azure GET response contained no buffers"
|
||||
"Azure GET response contained no response body"
|
||||
)));
|
||||
};
|
||||
let part = part?;
|
||||
if etag.is_none() {
|
||||
etag = Some(part.blob.properties.etag);
|
||||
}
|
||||
if last_modified.is_none() {
|
||||
last_modified = Some(part.blob.properties.last_modified.into());
|
||||
}
|
||||
if let Some(blob_meta) = part.blob.metadata {
|
||||
metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
|
||||
}
|
||||
|
||||
// unwrap safety: if these were None, bufs would be empty and we would have returned an error already
|
||||
let etag = etag.unwrap();
|
||||
let last_modified = last_modified.unwrap();
|
||||
|
||||
let tail_stream = response
|
||||
.map(|part| match part {
|
||||
Ok(part) => Either::Left(part.data.map(|r| r.map_err(io::Error::other))),
|
||||
Err(e) => {
|
||||
Either::Right(futures::stream::once(async { Err(io::Error::other(e)) }))
|
||||
}
|
||||
})
|
||||
.flatten();
|
||||
let stream = part
|
||||
.data
|
||||
.map(|r| r.map_err(io::Error::other))
|
||||
.chain(sync_wrapper::SyncStream::new(tail_stream));
|
||||
//.chain(SyncStream::from_pin(Box::pin(tail_stream)));
|
||||
|
||||
let download_stream = crate::support::DownloadStream::new(cancel_or_timeout_, stream);
|
||||
|
||||
Ok(Download {
|
||||
download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
|
||||
download_stream: Box::pin(download_stream),
|
||||
etag,
|
||||
last_modified,
|
||||
metadata: Some(StorageMetadata(metadata)),
|
||||
@@ -193,7 +202,10 @@ impl AzureBlobStorage {
|
||||
|
||||
tokio::select! {
|
||||
bufs = download => bufs,
|
||||
_ = cancel.cancelled() => Err(DownloadError::Cancelled),
|
||||
cancel_or_timeout = cancel_or_timeout => match cancel_or_timeout {
|
||||
TimeoutOrCancel::Timeout => Err(DownloadError::Timeout),
|
||||
TimeoutOrCancel::Cancel => Err(DownloadError::Cancelled),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -55,11 +55,11 @@ pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel};
|
||||
/// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests
|
||||
/// <https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/>
|
||||
pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
|
||||
/// We set this a little bit low as we currently buffer the entire file into RAM
|
||||
/// Set this limit analogously to the S3 limit
|
||||
///
|
||||
/// Here, a limit of max 20k concurrent connections was noted.
|
||||
/// <https://learn.microsoft.com/en-us/answers/questions/1301863/is-there-any-limitation-to-concurrent-connections>
|
||||
pub const DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT: usize = 30;
|
||||
pub const DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT: usize = 100;
|
||||
/// No limits on the client side, which currenltly means 1000 for AWS S3.
|
||||
/// <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax>
|
||||
pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;
|
||||
|
||||
@@ -2,11 +2,10 @@
|
||||
|
||||
use std::cmp::{Eq, Ordering};
|
||||
use std::collections::BinaryHeap;
|
||||
use std::fmt::Debug;
|
||||
use std::mem;
|
||||
use std::sync::Mutex;
|
||||
use std::time::Duration;
|
||||
use tokio::sync::watch::{channel, Receiver, Sender};
|
||||
use tokio::sync::watch::{self, channel};
|
||||
use tokio::time::timeout;
|
||||
|
||||
/// An error happened while waiting for a number
|
||||
@@ -35,23 +34,73 @@ pub trait MonotonicCounter<V> {
|
||||
fn cnt_value(&self) -> V;
|
||||
}
|
||||
|
||||
/// Internal components of a `SeqWait`
|
||||
struct SeqWaitInt<S, V>
|
||||
/// Heap of waiters, lowest numbers pop first.
|
||||
struct Waiters<V>
|
||||
where
|
||||
S: MonotonicCounter<V>,
|
||||
V: Ord,
|
||||
{
|
||||
waiters: BinaryHeap<Waiter<V>>,
|
||||
current: S,
|
||||
shutdown: bool,
|
||||
heap: BinaryHeap<Waiter<V>>,
|
||||
/// Number of the first waiter in the heap, or None if there are no waiters.
|
||||
status_channel: watch::Sender<Option<V>>,
|
||||
}
|
||||
|
||||
impl<V> Waiters<V>
|
||||
where
|
||||
V: Ord + Copy,
|
||||
{
|
||||
fn new() -> Self {
|
||||
Waiters {
|
||||
heap: BinaryHeap::new(),
|
||||
status_channel: channel(None).0,
|
||||
}
|
||||
}
|
||||
|
||||
/// `status_channel` contains the number of the first waiter in the heap.
|
||||
/// This function should be called whenever waiters heap changes.
|
||||
fn update_status(&self) {
|
||||
let first_waiter = self.heap.peek().map(|w| w.wake_num);
|
||||
let _ = self.status_channel.send_replace(first_waiter);
|
||||
}
|
||||
|
||||
/// Add new waiter to the heap, return a channel that will be notified when the number arrives.
|
||||
fn add(&mut self, num: V) -> watch::Receiver<()> {
|
||||
let (tx, rx) = channel(());
|
||||
self.heap.push(Waiter {
|
||||
wake_num: num,
|
||||
wake_channel: tx,
|
||||
});
|
||||
self.update_status();
|
||||
rx
|
||||
}
|
||||
|
||||
/// Pop all waiters <= num from the heap. Collect channels in a vector,
|
||||
/// so that caller can wake them up.
|
||||
fn pop_leq(&mut self, num: V) -> Vec<watch::Sender<()>> {
|
||||
let mut wake_these = Vec::new();
|
||||
while let Some(n) = self.heap.peek() {
|
||||
if n.wake_num > num {
|
||||
break;
|
||||
}
|
||||
wake_these.push(self.heap.pop().unwrap().wake_channel);
|
||||
}
|
||||
self.update_status();
|
||||
wake_these
|
||||
}
|
||||
|
||||
/// Used on shutdown to efficiently drop all waiters.
|
||||
fn take_all(&mut self) -> BinaryHeap<Waiter<V>> {
|
||||
let heap = mem::take(&mut self.heap);
|
||||
self.update_status();
|
||||
heap
|
||||
}
|
||||
}
|
||||
|
||||
struct Waiter<T>
|
||||
where
|
||||
T: Ord,
|
||||
{
|
||||
wake_num: T, // wake me when this number arrives ...
|
||||
wake_channel: Sender<()>, // ... by sending a message to this channel
|
||||
wake_num: T, // wake me when this number arrives ...
|
||||
wake_channel: watch::Sender<()>, // ... by sending a message to this channel
|
||||
}
|
||||
|
||||
// BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here
|
||||
@@ -76,6 +125,17 @@ impl<T: Ord> PartialEq for Waiter<T> {
|
||||
|
||||
impl<T: Ord> Eq for Waiter<T> {}
|
||||
|
||||
/// Internal components of a `SeqWait`
|
||||
struct SeqWaitInt<S, V>
|
||||
where
|
||||
S: MonotonicCounter<V>,
|
||||
V: Ord,
|
||||
{
|
||||
waiters: Waiters<V>,
|
||||
current: S,
|
||||
shutdown: bool,
|
||||
}
|
||||
|
||||
/// A tool for waiting on a sequence number
|
||||
///
|
||||
/// This provides a way to wait the arrival of a number.
|
||||
@@ -108,7 +168,7 @@ where
|
||||
/// Create a new `SeqWait`, initialized to a particular number
|
||||
pub fn new(starting_num: S) -> Self {
|
||||
let internal = SeqWaitInt {
|
||||
waiters: BinaryHeap::new(),
|
||||
waiters: Waiters::new(),
|
||||
current: starting_num,
|
||||
shutdown: false,
|
||||
};
|
||||
@@ -128,9 +188,8 @@ where
|
||||
// Block any future waiters from starting
|
||||
internal.shutdown = true;
|
||||
|
||||
// This will steal the entire waiters map.
|
||||
// When we drop it all waiters will be woken.
|
||||
mem::take(&mut internal.waiters)
|
||||
// Take all waiters to drop them later.
|
||||
internal.waiters.take_all()
|
||||
|
||||
// Drop the lock as we exit this scope.
|
||||
};
|
||||
@@ -196,7 +255,7 @@ where
|
||||
|
||||
/// Register and return a channel that will be notified when a number arrives,
|
||||
/// or None, if it has already arrived.
|
||||
fn queue_for_wait(&self, num: V) -> Result<Option<Receiver<()>>, SeqWaitError> {
|
||||
fn queue_for_wait(&self, num: V) -> Result<Option<watch::Receiver<()>>, SeqWaitError> {
|
||||
let mut internal = self.internal.lock().unwrap();
|
||||
if internal.current.cnt_value() >= num {
|
||||
return Ok(None);
|
||||
@@ -205,12 +264,8 @@ where
|
||||
return Err(SeqWaitError::Shutdown);
|
||||
}
|
||||
|
||||
// Create a new channel.
|
||||
let (tx, rx) = channel(());
|
||||
internal.waiters.push(Waiter {
|
||||
wake_num: num,
|
||||
wake_channel: tx,
|
||||
});
|
||||
// Add waiter channel to the queue.
|
||||
let rx = internal.waiters.add(num);
|
||||
// Drop the lock as we exit this scope.
|
||||
Ok(Some(rx))
|
||||
}
|
||||
@@ -231,16 +286,8 @@ where
|
||||
}
|
||||
internal.current.cnt_advance(num);
|
||||
|
||||
// Pop all waiters <= num from the heap. Collect them in a vector, and
|
||||
// wake them up after releasing the lock.
|
||||
let mut wake_these = Vec::new();
|
||||
while let Some(n) = internal.waiters.peek() {
|
||||
if n.wake_num > num {
|
||||
break;
|
||||
}
|
||||
wake_these.push(internal.waiters.pop().unwrap().wake_channel);
|
||||
}
|
||||
wake_these
|
||||
// Pop all waiters <= num from the heap.
|
||||
internal.waiters.pop_leq(num)
|
||||
};
|
||||
|
||||
for tx in wake_these {
|
||||
@@ -255,6 +302,23 @@ where
|
||||
pub fn load(&self) -> S {
|
||||
self.internal.lock().unwrap().current
|
||||
}
|
||||
|
||||
/// Get a Receiver for the current status.
|
||||
///
|
||||
/// The current status is the number of the first waiter in the queue,
|
||||
/// or None if there are no waiters.
|
||||
///
|
||||
/// This receiver will be notified whenever the status changes.
|
||||
/// It is useful for receiving notifications when the first waiter
|
||||
/// starts waiting for a number, or when there are no more waiters left.
|
||||
pub fn status_receiver(&self) -> watch::Receiver<Option<V>> {
|
||||
self.internal
|
||||
.lock()
|
||||
.unwrap()
|
||||
.waiters
|
||||
.status_channel
|
||||
.subscribe()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -15,7 +15,6 @@ anyhow.workspace = true
|
||||
arc-swap.workspace = true
|
||||
async-compression.workspace = true
|
||||
async-stream.workspace = true
|
||||
async-trait.workspace = true
|
||||
byteorder.workspace = true
|
||||
bytes.workspace = true
|
||||
camino.workspace = true
|
||||
|
||||
@@ -284,6 +284,34 @@ impl Client {
|
||||
Ok((status, progress))
|
||||
}
|
||||
|
||||
pub async fn tenant_secondary_status(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
) -> Result<SecondaryProgress> {
|
||||
let path = reqwest::Url::parse(&format!(
|
||||
"{}/v1/tenant/{}/secondary/status",
|
||||
self.mgmt_api_endpoint, tenant_shard_id
|
||||
))
|
||||
.expect("Cannot build URL");
|
||||
|
||||
self.request(Method::GET, path, ())
|
||||
.await?
|
||||
.json()
|
||||
.await
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
pub async fn tenant_heatmap_upload(&self, tenant_id: TenantShardId) -> Result<()> {
|
||||
let path = reqwest::Url::parse(&format!(
|
||||
"{}/v1/tenant/{}/heatmap_upload",
|
||||
self.mgmt_api_endpoint, tenant_id
|
||||
))
|
||||
.expect("Cannot build URL");
|
||||
|
||||
self.request(Method::POST, path, ()).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn location_config(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
@@ -291,10 +319,7 @@ impl Client {
|
||||
flush_ms: Option<std::time::Duration>,
|
||||
lazy: bool,
|
||||
) -> Result<()> {
|
||||
let req_body = TenantLocationConfigRequest {
|
||||
tenant_id: None,
|
||||
config,
|
||||
};
|
||||
let req_body = TenantLocationConfigRequest { config };
|
||||
|
||||
let mut path = reqwest::Url::parse(&format!(
|
||||
"{}/v1/tenant/{}/location_config",
|
||||
|
||||
@@ -18,6 +18,7 @@
|
||||
//! database size. For example, if the logical database size is 10 GB, we would
|
||||
//! generate new image layers every 10 GB of WAL.
|
||||
use futures::StreamExt;
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
use tracing::{debug, info};
|
||||
|
||||
use std::collections::{HashSet, VecDeque};
|
||||
@@ -125,6 +126,7 @@ async fn compact_level<E: CompactionJobExecutor>(
|
||||
}
|
||||
|
||||
let mut state = LevelCompactionState {
|
||||
shard_identity: *executor.get_shard_identity(),
|
||||
target_file_size,
|
||||
_lsn_range: lsn_range.clone(),
|
||||
layers: layer_fragments,
|
||||
@@ -164,6 +166,8 @@ struct LevelCompactionState<'a, E>
|
||||
where
|
||||
E: CompactionJobExecutor,
|
||||
{
|
||||
shard_identity: ShardIdentity,
|
||||
|
||||
// parameters
|
||||
target_file_size: u64,
|
||||
|
||||
@@ -366,6 +370,7 @@ where
|
||||
.executor
|
||||
.get_keyspace(&job.key_range, job.lsn_range.end, ctx)
|
||||
.await?,
|
||||
&self.shard_identity,
|
||||
) * 8192;
|
||||
|
||||
let wal_size = job
|
||||
@@ -430,7 +435,7 @@ where
|
||||
keyspace,
|
||||
self.target_file_size / 8192,
|
||||
);
|
||||
while let Some(key_range) = window.choose_next_image() {
|
||||
while let Some(key_range) = window.choose_next_image(&self.shard_identity) {
|
||||
new_jobs.push(CompactionJob::<E> {
|
||||
key_range,
|
||||
lsn_range: job.lsn_range.clone(),
|
||||
@@ -623,7 +628,12 @@ impl<K: CompactionKey> KeyspaceWindowPos<K> {
|
||||
}
|
||||
|
||||
// Advance the cursor until it reaches 'target_keysize'.
|
||||
fn advance_until_size(&mut self, w: &KeyspaceWindowHead<K>, max_size: u64) {
|
||||
fn advance_until_size(
|
||||
&mut self,
|
||||
w: &KeyspaceWindowHead<K>,
|
||||
max_size: u64,
|
||||
shard_identity: &ShardIdentity,
|
||||
) {
|
||||
while self.accum_keysize < max_size && !self.reached_end(w) {
|
||||
let curr_range = &w.keyspace[self.keyspace_idx];
|
||||
if self.end_key < curr_range.start {
|
||||
@@ -632,7 +642,7 @@ impl<K: CompactionKey> KeyspaceWindowPos<K> {
|
||||
}
|
||||
|
||||
// We're now within 'curr_range'. Can we advance past it completely?
|
||||
let distance = K::key_range_size(&(self.end_key..curr_range.end));
|
||||
let distance = K::key_range_size(&(self.end_key..curr_range.end), shard_identity);
|
||||
if (self.accum_keysize + distance as u64) < max_size {
|
||||
// oh yeah, it fits
|
||||
self.end_key = curr_range.end;
|
||||
@@ -641,7 +651,7 @@ impl<K: CompactionKey> KeyspaceWindowPos<K> {
|
||||
} else {
|
||||
// advance within the range
|
||||
let skip_key = self.end_key.skip_some();
|
||||
let distance = K::key_range_size(&(self.end_key..skip_key));
|
||||
let distance = K::key_range_size(&(self.end_key..skip_key), shard_identity);
|
||||
if (self.accum_keysize + distance as u64) < max_size {
|
||||
self.end_key = skip_key;
|
||||
self.accum_keysize += distance as u64;
|
||||
@@ -677,7 +687,7 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
fn choose_next_image(&mut self) -> Option<Range<K>> {
|
||||
fn choose_next_image(&mut self, shard_identity: &ShardIdentity) -> Option<Range<K>> {
|
||||
if self.start_pos.keyspace_idx == self.head.keyspace.len() {
|
||||
// we've reached the end
|
||||
return None;
|
||||
@@ -687,6 +697,7 @@ where
|
||||
next_pos.advance_until_size(
|
||||
&self.head,
|
||||
self.start_pos.accum_keysize + self.head.target_keysize,
|
||||
shard_identity,
|
||||
);
|
||||
|
||||
// See if we can gobble up the rest of the keyspace if we stretch out the layer, up to
|
||||
@@ -695,6 +706,7 @@ where
|
||||
end_pos.advance_until_size(
|
||||
&self.head,
|
||||
self.start_pos.accum_keysize + (self.head.target_keysize * 5 / 4),
|
||||
shard_identity,
|
||||
);
|
||||
if end_pos.reached_end(&self.head) {
|
||||
// gobble up any unused keyspace between the last used key and end of the range
|
||||
|
||||
@@ -5,6 +5,7 @@ use crate::interface::*;
|
||||
use futures::future::BoxFuture;
|
||||
use futures::{Stream, StreamExt};
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
use pin_project_lite::pin_project;
|
||||
use std::collections::BinaryHeap;
|
||||
use std::collections::VecDeque;
|
||||
@@ -13,11 +14,17 @@ use std::ops::{DerefMut, Range};
|
||||
use std::pin::Pin;
|
||||
use std::task::{ready, Poll};
|
||||
|
||||
pub fn keyspace_total_size<K>(keyspace: &CompactionKeySpace<K>) -> u64
|
||||
pub fn keyspace_total_size<K>(
|
||||
keyspace: &CompactionKeySpace<K>,
|
||||
shard_identity: &ShardIdentity,
|
||||
) -> u64
|
||||
where
|
||||
K: CompactionKey,
|
||||
{
|
||||
keyspace.iter().map(|r| K::key_range_size(r) as u64).sum()
|
||||
keyspace
|
||||
.iter()
|
||||
.map(|r| K::key_range_size(r, shard_identity) as u64)
|
||||
.sum()
|
||||
}
|
||||
|
||||
pub fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
//! All the heavy lifting is done by the create_image and create_delta
|
||||
//! functions that the implementor provides.
|
||||
use futures::Future;
|
||||
use pageserver_api::{key::Key, keyspace::key_range_size};
|
||||
use pageserver_api::{key::Key, keyspace::ShardedRange, shard::ShardIdentity};
|
||||
use std::ops::Range;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
@@ -32,6 +32,8 @@ pub trait CompactionJobExecutor {
|
||||
// Functions that the planner uses to support its decisions
|
||||
// ----
|
||||
|
||||
fn get_shard_identity(&self) -> &ShardIdentity;
|
||||
|
||||
/// Return all layers that overlap the given bounding box.
|
||||
fn get_layers(
|
||||
&mut self,
|
||||
@@ -98,7 +100,7 @@ pub trait CompactionKey: std::cmp::Ord + Clone + Copy + std::fmt::Display {
|
||||
///
|
||||
/// This returns u32, for compatibility with Repository::key. If the
|
||||
/// distance is larger, return u32::MAX.
|
||||
fn key_range_size(key_range: &Range<Self>) -> u32;
|
||||
fn key_range_size(key_range: &Range<Self>, shard_identity: &ShardIdentity) -> u32;
|
||||
|
||||
// return "self + 1"
|
||||
fn next(&self) -> Self;
|
||||
@@ -113,8 +115,8 @@ impl CompactionKey for Key {
|
||||
const MIN: Self = Self::MIN;
|
||||
const MAX: Self = Self::MAX;
|
||||
|
||||
fn key_range_size(r: &std::ops::Range<Self>) -> u32 {
|
||||
key_range_size(r)
|
||||
fn key_range_size(r: &std::ops::Range<Self>, shard_identity: &ShardIdentity) -> u32 {
|
||||
ShardedRange::new(r.clone(), shard_identity).page_count()
|
||||
}
|
||||
fn next(&self) -> Key {
|
||||
(self as &Key).next()
|
||||
|
||||
@@ -3,6 +3,7 @@ mod draw;
|
||||
use draw::{LayerTraceEvent, LayerTraceFile, LayerTraceOp};
|
||||
|
||||
use futures::StreamExt;
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
use rand::Rng;
|
||||
use tracing::info;
|
||||
|
||||
@@ -71,7 +72,7 @@ impl interface::CompactionKey for Key {
|
||||
const MIN: Self = u64::MIN;
|
||||
const MAX: Self = u64::MAX;
|
||||
|
||||
fn key_range_size(key_range: &Range<Self>) -> u32 {
|
||||
fn key_range_size(key_range: &Range<Self>, _shard_identity: &ShardIdentity) -> u32 {
|
||||
std::cmp::min(key_range.end - key_range.start, u32::MAX as u64) as u32
|
||||
}
|
||||
|
||||
@@ -434,6 +435,11 @@ impl interface::CompactionJobExecutor for MockTimeline {
|
||||
type ImageLayer = Arc<MockImageLayer>;
|
||||
type RequestContext = MockRequestContext;
|
||||
|
||||
fn get_shard_identity(&self) -> &ShardIdentity {
|
||||
static IDENTITY: ShardIdentity = ShardIdentity::unsharded();
|
||||
&IDENTITY
|
||||
}
|
||||
|
||||
async fn get_layers(
|
||||
&mut self,
|
||||
key_range: &Range<Self::Key>,
|
||||
|
||||
@@ -10,10 +10,10 @@
|
||||
//! This module is responsible for creation of such tarball
|
||||
//! from data stored in object storage.
|
||||
//!
|
||||
use anyhow::{anyhow, bail, ensure, Context};
|
||||
use anyhow::{anyhow, Context};
|
||||
use bytes::{BufMut, Bytes, BytesMut};
|
||||
use fail::fail_point;
|
||||
use pageserver_api::key::{key_to_slru_block, rel_block_to_key, Key};
|
||||
use pageserver_api::key::{key_to_slru_block, Key};
|
||||
use postgres_ffi::pg_constants;
|
||||
use std::fmt::Write as FmtWrite;
|
||||
use std::time::SystemTime;
|
||||
@@ -38,6 +38,14 @@ use postgres_ffi::PG_TLI;
|
||||
use postgres_ffi::{BLCKSZ, RELSEG_SIZE, WAL_SEGMENT_SIZE};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum BasebackupError {
|
||||
#[error("basebackup pageserver error {0:#}")]
|
||||
Server(#[from] anyhow::Error),
|
||||
#[error("basebackup client error {0:#}")]
|
||||
Client(#[source] io::Error),
|
||||
}
|
||||
|
||||
/// Create basebackup with non-rel data in it.
|
||||
/// Only include relational data if 'full_backup' is true.
|
||||
///
|
||||
@@ -53,7 +61,7 @@ pub async fn send_basebackup_tarball<'a, W>(
|
||||
prev_lsn: Option<Lsn>,
|
||||
full_backup: bool,
|
||||
ctx: &'a RequestContext,
|
||||
) -> anyhow::Result<()>
|
||||
) -> Result<(), BasebackupError>
|
||||
where
|
||||
W: AsyncWrite + Send + Sync + Unpin,
|
||||
{
|
||||
@@ -92,8 +100,10 @@ where
|
||||
|
||||
// Consolidate the derived and the provided prev_lsn values
|
||||
let prev_lsn = if let Some(provided_prev_lsn) = prev_lsn {
|
||||
if backup_prev != Lsn(0) {
|
||||
ensure!(backup_prev == provided_prev_lsn);
|
||||
if backup_prev != Lsn(0) && backup_prev != provided_prev_lsn {
|
||||
return Err(BasebackupError::Server(anyhow!(
|
||||
"backup_prev {backup_prev} != provided_prev_lsn {provided_prev_lsn}"
|
||||
)));
|
||||
}
|
||||
provided_prev_lsn
|
||||
} else {
|
||||
@@ -159,15 +169,26 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
async fn add_block(&mut self, key: &Key, block: Bytes) -> anyhow::Result<()> {
|
||||
async fn add_block(&mut self, key: &Key, block: Bytes) -> Result<(), BasebackupError> {
|
||||
let (kind, segno, _) = key_to_slru_block(*key)?;
|
||||
|
||||
match kind {
|
||||
SlruKind::Clog => {
|
||||
ensure!(block.len() == BLCKSZ as usize || block.len() == BLCKSZ as usize + 8);
|
||||
if !(block.len() == BLCKSZ as usize || block.len() == BLCKSZ as usize + 8) {
|
||||
return Err(BasebackupError::Server(anyhow!(
|
||||
"invalid SlruKind::Clog record: block.len()={}",
|
||||
block.len()
|
||||
)));
|
||||
}
|
||||
}
|
||||
SlruKind::MultiXactMembers | SlruKind::MultiXactOffsets => {
|
||||
ensure!(block.len() == BLCKSZ as usize);
|
||||
if block.len() != BLCKSZ as usize {
|
||||
return Err(BasebackupError::Server(anyhow!(
|
||||
"invalid {:?} record: block.len()={}",
|
||||
kind,
|
||||
block.len()
|
||||
)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -194,12 +215,15 @@ where
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn flush(&mut self) -> anyhow::Result<()> {
|
||||
async fn flush(&mut self) -> Result<(), BasebackupError> {
|
||||
let nblocks = self.buf.len() / BLCKSZ as usize;
|
||||
let (kind, segno) = self.current_segment.take().unwrap();
|
||||
let segname = format!("{}/{:>04X}", kind.to_str(), segno);
|
||||
let header = new_tar_header(&segname, self.buf.len() as u64)?;
|
||||
self.ar.append(&header, self.buf.as_slice()).await?;
|
||||
self.ar
|
||||
.append(&header, self.buf.as_slice())
|
||||
.await
|
||||
.map_err(BasebackupError::Client)?;
|
||||
|
||||
self.total_blocks += nblocks;
|
||||
debug!("Added to basebackup slru {} relsize {}", segname, nblocks);
|
||||
@@ -209,7 +233,7 @@ where
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn finish(mut self) -> anyhow::Result<()> {
|
||||
async fn finish(mut self) -> Result<(), BasebackupError> {
|
||||
let res = if self.current_segment.is_none() || self.buf.is_empty() {
|
||||
Ok(())
|
||||
} else {
|
||||
@@ -226,7 +250,7 @@ impl<'a, W> Basebackup<'a, W>
|
||||
where
|
||||
W: AsyncWrite + Send + Sync + Unpin,
|
||||
{
|
||||
async fn send_tarball(mut self) -> anyhow::Result<()> {
|
||||
async fn send_tarball(mut self) -> Result<(), BasebackupError> {
|
||||
// TODO include checksum
|
||||
|
||||
let lazy_slru_download = self.timeline.get_lazy_slru_download() && !self.full_backup;
|
||||
@@ -262,16 +286,25 @@ where
|
||||
let slru_partitions = self
|
||||
.timeline
|
||||
.get_slru_keyspace(Version::Lsn(self.lsn), self.ctx)
|
||||
.await?
|
||||
.partition(Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64);
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?
|
||||
.partition(
|
||||
self.timeline.get_shard_identity(),
|
||||
Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64,
|
||||
);
|
||||
|
||||
let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar);
|
||||
|
||||
for part in slru_partitions.parts {
|
||||
let blocks = self.timeline.get_vectored(part, self.lsn, self.ctx).await?;
|
||||
let blocks = self
|
||||
.timeline
|
||||
.get_vectored(part, self.lsn, self.ctx)
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||
|
||||
for (key, block) in blocks {
|
||||
slru_builder.add_block(&key, block?).await?;
|
||||
let block = block.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||
slru_builder.add_block(&key, block).await?;
|
||||
}
|
||||
}
|
||||
slru_builder.finish().await?;
|
||||
@@ -279,8 +312,11 @@ where
|
||||
|
||||
let mut min_restart_lsn: Lsn = Lsn::MAX;
|
||||
// Create tablespace directories
|
||||
for ((spcnode, dbnode), has_relmap_file) in
|
||||
self.timeline.list_dbdirs(self.lsn, self.ctx).await?
|
||||
for ((spcnode, dbnode), has_relmap_file) in self
|
||||
.timeline
|
||||
.list_dbdirs(self.lsn, self.ctx)
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?
|
||||
{
|
||||
self.add_dbdir(spcnode, dbnode, has_relmap_file).await?;
|
||||
|
||||
@@ -289,7 +325,8 @@ where
|
||||
let rels = self
|
||||
.timeline
|
||||
.list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
|
||||
.await?;
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||
for &rel in rels.iter() {
|
||||
// Send init fork as main fork to provide well formed empty
|
||||
// contents of UNLOGGED relations. Postgres copies it in
|
||||
@@ -297,20 +334,7 @@ where
|
||||
if rel.forknum == INIT_FORKNUM {
|
||||
// I doubt we need _init fork itself, but having it at least
|
||||
// serves as a marker relation is unlogged.
|
||||
if let Err(_e) = self.add_rel(rel, rel).await {
|
||||
if self
|
||||
.timeline
|
||||
.get_shard_identity()
|
||||
.is_key_buggy_forknum(&rel_block_to_key(rel, 0x0))
|
||||
{
|
||||
// Workaround https://github.com/neondatabase/neon/issues/7451 -- if we have an unlogged relation
|
||||
// whose INIT_FORKNUM is not correctly on shard zero, then omit it in the basebackup. This allows
|
||||
// postgres to start up. The relation won't work, but it will be possible to DROP TABLE on it and
|
||||
// recreate.
|
||||
tracing::warn!("Omitting relation {rel} for issue #7451: drop and recreate this unlogged relation");
|
||||
continue;
|
||||
}
|
||||
};
|
||||
self.add_rel(rel, rel).await?;
|
||||
self.add_rel(rel, rel.with_forknum(MAIN_FORKNUM)).await?;
|
||||
continue;
|
||||
}
|
||||
@@ -325,7 +349,12 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
for (path, content) in self.timeline.list_aux_files(self.lsn, self.ctx).await? {
|
||||
for (path, content) in self
|
||||
.timeline
|
||||
.list_aux_files(self.lsn, self.ctx)
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?
|
||||
{
|
||||
if path.starts_with("pg_replslot") {
|
||||
let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN;
|
||||
let restart_lsn = Lsn(u64::from_le_bytes(
|
||||
@@ -356,34 +385,41 @@ where
|
||||
for xid in self
|
||||
.timeline
|
||||
.list_twophase_files(self.lsn, self.ctx)
|
||||
.await?
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?
|
||||
{
|
||||
self.add_twophase_file(xid).await?;
|
||||
}
|
||||
|
||||
fail_point!("basebackup-before-control-file", |_| {
|
||||
bail!("failpoint basebackup-before-control-file")
|
||||
Err(BasebackupError::Server(anyhow!(
|
||||
"failpoint basebackup-before-control-file"
|
||||
)))
|
||||
});
|
||||
|
||||
// Generate pg_control and bootstrap WAL segment.
|
||||
self.add_pgcontrol_file().await?;
|
||||
self.ar.finish().await?;
|
||||
self.ar.finish().await.map_err(BasebackupError::Client)?;
|
||||
debug!("all tarred up!");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Add contents of relfilenode `src`, naming it as `dst`.
|
||||
async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> anyhow::Result<()> {
|
||||
async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> Result<(), BasebackupError> {
|
||||
let nblocks = self
|
||||
.timeline
|
||||
.get_rel_size(src, Version::Lsn(self.lsn), self.ctx)
|
||||
.await?;
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||
|
||||
// If the relation is empty, create an empty file
|
||||
if nblocks == 0 {
|
||||
let file_name = dst.to_segfile_name(0);
|
||||
let header = new_tar_header(&file_name, 0)?;
|
||||
self.ar.append(&header, &mut io::empty()).await?;
|
||||
self.ar
|
||||
.append(&header, &mut io::empty())
|
||||
.await
|
||||
.map_err(BasebackupError::Client)?;
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
@@ -398,13 +434,17 @@ where
|
||||
let img = self
|
||||
.timeline
|
||||
.get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), self.ctx)
|
||||
.await?;
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||
segment_data.extend_from_slice(&img[..]);
|
||||
}
|
||||
|
||||
let file_name = dst.to_segfile_name(seg as u32);
|
||||
let header = new_tar_header(&file_name, segment_data.len() as u64)?;
|
||||
self.ar.append(&header, segment_data.as_slice()).await?;
|
||||
self.ar
|
||||
.append(&header, segment_data.as_slice())
|
||||
.await
|
||||
.map_err(BasebackupError::Client)?;
|
||||
|
||||
seg += 1;
|
||||
startblk = endblk;
|
||||
@@ -424,20 +464,22 @@ where
|
||||
spcnode: u32,
|
||||
dbnode: u32,
|
||||
has_relmap_file: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), BasebackupError> {
|
||||
let relmap_img = if has_relmap_file {
|
||||
let img = self
|
||||
.timeline
|
||||
.get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
|
||||
.await?;
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||
|
||||
ensure!(
|
||||
img.len()
|
||||
== dispatch_pgversion!(
|
||||
self.timeline.pg_version,
|
||||
pgv::bindings::SIZEOF_RELMAPFILE
|
||||
)
|
||||
);
|
||||
if img.len()
|
||||
!= dispatch_pgversion!(self.timeline.pg_version, pgv::bindings::SIZEOF_RELMAPFILE)
|
||||
{
|
||||
return Err(BasebackupError::Server(anyhow!(
|
||||
"img.len() != SIZE_OF_RELMAPFILE, img.len()={}",
|
||||
img.len(),
|
||||
)));
|
||||
}
|
||||
|
||||
Some(img)
|
||||
} else {
|
||||
@@ -450,14 +492,20 @@ where
|
||||
ver => format!("{ver}\x0A"),
|
||||
};
|
||||
let header = new_tar_header("PG_VERSION", pg_version_str.len() as u64)?;
|
||||
self.ar.append(&header, pg_version_str.as_bytes()).await?;
|
||||
self.ar
|
||||
.append(&header, pg_version_str.as_bytes())
|
||||
.await
|
||||
.map_err(BasebackupError::Client)?;
|
||||
|
||||
info!("timeline.pg_version {}", self.timeline.pg_version);
|
||||
|
||||
if let Some(img) = relmap_img {
|
||||
// filenode map for global tablespace
|
||||
let header = new_tar_header("global/pg_filenode.map", img.len() as u64)?;
|
||||
self.ar.append(&header, &img[..]).await?;
|
||||
self.ar
|
||||
.append(&header, &img[..])
|
||||
.await
|
||||
.map_err(BasebackupError::Client)?;
|
||||
} else {
|
||||
warn!("global/pg_filenode.map is missing");
|
||||
}
|
||||
@@ -476,18 +524,26 @@ where
|
||||
&& self
|
||||
.timeline
|
||||
.list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
|
||||
.await?
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?
|
||||
.is_empty()
|
||||
{
|
||||
return Ok(());
|
||||
}
|
||||
// User defined tablespaces are not supported
|
||||
ensure!(spcnode == DEFAULTTABLESPACE_OID);
|
||||
if spcnode != DEFAULTTABLESPACE_OID {
|
||||
return Err(BasebackupError::Server(anyhow!(
|
||||
"spcnode != DEFAULTTABLESPACE_OID, spcnode={spcnode}"
|
||||
)));
|
||||
}
|
||||
|
||||
// Append dir path for each database
|
||||
let path = format!("base/{}", dbnode);
|
||||
let header = new_tar_header_dir(&path)?;
|
||||
self.ar.append(&header, &mut io::empty()).await?;
|
||||
self.ar
|
||||
.append(&header, &mut io::empty())
|
||||
.await
|
||||
.map_err(BasebackupError::Client)?;
|
||||
|
||||
if let Some(img) = relmap_img {
|
||||
let dst_path = format!("base/{}/PG_VERSION", dbnode);
|
||||
@@ -497,11 +553,17 @@ where
|
||||
ver => format!("{ver}\x0A"),
|
||||
};
|
||||
let header = new_tar_header(&dst_path, pg_version_str.len() as u64)?;
|
||||
self.ar.append(&header, pg_version_str.as_bytes()).await?;
|
||||
self.ar
|
||||
.append(&header, pg_version_str.as_bytes())
|
||||
.await
|
||||
.map_err(BasebackupError::Client)?;
|
||||
|
||||
let relmap_path = format!("base/{}/pg_filenode.map", dbnode);
|
||||
let header = new_tar_header(&relmap_path, img.len() as u64)?;
|
||||
self.ar.append(&header, &img[..]).await?;
|
||||
self.ar
|
||||
.append(&header, &img[..])
|
||||
.await
|
||||
.map_err(BasebackupError::Client)?;
|
||||
}
|
||||
};
|
||||
Ok(())
|
||||
@@ -510,11 +572,12 @@ where
|
||||
//
|
||||
// Extract twophase state files
|
||||
//
|
||||
async fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
|
||||
async fn add_twophase_file(&mut self, xid: TransactionId) -> Result<(), BasebackupError> {
|
||||
let img = self
|
||||
.timeline
|
||||
.get_twophase_file(xid, self.lsn, self.ctx)
|
||||
.await?;
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||
|
||||
let mut buf = BytesMut::new();
|
||||
buf.extend_from_slice(&img[..]);
|
||||
@@ -522,7 +585,10 @@ where
|
||||
buf.put_u32_le(crc);
|
||||
let path = format!("pg_twophase/{:>08X}", xid);
|
||||
let header = new_tar_header(&path, buf.len() as u64)?;
|
||||
self.ar.append(&header, &buf[..]).await?;
|
||||
self.ar
|
||||
.append(&header, &buf[..])
|
||||
.await
|
||||
.map_err(BasebackupError::Client)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -531,24 +597,28 @@ where
|
||||
// Add generated pg_control file and bootstrap WAL segment.
|
||||
// Also send zenith.signal file with extra bootstrap data.
|
||||
//
|
||||
async fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> {
|
||||
async fn add_pgcontrol_file(&mut self) -> Result<(), BasebackupError> {
|
||||
// add zenith.signal file
|
||||
let mut zenith_signal = String::new();
|
||||
if self.prev_record_lsn == Lsn(0) {
|
||||
if self.lsn == self.timeline.get_ancestor_lsn() {
|
||||
write!(zenith_signal, "PREV LSN: none")?;
|
||||
write!(zenith_signal, "PREV LSN: none")
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||
} else {
|
||||
write!(zenith_signal, "PREV LSN: invalid")?;
|
||||
write!(zenith_signal, "PREV LSN: invalid")
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||
}
|
||||
} else {
|
||||
write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn)?;
|
||||
write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn)
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||
}
|
||||
self.ar
|
||||
.append(
|
||||
&new_tar_header("zenith.signal", zenith_signal.len() as u64)?,
|
||||
zenith_signal.as_bytes(),
|
||||
)
|
||||
.await?;
|
||||
.await
|
||||
.map_err(BasebackupError::Client)?;
|
||||
|
||||
let checkpoint_bytes = self
|
||||
.timeline
|
||||
@@ -570,7 +640,10 @@ where
|
||||
|
||||
//send pg_control
|
||||
let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?;
|
||||
self.ar.append(&header, &pg_control_bytes[..]).await?;
|
||||
self.ar
|
||||
.append(&header, &pg_control_bytes[..])
|
||||
.await
|
||||
.map_err(BasebackupError::Client)?;
|
||||
|
||||
//send wal segment
|
||||
let segno = self.lsn.segment_number(WAL_SEGMENT_SIZE);
|
||||
@@ -585,8 +658,16 @@ where
|
||||
self.lsn,
|
||||
)
|
||||
.map_err(|e| anyhow!(e).context("Failed generating wal segment"))?;
|
||||
ensure!(wal_seg.len() == WAL_SEGMENT_SIZE);
|
||||
self.ar.append(&header, &wal_seg[..]).await?;
|
||||
if wal_seg.len() != WAL_SEGMENT_SIZE {
|
||||
return Err(BasebackupError::Server(anyhow!(
|
||||
"wal_seg.len() != WAL_SEGMENT_SIZE, wal_seg.len()={}",
|
||||
wal_seg.len()
|
||||
)));
|
||||
}
|
||||
self.ar
|
||||
.append(&header, &wal_seg[..])
|
||||
.await
|
||||
.map_err(BasebackupError::Client)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,7 +9,7 @@ use pageserver_api::shard::TenantShardId;
|
||||
use remote_storage::{RemotePath, RemoteStorageConfig};
|
||||
use serde;
|
||||
use serde::de::IntoDeserializer;
|
||||
use std::{collections::HashMap, env};
|
||||
use std::env;
|
||||
use storage_broker::Uri;
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
use utils::id::ConnectionId;
|
||||
@@ -51,7 +51,7 @@ pub mod defaults {
|
||||
use crate::tenant::config::defaults::*;
|
||||
use const_format::formatcp;
|
||||
|
||||
pub use pageserver_api::{
|
||||
pub use pageserver_api::config::{
|
||||
DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR,
|
||||
DEFAULT_PG_LISTEN_PORT,
|
||||
};
|
||||
@@ -335,26 +335,6 @@ impl<T: Clone> BuilderValue<T> {
|
||||
}
|
||||
}
|
||||
|
||||
// Certain metadata (e.g. externally-addressable name, AZ) is delivered
|
||||
// as a separate structure. This information is not neeed by the pageserver
|
||||
// itself, it is only used for registering the pageserver with the control
|
||||
// plane and/or storage controller.
|
||||
//
|
||||
#[derive(serde::Deserialize)]
|
||||
pub(crate) struct NodeMetadata {
|
||||
#[serde(rename = "host")]
|
||||
pub(crate) postgres_host: String,
|
||||
#[serde(rename = "port")]
|
||||
pub(crate) postgres_port: u16,
|
||||
pub(crate) http_host: String,
|
||||
pub(crate) http_port: u16,
|
||||
|
||||
// Deployment tools may write fields to the metadata file beyond what we
|
||||
// use in this type: this type intentionally only names fields that require.
|
||||
#[serde(flatten)]
|
||||
pub(crate) other: HashMap<String, serde_json::Value>,
|
||||
}
|
||||
|
||||
// needed to simplify config construction
|
||||
#[derive(Default)]
|
||||
struct PageServerConfigBuilder {
|
||||
|
||||
@@ -14,10 +14,8 @@ use tokio_util::sync::CancellationToken;
|
||||
use url::Url;
|
||||
use utils::{backoff, failpoint_support, generation::Generation, id::NodeId};
|
||||
|
||||
use crate::{
|
||||
config::{NodeMetadata, PageServerConf},
|
||||
virtual_file::on_fatal_io_error,
|
||||
};
|
||||
use crate::{config::PageServerConf, virtual_file::on_fatal_io_error};
|
||||
use pageserver_api::config::NodeMetadata;
|
||||
|
||||
/// The Pageserver's client for using the control plane API: this is a small subset
|
||||
/// of the overall control plane API, for dealing with generations (see docs/rfcs/025-generation-numbers.md)
|
||||
@@ -65,7 +63,7 @@ impl ControlPlaneClient {
|
||||
let mut client = reqwest::ClientBuilder::new();
|
||||
|
||||
if let Some(jwt) = &conf.control_plane_api_token {
|
||||
let mut headers = hyper::HeaderMap::new();
|
||||
let mut headers = reqwest::header::HeaderMap::new();
|
||||
headers.insert(
|
||||
"Authorization",
|
||||
format!("Bearer {}", jwt.get_contents()).parse().unwrap(),
|
||||
|
||||
@@ -782,9 +782,6 @@ components:
|
||||
required:
|
||||
- mode
|
||||
properties:
|
||||
tenant_id:
|
||||
type: string
|
||||
description: Not used, scheduled for removal.
|
||||
mode:
|
||||
type: string
|
||||
enum: ["AttachedSingle", "AttachedMulti", "AttachedStale", "Secondary", "Detached"]
|
||||
|
||||
@@ -1918,12 +1918,14 @@ async fn timeline_collect_keyspace(
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
|
||||
let at_lsn = at_lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
|
||||
let keys = timeline
|
||||
let (dense_ks, sparse_ks) = timeline
|
||||
.collect_keyspace(at_lsn, &ctx)
|
||||
.await
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
||||
|
||||
let res = pageserver_api::models::partitioning::Partitioning { keys, at_lsn };
|
||||
// This API is currently used by pagebench. Pagebench will iterate all keys within the keyspace.
|
||||
// Therefore, we split dense/sparse keys in this API.
|
||||
let res = pageserver_api::models::partitioning::Partitioning { keys: dense_ks, sparse_keys: sparse_ks, at_lsn };
|
||||
|
||||
json_response(StatusCode::OK, res)
|
||||
}
|
||||
@@ -2158,6 +2160,27 @@ async fn secondary_download_handler(
|
||||
json_response(status, progress)
|
||||
}
|
||||
|
||||
async fn secondary_status_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let state = get_state(&request);
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
|
||||
let Some(secondary_tenant) = state
|
||||
.tenant_manager
|
||||
.get_secondary_tenant_shard(tenant_shard_id)
|
||||
else {
|
||||
return Err(ApiError::NotFound(
|
||||
anyhow::anyhow!("Shard {} not found", tenant_shard_id).into(),
|
||||
));
|
||||
};
|
||||
|
||||
let progress = secondary_tenant.progress.lock().unwrap().clone();
|
||||
|
||||
json_response(StatusCode::OK, progress)
|
||||
}
|
||||
|
||||
async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
json_response(
|
||||
StatusCode::NOT_FOUND,
|
||||
@@ -2519,6 +2542,9 @@ pub fn make_router(
|
||||
.put("/v1/deletion_queue/flush", |r| {
|
||||
api_handler(r, deletion_queue_flush)
|
||||
})
|
||||
.get("/v1/tenant/:tenant_shard_id/secondary/status", |r| {
|
||||
api_handler(r, secondary_status_handler)
|
||||
})
|
||||
.post("/v1/tenant/:tenant_shard_id/secondary/download", |r| {
|
||||
api_handler(r, secondary_download_handler)
|
||||
})
|
||||
|
||||
@@ -51,8 +51,8 @@ pub(crate) enum StorageTimeOperation {
|
||||
#[strum(serialize = "gc")]
|
||||
Gc,
|
||||
|
||||
#[strum(serialize = "update gc info")]
|
||||
UpdateGcInfo,
|
||||
#[strum(serialize = "find gc cutoffs")]
|
||||
FindGcCutoffs,
|
||||
|
||||
#[strum(serialize = "create tenant")]
|
||||
CreateTenant,
|
||||
@@ -194,6 +194,11 @@ pub(crate) struct GetVectoredLatency {
|
||||
map: EnumMap<TaskKind, Option<Histogram>>,
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub(crate) struct ScanLatency {
|
||||
map: EnumMap<TaskKind, Option<Histogram>>,
|
||||
}
|
||||
|
||||
impl GetVectoredLatency {
|
||||
// Only these task types perform vectored gets. Filter all other tasks out to reduce total
|
||||
// cardinality of the metric.
|
||||
@@ -204,6 +209,48 @@ impl GetVectoredLatency {
|
||||
}
|
||||
}
|
||||
|
||||
impl ScanLatency {
|
||||
// Only these task types perform vectored gets. Filter all other tasks out to reduce total
|
||||
// cardinality of the metric.
|
||||
const TRACKED_TASK_KINDS: [TaskKind; 1] = [TaskKind::PageRequestHandler];
|
||||
|
||||
pub(crate) fn for_task_kind(&self, task_kind: TaskKind) -> Option<&Histogram> {
|
||||
self.map[task_kind].as_ref()
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct ScanLatencyOngoingRecording<'a> {
|
||||
parent: &'a Histogram,
|
||||
start: std::time::Instant,
|
||||
}
|
||||
|
||||
impl<'a> ScanLatencyOngoingRecording<'a> {
|
||||
pub(crate) fn start_recording(parent: &'a Histogram) -> ScanLatencyOngoingRecording<'a> {
|
||||
let start = Instant::now();
|
||||
ScanLatencyOngoingRecording { parent, start }
|
||||
}
|
||||
|
||||
pub(crate) fn observe(self, throttled: Option<Duration>) {
|
||||
let elapsed = self.start.elapsed();
|
||||
let ex_throttled = if let Some(throttled) = throttled {
|
||||
elapsed.checked_sub(throttled)
|
||||
} else {
|
||||
Some(elapsed)
|
||||
};
|
||||
if let Some(ex_throttled) = ex_throttled {
|
||||
self.parent.observe(ex_throttled.as_secs_f64());
|
||||
} else {
|
||||
use utils::rate_limit::RateLimit;
|
||||
static LOGGED: Lazy<Mutex<RateLimit>> =
|
||||
Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
|
||||
let mut rate_limit = LOGGED.lock().unwrap();
|
||||
rate_limit.call(|| {
|
||||
warn!("error deducting time spent throttled; this message is logged at a global rate limit");
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) static GET_VECTORED_LATENCY: Lazy<GetVectoredLatency> = Lazy::new(|| {
|
||||
let inner = register_histogram_vec!(
|
||||
"pageserver_get_vectored_seconds",
|
||||
@@ -227,6 +274,29 @@ pub(crate) static GET_VECTORED_LATENCY: Lazy<GetVectoredLatency> = Lazy::new(||
|
||||
}
|
||||
});
|
||||
|
||||
pub(crate) static SCAN_LATENCY: Lazy<ScanLatency> = Lazy::new(|| {
|
||||
let inner = register_histogram_vec!(
|
||||
"pageserver_scan_seconds",
|
||||
"Time spent in scan, excluding time spent in timeline_get_throttle.",
|
||||
&["task_kind"],
|
||||
CRITICAL_OP_BUCKETS.into(),
|
||||
)
|
||||
.expect("failed to define a metric");
|
||||
|
||||
ScanLatency {
|
||||
map: EnumMap::from_array(std::array::from_fn(|task_kind_idx| {
|
||||
let task_kind = <TaskKind as enum_map::Enum>::from_usize(task_kind_idx);
|
||||
|
||||
if ScanLatency::TRACKED_TASK_KINDS.contains(&task_kind) {
|
||||
let task_kind = task_kind.into();
|
||||
Some(inner.with_label_values(&[task_kind]))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})),
|
||||
}
|
||||
});
|
||||
|
||||
pub(crate) struct PageCacheMetricsForTaskKind {
|
||||
pub read_accesses_materialized_page: IntCounter,
|
||||
pub read_accesses_immutable: IntCounter,
|
||||
@@ -1989,7 +2059,7 @@ pub(crate) struct TimelineMetrics {
|
||||
pub imitate_logical_size_histo: StorageTimeMetrics,
|
||||
pub load_layer_map_histo: StorageTimeMetrics,
|
||||
pub garbage_collect_histo: StorageTimeMetrics,
|
||||
pub update_gc_info_histo: StorageTimeMetrics,
|
||||
pub find_gc_cutoffs_histo: StorageTimeMetrics,
|
||||
pub last_record_gauge: IntGauge,
|
||||
resident_physical_size_gauge: UIntGauge,
|
||||
/// copy of LayeredTimeline.current_logical_size
|
||||
@@ -2050,8 +2120,8 @@ impl TimelineMetrics {
|
||||
&shard_id,
|
||||
&timeline_id,
|
||||
);
|
||||
let update_gc_info_histo = StorageTimeMetrics::new(
|
||||
StorageTimeOperation::UpdateGcInfo,
|
||||
let find_gc_cutoffs_histo = StorageTimeMetrics::new(
|
||||
StorageTimeOperation::FindGcCutoffs,
|
||||
&tenant_id,
|
||||
&shard_id,
|
||||
&timeline_id,
|
||||
@@ -2098,7 +2168,7 @@ impl TimelineMetrics {
|
||||
logical_size_histo,
|
||||
imitate_logical_size_histo,
|
||||
garbage_collect_histo,
|
||||
update_gc_info_histo,
|
||||
find_gc_cutoffs_histo,
|
||||
load_layer_map_histo,
|
||||
last_record_gauge,
|
||||
resident_physical_size_gauge,
|
||||
|
||||
@@ -48,6 +48,7 @@ use utils::{
|
||||
|
||||
use crate::auth::check_permission;
|
||||
use crate::basebackup;
|
||||
use crate::basebackup::BasebackupError;
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::import_datadir::import_wal_from_tar;
|
||||
@@ -1236,6 +1237,13 @@ impl PageServerHandler {
|
||||
where
|
||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||
{
|
||||
fn map_basebackup_error(err: BasebackupError) -> QueryError {
|
||||
match err {
|
||||
BasebackupError::Client(e) => QueryError::Disconnected(ConnectionError::Io(e)),
|
||||
BasebackupError::Server(e) => QueryError::Other(e),
|
||||
}
|
||||
}
|
||||
|
||||
let started = std::time::Instant::now();
|
||||
|
||||
// check that the timeline exists
|
||||
@@ -1261,7 +1269,8 @@ impl PageServerHandler {
|
||||
let lsn_awaited_after = started.elapsed();
|
||||
|
||||
// switch client to COPYOUT
|
||||
pgb.write_message_noflush(&BeMessage::CopyOutResponse)?;
|
||||
pgb.write_message_noflush(&BeMessage::CopyOutResponse)
|
||||
.map_err(QueryError::Disconnected)?;
|
||||
self.flush_cancellable(pgb, &timeline.cancel).await?;
|
||||
|
||||
// Send a tarball of the latest layer on the timeline. Compress if not
|
||||
@@ -1276,7 +1285,8 @@ impl PageServerHandler {
|
||||
full_backup,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
.await
|
||||
.map_err(map_basebackup_error)?;
|
||||
} else {
|
||||
let mut writer = pgb.copyout_writer();
|
||||
if gzip {
|
||||
@@ -1297,9 +1307,13 @@ impl PageServerHandler {
|
||||
full_backup,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
.await
|
||||
.map_err(map_basebackup_error)?;
|
||||
// shutdown the encoder to ensure the gzip footer is written
|
||||
encoder.shutdown().await?;
|
||||
encoder
|
||||
.shutdown()
|
||||
.await
|
||||
.map_err(|e| QueryError::Disconnected(ConnectionError::Io(e)))?;
|
||||
} else {
|
||||
basebackup::send_basebackup_tarball(
|
||||
&mut writer,
|
||||
@@ -1309,11 +1323,13 @@ impl PageServerHandler {
|
||||
full_backup,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
.await
|
||||
.map_err(map_basebackup_error)?;
|
||||
}
|
||||
}
|
||||
|
||||
pgb.write_message_noflush(&BeMessage::CopyDone)?;
|
||||
pgb.write_message_noflush(&BeMessage::CopyDone)
|
||||
.map_err(QueryError::Disconnected)?;
|
||||
self.flush_cancellable(pgb, &timeline.cancel).await?;
|
||||
|
||||
let basebackup_after = started
|
||||
@@ -1368,7 +1384,6 @@ impl PageServerHandler {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl<IO> postgres_backend::Handler<IO> for PageServerHandler
|
||||
where
|
||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||
|
||||
@@ -23,6 +23,7 @@ use pageserver_api::key::{
|
||||
slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
|
||||
AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
|
||||
};
|
||||
use pageserver_api::keyspace::SparseKeySpace;
|
||||
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
|
||||
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
|
||||
use postgres_ffi::BLCKSZ;
|
||||
@@ -278,7 +279,7 @@ impl Timeline {
|
||||
|
||||
match RelDirectory::des(&buf).context("deserialization failure") {
|
||||
Ok(dir) => {
|
||||
let exists = dir.rels.get(&(tag.relnode, tag.forknum)).is_some();
|
||||
let exists = dir.rels.contains(&(tag.relnode, tag.forknum));
|
||||
Ok(exists)
|
||||
}
|
||||
Err(e) => Err(PageReconstructError::from(e)),
|
||||
@@ -378,7 +379,7 @@ impl Timeline {
|
||||
|
||||
match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
|
||||
Ok(dir) => {
|
||||
let exists = dir.segments.get(&segno).is_some();
|
||||
let exists = dir.segments.contains(&segno);
|
||||
Ok(exists)
|
||||
}
|
||||
Err(e) => Err(PageReconstructError::from(e)),
|
||||
@@ -730,11 +731,13 @@ impl Timeline {
|
||||
/// Get a KeySpace that covers all the Keys that are in use at the given LSN.
|
||||
/// Anything that's not listed maybe removed from the underlying storage (from
|
||||
/// that LSN forwards).
|
||||
///
|
||||
/// The return value is (dense keyspace, sparse keyspace).
|
||||
pub(crate) async fn collect_keyspace(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<KeySpace, CollectKeySpaceError> {
|
||||
) -> Result<(KeySpace, SparseKeySpace), CollectKeySpaceError> {
|
||||
// Iterate through key ranges, greedily packing them into partitions
|
||||
let mut result = KeySpaceAccum::new();
|
||||
|
||||
@@ -806,7 +809,12 @@ impl Timeline {
|
||||
if self.get(AUX_FILES_KEY, lsn, ctx).await.is_ok() {
|
||||
result.add_key(AUX_FILES_KEY);
|
||||
}
|
||||
Ok(result.to_keyspace())
|
||||
|
||||
Ok((
|
||||
result.to_keyspace(),
|
||||
/* AUX sparse key space */
|
||||
SparseKeySpace(KeySpace::single(Key::metadata_aux_key_range())),
|
||||
))
|
||||
}
|
||||
|
||||
/// Get cached size of relation if it not updated after specified LSN
|
||||
@@ -1135,21 +1143,22 @@ impl<'a> DatadirModification<'a> {
|
||||
let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await.context("read db")?)
|
||||
.context("deserialize db")?;
|
||||
let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
|
||||
let mut rel_dir = if dbdir.dbdirs.get(&(rel.spcnode, rel.dbnode)).is_none() {
|
||||
// Didn't exist. Update dbdir
|
||||
dbdir.dbdirs.insert((rel.spcnode, rel.dbnode), false);
|
||||
let buf = DbDirectory::ser(&dbdir).context("serialize db")?;
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::Db, dbdir.dbdirs.len()));
|
||||
self.put(DBDIR_KEY, Value::Image(buf.into()));
|
||||
let mut rel_dir =
|
||||
if let hash_map::Entry::Vacant(e) = dbdir.dbdirs.entry((rel.spcnode, rel.dbnode)) {
|
||||
// Didn't exist. Update dbdir
|
||||
e.insert(false);
|
||||
let buf = DbDirectory::ser(&dbdir).context("serialize db")?;
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::Db, dbdir.dbdirs.len()));
|
||||
self.put(DBDIR_KEY, Value::Image(buf.into()));
|
||||
|
||||
// and create the RelDirectory
|
||||
RelDirectory::default()
|
||||
} else {
|
||||
// reldir already exists, fetch it
|
||||
RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?)
|
||||
.context("deserialize db")?
|
||||
};
|
||||
// and create the RelDirectory
|
||||
RelDirectory::default()
|
||||
} else {
|
||||
// reldir already exists, fetch it
|
||||
RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?)
|
||||
.context("deserialize db")?
|
||||
};
|
||||
|
||||
// Add the new relation to the rel directory entry, and write it back
|
||||
if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
|
||||
|
||||
@@ -363,6 +363,8 @@ pub enum TaskKind {
|
||||
|
||||
EphemeralFilePreWarmPageCache,
|
||||
|
||||
LayerDownload,
|
||||
|
||||
#[cfg(test)]
|
||||
UnitTest,
|
||||
}
|
||||
|
||||
@@ -64,6 +64,7 @@ use self::timeline::uninit::UninitializedTimeline;
|
||||
use self::timeline::EvictionTaskTenantState;
|
||||
use self::timeline::TimelineResources;
|
||||
use self::timeline::WaitLsnError;
|
||||
use self::timeline::{GcCutoffs, GcInfo};
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::deletion_queue::DeletionQueueClient;
|
||||
@@ -86,7 +87,6 @@ use crate::tenant::remote_timeline_client::INITDB_PATH;
|
||||
use crate::tenant::storage_layer::DeltaLayer;
|
||||
use crate::tenant::storage_layer::ImageLayer;
|
||||
use crate::InitializationOrder;
|
||||
use std::cmp::min;
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::BTreeSet;
|
||||
use std::collections::HashMap;
|
||||
@@ -2812,7 +2812,48 @@ impl Tenant {
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Vec<Arc<Timeline>>> {
|
||||
// grab mutex to prevent new timelines from being created here.
|
||||
// before taking the gc_cs lock, do the heavier weight finding of gc_cutoff points for
|
||||
// currently visible timelines.
|
||||
let timelines = self
|
||||
.timelines
|
||||
.lock()
|
||||
.unwrap()
|
||||
.values()
|
||||
.filter(|tl| match target_timeline_id.as_ref() {
|
||||
Some(target) => &tl.timeline_id == target,
|
||||
None => true,
|
||||
})
|
||||
.cloned()
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let mut gc_cutoffs: HashMap<TimelineId, GcCutoffs> =
|
||||
HashMap::with_capacity(timelines.len());
|
||||
|
||||
for timeline in timelines.iter() {
|
||||
let cutoff = timeline
|
||||
.get_last_record_lsn()
|
||||
.checked_sub(horizon)
|
||||
.unwrap_or(Lsn(0));
|
||||
|
||||
let res = timeline.find_gc_cutoffs(cutoff, pitr, cancel, ctx).await;
|
||||
|
||||
match res {
|
||||
Ok(cutoffs) => {
|
||||
let old = gc_cutoffs.insert(timeline.timeline_id, cutoffs);
|
||||
assert!(old.is_none());
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!(timeline_id = %timeline.timeline_id, "ignoring failure to find gc cutoffs: {e:#}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !self.is_active() {
|
||||
anyhow::bail!("shutting down");
|
||||
}
|
||||
|
||||
// grab mutex to prevent new timelines from being created here; avoid doing long operations
|
||||
// because that will stall branch creation.
|
||||
let gc_cs = self.gc_cs.lock().await;
|
||||
|
||||
// Scan all timelines. For each timeline, remember the timeline ID and
|
||||
@@ -2874,11 +2915,6 @@ impl Tenant {
|
||||
}
|
||||
}
|
||||
|
||||
let cutoff = timeline
|
||||
.get_last_record_lsn()
|
||||
.checked_sub(horizon)
|
||||
.unwrap_or(Lsn(0));
|
||||
|
||||
let branchpoints: Vec<Lsn> = all_branchpoints
|
||||
.range((
|
||||
Included((timeline_id, Lsn(0))),
|
||||
@@ -2886,9 +2922,27 @@ impl Tenant {
|
||||
))
|
||||
.map(|&x| x.1)
|
||||
.collect();
|
||||
timeline
|
||||
.update_gc_info(branchpoints, cutoff, pitr, cancel, ctx)
|
||||
.await?;
|
||||
|
||||
{
|
||||
let mut target = timeline.gc_info.write().unwrap();
|
||||
|
||||
match gc_cutoffs.remove(&timeline_id) {
|
||||
Some(cutoffs) => {
|
||||
*target = GcInfo {
|
||||
retain_lsns: branchpoints,
|
||||
cutoffs,
|
||||
};
|
||||
}
|
||||
None => {
|
||||
// reasons for this being unavailable:
|
||||
// - this timeline was created while we were finding cutoffs
|
||||
// - lsn for timestamp search fails for this timeline repeatedly
|
||||
//
|
||||
// in both cases, refreshing the branchpoints is correct.
|
||||
target.retain_lsns = branchpoints;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
gc_timelines.push(timeline);
|
||||
}
|
||||
@@ -2977,7 +3031,7 @@ impl Tenant {
|
||||
// and then the planned GC cutoff
|
||||
{
|
||||
let gc_info = src_timeline.gc_info.read().unwrap();
|
||||
let cutoff = min(gc_info.pitr_cutoff, gc_info.horizon_cutoff);
|
||||
let cutoff = gc_info.min_cutoff();
|
||||
if start_lsn < cutoff {
|
||||
return Err(CreateTimelineError::AncestorLsn(anyhow::anyhow!(
|
||||
"invalid branch start lsn: less than planned GC cutoff {cutoff}"
|
||||
@@ -3871,8 +3925,9 @@ mod tests {
|
||||
use crate::DEFAULT_PG_VERSION;
|
||||
use bytes::BytesMut;
|
||||
use hex_literal::hex;
|
||||
use pageserver_api::key::NON_INHERITED_RANGE;
|
||||
use pageserver_api::key::{AUX_KEY_PREFIX, NON_INHERITED_RANGE};
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::CompactionAlgorithm;
|
||||
use rand::{thread_rng, Rng};
|
||||
use tests::storage_layer::ValuesReconstructState;
|
||||
use tests::timeline::{GetVectoredError, ShutdownMode};
|
||||
@@ -4512,11 +4567,25 @@ mod tests {
|
||||
}
|
||||
|
||||
async fn bulk_insert_compact_gc(
|
||||
timeline: Arc<Timeline>,
|
||||
tenant: &Tenant,
|
||||
timeline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
lsn: Lsn,
|
||||
repeat: usize,
|
||||
key_count: usize,
|
||||
) -> anyhow::Result<()> {
|
||||
let compact = true;
|
||||
bulk_insert_maybe_compact_gc(tenant, timeline, ctx, lsn, repeat, key_count, compact).await
|
||||
}
|
||||
|
||||
async fn bulk_insert_maybe_compact_gc(
|
||||
tenant: &Tenant,
|
||||
timeline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
mut lsn: Lsn,
|
||||
repeat: usize,
|
||||
key_count: usize,
|
||||
compact: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
|
||||
let mut blknum = 0;
|
||||
@@ -4524,6 +4593,8 @@ mod tests {
|
||||
// Enforce that key range is monotonously increasing
|
||||
let mut keyspace = KeySpaceAccum::new();
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
for _ in 0..repeat {
|
||||
for _ in 0..key_count {
|
||||
test_key.field6 = blknum;
|
||||
@@ -4545,22 +4616,19 @@ mod tests {
|
||||
blknum += 1;
|
||||
}
|
||||
|
||||
let cutoff = timeline.get_last_record_lsn();
|
||||
|
||||
timeline
|
||||
.update_gc_info(
|
||||
Vec::new(),
|
||||
cutoff,
|
||||
Duration::ZERO,
|
||||
&CancellationToken::new(),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
timeline.freeze_and_flush().await?;
|
||||
timeline
|
||||
.compact(&CancellationToken::new(), EnumSet::empty(), ctx)
|
||||
if compact {
|
||||
// this requires timeline to be &Arc<Timeline>
|
||||
timeline.compact(&cancel, EnumSet::empty(), ctx).await?;
|
||||
}
|
||||
|
||||
// this doesn't really need to use the timeline_id target, but it is closer to what it
|
||||
// originally was.
|
||||
let res = tenant
|
||||
.gc_iteration(Some(timeline.timeline_id), 0, Duration::ZERO, &cancel, ctx)
|
||||
.await?;
|
||||
timeline.gc().await?;
|
||||
|
||||
assert_eq!(res.layers_removed, 0, "this never removes anything");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -4579,7 +4647,7 @@ mod tests {
|
||||
.await?;
|
||||
|
||||
let lsn = Lsn(0x10);
|
||||
bulk_insert_compact_gc(tline.clone(), &ctx, lsn, 50, 10000).await?;
|
||||
bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -4610,7 +4678,7 @@ mod tests {
|
||||
.await?;
|
||||
|
||||
let lsn = Lsn(0x10);
|
||||
bulk_insert_compact_gc(tline.clone(), &ctx, lsn, 50, 10000).await?;
|
||||
bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?;
|
||||
|
||||
let guard = tline.layers.read().await;
|
||||
guard.layer_map().dump(true, &ctx).await?;
|
||||
@@ -4723,15 +4791,7 @@ mod tests {
|
||||
.await;
|
||||
|
||||
let images = vectored_res?;
|
||||
let mut key = NON_INHERITED_RANGE.start;
|
||||
while key < NON_INHERITED_RANGE.end {
|
||||
assert!(matches!(
|
||||
images[&key],
|
||||
Err(PageReconstructError::MissingKey(_))
|
||||
));
|
||||
key = key.next();
|
||||
}
|
||||
|
||||
assert!(images.is_empty());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -5042,13 +5102,29 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_random_updates() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_random_updates")?;
|
||||
let names_algorithms = [
|
||||
("test_random_updates_legacy", CompactionAlgorithm::Legacy),
|
||||
("test_random_updates_tiered", CompactionAlgorithm::Tiered),
|
||||
];
|
||||
for (name, algorithm) in names_algorithms {
|
||||
test_random_updates_algorithm(name, algorithm).await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn test_random_updates_algorithm(
|
||||
name: &'static str,
|
||||
compaction_algorithm: CompactionAlgorithm,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut harness = TenantHarness::create(name)?;
|
||||
harness.tenant_conf.compaction_algorithm = compaction_algorithm;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
|
||||
const NUM_KEYS: usize = 1000;
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
|
||||
|
||||
@@ -5107,22 +5183,11 @@ mod tests {
|
||||
);
|
||||
}
|
||||
|
||||
// Perform a cycle of flush, compact, and GC
|
||||
let cutoff = tline.get_last_record_lsn();
|
||||
tline
|
||||
.update_gc_info(
|
||||
Vec::new(),
|
||||
cutoff,
|
||||
Duration::ZERO,
|
||||
&CancellationToken::new(),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
// Perform a cycle of flush, and GC
|
||||
tline.freeze_and_flush().await?;
|
||||
tline
|
||||
.compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
|
||||
tenant
|
||||
.gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
|
||||
.await?;
|
||||
tline.gc().await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -5143,6 +5208,8 @@ mod tests {
|
||||
|
||||
let mut keyspace = KeySpaceAccum::new();
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
// Track when each page was last modified. Used to assert that
|
||||
// a read sees the latest page version.
|
||||
let mut updated = [Lsn(0); NUM_KEYS];
|
||||
@@ -5206,21 +5273,11 @@ mod tests {
|
||||
}
|
||||
|
||||
// Perform a cycle of flush, compact, and GC
|
||||
let cutoff = tline.get_last_record_lsn();
|
||||
tline
|
||||
.update_gc_info(
|
||||
Vec::new(),
|
||||
cutoff,
|
||||
Duration::ZERO,
|
||||
&CancellationToken::new(),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
tline.freeze_and_flush().await?;
|
||||
tline
|
||||
.compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
|
||||
tline.compact(&cancel, EnumSet::empty(), &ctx).await?;
|
||||
tenant
|
||||
.gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
|
||||
.await?;
|
||||
tline.gc().await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -5402,19 +5459,140 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_read_at_max_lsn() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_read_at_max_lsn")?;
|
||||
let names_algorithms = [
|
||||
("test_read_at_max_lsn_legacy", CompactionAlgorithm::Legacy),
|
||||
("test_read_at_max_lsn_tiered", CompactionAlgorithm::Tiered),
|
||||
];
|
||||
for (name, algorithm) in names_algorithms {
|
||||
test_read_at_max_lsn_algorithm(name, algorithm).await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn test_read_at_max_lsn_algorithm(
|
||||
name: &'static str,
|
||||
compaction_algorithm: CompactionAlgorithm,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut harness = TenantHarness::create(name)?;
|
||||
harness.tenant_conf.compaction_algorithm = compaction_algorithm;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
|
||||
let lsn = Lsn(0x10);
|
||||
bulk_insert_compact_gc(tline.clone(), &ctx, lsn, 50, 10000).await?;
|
||||
let compact = false;
|
||||
bulk_insert_maybe_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000, compact).await?;
|
||||
|
||||
let test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
|
||||
let read_lsn = Lsn(u64::MAX - 1);
|
||||
|
||||
assert!(tline.get(test_key, read_lsn, &ctx).await.is_ok());
|
||||
let result = tline.get(test_key, read_lsn, &ctx).await;
|
||||
assert!(result.is_ok(), "result is not Ok: {}", result.unwrap_err());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_metadata_scan() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_metadata_scan")?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
|
||||
const NUM_KEYS: usize = 1000;
|
||||
const STEP: usize = 100; // random update + scan base_key + idx * STEP
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
|
||||
base_key.field1 = AUX_KEY_PREFIX;
|
||||
let mut test_key = base_key;
|
||||
|
||||
// Track when each page was last modified. Used to assert that
|
||||
// a read sees the latest page version.
|
||||
let mut updated = [Lsn(0); NUM_KEYS];
|
||||
|
||||
let mut lsn = Lsn(0x10);
|
||||
#[allow(clippy::needless_range_loop)]
|
||||
for blknum in 0..NUM_KEYS {
|
||||
lsn = Lsn(lsn.0 + 0x10);
|
||||
test_key.field6 = (blknum * STEP) as u32;
|
||||
let mut writer = tline.writer().await;
|
||||
writer
|
||||
.put(
|
||||
test_key,
|
||||
lsn,
|
||||
&Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
writer.finish_write(lsn);
|
||||
updated[blknum] = lsn;
|
||||
drop(writer);
|
||||
}
|
||||
|
||||
let keyspace = KeySpace::single(base_key..base_key.add((NUM_KEYS * STEP) as u32));
|
||||
|
||||
for _ in 0..10 {
|
||||
// Read all the blocks
|
||||
for (blknum, last_lsn) in updated.iter().enumerate() {
|
||||
test_key.field6 = (blknum * STEP) as u32;
|
||||
assert_eq!(
|
||||
tline.get(test_key, lsn, &ctx).await?,
|
||||
test_img(&format!("{} at {}", blknum, last_lsn))
|
||||
);
|
||||
}
|
||||
|
||||
let mut cnt = 0;
|
||||
for (key, value) in tline
|
||||
.get_vectored_impl(
|
||||
keyspace.clone(),
|
||||
lsn,
|
||||
ValuesReconstructState::default(),
|
||||
&ctx,
|
||||
)
|
||||
.await?
|
||||
{
|
||||
let blknum = key.field6 as usize;
|
||||
let value = value?;
|
||||
assert!(blknum % STEP == 0);
|
||||
let blknum = blknum / STEP;
|
||||
assert_eq!(
|
||||
value,
|
||||
test_img(&format!("{} at {}", blknum, updated[blknum]))
|
||||
);
|
||||
cnt += 1;
|
||||
}
|
||||
|
||||
assert_eq!(cnt, NUM_KEYS);
|
||||
|
||||
for _ in 0..NUM_KEYS {
|
||||
lsn = Lsn(lsn.0 + 0x10);
|
||||
let blknum = thread_rng().gen_range(0..NUM_KEYS);
|
||||
test_key.field6 = (blknum * STEP) as u32;
|
||||
let mut writer = tline.writer().await;
|
||||
writer
|
||||
.put(
|
||||
test_key,
|
||||
lsn,
|
||||
&Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
writer.finish_write(lsn);
|
||||
drop(writer);
|
||||
updated[blknum] = lsn;
|
||||
}
|
||||
|
||||
// Perform a cycle of flush, compact, and GC
|
||||
tline.freeze_and_flush().await?;
|
||||
tline.compact(&cancel, EnumSet::empty(), &ctx).await?;
|
||||
tenant
|
||||
.gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
|
||||
.await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -130,8 +130,9 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||
async fn write_all_unbuffered<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
|
||||
&mut self,
|
||||
src_buf: B,
|
||||
ctx: &RequestContext,
|
||||
) -> (B::Buf, Result<(), Error>) {
|
||||
let (src_buf, res) = self.inner.write_all(src_buf).await;
|
||||
let (src_buf, res) = self.inner.write_all(src_buf, ctx).await;
|
||||
let nbytes = match res {
|
||||
Ok(nbytes) => nbytes,
|
||||
Err(e) => return (src_buf, Err(e)),
|
||||
@@ -142,9 +143,9 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||
|
||||
#[inline(always)]
|
||||
/// Flushes the internal buffer to the underlying `VirtualFile`.
|
||||
pub async fn flush_buffer(&mut self) -> Result<(), Error> {
|
||||
pub async fn flush_buffer(&mut self, ctx: &RequestContext) -> Result<(), Error> {
|
||||
let buf = std::mem::take(&mut self.buf);
|
||||
let (mut buf, res) = self.inner.write_all(buf).await;
|
||||
let (mut buf, res) = self.inner.write_all(buf, ctx).await;
|
||||
res?;
|
||||
buf.clear();
|
||||
self.buf = buf;
|
||||
@@ -165,10 +166,11 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||
async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
|
||||
&mut self,
|
||||
src_buf: B,
|
||||
ctx: &RequestContext,
|
||||
) -> (B::Buf, Result<(), Error>) {
|
||||
if !BUFFERED {
|
||||
assert!(self.buf.is_empty());
|
||||
return self.write_all_unbuffered(src_buf).await;
|
||||
return self.write_all_unbuffered(src_buf, ctx).await;
|
||||
}
|
||||
let remaining = Self::CAPACITY - self.buf.len();
|
||||
let src_buf_len = src_buf.bytes_init();
|
||||
@@ -183,7 +185,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||
}
|
||||
// Then, if the buffer is full, flush it out
|
||||
if self.buf.len() == Self::CAPACITY {
|
||||
if let Err(e) = self.flush_buffer().await {
|
||||
if let Err(e) = self.flush_buffer(ctx).await {
|
||||
return (Slice::into_inner(src_buf), Err(e));
|
||||
}
|
||||
}
|
||||
@@ -199,7 +201,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||
assert_eq!(copied, src_buf.len());
|
||||
Slice::into_inner(src_buf)
|
||||
} else {
|
||||
let (src_buf, res) = self.write_all_unbuffered(src_buf).await;
|
||||
let (src_buf, res) = self.write_all_unbuffered(src_buf, ctx).await;
|
||||
if let Err(e) = res {
|
||||
return (src_buf, Err(e));
|
||||
}
|
||||
@@ -216,6 +218,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||
pub async fn write_blob<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
|
||||
&mut self,
|
||||
srcbuf: B,
|
||||
ctx: &RequestContext,
|
||||
) -> (B::Buf, Result<u64, Error>) {
|
||||
let offset = self.offset;
|
||||
|
||||
@@ -227,7 +230,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||
if len < 128 {
|
||||
// Short blob. Write a 1-byte length header
|
||||
io_buf.put_u8(len as u8);
|
||||
self.write_all(io_buf).await
|
||||
self.write_all(io_buf, ctx).await
|
||||
} else {
|
||||
// Write a 4-byte length header
|
||||
if len > 0x7fff_ffff {
|
||||
@@ -242,7 +245,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||
let mut len_buf = (len as u32).to_be_bytes();
|
||||
len_buf[0] |= 0x80;
|
||||
io_buf.extend_from_slice(&len_buf[..]);
|
||||
self.write_all(io_buf).await
|
||||
self.write_all(io_buf, ctx).await
|
||||
}
|
||||
}
|
||||
.await;
|
||||
@@ -251,7 +254,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||
Ok(_) => (),
|
||||
Err(e) => return (Slice::into_inner(srcbuf.slice(..)), Err(e)),
|
||||
}
|
||||
let (srcbuf, res) = self.write_all(srcbuf).await;
|
||||
let (srcbuf, res) = self.write_all(srcbuf, ctx).await;
|
||||
(srcbuf, res.map(|_| offset))
|
||||
}
|
||||
}
|
||||
@@ -261,8 +264,8 @@ impl BlobWriter<true> {
|
||||
///
|
||||
/// This function flushes the internal buffer before giving access
|
||||
/// to the underlying `VirtualFile`.
|
||||
pub async fn into_inner(mut self) -> Result<VirtualFile, Error> {
|
||||
self.flush_buffer().await?;
|
||||
pub async fn into_inner(mut self, ctx: &RequestContext) -> Result<VirtualFile, Error> {
|
||||
self.flush_buffer(ctx).await?;
|
||||
Ok(self.inner)
|
||||
}
|
||||
|
||||
@@ -299,16 +302,16 @@ mod tests {
|
||||
let file = VirtualFile::create(pathbuf.as_path()).await?;
|
||||
let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
|
||||
for blob in blobs.iter() {
|
||||
let (_, res) = wtr.write_blob(blob.clone()).await;
|
||||
let (_, res) = wtr.write_blob(blob.clone(), &ctx).await;
|
||||
let offs = res?;
|
||||
offsets.push(offs);
|
||||
}
|
||||
// Write out one page worth of zeros so that we can
|
||||
// read again with read_blk
|
||||
let (_, res) = wtr.write_blob(vec![0; PAGE_SZ]).await;
|
||||
let (_, res) = wtr.write_blob(vec![0; PAGE_SZ], &ctx).await;
|
||||
let offs = res?;
|
||||
println!("Writing final blob at offs={offs}");
|
||||
wtr.flush_buffer().await?;
|
||||
wtr.flush_buffer(&ctx).await?;
|
||||
}
|
||||
|
||||
let file = VirtualFile::open(pathbuf.as_path()).await?;
|
||||
|
||||
@@ -74,7 +74,7 @@ impl EphemeralFile {
|
||||
pub(crate) async fn write_blob(
|
||||
&mut self,
|
||||
srcbuf: &[u8],
|
||||
_ctx: &RequestContext,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<u64, io::Error> {
|
||||
let pos = self.rw.bytes_written();
|
||||
|
||||
@@ -83,15 +83,15 @@ impl EphemeralFile {
|
||||
// short one-byte length header
|
||||
let len_buf = [srcbuf.len() as u8];
|
||||
|
||||
self.rw.write_all_borrowed(&len_buf).await?;
|
||||
self.rw.write_all_borrowed(&len_buf, ctx).await?;
|
||||
} else {
|
||||
let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32);
|
||||
len_buf[0] |= 0x80;
|
||||
self.rw.write_all_borrowed(&len_buf).await?;
|
||||
self.rw.write_all_borrowed(&len_buf, ctx).await?;
|
||||
}
|
||||
|
||||
// Write the payload
|
||||
self.rw.write_all_borrowed(srcbuf).await?;
|
||||
self.rw.write_all_borrowed(srcbuf, ctx).await?;
|
||||
|
||||
Ok(pos)
|
||||
}
|
||||
|
||||
@@ -35,10 +35,14 @@ impl RW {
|
||||
self.page_cache_file_id
|
||||
}
|
||||
|
||||
pub(crate) async fn write_all_borrowed(&mut self, srcbuf: &[u8]) -> Result<usize, io::Error> {
|
||||
pub(crate) async fn write_all_borrowed(
|
||||
&mut self,
|
||||
srcbuf: &[u8],
|
||||
ctx: &RequestContext,
|
||||
) -> Result<usize, io::Error> {
|
||||
// It doesn't make sense to proactively fill the page cache on the Pageserver write path
|
||||
// because Compute is unlikely to access recently written data.
|
||||
self.rw.write_all_borrowed(srcbuf).await
|
||||
self.rw.write_all_borrowed(srcbuf, ctx).await
|
||||
}
|
||||
|
||||
pub(crate) fn bytes_written(&self) -> u64 {
|
||||
@@ -134,6 +138,7 @@ impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmi
|
||||
>(
|
||||
&mut self,
|
||||
buf: B,
|
||||
ctx: &RequestContext,
|
||||
) -> std::io::Result<(usize, B::Buf)> {
|
||||
let buf = buf.slice(..);
|
||||
let saved_bounds = buf.bounds(); // save for reconstructing the Slice from iobuf after the IO is done
|
||||
@@ -150,7 +155,7 @@ impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmi
|
||||
);
|
||||
|
||||
// Do the IO.
|
||||
let iobuf = match self.file.write_all(buf).await {
|
||||
let iobuf = match self.file.write_all(buf, ctx).await {
|
||||
(iobuf, Ok(nwritten)) => {
|
||||
assert_eq!(nwritten, buflen);
|
||||
iobuf
|
||||
|
||||
@@ -20,6 +20,7 @@
|
||||
mod zero_padded;
|
||||
|
||||
use crate::{
|
||||
context::RequestContext,
|
||||
page_cache::PAGE_SZ,
|
||||
virtual_file::owned_buffers_io::{
|
||||
self,
|
||||
@@ -60,8 +61,12 @@ where
|
||||
self.buffered_writer.as_inner().as_inner()
|
||||
}
|
||||
|
||||
pub async fn write_all_borrowed(&mut self, buf: &[u8]) -> std::io::Result<usize> {
|
||||
self.buffered_writer.write_buffered_borrowed(buf).await
|
||||
pub async fn write_all_borrowed(
|
||||
&mut self,
|
||||
buf: &[u8],
|
||||
ctx: &RequestContext,
|
||||
) -> std::io::Result<usize> {
|
||||
self.buffered_writer.write_buffered_borrowed(buf, ctx).await
|
||||
}
|
||||
|
||||
pub fn bytes_written(&self) -> u64 {
|
||||
|
||||
@@ -588,7 +588,7 @@ impl LayerMap {
|
||||
let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
|
||||
coverage.push((kr, current_val.take()));
|
||||
current_key = change_key;
|
||||
current_val = change_val.clone();
|
||||
current_val.clone_from(&change_val);
|
||||
}
|
||||
|
||||
// Add the final interval
|
||||
@@ -672,12 +672,12 @@ impl LayerMap {
|
||||
// Loop through the delta coverage and recurse on each part
|
||||
for (change_key, change_val) in version.delta_coverage.range(start..end) {
|
||||
// If there's a relevant delta in this part, add 1 and recurse down
|
||||
if let Some(val) = current_val {
|
||||
if let Some(val) = ¤t_val {
|
||||
if val.get_lsn_range().end > lsn.start {
|
||||
let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
|
||||
let lr = lsn.start..val.get_lsn_range().start;
|
||||
if !kr.is_empty() {
|
||||
let base_count = Self::is_reimage_worthy(&val, key) as usize;
|
||||
let base_count = Self::is_reimage_worthy(val, key) as usize;
|
||||
let new_limit = limit.map(|l| l - base_count);
|
||||
let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit);
|
||||
max_stacked_deltas = std::cmp::max(
|
||||
@@ -689,17 +689,17 @@ impl LayerMap {
|
||||
}
|
||||
|
||||
current_key = change_key;
|
||||
current_val = change_val.clone();
|
||||
current_val.clone_from(&change_val);
|
||||
}
|
||||
|
||||
// Consider the last part
|
||||
if let Some(val) = current_val {
|
||||
if let Some(val) = ¤t_val {
|
||||
if val.get_lsn_range().end > lsn.start {
|
||||
let kr = Key::from_i128(current_key)..Key::from_i128(end);
|
||||
let lr = lsn.start..val.get_lsn_range().start;
|
||||
|
||||
if !kr.is_empty() {
|
||||
let base_count = Self::is_reimage_worthy(&val, key) as usize;
|
||||
let base_count = Self::is_reimage_worthy(val, key) as usize;
|
||||
let new_limit = limit.map(|l| l - base_count);
|
||||
let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit);
|
||||
max_stacked_deltas = std::cmp::max(
|
||||
@@ -916,6 +916,7 @@ mod tests {
|
||||
assert_eq!(lhs, rhs);
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
fn brute_force_range_search(
|
||||
layer_map: &LayerMap,
|
||||
key_range: Range<Key>,
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
//! page server.
|
||||
|
||||
use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
|
||||
use futures::StreamExt;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::models::LocationConfigMode;
|
||||
@@ -253,17 +254,15 @@ impl TenantsMap {
|
||||
}
|
||||
}
|
||||
|
||||
/// Precursor to deletion of a tenant dir: we do a fast rename to a tmp path, and then
|
||||
/// the slower actual deletion in the background.
|
||||
///
|
||||
/// This is "safe" in that that it won't leave behind a partially deleted directory
|
||||
/// at the original path, because we rename with TEMP_FILE_SUFFIX before starting deleting
|
||||
/// the contents.
|
||||
///
|
||||
/// This is pageserver-specific, as it relies on future processes after a crash to check
|
||||
/// for TEMP_FILE_SUFFIX when loading things.
|
||||
async fn safe_remove_tenant_dir_all(path: impl AsRef<Utf8Path>) -> std::io::Result<()> {
|
||||
let tmp_path = safe_rename_tenant_dir(path).await?;
|
||||
fs::remove_dir_all(tmp_path).await
|
||||
}
|
||||
|
||||
async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<Utf8PathBuf> {
|
||||
let parent = path
|
||||
.as_ref()
|
||||
@@ -286,6 +285,28 @@ async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<U
|
||||
Ok(tmp_path)
|
||||
}
|
||||
|
||||
/// When we have moved a tenant's content to a temporary directory, we may delete it lazily in
|
||||
/// the background, and thereby avoid blocking any API requests on this deletion completing.
|
||||
fn spawn_background_purge(tmp_path: Utf8PathBuf) {
|
||||
// Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
|
||||
// After a tenant is detached, there are no more task_mgr tasks for that tenant_id.
|
||||
let task_tenant_id = None;
|
||||
|
||||
task_mgr::spawn(
|
||||
task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::MgmtRequest,
|
||||
task_tenant_id,
|
||||
None,
|
||||
"tenant_files_delete",
|
||||
false,
|
||||
async move {
|
||||
fs::remove_dir_all(tmp_path.as_path())
|
||||
.await
|
||||
.with_context(|| format!("tenant directory {:?} deletion", tmp_path))
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
static TENANTS: Lazy<std::sync::RwLock<TenantsMap>> =
|
||||
Lazy::new(|| std::sync::RwLock::new(TenantsMap::Initializing));
|
||||
|
||||
@@ -570,7 +591,11 @@ pub async fn init_tenant_mgr(
|
||||
);
|
||||
TENANT.startup_scheduled.inc_by(tenant_configs.len() as u64);
|
||||
|
||||
// Construct `Tenant` objects and start them running
|
||||
// Accumulate futures for writing tenant configs, so that we can execute in parallel
|
||||
let mut config_write_futs = Vec::new();
|
||||
|
||||
// Update the location configs according to the re-attach response and persist them to disk
|
||||
tracing::info!("Updating {} location configs", tenant_configs.len());
|
||||
for (tenant_shard_id, location_conf) in tenant_configs {
|
||||
let tenant_dir_path = conf.tenant_path(&tenant_shard_id);
|
||||
|
||||
@@ -597,18 +622,22 @@ pub async fn init_tenant_mgr(
|
||||
const DEFAULT_SECONDARY_CONF: SecondaryLocationConfig =
|
||||
SecondaryLocationConfig { warm: true };
|
||||
|
||||
// Update the location config according to the re-attach response
|
||||
if let Some(tenant_modes) = &tenant_modes {
|
||||
// We have a generation map: treat it as the authority for whether
|
||||
// this tenant is really attached.
|
||||
match tenant_modes.get(&tenant_shard_id) {
|
||||
None => {
|
||||
info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Detaching tenant, control plane omitted it in re-attach response");
|
||||
if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await {
|
||||
error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
|
||||
"Failed to remove detached tenant directory '{tenant_dir_path}': {e:?}",
|
||||
);
|
||||
}
|
||||
|
||||
match safe_rename_tenant_dir(&tenant_dir_path).await {
|
||||
Ok(tmp_path) => {
|
||||
spawn_background_purge(tmp_path);
|
||||
}
|
||||
Err(e) => {
|
||||
error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
|
||||
"Failed to move detached tenant directory '{tenant_dir_path}': {e:?}");
|
||||
}
|
||||
};
|
||||
|
||||
// We deleted local content: move on to next tenant, don't try and spawn this one.
|
||||
continue;
|
||||
@@ -654,8 +683,32 @@ pub async fn init_tenant_mgr(
|
||||
|
||||
// Presence of a generation number implies attachment: attach the tenant
|
||||
// if it wasn't already, and apply the generation number.
|
||||
Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?;
|
||||
config_write_futs.push(async move {
|
||||
let r = Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await;
|
||||
(tenant_shard_id, location_conf, r)
|
||||
});
|
||||
}
|
||||
|
||||
// Execute config writes with concurrency, to avoid bottlenecking on local FS write latency
|
||||
tracing::info!(
|
||||
"Writing {} location config files...",
|
||||
config_write_futs.len()
|
||||
);
|
||||
let config_write_results = futures::stream::iter(config_write_futs)
|
||||
.buffer_unordered(16)
|
||||
.collect::<Vec<_>>()
|
||||
.await;
|
||||
|
||||
tracing::info!(
|
||||
"Spawning {} tenant shard locations...",
|
||||
config_write_results.len()
|
||||
);
|
||||
// For those shards that have live configurations, construct `Tenant` or `SecondaryTenant` objects and start them running
|
||||
for (tenant_shard_id, location_conf, config_write_result) in config_write_results {
|
||||
// Errors writing configs are fatal
|
||||
config_write_result?;
|
||||
|
||||
let tenant_dir_path = conf.tenant_path(&tenant_shard_id);
|
||||
let shard_identity = location_conf.shard;
|
||||
let slot = match location_conf.mode {
|
||||
LocationMode::Attached(attached_conf) => {
|
||||
@@ -1699,7 +1752,7 @@ impl TenantManager {
|
||||
let tmp_path = safe_rename_tenant_dir(&local_tenant_directory)
|
||||
.await
|
||||
.with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))?;
|
||||
self.spawn_background_purge(tmp_path);
|
||||
spawn_background_purge(tmp_path);
|
||||
|
||||
fail::fail_point!("shard-split-pre-finish", |_| Err(anyhow::anyhow!(
|
||||
"failpoint"
|
||||
@@ -1854,28 +1907,6 @@ impl TenantManager {
|
||||
shutdown_all_tenants0(self.tenants).await
|
||||
}
|
||||
|
||||
/// When we have moved a tenant's content to a temporary directory, we may delete it lazily in
|
||||
/// the background, and thereby avoid blocking any API requests on this deletion completing.
|
||||
fn spawn_background_purge(&self, tmp_path: Utf8PathBuf) {
|
||||
// Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
|
||||
// After a tenant is detached, there are no more task_mgr tasks for that tenant_id.
|
||||
let task_tenant_id = None;
|
||||
|
||||
task_mgr::spawn(
|
||||
task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::MgmtRequest,
|
||||
task_tenant_id,
|
||||
None,
|
||||
"tenant_files_delete",
|
||||
false,
|
||||
async move {
|
||||
fs::remove_dir_all(tmp_path.as_path())
|
||||
.await
|
||||
.with_context(|| format!("tenant directory {:?} deletion", tmp_path))
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
pub(crate) async fn detach_tenant(
|
||||
&self,
|
||||
conf: &'static PageServerConf,
|
||||
@@ -1892,7 +1923,7 @@ impl TenantManager {
|
||||
deletion_queue_client,
|
||||
)
|
||||
.await?;
|
||||
self.spawn_background_purge(tmp_path);
|
||||
spawn_background_purge(tmp_path);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -210,6 +210,7 @@ use tracing::{debug, error, info, instrument, warn};
|
||||
use tracing::{info_span, Instrument};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::context::RequestContext;
|
||||
use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError};
|
||||
use crate::metrics::{
|
||||
MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics,
|
||||
@@ -505,6 +506,7 @@ impl RemoteTimelineClient {
|
||||
layer_file_name: &LayerFileName,
|
||||
layer_metadata: &LayerFileMetadata,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<u64> {
|
||||
let downloaded_size = {
|
||||
let _unfinished_gauge_guard = self.metrics.call_begin(
|
||||
@@ -522,6 +524,7 @@ impl RemoteTimelineClient {
|
||||
layer_file_name,
|
||||
layer_metadata,
|
||||
cancel,
|
||||
ctx,
|
||||
)
|
||||
.measure_remote_op(
|
||||
RemoteOpFileKind::Layer,
|
||||
|
||||
@@ -18,6 +18,7 @@ use tracing::warn;
|
||||
use utils::backoff;
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::RequestContext;
|
||||
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
|
||||
use crate::tenant::storage_layer::LayerFileName;
|
||||
@@ -40,6 +41,7 @@ use super::{
|
||||
/// in the metadata. (In the future, we might do more cross-checks, like CRC validation)
|
||||
///
|
||||
/// Returns the size of the downloaded file.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub async fn download_layer_file<'a>(
|
||||
conf: &'static PageServerConf,
|
||||
storage: &'a GenericRemoteStorage,
|
||||
@@ -48,6 +50,7 @@ pub async fn download_layer_file<'a>(
|
||||
layer_file_name: &'a LayerFileName,
|
||||
layer_metadata: &'a LayerFileMetadata,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<u64, DownloadError> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
@@ -75,7 +78,7 @@ pub async fn download_layer_file<'a>(
|
||||
let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION);
|
||||
|
||||
let bytes_amount = download_retry(
|
||||
|| async { download_object(storage, &remote_path, &temp_file_path, cancel).await },
|
||||
|| async { download_object(storage, &remote_path, &temp_file_path, cancel, ctx).await },
|
||||
&format!("download {remote_path:?}"),
|
||||
cancel,
|
||||
)
|
||||
@@ -133,6 +136,7 @@ async fn download_object<'a>(
|
||||
src_path: &RemotePath,
|
||||
dst_path: &Utf8PathBuf,
|
||||
cancel: &CancellationToken,
|
||||
#[cfg_attr(target_os = "macos", allow(unused_variables))] ctx: &RequestContext,
|
||||
) -> Result<u64, DownloadError> {
|
||||
let res = match crate::virtual_file::io_engine::get() {
|
||||
crate::virtual_file::io_engine::IoEngine::NotSet => panic!("unset"),
|
||||
@@ -208,10 +212,10 @@ async fn download_object<'a>(
|
||||
Err(e) => return Err(e),
|
||||
};
|
||||
buffered
|
||||
.write_buffered(tokio_epoll_uring::BoundedBuf::slice_full(chunk))
|
||||
.write_buffered(tokio_epoll_uring::BoundedBuf::slice_full(chunk), ctx)
|
||||
.await?;
|
||||
}
|
||||
let size_tracking = buffered.flush_and_into_inner().await?;
|
||||
let size_tracking = buffered.flush_and_into_inner(ctx).await?;
|
||||
Ok(size_tracking.into_inner())
|
||||
}
|
||||
.await?;
|
||||
|
||||
@@ -7,6 +7,7 @@ use std::{sync::Arc, time::SystemTime};
|
||||
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
context::RequestContext,
|
||||
disk_usage_eviction_task::DiskUsageEvictionInfo,
|
||||
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
|
||||
virtual_file::MaybeFatalIo,
|
||||
@@ -316,9 +317,13 @@ pub fn spawn_tasks(
|
||||
let (upload_req_tx, upload_req_rx) =
|
||||
tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);
|
||||
|
||||
let downloader_task_ctx = RequestContext::new(
|
||||
TaskKind::SecondaryDownloads,
|
||||
crate::context::DownloadBehavior::Download,
|
||||
);
|
||||
task_mgr::spawn(
|
||||
BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::SecondaryDownloads,
|
||||
downloader_task_ctx.task_kind(),
|
||||
None,
|
||||
None,
|
||||
"secondary tenant downloads",
|
||||
@@ -330,6 +335,7 @@ pub fn spawn_tasks(
|
||||
download_req_rx,
|
||||
bg_jobs_clone,
|
||||
cancel_clone,
|
||||
downloader_task_ctx,
|
||||
)
|
||||
.await;
|
||||
|
||||
|
||||
@@ -8,6 +8,7 @@ use std::{
|
||||
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
context::RequestContext,
|
||||
disk_usage_eviction_task::{
|
||||
finite_f32, DiskUsageEvictionInfo, EvictionCandidate, EvictionLayer, EvictionSecondaryLayer,
|
||||
},
|
||||
@@ -30,7 +31,10 @@ use crate::{
|
||||
|
||||
use super::{
|
||||
heatmap::HeatMapLayer,
|
||||
scheduler::{self, Completion, JobGenerator, SchedulingResult, TenantBackgroundJobs},
|
||||
scheduler::{
|
||||
self, period_jitter, period_warmup, Completion, JobGenerator, SchedulingResult,
|
||||
TenantBackgroundJobs,
|
||||
},
|
||||
SecondaryTenant,
|
||||
};
|
||||
|
||||
@@ -44,7 +48,6 @@ use chrono::format::{DelayedFormat, StrftimeItems};
|
||||
use futures::Future;
|
||||
use pageserver_api::models::SecondaryProgress;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use rand::Rng;
|
||||
use remote_storage::{DownloadError, Etag, GenericRemoteStorage};
|
||||
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -74,12 +77,14 @@ pub(super) async fn downloader_task(
|
||||
command_queue: tokio::sync::mpsc::Receiver<CommandRequest<DownloadCommand>>,
|
||||
background_jobs_can_start: Barrier,
|
||||
cancel: CancellationToken,
|
||||
root_ctx: RequestContext,
|
||||
) {
|
||||
let concurrency = tenant_manager.get_conf().secondary_download_concurrency;
|
||||
|
||||
let generator = SecondaryDownloader {
|
||||
tenant_manager,
|
||||
remote_storage,
|
||||
root_ctx,
|
||||
};
|
||||
let mut scheduler = Scheduler::new(generator, concurrency);
|
||||
|
||||
@@ -92,6 +97,7 @@ pub(super) async fn downloader_task(
|
||||
struct SecondaryDownloader {
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
remote_storage: GenericRemoteStorage,
|
||||
root_ctx: RequestContext,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
@@ -270,7 +276,7 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
|
||||
// Update freshened_at even if there was an error: we don't want errored tenants to implicitly
|
||||
// take priority to run again.
|
||||
let mut detail = secondary_state.detail.lock().unwrap();
|
||||
detail.next_download = Some(Instant::now() + DOWNLOAD_FRESHEN_INTERVAL);
|
||||
detail.next_download = Some(Instant::now() + period_jitter(DOWNLOAD_FRESHEN_INTERVAL, 5));
|
||||
}
|
||||
|
||||
async fn schedule(&mut self) -> SchedulingResult<PendingDownload> {
|
||||
@@ -301,11 +307,9 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
|
||||
}
|
||||
|
||||
if detail.next_download.is_none() {
|
||||
// Initialize with a jitter: this spreads initial downloads on startup
|
||||
// or mass-attach across our freshen interval.
|
||||
let jittered_period =
|
||||
rand::thread_rng().gen_range(Duration::ZERO..DOWNLOAD_FRESHEN_INTERVAL);
|
||||
detail.next_download = Some(now.checked_add(jittered_period).expect(
|
||||
// Initialize randomly in the range from 0 to our interval: this uniformly spreads the start times. Subsequent
|
||||
// rounds will use a smaller jitter to avoid accidentally synchronizing later.
|
||||
detail.next_download = Some(now.checked_add(period_warmup(DOWNLOAD_FRESHEN_INTERVAL)).expect(
|
||||
"Using our constant, which is known to be small compared with clock range",
|
||||
));
|
||||
}
|
||||
@@ -367,11 +371,12 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
|
||||
let remote_storage = self.remote_storage.clone();
|
||||
let conf = self.tenant_manager.get_conf();
|
||||
let tenant_shard_id = *secondary_state.get_tenant_shard_id();
|
||||
let download_ctx = self.root_ctx.attached_child();
|
||||
(RunningDownload { barrier }, Box::pin(async move {
|
||||
let _completion = completion;
|
||||
|
||||
match TenantDownloader::new(conf, &remote_storage, &secondary_state)
|
||||
.download()
|
||||
.download(&download_ctx)
|
||||
.await
|
||||
{
|
||||
Err(UpdateError::NoData) => {
|
||||
@@ -485,7 +490,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
async fn download(&self) -> Result<(), UpdateError> {
|
||||
async fn download(&self, ctx: &RequestContext) -> Result<(), UpdateError> {
|
||||
debug_assert_current_span_has_tenant_id();
|
||||
|
||||
// For the duration of a download, we must hold the SecondaryTenant::gate, to ensure
|
||||
@@ -560,7 +565,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
}
|
||||
|
||||
let timeline_id = timeline.timeline_id;
|
||||
self.download_timeline(timeline)
|
||||
self.download_timeline(timeline, ctx)
|
||||
.instrument(tracing::info_span!(
|
||||
"secondary_download_timeline",
|
||||
tenant_id=%tenant_shard_id.tenant_id,
|
||||
@@ -742,7 +747,11 @@ impl<'a> TenantDownloader<'a> {
|
||||
.and_then(|x| x)
|
||||
}
|
||||
|
||||
async fn download_timeline(&self, timeline: HeatMapTimeline) -> Result<(), UpdateError> {
|
||||
async fn download_timeline(
|
||||
&self,
|
||||
timeline: HeatMapTimeline,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), UpdateError> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
|
||||
let timeline_path = self
|
||||
@@ -875,6 +884,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
&layer.name,
|
||||
&LayerFileMetadata::from(&layer.metadata),
|
||||
&self.secondary_state.cancel,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
{
|
||||
|
||||
@@ -20,12 +20,14 @@ use crate::{
|
||||
|
||||
use futures::Future;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use rand::Rng;
|
||||
use remote_storage::{GenericRemoteStorage, TimeoutOrCancel};
|
||||
|
||||
use super::{
|
||||
heatmap::HeatMapTenant,
|
||||
scheduler::{self, JobGenerator, RunningJob, SchedulingResult, TenantBackgroundJobs},
|
||||
scheduler::{
|
||||
self, period_jitter, period_warmup, JobGenerator, RunningJob, SchedulingResult,
|
||||
TenantBackgroundJobs,
|
||||
},
|
||||
CommandRequest, UploadCommand,
|
||||
};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -181,15 +183,11 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
|
||||
let state = self
|
||||
.tenants
|
||||
.entry(*tenant.get_tenant_shard_id())
|
||||
.or_insert_with(|| {
|
||||
let jittered_period = rand::thread_rng().gen_range(Duration::ZERO..period);
|
||||
|
||||
UploaderTenantState {
|
||||
tenant: Arc::downgrade(&tenant),
|
||||
last_upload: None,
|
||||
next_upload: Some(now.checked_add(jittered_period).unwrap_or(now)),
|
||||
last_digest: None,
|
||||
}
|
||||
.or_insert_with(|| UploaderTenantState {
|
||||
tenant: Arc::downgrade(&tenant),
|
||||
last_upload: None,
|
||||
next_upload: Some(now.checked_add(period_warmup(period)).unwrap_or(now)),
|
||||
last_digest: None,
|
||||
});
|
||||
|
||||
// Decline to do the upload if insufficient time has passed
|
||||
@@ -274,7 +272,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
|
||||
|
||||
let next_upload = tenant
|
||||
.get_heatmap_period()
|
||||
.and_then(|period| now.checked_add(period));
|
||||
.and_then(|period| now.checked_add(period_jitter(period, 5)));
|
||||
|
||||
WriteComplete {
|
||||
tenant_shard_id: *tenant.get_tenant_shard_id(),
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use futures::Future;
|
||||
use rand::Rng;
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
marker::PhantomData,
|
||||
@@ -19,6 +20,26 @@ use super::{CommandRequest, CommandResponse};
|
||||
const MAX_SCHEDULING_INTERVAL: Duration = Duration::from_secs(10);
|
||||
const MIN_SCHEDULING_INTERVAL: Duration = Duration::from_secs(1);
|
||||
|
||||
/// Jitter a Duration by an integer percentage. Returned values are uniform
|
||||
/// in the range 100-pct..100+pct (i.e. a 5% jitter is 5% either way: a ~10% range)
|
||||
pub(super) fn period_jitter(d: Duration, pct: u32) -> Duration {
|
||||
if d == Duration::ZERO {
|
||||
d
|
||||
} else {
|
||||
rand::thread_rng().gen_range((d * (100 - pct)) / 100..(d * (100 + pct)) / 100)
|
||||
}
|
||||
}
|
||||
|
||||
/// When a periodic task first starts, it should wait for some time in the range 0..period, so
|
||||
/// that starting many such tasks at the same time spreads them across the time range.
|
||||
pub(super) fn period_warmup(period: Duration) -> Duration {
|
||||
if period == Duration::ZERO {
|
||||
period
|
||||
} else {
|
||||
rand::thread_rng().gen_range(Duration::ZERO..period)
|
||||
}
|
||||
}
|
||||
|
||||
/// Scheduling helper for background work across many tenants.
|
||||
///
|
||||
/// Systems that need to run background work across many tenants may use this type
|
||||
|
||||
@@ -118,9 +118,6 @@ pub(super) async fn gather_inputs(
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ModelInputs> {
|
||||
// refresh is needed to update gc related pitr_cutoff and horizon_cutoff
|
||||
//
|
||||
// FIXME: if a single timeline is deleted while refresh gc info is ongoing, we will fail the
|
||||
// whole computation. It does not make sense from the billing perspective.
|
||||
tenant
|
||||
.refresh_gc_info(cancel, ctx)
|
||||
.await
|
||||
@@ -192,7 +189,9 @@ pub(super) async fn gather_inputs(
|
||||
// than a space bound (horizon cutoff). This means that if someone drops a database and waits for their
|
||||
// PITR interval, they will see synthetic size decrease, even if we are still storing data inside
|
||||
// horizon_cutoff.
|
||||
let mut next_gc_cutoff = gc_info.pitr_cutoff;
|
||||
let pitr_cutoff = gc_info.cutoffs.pitr;
|
||||
let horizon_cutoff = gc_info.cutoffs.horizon;
|
||||
let mut next_gc_cutoff = pitr_cutoff;
|
||||
|
||||
// If the caller provided a shorter retention period, use that instead of the GC cutoff.
|
||||
let retention_param_cutoff = if let Some(max_retention_period) = max_retention_period {
|
||||
@@ -219,6 +218,8 @@ pub(super) async fn gather_inputs(
|
||||
.map(|lsn| (lsn, LsnKind::BranchPoint))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
drop(gc_info);
|
||||
|
||||
// Add branch points we collected earlier, just in case there were any that were
|
||||
// not present in retain_lsns. We will remove any duplicates below later.
|
||||
if let Some(this_branchpoints) = branchpoints.get(&timeline_id) {
|
||||
@@ -297,8 +298,8 @@ pub(super) async fn gather_inputs(
|
||||
last_record: last_record_lsn,
|
||||
// this is not used above, because it might not have updated recently enough
|
||||
latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(),
|
||||
horizon_cutoff: gc_info.horizon_cutoff,
|
||||
pitr_cutoff: gc_info.pitr_cutoff,
|
||||
horizon_cutoff,
|
||||
pitr_cutoff,
|
||||
next_gc_cutoff,
|
||||
retention_param_cutoff,
|
||||
});
|
||||
|
||||
@@ -428,9 +428,15 @@ impl DeltaLayerWriterInner {
|
||||
///
|
||||
/// The values must be appended in key, lsn order.
|
||||
///
|
||||
async fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
|
||||
async fn put_value(
|
||||
&mut self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
val: Value,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let (_, res) = self
|
||||
.put_value_bytes(key, lsn, Value::ser(&val)?, val.will_init())
|
||||
.put_value_bytes(key, lsn, Value::ser(&val)?, val.will_init(), ctx)
|
||||
.await;
|
||||
res
|
||||
}
|
||||
@@ -441,9 +447,10 @@ impl DeltaLayerWriterInner {
|
||||
lsn: Lsn,
|
||||
val: Vec<u8>,
|
||||
will_init: bool,
|
||||
ctx: &RequestContext,
|
||||
) -> (Vec<u8>, anyhow::Result<()>) {
|
||||
assert!(self.lsn_range.start <= lsn);
|
||||
let (val, res) = self.blob_writer.write_blob(val).await;
|
||||
let (val, res) = self.blob_writer.write_blob(val, ctx).await;
|
||||
let off = match res {
|
||||
Ok(off) => off,
|
||||
Err(e) => return (val, Err(anyhow::anyhow!(e))),
|
||||
@@ -463,18 +470,23 @@ impl DeltaLayerWriterInner {
|
||||
///
|
||||
/// Finish writing the delta layer.
|
||||
///
|
||||
async fn finish(self, key_end: Key, timeline: &Arc<Timeline>) -> anyhow::Result<ResidentLayer> {
|
||||
async fn finish(
|
||||
self,
|
||||
key_end: Key,
|
||||
timeline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ResidentLayer> {
|
||||
let index_start_blk =
|
||||
((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
|
||||
|
||||
let mut file = self.blob_writer.into_inner().await?;
|
||||
let mut file = self.blob_writer.into_inner(ctx).await?;
|
||||
|
||||
// Write out the index
|
||||
let (index_root_blk, block_buf) = self.tree.finish()?;
|
||||
file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64))
|
||||
.await?;
|
||||
for buf in block_buf.blocks {
|
||||
let (_buf, res) = file.write_all(buf).await;
|
||||
let (_buf, res) = file.write_all(buf, ctx).await;
|
||||
res?;
|
||||
}
|
||||
assert!(self.lsn_range.start < self.lsn_range.end);
|
||||
@@ -494,7 +506,7 @@ impl DeltaLayerWriterInner {
|
||||
// TODO: could use smallvec here but it's a pain with Slice<T>
|
||||
Summary::ser_into(&summary, &mut buf)?;
|
||||
file.seek(SeekFrom::Start(0)).await?;
|
||||
let (_buf, res) = file.write_all(buf).await;
|
||||
let (_buf, res) = file.write_all(buf, ctx).await;
|
||||
res?;
|
||||
|
||||
let metadata = file
|
||||
@@ -592,8 +604,18 @@ impl DeltaLayerWriter {
|
||||
///
|
||||
/// The values must be appended in key, lsn order.
|
||||
///
|
||||
pub async fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
|
||||
self.inner.as_mut().unwrap().put_value(key, lsn, val).await
|
||||
pub async fn put_value(
|
||||
&mut self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
val: Value,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
self.inner
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.put_value(key, lsn, val, ctx)
|
||||
.await
|
||||
}
|
||||
|
||||
pub async fn put_value_bytes(
|
||||
@@ -602,11 +624,12 @@ impl DeltaLayerWriter {
|
||||
lsn: Lsn,
|
||||
val: Vec<u8>,
|
||||
will_init: bool,
|
||||
ctx: &RequestContext,
|
||||
) -> (Vec<u8>, anyhow::Result<()>) {
|
||||
self.inner
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.put_value_bytes(key, lsn, val, will_init)
|
||||
.put_value_bytes(key, lsn, val, will_init, ctx)
|
||||
.await
|
||||
}
|
||||
|
||||
@@ -621,10 +644,11 @@ impl DeltaLayerWriter {
|
||||
mut self,
|
||||
key_end: Key,
|
||||
timeline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ResidentLayer> {
|
||||
let inner = self.inner.take().unwrap();
|
||||
let temp_path = inner.path.clone();
|
||||
let result = inner.finish(key_end, timeline).await;
|
||||
let result = inner.finish(key_end, timeline, ctx).await;
|
||||
// The delta layer files can sometimes be really large. Clean them up.
|
||||
if result.is_err() {
|
||||
tracing::warn!(
|
||||
@@ -692,7 +716,7 @@ impl DeltaLayer {
|
||||
// TODO: could use smallvec here, but it's a pain with Slice<T>
|
||||
Summary::ser_into(&new_summary, &mut buf).context("serialize")?;
|
||||
file.seek(SeekFrom::Start(0)).await?;
|
||||
let (_buf, res) = file.write_all(buf).await;
|
||||
let (_buf, res) = file.write_all(buf, ctx).await;
|
||||
res?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -1281,7 +1305,13 @@ impl DeltaLayerInner {
|
||||
per_blob_copy.extend_from_slice(data);
|
||||
|
||||
let (tmp, res) = writer
|
||||
.put_value_bytes(key, lsn, std::mem::take(&mut per_blob_copy), will_init)
|
||||
.put_value_bytes(
|
||||
key,
|
||||
lsn,
|
||||
std::mem::take(&mut per_blob_copy),
|
||||
will_init,
|
||||
ctx,
|
||||
)
|
||||
.await;
|
||||
per_blob_copy = tmp;
|
||||
res?;
|
||||
@@ -1760,12 +1790,14 @@ mod test {
|
||||
|
||||
for entry in entries {
|
||||
let (_, res) = writer
|
||||
.put_value_bytes(entry.key, entry.lsn, entry.value, false)
|
||||
.put_value_bytes(entry.key, entry.lsn, entry.value, false, &ctx)
|
||||
.await;
|
||||
res?;
|
||||
}
|
||||
|
||||
let resident = writer.finish(entries_meta.key_range.end, &timeline).await?;
|
||||
let resident = writer
|
||||
.finish(entries_meta.key_range.end, &timeline, &ctx)
|
||||
.await?;
|
||||
|
||||
let inner = resident.as_delta(&ctx).await?;
|
||||
|
||||
@@ -1951,7 +1983,7 @@ mod test {
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let copied_layer = writer.finish(Key::MAX, &branch).await.unwrap();
|
||||
let copied_layer = writer.finish(Key::MAX, &branch, ctx).await.unwrap();
|
||||
|
||||
copied_layer.as_delta(ctx).await.unwrap();
|
||||
|
||||
|
||||
@@ -357,7 +357,7 @@ impl ImageLayer {
|
||||
// TODO: could use smallvec here but it's a pain with Slice<T>
|
||||
Summary::ser_into(&new_summary, &mut buf).context("serialize")?;
|
||||
file.seek(SeekFrom::Start(0)).await?;
|
||||
let (_buf, res) = file.write_all(buf).await;
|
||||
let (_buf, res) = file.write_all(buf, ctx).await;
|
||||
res?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -677,9 +677,14 @@ impl ImageLayerWriterInner {
|
||||
///
|
||||
/// The page versions must be appended in blknum order.
|
||||
///
|
||||
async fn put_image(&mut self, key: Key, img: Bytes) -> anyhow::Result<()> {
|
||||
async fn put_image(
|
||||
&mut self,
|
||||
key: Key,
|
||||
img: Bytes,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
ensure!(self.key_range.contains(&key));
|
||||
let (_img, res) = self.blob_writer.write_blob(img).await;
|
||||
let (_img, res) = self.blob_writer.write_blob(img, ctx).await;
|
||||
// TODO: re-use the buffer for `img` further upstack
|
||||
let off = res?;
|
||||
|
||||
@@ -693,7 +698,11 @@ impl ImageLayerWriterInner {
|
||||
///
|
||||
/// Finish writing the image layer.
|
||||
///
|
||||
async fn finish(self, timeline: &Arc<Timeline>) -> anyhow::Result<ResidentLayer> {
|
||||
async fn finish(
|
||||
self,
|
||||
timeline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ResidentLayer> {
|
||||
let index_start_blk =
|
||||
((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
|
||||
|
||||
@@ -704,7 +713,7 @@ impl ImageLayerWriterInner {
|
||||
.await?;
|
||||
let (index_root_blk, block_buf) = self.tree.finish()?;
|
||||
for buf in block_buf.blocks {
|
||||
let (_buf, res) = file.write_all(buf).await;
|
||||
let (_buf, res) = file.write_all(buf, ctx).await;
|
||||
res?;
|
||||
}
|
||||
|
||||
@@ -724,7 +733,7 @@ impl ImageLayerWriterInner {
|
||||
// TODO: could use smallvec here but it's a pain with Slice<T>
|
||||
Summary::ser_into(&summary, &mut buf)?;
|
||||
file.seek(SeekFrom::Start(0)).await?;
|
||||
let (_buf, res) = file.write_all(buf).await;
|
||||
let (_buf, res) = file.write_all(buf, ctx).await;
|
||||
res?;
|
||||
|
||||
let metadata = file
|
||||
@@ -806,8 +815,13 @@ impl ImageLayerWriter {
|
||||
///
|
||||
/// The page versions must be appended in blknum order.
|
||||
///
|
||||
pub async fn put_image(&mut self, key: Key, img: Bytes) -> anyhow::Result<()> {
|
||||
self.inner.as_mut().unwrap().put_image(key, img).await
|
||||
pub async fn put_image(
|
||||
&mut self,
|
||||
key: Key,
|
||||
img: Bytes,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
self.inner.as_mut().unwrap().put_image(key, img, ctx).await
|
||||
}
|
||||
|
||||
///
|
||||
@@ -816,8 +830,9 @@ impl ImageLayerWriter {
|
||||
pub(crate) async fn finish(
|
||||
mut self,
|
||||
timeline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<super::ResidentLayer> {
|
||||
self.inner.take().unwrap().finish(timeline).await
|
||||
self.inner.take().unwrap().finish(timeline, ctx).await
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -17,7 +17,7 @@ use anyhow::{anyhow, ensure, Result};
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::InMemoryLayerInfo;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::collections::{BinaryHeap, HashMap, HashSet};
|
||||
use std::collections::{BTreeMap, BinaryHeap, HashSet};
|
||||
use std::sync::{Arc, OnceLock};
|
||||
use std::time::Instant;
|
||||
use tracing::*;
|
||||
@@ -78,10 +78,10 @@ impl std::fmt::Debug for InMemoryLayer {
|
||||
}
|
||||
|
||||
pub struct InMemoryLayerInner {
|
||||
/// All versions of all pages in the layer are kept here. Indexed
|
||||
/// All versions of all pages in the layer are kept here. Indexed
|
||||
/// by block number and LSN. The value is an offset into the
|
||||
/// ephemeral file where the page version is stored.
|
||||
index: HashMap<Key, VecMap<Lsn, u64>>,
|
||||
index: BTreeMap<Key, VecMap<Lsn, u64>>,
|
||||
|
||||
/// The values are stored in a serialized format in this file.
|
||||
/// Each serialized Value is preceded by a 'u32' length field.
|
||||
@@ -384,29 +384,24 @@ impl InMemoryLayer {
|
||||
let mut planned_block_reads = BinaryHeap::new();
|
||||
|
||||
for range in keyspace.ranges.iter() {
|
||||
let mut key = range.start;
|
||||
while key < range.end {
|
||||
if let Some(vec_map) = inner.index.get(&key) {
|
||||
let lsn_range = match reconstruct_state.get_cached_lsn(&key) {
|
||||
Some(cached_lsn) => (cached_lsn + 1)..end_lsn,
|
||||
None => self.start_lsn..end_lsn,
|
||||
};
|
||||
for (key, vec_map) in inner.index.range(range.start..range.end) {
|
||||
let lsn_range = match reconstruct_state.get_cached_lsn(key) {
|
||||
Some(cached_lsn) => (cached_lsn + 1)..end_lsn,
|
||||
None => self.start_lsn..end_lsn,
|
||||
};
|
||||
|
||||
let slice = vec_map.slice_range(lsn_range);
|
||||
for (entry_lsn, pos) in slice.iter().rev() {
|
||||
planned_block_reads.push(BlockRead {
|
||||
key,
|
||||
lsn: *entry_lsn,
|
||||
block_offset: *pos,
|
||||
});
|
||||
}
|
||||
let slice = vec_map.slice_range(lsn_range);
|
||||
for (entry_lsn, pos) in slice.iter().rev() {
|
||||
planned_block_reads.push(BlockRead {
|
||||
key: *key,
|
||||
lsn: *entry_lsn,
|
||||
block_offset: *pos,
|
||||
});
|
||||
}
|
||||
|
||||
key = key.next();
|
||||
}
|
||||
}
|
||||
|
||||
let keyspace_size = keyspace.total_size();
|
||||
let keyspace_size = keyspace.total_raw_size();
|
||||
|
||||
let mut completed_keys = HashSet::new();
|
||||
while completed_keys.len() < keyspace_size && !planned_block_reads.is_empty() {
|
||||
@@ -499,7 +494,7 @@ impl InMemoryLayer {
|
||||
end_lsn: OnceLock::new(),
|
||||
opened_at: Instant::now(),
|
||||
inner: RwLock::new(InMemoryLayerInner {
|
||||
index: HashMap::new(),
|
||||
index: BTreeMap::new(),
|
||||
file,
|
||||
resource_units: GlobalResourceUnits::new(),
|
||||
}),
|
||||
@@ -602,14 +597,17 @@ impl InMemoryLayer {
|
||||
}
|
||||
}
|
||||
|
||||
/// Write this frozen in-memory layer to disk.
|
||||
/// Write this frozen in-memory layer to disk. If `key_range` is set, the delta
|
||||
/// layer will only contain the key range the user specifies, and may return `None`
|
||||
/// if there are no matching keys.
|
||||
///
|
||||
/// Returns a new delta layer with all the same data as this in-memory layer
|
||||
pub(crate) async fn write_to_disk(
|
||||
&self,
|
||||
timeline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<ResidentLayer> {
|
||||
key_range: Option<Range<Key>>,
|
||||
) -> Result<Option<ResidentLayer>> {
|
||||
// Grab the lock in read-mode. We hold it over the I/O, but because this
|
||||
// layer is not writeable anymore, no one should be trying to acquire the
|
||||
// write lock on it, so we shouldn't block anyone. There's one exception
|
||||
@@ -623,6 +621,21 @@ impl InMemoryLayer {
|
||||
|
||||
let end_lsn = *self.end_lsn.get().unwrap();
|
||||
|
||||
let keys: Vec<_> = if let Some(key_range) = key_range {
|
||||
inner
|
||||
.index
|
||||
.iter()
|
||||
.filter(|(k, _)| key_range.contains(k))
|
||||
.map(|(k, m)| (k.to_i128(), m))
|
||||
.collect()
|
||||
} else {
|
||||
inner.index.iter().map(|(k, m)| (k.to_i128(), m)).collect()
|
||||
};
|
||||
|
||||
if keys.is_empty() {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let mut delta_layer_writer = DeltaLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
@@ -636,33 +649,24 @@ impl InMemoryLayer {
|
||||
|
||||
let cursor = inner.file.block_cursor();
|
||||
|
||||
// Sort the keys because delta layer writer expects them sorted.
|
||||
//
|
||||
// NOTE: this sort can take up significant time if the layer has millions of
|
||||
// keys. To speed up all the comparisons we convert the key to i128 and
|
||||
// keep the value as a reference.
|
||||
let mut keys: Vec<_> = inner.index.iter().map(|(k, m)| (k.to_i128(), m)).collect();
|
||||
keys.sort_unstable_by_key(|k| k.0);
|
||||
|
||||
let ctx = RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::InMemoryLayer)
|
||||
.build();
|
||||
for (key, vec_map) in keys.iter() {
|
||||
let key = Key::from_i128(*key);
|
||||
for (key, vec_map) in inner.index.iter() {
|
||||
// Write all page versions
|
||||
for (lsn, pos) in vec_map.as_slice() {
|
||||
cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
|
||||
let will_init = Value::des(&buf)?.will_init();
|
||||
let res;
|
||||
(buf, res) = delta_layer_writer
|
||||
.put_value_bytes(key, *lsn, buf, will_init)
|
||||
.put_value_bytes(*key, *lsn, buf, will_init, &ctx)
|
||||
.await;
|
||||
res?;
|
||||
}
|
||||
}
|
||||
|
||||
// MAX is used here because we identify L0 layers by full key range
|
||||
let delta_layer = delta_layer_writer.finish(Key::MAX, timeline).await?;
|
||||
Ok(delta_layer)
|
||||
let delta_layer = delta_layer_writer.finish(Key::MAX, timeline, &ctx).await?;
|
||||
Ok(Some(delta_layer))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,9 +14,10 @@ use utils::lsn::Lsn;
|
||||
use utils::sync::heavier_once_cell;
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::RequestContext;
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::repository::Key;
|
||||
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline};
|
||||
|
||||
@@ -401,8 +402,8 @@ impl Layer {
|
||||
&self.0.path
|
||||
}
|
||||
|
||||
pub(crate) fn local_path_str(&self) -> &Arc<str> {
|
||||
&self.0.path_str
|
||||
pub(crate) fn debug_str(&self) -> &Arc<str> {
|
||||
&self.0.debug_str
|
||||
}
|
||||
|
||||
pub(crate) fn metadata(&self) -> LayerFileMetadata {
|
||||
@@ -527,8 +528,8 @@ struct LayerInner {
|
||||
/// Full path to the file; unclear if this should exist anymore.
|
||||
path: Utf8PathBuf,
|
||||
|
||||
/// String representation of the full path, used for traversal id.
|
||||
path_str: Arc<str>,
|
||||
/// String representation of the layer, used for traversal id.
|
||||
debug_str: Arc<str>,
|
||||
|
||||
desc: PersistentLayerDesc,
|
||||
|
||||
@@ -735,7 +736,7 @@ impl LayerInner {
|
||||
|
||||
LayerInner {
|
||||
conf,
|
||||
path_str: path.to_string().into(),
|
||||
debug_str: { format!("timelines/{}/{}", timeline.timeline_id, desc.filename()).into() },
|
||||
path,
|
||||
desc,
|
||||
timeline: Arc::downgrade(timeline),
|
||||
@@ -939,11 +940,20 @@ impl LayerInner {
|
||||
return Err(DownloadError::DownloadRequired);
|
||||
}
|
||||
|
||||
let download_ctx = ctx
|
||||
.map(|ctx| ctx.detached_child(TaskKind::LayerDownload, DownloadBehavior::Download))
|
||||
.unwrap_or(RequestContext::new(
|
||||
TaskKind::LayerDownload,
|
||||
DownloadBehavior::Download,
|
||||
));
|
||||
|
||||
async move {
|
||||
tracing::info!(%reason, "downloading on-demand");
|
||||
|
||||
let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
|
||||
let res = self.download_init_and_wait(timeline, permit).await?;
|
||||
let res = self
|
||||
.download_init_and_wait(timeline, permit, download_ctx)
|
||||
.await?;
|
||||
scopeguard::ScopeGuard::into_inner(init_cancelled);
|
||||
Ok(res)
|
||||
}
|
||||
@@ -982,6 +992,7 @@ impl LayerInner {
|
||||
self: &Arc<Self>,
|
||||
timeline: Arc<Timeline>,
|
||||
permit: heavier_once_cell::InitPermit,
|
||||
ctx: RequestContext,
|
||||
) -> Result<Arc<DownloadedLayer>, DownloadError> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
@@ -1011,7 +1022,7 @@ impl LayerInner {
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let res = this.download_and_init(timeline, permit).await;
|
||||
let res = this.download_and_init(timeline, permit, &ctx).await;
|
||||
|
||||
if let Err(res) = tx.send(res) {
|
||||
match res {
|
||||
@@ -1054,6 +1065,7 @@ impl LayerInner {
|
||||
self: &Arc<LayerInner>,
|
||||
timeline: Arc<Timeline>,
|
||||
permit: heavier_once_cell::InitPermit,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Arc<DownloadedLayer>> {
|
||||
let client = timeline
|
||||
.remote_client
|
||||
@@ -1061,7 +1073,12 @@ impl LayerInner {
|
||||
.expect("checked before download_init_and_wait");
|
||||
|
||||
let result = client
|
||||
.download_layer_file(&self.desc.filename(), &self.metadata(), &timeline.cancel)
|
||||
.download_layer_file(
|
||||
&self.desc.filename(),
|
||||
&self.metadata(),
|
||||
&timeline.cancel,
|
||||
ctx,
|
||||
)
|
||||
.await;
|
||||
|
||||
match result {
|
||||
|
||||
@@ -16,8 +16,11 @@ use enumset::EnumSet;
|
||||
use fail::fail_point;
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_api::{
|
||||
key::{AUX_FILES_KEY, NON_INHERITED_RANGE},
|
||||
keyspace::KeySpaceAccum,
|
||||
key::{
|
||||
AUX_FILES_KEY, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE,
|
||||
NON_INHERITED_SPARSE_RANGE,
|
||||
},
|
||||
keyspace::{KeySpaceAccum, SparseKeyPartitioning},
|
||||
models::{
|
||||
CompactionAlgorithm, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest,
|
||||
EvictionPolicy, InMemoryLayerInfo, LayerMapInfo, TimelineState,
|
||||
@@ -55,8 +58,6 @@ use std::{
|
||||
ops::ControlFlow,
|
||||
};
|
||||
|
||||
use crate::deletion_queue::DeletionQueueClient;
|
||||
use crate::tenant::timeline::logical_size::CurrentLogicalSize;
|
||||
use crate::tenant::{
|
||||
layer_map::{LayerMap, SearchResult},
|
||||
metadata::TimelineMetadata,
|
||||
@@ -66,6 +67,7 @@ use crate::{
|
||||
disk_usage_eviction_task::DiskUsageEvictionInfo,
|
||||
pgdatadir_mapping::CollectKeySpaceError,
|
||||
};
|
||||
use crate::{deletion_queue::DeletionQueueClient, metrics::GetKind};
|
||||
use crate::{
|
||||
disk_usage_eviction_task::finite_f32,
|
||||
tenant::storage_layer::{
|
||||
@@ -77,6 +79,9 @@ use crate::{
|
||||
use crate::{
|
||||
disk_usage_eviction_task::EvictionCandidate, tenant::storage_layer::delta_layer::DeltaEntry,
|
||||
};
|
||||
use crate::{
|
||||
metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize,
|
||||
};
|
||||
use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind};
|
||||
use crate::{
|
||||
pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind},
|
||||
@@ -86,7 +91,7 @@ use crate::{
|
||||
use crate::config::PageServerConf;
|
||||
use crate::keyspace::{KeyPartitioning, KeySpace};
|
||||
use crate::metrics::{
|
||||
GetKind, TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
|
||||
TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
|
||||
};
|
||||
use crate::pgdatadir_mapping::CalculateLogicalSizeError;
|
||||
use crate::tenant::config::TenantConfOpt;
|
||||
@@ -137,6 +142,25 @@ pub(super) enum FlushLoopState {
|
||||
Exited,
|
||||
}
|
||||
|
||||
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
|
||||
pub enum ImageLayerCreationMode {
|
||||
/// Try to create image layers based on `time_for_new_image_layer`. Used in compaction code path.
|
||||
Try,
|
||||
/// Force creating the image layers if possible. For now, no image layers will be created
|
||||
/// for metadata keys. Used in compaction code path with force flag enabled.
|
||||
Force,
|
||||
/// Initial ingestion of the data, and no data should be dropped in this function. This
|
||||
/// means that no metadata keys should be included in the partitions. Used in flush frozen layer
|
||||
/// code path.
|
||||
Initial,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for ImageLayerCreationMode {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{:?}", self)
|
||||
}
|
||||
}
|
||||
|
||||
/// Wrapper for key range to provide reverse ordering by range length for BinaryHeap
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub(crate) struct Hole {
|
||||
@@ -306,7 +330,7 @@ pub struct Timeline {
|
||||
|
||||
// List of child timelines and their branch points. This is needed to avoid
|
||||
// garbage collecting data that is still needed by the child timelines.
|
||||
pub gc_info: std::sync::RwLock<GcInfo>,
|
||||
pub(crate) gc_info: std::sync::RwLock<GcInfo>,
|
||||
|
||||
// It may change across major versions so for simplicity
|
||||
// keep it after running initdb for a timeline.
|
||||
@@ -317,7 +341,7 @@ pub struct Timeline {
|
||||
pub initdb_lsn: Lsn,
|
||||
|
||||
/// When did we last calculate the partitioning?
|
||||
partitioning: tokio::sync::Mutex<(KeyPartitioning, Lsn)>,
|
||||
partitioning: tokio::sync::Mutex<((KeyPartitioning, SparseKeyPartitioning), Lsn)>,
|
||||
|
||||
/// Configuration: how often should the partitioning be recalculated.
|
||||
repartition_threshold: u64,
|
||||
@@ -390,33 +414,59 @@ pub struct WalReceiverInfo {
|
||||
pub last_received_msg_ts: u128,
|
||||
}
|
||||
|
||||
///
|
||||
/// Information about how much history needs to be retained, needed by
|
||||
/// Garbage Collection.
|
||||
///
|
||||
pub struct GcInfo {
|
||||
#[derive(Default)]
|
||||
pub(crate) struct GcInfo {
|
||||
/// Specific LSNs that are needed.
|
||||
///
|
||||
/// Currently, this includes all points where child branches have
|
||||
/// been forked off from. In the future, could also include
|
||||
/// explicit user-defined snapshot points.
|
||||
pub retain_lsns: Vec<Lsn>,
|
||||
pub(crate) retain_lsns: Vec<Lsn>,
|
||||
|
||||
/// In addition to 'retain_lsns', keep everything newer than this
|
||||
/// point.
|
||||
/// The cutoff coordinates, which are combined by selecting the minimum.
|
||||
pub(crate) cutoffs: GcCutoffs,
|
||||
}
|
||||
|
||||
impl GcInfo {
|
||||
pub(crate) fn min_cutoff(&self) -> Lsn {
|
||||
self.cutoffs.select_min()
|
||||
}
|
||||
}
|
||||
|
||||
/// The `GcInfo` component describing which Lsns need to be retained.
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct GcCutoffs {
|
||||
/// Keep everything newer than this point.
|
||||
///
|
||||
/// This is calculated by subtracting 'gc_horizon' setting from
|
||||
/// last-record LSN
|
||||
///
|
||||
/// FIXME: is this inclusive or exclusive?
|
||||
pub horizon_cutoff: Lsn,
|
||||
pub(crate) horizon: Lsn,
|
||||
|
||||
/// In addition to 'retain_lsns' and 'horizon_cutoff', keep everything newer than this
|
||||
/// point.
|
||||
///
|
||||
/// This is calculated by finding a number such that a record is needed for PITR
|
||||
/// if only if its LSN is larger than 'pitr_cutoff'.
|
||||
pub pitr_cutoff: Lsn,
|
||||
pub(crate) pitr: Lsn,
|
||||
}
|
||||
|
||||
impl Default for GcCutoffs {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
horizon: Lsn::INVALID,
|
||||
pitr: Lsn::INVALID,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl GcCutoffs {
|
||||
fn select_min(&self) -> Lsn {
|
||||
std::cmp::min(self.horizon, self.pitr)
|
||||
}
|
||||
}
|
||||
|
||||
/// An error happened in a get() operation.
|
||||
@@ -445,7 +495,6 @@ pub(crate) enum PageReconstructError {
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct MissingKeyError {
|
||||
stuck_at_lsn: bool,
|
||||
key: Key,
|
||||
shard: ShardNumber,
|
||||
cont_lsn: Lsn,
|
||||
@@ -457,23 +506,13 @@ pub struct MissingKeyError {
|
||||
|
||||
impl std::fmt::Display for MissingKeyError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
if self.stuck_at_lsn {
|
||||
// Records are found in this timeline but no image layer or initial delta record was found.
|
||||
write!(
|
||||
f,
|
||||
"could not find layer with more data for key {} (shard {:?}) at LSN {}, request LSN {}",
|
||||
self.key, self.shard, self.cont_lsn, self.request_lsn
|
||||
)?;
|
||||
if let Some(ref ancestor_lsn) = self.ancestor_lsn {
|
||||
write!(f, ", ancestor {}", ancestor_lsn)?;
|
||||
}
|
||||
} else {
|
||||
// No records in this timeline.
|
||||
write!(
|
||||
f,
|
||||
"could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}",
|
||||
self.key, self.shard, self.cont_lsn, self.request_lsn
|
||||
)?;
|
||||
write!(
|
||||
f,
|
||||
"could not find data for key {} (shard {:?}) at LSN {}, request LSN {}",
|
||||
self.key, self.shard, self.cont_lsn, self.request_lsn
|
||||
)?;
|
||||
if let Some(ref ancestor_lsn) = self.ancestor_lsn {
|
||||
write!(f, ", ancestor {}", ancestor_lsn)?;
|
||||
}
|
||||
|
||||
if !self.traversal_path.is_empty() {
|
||||
@@ -549,8 +588,8 @@ pub(crate) enum GetVectoredError {
|
||||
#[error("Requested at invalid LSN: {0}")]
|
||||
InvalidLsn(Lsn),
|
||||
|
||||
#[error("Requested key {0} not found")]
|
||||
MissingKey(Key),
|
||||
#[error("Requested key not found: {0}")]
|
||||
MissingKey(MissingKeyError),
|
||||
|
||||
#[error(transparent)]
|
||||
GetReadyAncestorError(GetReadyAncestorError),
|
||||
@@ -659,7 +698,7 @@ impl From<GetVectoredError> for PageReconstructError {
|
||||
GetVectoredError::Cancelled => PageReconstructError::Cancelled,
|
||||
GetVectoredError::InvalidLsn(_) => PageReconstructError::Other(anyhow!("Invalid LSN")),
|
||||
err @ GetVectoredError::Oversized(_) => PageReconstructError::Other(err.into()),
|
||||
err @ GetVectoredError::MissingKey(_) => PageReconstructError::Other(err.into()),
|
||||
GetVectoredError::MissingKey(err) => PageReconstructError::MissingKey(err),
|
||||
GetVectoredError::GetReadyAncestorError(err) => PageReconstructError::from(err),
|
||||
GetVectoredError::Other(err) => PageReconstructError::Other(err),
|
||||
}
|
||||
@@ -851,16 +890,15 @@ impl Timeline {
|
||||
value
|
||||
}
|
||||
}
|
||||
None => {
|
||||
error!(
|
||||
"Expected {}, but singular vectored get returned nothing",
|
||||
key
|
||||
);
|
||||
Err(PageReconstructError::Other(anyhow!(
|
||||
"Singular vectored get did not return a value for {}",
|
||||
key
|
||||
)))
|
||||
}
|
||||
None => Err(PageReconstructError::MissingKey(MissingKeyError {
|
||||
key,
|
||||
shard: self.shard_identity.get_shard_number(&key),
|
||||
cont_lsn: Lsn(0),
|
||||
request_lsn: lsn,
|
||||
ancestor_lsn: None,
|
||||
traversal_path: Vec::new(),
|
||||
backtrace: None,
|
||||
})),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -936,7 +974,7 @@ impl Timeline {
|
||||
return Err(GetVectoredError::InvalidLsn(lsn));
|
||||
}
|
||||
|
||||
let key_count = keyspace.total_size().try_into().unwrap();
|
||||
let key_count = keyspace.total_raw_size().try_into().unwrap();
|
||||
if key_count > Timeline::MAX_GET_VECTORED_KEYS {
|
||||
return Err(GetVectoredError::Oversized(key_count));
|
||||
}
|
||||
@@ -1010,6 +1048,70 @@ impl Timeline {
|
||||
res
|
||||
}
|
||||
|
||||
/// Scan the keyspace and return all existing key-values in the keyspace. This currently uses vectored
|
||||
/// get underlying. Normal vectored get would throw an error when a key in the keyspace is not found
|
||||
/// during the search, but for the scan interface, it returns all existing key-value pairs, and does
|
||||
/// not expect each single key in the key space will be found. The semantics is closer to the RocksDB
|
||||
/// scan iterator interface. We could optimize this interface later to avoid some checks in the vectored
|
||||
/// get path to maintain and split the probing and to-be-probe keyspace. We also need to ensure that
|
||||
/// the scan operation will not cause OOM in the future.
|
||||
#[allow(dead_code)]
|
||||
pub(crate) async fn scan(
|
||||
&self,
|
||||
keyspace: KeySpace,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
|
||||
if !lsn.is_valid() {
|
||||
return Err(GetVectoredError::InvalidLsn(lsn));
|
||||
}
|
||||
|
||||
trace!(
|
||||
"key-value scan request for {:?}@{} from task kind {:?}",
|
||||
keyspace,
|
||||
lsn,
|
||||
ctx.task_kind()
|
||||
);
|
||||
|
||||
// We should generalize this into Keyspace::contains in the future.
|
||||
for range in &keyspace.ranges {
|
||||
if range.start.field1 < METADATA_KEY_BEGIN_PREFIX
|
||||
|| range.end.field1 >= METADATA_KEY_END_PREFIX
|
||||
{
|
||||
return Err(GetVectoredError::Other(anyhow::anyhow!(
|
||||
"only metadata keyspace can be scanned"
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
let start = crate::metrics::SCAN_LATENCY
|
||||
.for_task_kind(ctx.task_kind())
|
||||
.map(ScanLatencyOngoingRecording::start_recording);
|
||||
|
||||
// start counting after throttle so that throttle time
|
||||
// is always less than observation time
|
||||
let throttled = self
|
||||
.timeline_get_throttle
|
||||
// assume scan = 1 quota for now until we find a better way to process this
|
||||
.throttle(ctx, 1)
|
||||
.await;
|
||||
|
||||
let vectored_res = self
|
||||
.get_vectored_impl(
|
||||
keyspace.clone(),
|
||||
lsn,
|
||||
ValuesReconstructState::default(),
|
||||
ctx,
|
||||
)
|
||||
.await;
|
||||
|
||||
if let Some(recording) = start {
|
||||
recording.observe(throttled);
|
||||
}
|
||||
|
||||
vectored_res
|
||||
}
|
||||
|
||||
/// Not subject to [`Self::timeline_get_throttle`].
|
||||
pub(super) async fn get_vectored_sequential_impl(
|
||||
&self,
|
||||
@@ -1018,6 +1120,7 @@ impl Timeline {
|
||||
ctx: &RequestContext,
|
||||
) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
|
||||
let mut values = BTreeMap::new();
|
||||
|
||||
for range in keyspace.ranges {
|
||||
let mut key = range.start;
|
||||
while key != range.end {
|
||||
@@ -1030,16 +1133,17 @@ impl Timeline {
|
||||
Err(Cancelled | AncestorStopping(_)) => {
|
||||
return Err(GetVectoredError::Cancelled)
|
||||
}
|
||||
// we only capture stuck_at_lsn=false now until we figure out https://github.com/neondatabase/neon/issues/7380
|
||||
Err(MissingKey(MissingKeyError {
|
||||
stuck_at_lsn: false,
|
||||
..
|
||||
})) if !NON_INHERITED_RANGE.contains(&key) => {
|
||||
// The vectored read path handles non inherited keys specially.
|
||||
// If such a a key cannot be reconstructed from the current timeline,
|
||||
// the vectored read path returns a key level error as opposed to a top
|
||||
// level error.
|
||||
return Err(GetVectoredError::MissingKey(key));
|
||||
Err(MissingKey(_))
|
||||
if NON_INHERITED_RANGE.contains(&key)
|
||||
|| NON_INHERITED_SPARSE_RANGE.contains(&key) =>
|
||||
{
|
||||
// Ignore missing key error for aux key range. TODO: currently, we assume non_inherited_range == aux_key_range.
|
||||
// When we add more types of keys into the page server, we should revisit this part of code and throw errors
|
||||
// accordingly.
|
||||
key = key.next();
|
||||
}
|
||||
Err(MissingKey(err)) => {
|
||||
return Err(GetVectoredError::MissingKey(err));
|
||||
}
|
||||
Err(Other(err))
|
||||
if err
|
||||
@@ -1076,7 +1180,7 @@ impl Timeline {
|
||||
mut reconstruct_state: ValuesReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
|
||||
let get_kind = if keyspace.total_size() == 1 {
|
||||
let get_kind = if keyspace.total_raw_size() == 1 {
|
||||
GetKind::Singular
|
||||
} else {
|
||||
GetKind::Vectored
|
||||
@@ -1126,6 +1230,11 @@ impl Timeline {
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) {
|
||||
if keyspace.overlaps(&Key::metadata_key_range()) {
|
||||
// skip validation for metadata key range
|
||||
return;
|
||||
}
|
||||
|
||||
let sequential_res = self
|
||||
.get_vectored_sequential_impl(keyspace.clone(), lsn, ctx)
|
||||
.await;
|
||||
@@ -1135,7 +1244,7 @@ impl Timeline {
|
||||
match (lhs, rhs) {
|
||||
(Oversized(l), Oversized(r)) => l == r,
|
||||
(InvalidLsn(l), InvalidLsn(r)) => l == r,
|
||||
(MissingKey(l), MissingKey(r)) => l == r,
|
||||
(MissingKey(l), MissingKey(r)) => l.key == r.key,
|
||||
(GetReadyAncestorError(_), GetReadyAncestorError(_)) => true,
|
||||
(Other(_), Other(_)) => true,
|
||||
_ => false,
|
||||
@@ -1149,6 +1258,11 @@ impl Timeline {
|
||||
panic!(concat!("Sequential get failed with {}, but vectored get did not",
|
||||
" - keyspace={:?} lsn={}"),
|
||||
seq_err, keyspace, lsn) },
|
||||
(Ok(_), Err(GetVectoredError::GetReadyAncestorError(GetReadyAncestorError::AncestorLsnTimeout(_)))) => {
|
||||
// Sequential get runs after vectored get, so it is possible for the later
|
||||
// to time out while waiting for its ancestor's Lsn to become ready and for the
|
||||
// former to succeed (it essentially has a doubled wait time).
|
||||
},
|
||||
(Ok(_), Err(vec_err)) => {
|
||||
panic!(concat!("Vectored get failed with {}, but sequential get did not",
|
||||
" - keyspace={:?} lsn={}"),
|
||||
@@ -1229,6 +1343,12 @@ impl Timeline {
|
||||
self.last_record_lsn.load()
|
||||
}
|
||||
|
||||
/// Subscribe to callers of wait_lsn(). The value of the channel is None if there are no
|
||||
/// wait_lsn() calls in progress, and Some(Lsn) if there is an active waiter for wait_lsn().
|
||||
pub(crate) fn subscribe_for_wait_lsn_updates(&self) -> watch::Receiver<Option<Lsn>> {
|
||||
self.last_record_lsn.status_receiver()
|
||||
}
|
||||
|
||||
pub(crate) fn get_disk_consistent_lsn(&self) -> Lsn {
|
||||
self.disk_consistent_lsn.load()
|
||||
}
|
||||
@@ -2081,11 +2201,7 @@ impl Timeline {
|
||||
|
||||
write_lock: tokio::sync::Mutex::new(None),
|
||||
|
||||
gc_info: std::sync::RwLock::new(GcInfo {
|
||||
retain_lsns: Vec::new(),
|
||||
horizon_cutoff: Lsn(0),
|
||||
pitr_cutoff: Lsn(0),
|
||||
}),
|
||||
gc_info: std::sync::RwLock::new(GcInfo::default()),
|
||||
|
||||
latest_gc_cutoff_lsn: Rcu::new(metadata.latest_gc_cutoff_lsn()),
|
||||
initdb_lsn: metadata.initdb_lsn(),
|
||||
@@ -2099,7 +2215,10 @@ impl Timeline {
|
||||
// initial logical size is 0.
|
||||
LogicalSize::empty_initial()
|
||||
},
|
||||
partitioning: tokio::sync::Mutex::new((KeyPartitioning::new(), Lsn(0))),
|
||||
partitioning: tokio::sync::Mutex::new((
|
||||
(KeyPartitioning::new(), KeyPartitioning::new().into_sparse()),
|
||||
Lsn(0),
|
||||
)),
|
||||
repartition_threshold: 0,
|
||||
last_image_layer_creation_check_at: AtomicLsn::new(0),
|
||||
|
||||
@@ -2915,7 +3034,7 @@ trait TraversalLayerExt {
|
||||
|
||||
impl TraversalLayerExt for Layer {
|
||||
fn traversal_id(&self) -> TraversalId {
|
||||
Arc::clone(self.local_path_str())
|
||||
Arc::clone(self.debug_str())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2991,7 +3110,6 @@ impl Timeline {
|
||||
// Didn't make any progress in last iteration. Error out to avoid
|
||||
// getting stuck in the loop.
|
||||
return Err(PageReconstructError::MissingKey(MissingKeyError {
|
||||
stuck_at_lsn: true,
|
||||
key,
|
||||
shard: self.shard_identity.get_shard_number(&key),
|
||||
cont_lsn: Lsn(cont_lsn.0 - 1),
|
||||
@@ -3006,7 +3124,6 @@ impl Timeline {
|
||||
}
|
||||
ValueReconstructResult::Missing => {
|
||||
return Err(PageReconstructError::MissingKey(MissingKeyError {
|
||||
stuck_at_lsn: false,
|
||||
key,
|
||||
shard: self.shard_identity.get_shard_number(&key),
|
||||
cont_lsn,
|
||||
@@ -3101,7 +3218,6 @@ impl Timeline {
|
||||
if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) {
|
||||
let layer = guard.get_from_desc(&layer);
|
||||
drop(guard);
|
||||
|
||||
// Get all the data needed to reconstruct the page version from this layer.
|
||||
// But if we have an older cached page image, no need to go past that.
|
||||
let lsn_floor = max(cached_lsn + 1, lsn_floor);
|
||||
@@ -3171,38 +3287,13 @@ impl Timeline {
|
||||
// Do not descend into the ancestor timeline for aux files.
|
||||
// We don't return a blanket [`GetVectoredError::MissingKey`] to avoid
|
||||
// stalling compaction.
|
||||
// TODO(chi): this will need to be updated for aux files v2 storage
|
||||
if keyspace.overlaps(&NON_INHERITED_RANGE) {
|
||||
let removed = keyspace.remove_overlapping_with(&KeySpace {
|
||||
ranges: vec![NON_INHERITED_RANGE],
|
||||
});
|
||||
keyspace.remove_overlapping_with(&KeySpace {
|
||||
ranges: vec![NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE],
|
||||
});
|
||||
|
||||
for range in removed.ranges {
|
||||
let mut key = range.start;
|
||||
while key < range.end {
|
||||
reconstruct_state.on_key_error(
|
||||
key,
|
||||
PageReconstructError::MissingKey(MissingKeyError {
|
||||
stuck_at_lsn: false,
|
||||
key,
|
||||
shard: self.shard_identity.get_shard_number(&key),
|
||||
cont_lsn,
|
||||
request_lsn,
|
||||
ancestor_lsn: None,
|
||||
traversal_path: Vec::default(),
|
||||
backtrace: if cfg!(test) {
|
||||
Some(std::backtrace::Backtrace::force_capture())
|
||||
} else {
|
||||
None
|
||||
},
|
||||
}),
|
||||
);
|
||||
key = key.next();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if keyspace.total_size() == 0 || timeline.ancestor_timeline.is_none() {
|
||||
// Keyspace is fully retrieved, no ancestor timeline, or metadata scan (where we do not look
|
||||
// into ancestor timelines). TODO: is there any other metadata which we want to inherit?
|
||||
if keyspace.total_raw_size() == 0 || timeline.ancestor_timeline.is_none() {
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -3215,14 +3306,24 @@ impl Timeline {
|
||||
timeline = &*timeline_owned;
|
||||
}
|
||||
|
||||
if keyspace.total_size() != 0 {
|
||||
return Err(GetVectoredError::MissingKey(keyspace.start().unwrap()));
|
||||
if keyspace.total_raw_size() != 0 {
|
||||
return Err(GetVectoredError::MissingKey(MissingKeyError {
|
||||
key: keyspace.start().unwrap(), /* better if we can store the full keyspace */
|
||||
shard: self
|
||||
.shard_identity
|
||||
.get_shard_number(&keyspace.start().unwrap()),
|
||||
cont_lsn,
|
||||
request_lsn,
|
||||
ancestor_lsn: Some(timeline.ancestor_lsn),
|
||||
traversal_path: vec![],
|
||||
backtrace: None,
|
||||
}));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Collect the reconstruct data for a ketspace from the specified timeline.
|
||||
/// Collect the reconstruct data for a keyspace from the specified timeline.
|
||||
///
|
||||
/// Maintain a fringe [`LayerFringe`] which tracks all the layers that intersect
|
||||
/// the current keyspace. The current keyspace of the search at any given timeline
|
||||
@@ -3651,66 +3752,103 @@ impl Timeline {
|
||||
// files instead. This is possible as long as *all* the data imported into the
|
||||
// repository have the same LSN.
|
||||
let lsn_range = frozen_layer.get_lsn_range();
|
||||
let (layers_to_upload, delta_layer_to_add) =
|
||||
if lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1) {
|
||||
#[cfg(test)]
|
||||
match &mut *self.flush_loop_state.lock().unwrap() {
|
||||
FlushLoopState::NotStarted | FlushLoopState::Exited => {
|
||||
panic!("flush loop not running")
|
||||
}
|
||||
FlushLoopState::Running {
|
||||
initdb_optimization_count,
|
||||
..
|
||||
} => {
|
||||
|
||||
// Whether to directly create image layers for this flush, or flush them as delta layers
|
||||
let create_image_layer =
|
||||
lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1);
|
||||
|
||||
#[cfg(test)]
|
||||
{
|
||||
match &mut *self.flush_loop_state.lock().unwrap() {
|
||||
FlushLoopState::NotStarted | FlushLoopState::Exited => {
|
||||
panic!("flush loop not running")
|
||||
}
|
||||
FlushLoopState::Running {
|
||||
expect_initdb_optimization,
|
||||
initdb_optimization_count,
|
||||
..
|
||||
} => {
|
||||
if create_image_layer {
|
||||
*initdb_optimization_count += 1;
|
||||
}
|
||||
}
|
||||
// Note: The 'ctx' in use here has DownloadBehavior::Error. We should not
|
||||
// require downloading anything during initial import.
|
||||
let (partitioning, _lsn) = self
|
||||
.repartition(
|
||||
self.initdb_lsn,
|
||||
self.get_compaction_target_size(),
|
||||
EnumSet::empty(),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
if self.cancel.is_cancelled() {
|
||||
return Err(FlushLayerError::Cancelled);
|
||||
}
|
||||
|
||||
// For image layers, we add them immediately into the layer map.
|
||||
(
|
||||
self.create_image_layers(&partitioning, self.initdb_lsn, true, ctx)
|
||||
.await?,
|
||||
None,
|
||||
)
|
||||
} else {
|
||||
#[cfg(test)]
|
||||
match &mut *self.flush_loop_state.lock().unwrap() {
|
||||
FlushLoopState::NotStarted | FlushLoopState::Exited => {
|
||||
panic!("flush loop not running")
|
||||
}
|
||||
FlushLoopState::Running {
|
||||
expect_initdb_optimization,
|
||||
..
|
||||
} => {
|
||||
} else {
|
||||
assert!(!*expect_initdb_optimization, "expected initdb optimization");
|
||||
}
|
||||
}
|
||||
// Normal case, write out a L0 delta layer file.
|
||||
// `create_delta_layer` will not modify the layer map.
|
||||
// We will remove frozen layer and add delta layer in one atomic operation later.
|
||||
let layer = self.create_delta_layer(&frozen_layer, ctx).await?;
|
||||
(
|
||||
// FIXME: even though we have a single image and single delta layer assumption
|
||||
// we push them to vec
|
||||
vec![layer.clone()],
|
||||
Some(layer),
|
||||
}
|
||||
}
|
||||
|
||||
let (layers_to_upload, delta_layer_to_add) = if create_image_layer {
|
||||
// Note: The 'ctx' in use here has DownloadBehavior::Error. We should not
|
||||
// require downloading anything during initial import.
|
||||
let ((rel_partition, metadata_partition), _lsn) = self
|
||||
.repartition(
|
||||
self.initdb_lsn,
|
||||
self.get_compaction_target_size(),
|
||||
EnumSet::empty(),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
if self.cancel.is_cancelled() {
|
||||
return Err(FlushLayerError::Cancelled);
|
||||
}
|
||||
|
||||
// For metadata, always create delta layers.
|
||||
let delta_layer = if !metadata_partition.parts.is_empty() {
|
||||
assert_eq!(
|
||||
metadata_partition.parts.len(),
|
||||
1,
|
||||
"currently sparse keyspace should only contain a single aux file keyspace"
|
||||
);
|
||||
let metadata_keyspace = &metadata_partition.parts[0];
|
||||
assert_eq!(
|
||||
metadata_keyspace.0.ranges.len(),
|
||||
1,
|
||||
"aux file keyspace should be a single range"
|
||||
);
|
||||
self.create_delta_layer(
|
||||
&frozen_layer,
|
||||
ctx,
|
||||
Some(metadata_keyspace.0.ranges[0].clone()),
|
||||
)
|
||||
.await?
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// For image layers, we add them immediately into the layer map.
|
||||
let mut layers_to_upload = Vec::new();
|
||||
layers_to_upload.extend(
|
||||
self.create_image_layers(
|
||||
&rel_partition,
|
||||
self.initdb_lsn,
|
||||
ImageLayerCreationMode::Initial,
|
||||
ctx,
|
||||
)
|
||||
.await?,
|
||||
);
|
||||
|
||||
if let Some(delta_layer) = delta_layer {
|
||||
layers_to_upload.push(delta_layer.clone());
|
||||
(layers_to_upload, Some(delta_layer))
|
||||
} else {
|
||||
(layers_to_upload, None)
|
||||
}
|
||||
} else {
|
||||
// Normal case, write out a L0 delta layer file.
|
||||
// `create_delta_layer` will not modify the layer map.
|
||||
// We will remove frozen layer and add delta layer in one atomic operation later.
|
||||
let Some(layer) = self.create_delta_layer(&frozen_layer, ctx, None).await? else {
|
||||
panic!("delta layer cannot be empty if no filter is applied");
|
||||
};
|
||||
(
|
||||
// FIXME: even though we have a single image and single delta layer assumption
|
||||
// we push them to vec
|
||||
vec![layer.clone()],
|
||||
Some(layer),
|
||||
)
|
||||
};
|
||||
|
||||
pausable_failpoint!("flush-layer-cancel-after-writing-layer-out-pausable");
|
||||
|
||||
if self.cancel.is_cancelled() {
|
||||
@@ -3830,12 +3968,18 @@ impl Timeline {
|
||||
self: &Arc<Self>,
|
||||
frozen_layer: &Arc<InMemoryLayer>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ResidentLayer> {
|
||||
key_range: Option<Range<Key>>,
|
||||
) -> anyhow::Result<Option<ResidentLayer>> {
|
||||
let self_clone = Arc::clone(self);
|
||||
let frozen_layer = Arc::clone(frozen_layer);
|
||||
let ctx = ctx.attached_child();
|
||||
let work = async move {
|
||||
let new_delta = frozen_layer.write_to_disk(&self_clone, &ctx).await?;
|
||||
let Some(new_delta) = frozen_layer
|
||||
.write_to_disk(&self_clone, &ctx, key_range)
|
||||
.await?
|
||||
else {
|
||||
return Ok(None);
|
||||
};
|
||||
// The write_to_disk() above calls writer.finish() which already did the fsync of the inodes.
|
||||
// We just need to fsync the directory in which these inodes are linked,
|
||||
// which we know to be the timeline directory.
|
||||
@@ -3854,7 +3998,7 @@ impl Timeline {
|
||||
.sync_all()
|
||||
.await
|
||||
.fatal_err("VirtualFile::sync_all timeline dir");
|
||||
anyhow::Ok(new_delta)
|
||||
anyhow::Ok(Some(new_delta))
|
||||
};
|
||||
// Before tokio-epoll-uring, we ran write_to_disk & the sync_all inside spawn_blocking.
|
||||
// Preserve that behavior to maintain the same behavior for `virtual_file_io_engine=std-fs`.
|
||||
@@ -3881,19 +4025,20 @@ impl Timeline {
|
||||
partition_size: u64,
|
||||
flags: EnumSet<CompactFlags>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<(KeyPartitioning, Lsn)> {
|
||||
) -> anyhow::Result<((KeyPartitioning, SparseKeyPartitioning), Lsn)> {
|
||||
let Ok(mut partitioning_guard) = self.partitioning.try_lock() else {
|
||||
// NB: there are two callers, one is the compaction task, of which there is only one per struct Tenant and hence Timeline.
|
||||
// The other is the initdb optimization in flush_frozen_layer, used by `boostrap_timeline`, which runs before `.activate()`
|
||||
// and hence before the compaction task starts.
|
||||
anyhow::bail!("repartition() called concurrently, this should not happen");
|
||||
};
|
||||
if lsn < partitioning_guard.1 {
|
||||
let ((dense_partition, sparse_partition), partition_lsn) = &*partitioning_guard;
|
||||
if lsn < *partition_lsn {
|
||||
anyhow::bail!("repartition() called with LSN going backwards, this should not happen");
|
||||
}
|
||||
|
||||
let distance = lsn.0 - partitioning_guard.1 .0;
|
||||
if partitioning_guard.1 != Lsn(0)
|
||||
let distance = lsn.0 - partition_lsn.0;
|
||||
if *partition_lsn != Lsn(0)
|
||||
&& distance <= self.repartition_threshold
|
||||
&& !flags.contains(CompactFlags::ForceRepartition)
|
||||
{
|
||||
@@ -3902,13 +4047,18 @@ impl Timeline {
|
||||
threshold = self.repartition_threshold,
|
||||
"no repartitioning needed"
|
||||
);
|
||||
return Ok((partitioning_guard.0.clone(), partitioning_guard.1));
|
||||
return Ok((
|
||||
(dense_partition.clone(), sparse_partition.clone()),
|
||||
*partition_lsn,
|
||||
));
|
||||
}
|
||||
|
||||
let keyspace = self.collect_keyspace(lsn, ctx).await?;
|
||||
let partitioning = keyspace.partition(partition_size);
|
||||
|
||||
*partitioning_guard = (partitioning, lsn);
|
||||
let (dense_ks, sparse_ks) = self.collect_keyspace(lsn, ctx).await?;
|
||||
let dense_partitioning = dense_ks.partition(&self.shard_identity, partition_size);
|
||||
let sparse_partitioning = SparseKeyPartitioning {
|
||||
parts: vec![sparse_ks],
|
||||
}; // no partitioning for metadata keys for now
|
||||
*partitioning_guard = ((dense_partitioning, sparse_partitioning), lsn);
|
||||
|
||||
Ok((partitioning_guard.0.clone(), partitioning_guard.1))
|
||||
}
|
||||
@@ -3964,12 +4114,12 @@ impl Timeline {
|
||||
false
|
||||
}
|
||||
|
||||
#[tracing::instrument(skip_all, fields(%lsn, %force))]
|
||||
#[tracing::instrument(skip_all, fields(%lsn, %mode))]
|
||||
async fn create_image_layers(
|
||||
self: &Arc<Timeline>,
|
||||
partitioning: &KeyPartitioning,
|
||||
lsn: Lsn,
|
||||
force: bool,
|
||||
mode: ImageLayerCreationMode,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Vec<ResidentLayer>, CreateImageLayersError> {
|
||||
let timer = self.metrics.create_images_time_histo.start_timer();
|
||||
@@ -4006,19 +4156,26 @@ impl Timeline {
|
||||
for partition in partitioning.parts.iter() {
|
||||
let img_range = start..partition.ranges.last().unwrap().end;
|
||||
|
||||
let do_it = if force {
|
||||
true
|
||||
} else if check_for_image_layers {
|
||||
// [`Self::time_for_new_image_layer`] is CPU expensive,
|
||||
// so skip if we've not collected enough WAL since the last time
|
||||
self.time_for_new_image_layer(partition, lsn).await
|
||||
} else {
|
||||
false
|
||||
};
|
||||
|
||||
if !do_it {
|
||||
start = img_range.end;
|
||||
continue;
|
||||
if partition.overlaps(&Key::metadata_key_range()) {
|
||||
// TODO(chi): The next patch will correctly create image layers for metadata keys, and it would be a
|
||||
// rather big change. Keep this patch small for now.
|
||||
match mode {
|
||||
ImageLayerCreationMode::Force | ImageLayerCreationMode::Try => {
|
||||
// skip image layer creation anyways for metadata keys.
|
||||
start = img_range.end;
|
||||
continue;
|
||||
}
|
||||
ImageLayerCreationMode::Initial => {
|
||||
return Err(CreateImageLayersError::Other(anyhow::anyhow!("no image layer should be created for metadata keys when flushing frozen layers")));
|
||||
}
|
||||
}
|
||||
} else if let ImageLayerCreationMode::Try = mode {
|
||||
// check_for_image_layers = false -> skip
|
||||
// check_for_image_layers = true -> check time_for_new_image_layer -> skip/generate
|
||||
if !check_for_image_layers || !self.time_for_new_image_layer(partition, lsn).await {
|
||||
start = img_range.end;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
let mut image_layer_writer = ImageLayerWriter::new(
|
||||
@@ -4059,7 +4216,7 @@ impl Timeline {
|
||||
key = key.next();
|
||||
|
||||
// Maybe flush `key_rest_accum`
|
||||
if key_request_accum.size() >= Timeline::MAX_GET_VECTORED_KEYS
|
||||
if key_request_accum.raw_size() >= Timeline::MAX_GET_VECTORED_KEYS
|
||||
|| last_key_in_range
|
||||
{
|
||||
let results = self
|
||||
@@ -4098,7 +4255,7 @@ impl Timeline {
|
||||
};
|
||||
|
||||
// Write all the keys we just read into our new image layer.
|
||||
image_layer_writer.put_image(img_key, img).await?;
|
||||
image_layer_writer.put_image(img_key, img, ctx).await?;
|
||||
wrote_keys = true;
|
||||
}
|
||||
}
|
||||
@@ -4109,7 +4266,7 @@ impl Timeline {
|
||||
// Normal path: we have written some data into the new image layer for this
|
||||
// partition, so flush it to disk.
|
||||
start = img_range.end;
|
||||
let image_layer = image_layer_writer.finish(self).await?;
|
||||
let image_layer = image_layer_writer.finish(self, ctx).await?;
|
||||
image_layers.push(image_layer);
|
||||
} else {
|
||||
// Special case: the image layer may be empty if this is a sharded tenant and the
|
||||
@@ -4302,7 +4459,7 @@ impl Timeline {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Update information about which layer files need to be retained on
|
||||
/// Find the Lsns above which layer files need to be retained on
|
||||
/// garbage collection. This is separate from actually performing the GC,
|
||||
/// and is updated more frequently, so that compaction can remove obsolete
|
||||
/// page versions more aggressively.
|
||||
@@ -4310,17 +4467,6 @@ impl Timeline {
|
||||
/// TODO: that's wishful thinking, compaction doesn't actually do that
|
||||
/// currently.
|
||||
///
|
||||
/// The caller specifies how much history is needed with the 3 arguments:
|
||||
///
|
||||
/// retain_lsns: keep a version of each page at these LSNs
|
||||
/// cutoff_horizon: also keep everything newer than this LSN
|
||||
/// pitr: the time duration required to keep data for PITR
|
||||
///
|
||||
/// The 'retain_lsns' list is currently used to prevent removing files that
|
||||
/// are needed by child timelines. In the future, the user might be able to
|
||||
/// name additional points in time to retain. The caller is responsible for
|
||||
/// collecting that information.
|
||||
///
|
||||
/// The 'cutoff_horizon' point is used to retain recent versions that might still be
|
||||
/// needed by read-only nodes. (As of this writing, the caller just passes
|
||||
/// the latest LSN subtracted by a constant, and doesn't do anything smart
|
||||
@@ -4328,29 +4474,22 @@ impl Timeline {
|
||||
///
|
||||
/// The 'pitr' duration is used to calculate a 'pitr_cutoff', which can be used to determine
|
||||
/// whether a record is needed for PITR.
|
||||
///
|
||||
/// NOTE: This function holds a short-lived lock to protect the 'gc_info'
|
||||
/// field, so that the three values passed as argument are stored
|
||||
/// atomically. But the caller is responsible for ensuring that no new
|
||||
/// branches are created that would need to be included in 'retain_lsns',
|
||||
/// for example. The caller should hold `Tenant::gc_cs` lock to ensure
|
||||
/// that.
|
||||
///
|
||||
#[instrument(skip_all, fields(timeline_id=%self.timeline_id))]
|
||||
pub(super) async fn update_gc_info(
|
||||
pub(super) async fn find_gc_cutoffs(
|
||||
&self,
|
||||
retain_lsns: Vec<Lsn>,
|
||||
cutoff_horizon: Lsn,
|
||||
pitr: Duration,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> anyhow::Result<GcCutoffs> {
|
||||
let _timer = self
|
||||
.metrics
|
||||
.update_gc_info_histo
|
||||
.find_gc_cutoffs_histo
|
||||
.start_timer()
|
||||
.record_on_drop();
|
||||
|
||||
pausable_failpoint!("Timeline::find_gc_cutoffs-pausable");
|
||||
|
||||
// First, calculate pitr_cutoff_timestamp and then convert it to LSN.
|
||||
//
|
||||
// Some unit tests depend on garbage-collection working even when
|
||||
@@ -4400,14 +4539,10 @@ impl Timeline {
|
||||
self.get_last_record_lsn()
|
||||
};
|
||||
|
||||
// Grab the lock and update the values
|
||||
*self.gc_info.write().unwrap() = GcInfo {
|
||||
retain_lsns,
|
||||
horizon_cutoff: cutoff_horizon,
|
||||
pitr_cutoff,
|
||||
};
|
||||
|
||||
Ok(())
|
||||
Ok(GcCutoffs {
|
||||
horizon: cutoff_horizon,
|
||||
pitr: pitr_cutoff,
|
||||
})
|
||||
}
|
||||
|
||||
/// Garbage collect layer files on a timeline that are no longer needed.
|
||||
@@ -4436,8 +4571,8 @@ impl Timeline {
|
||||
let (horizon_cutoff, pitr_cutoff, retain_lsns) = {
|
||||
let gc_info = self.gc_info.read().unwrap();
|
||||
|
||||
let horizon_cutoff = min(gc_info.horizon_cutoff, self.get_disk_consistent_lsn());
|
||||
let pitr_cutoff = gc_info.pitr_cutoff;
|
||||
let horizon_cutoff = min(gc_info.cutoffs.horizon, self.get_disk_consistent_lsn());
|
||||
let pitr_cutoff = gc_info.cutoffs.pitr;
|
||||
let retain_lsns = gc_info.retain_lsns.clone();
|
||||
(horizon_cutoff, pitr_cutoff, retain_lsns)
|
||||
};
|
||||
|
||||
@@ -9,13 +9,13 @@ use std::ops::{Deref, Range};
|
||||
use std::sync::Arc;
|
||||
|
||||
use super::layer_manager::LayerManager;
|
||||
use super::{CompactFlags, DurationRecorder, RecordedDuration, Timeline};
|
||||
use super::{CompactFlags, DurationRecorder, ImageLayerCreationMode, RecordedDuration, Timeline};
|
||||
|
||||
use anyhow::{anyhow, Context};
|
||||
use enumset::EnumSet;
|
||||
use fail::fail_point;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_api::shard::{ShardIdentity, TenantShardId};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, info, info_span, trace, warn, Instrument};
|
||||
use utils::id::TimelineId;
|
||||
@@ -102,7 +102,7 @@ impl Timeline {
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok((partitioning, lsn)) => {
|
||||
Ok(((dense_partitioning, sparse_partitioning), lsn)) => {
|
||||
// Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them
|
||||
let image_ctx = RequestContextBuilder::extend(ctx)
|
||||
.access_stats_behavior(AccessStatsBehavior::Skip)
|
||||
@@ -115,17 +115,37 @@ impl Timeline {
|
||||
|
||||
// 3. Create new image layers for partitions that have been modified
|
||||
// "enough".
|
||||
let layers = self
|
||||
let dense_layers = self
|
||||
.create_image_layers(
|
||||
&partitioning,
|
||||
&dense_partitioning,
|
||||
lsn,
|
||||
flags.contains(CompactFlags::ForceImageLayerCreation),
|
||||
if flags.contains(CompactFlags::ForceImageLayerCreation) {
|
||||
ImageLayerCreationMode::Force
|
||||
} else {
|
||||
ImageLayerCreationMode::Try
|
||||
},
|
||||
&image_ctx,
|
||||
)
|
||||
.await
|
||||
.map_err(anyhow::Error::from)?;
|
||||
|
||||
self.upload_new_image_layers(layers)?;
|
||||
// For now, nothing will be produced...
|
||||
let sparse_layers = self
|
||||
.create_image_layers(
|
||||
&sparse_partitioning.clone().into_dense(),
|
||||
lsn,
|
||||
if flags.contains(CompactFlags::ForceImageLayerCreation) {
|
||||
ImageLayerCreationMode::Force
|
||||
} else {
|
||||
ImageLayerCreationMode::Try
|
||||
},
|
||||
&image_ctx,
|
||||
)
|
||||
.await
|
||||
.map_err(anyhow::Error::from)?;
|
||||
assert!(sparse_layers.is_empty());
|
||||
|
||||
self.upload_new_image_layers(dense_layers)?;
|
||||
}
|
||||
Err(err) => {
|
||||
// no partitioning? This is normal, if the timeline was just created
|
||||
@@ -500,7 +520,7 @@ impl Timeline {
|
||||
writer
|
||||
.take()
|
||||
.unwrap()
|
||||
.finish(prev_key.unwrap().next(), self)
|
||||
.finish(prev_key.unwrap().next(), self, ctx)
|
||||
.await?,
|
||||
);
|
||||
writer = None;
|
||||
@@ -542,7 +562,11 @@ impl Timeline {
|
||||
);
|
||||
}
|
||||
|
||||
writer.as_mut().unwrap().put_value(key, lsn, value).await?;
|
||||
writer
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.put_value(key, lsn, value, ctx)
|
||||
.await?;
|
||||
} else {
|
||||
debug!(
|
||||
"Dropping key {} during compaction (it belongs on shard {:?})",
|
||||
@@ -558,7 +582,7 @@ impl Timeline {
|
||||
prev_key = Some(key);
|
||||
}
|
||||
if let Some(writer) = writer {
|
||||
new_layers.push(writer.finish(prev_key.unwrap().next(), self).await?);
|
||||
new_layers.push(writer.finish(prev_key.unwrap().next(), self, ctx).await?);
|
||||
}
|
||||
|
||||
// Sync layers
|
||||
@@ -758,8 +782,9 @@ impl Timeline {
|
||||
return Err(CompactionError::ShuttingDown);
|
||||
}
|
||||
|
||||
let keyspace = self.collect_keyspace(end_lsn, ctx).await?;
|
||||
let mut adaptor = TimelineAdaptor::new(self, (end_lsn, keyspace));
|
||||
let (dense_ks, _sparse_ks) = self.collect_keyspace(end_lsn, ctx).await?;
|
||||
// TODO(chi): ignore sparse_keyspace for now, compact it in the future.
|
||||
let mut adaptor = TimelineAdaptor::new(self, (end_lsn, dense_ks));
|
||||
|
||||
pageserver_compaction::compact_tiered::compact_tiered(
|
||||
&mut adaptor,
|
||||
@@ -831,6 +856,10 @@ impl CompactionJobExecutor for TimelineAdaptor {
|
||||
|
||||
type RequestContext = crate::context::RequestContext;
|
||||
|
||||
fn get_shard_identity(&self) -> &ShardIdentity {
|
||||
self.timeline.get_shard_identity()
|
||||
}
|
||||
|
||||
async fn get_layers(
|
||||
&mut self,
|
||||
key_range: &Range<Key>,
|
||||
@@ -947,7 +976,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
|
||||
|
||||
let value = val.load(ctx).await?;
|
||||
|
||||
writer.put_value(key, lsn, value).await?;
|
||||
writer.put_value(key, lsn, value, ctx).await?;
|
||||
|
||||
prev = Some((key, lsn));
|
||||
}
|
||||
@@ -963,7 +992,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
|
||||
});
|
||||
|
||||
let new_delta_layer = writer
|
||||
.finish(prev.unwrap().0.next(), &self.timeline)
|
||||
.finish(prev.unwrap().0.next(), &self.timeline, ctx)
|
||||
.await?;
|
||||
|
||||
self.new_deltas.push(new_delta_layer);
|
||||
@@ -1033,11 +1062,11 @@ impl TimelineAdaptor {
|
||||
}
|
||||
}
|
||||
};
|
||||
image_layer_writer.put_image(key, img).await?;
|
||||
image_layer_writer.put_image(key, img, ctx).await?;
|
||||
key = key.next();
|
||||
}
|
||||
}
|
||||
let image_layer = image_layer_writer.finish(&self.timeline).await?;
|
||||
let image_layer = image_layer_writer.finish(&self.timeline, ctx).await?;
|
||||
|
||||
self.new_images.push(image_layer);
|
||||
|
||||
|
||||
@@ -22,10 +22,12 @@ use crate::tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeli
|
||||
use anyhow::Context;
|
||||
use chrono::{NaiveDateTime, Utc};
|
||||
use pageserver_api::models::TimelineState;
|
||||
use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey;
|
||||
use storage_broker::proto::SafekeeperTimelineInfo;
|
||||
use storage_broker::proto::SubscribeSafekeeperInfoRequest;
|
||||
|
||||
use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
|
||||
use storage_broker::proto::{
|
||||
FilterTenantTimelineId, MessageType, SafekeeperDiscoveryRequest, SafekeeperDiscoveryResponse,
|
||||
SubscribeByFilterRequest, TypeSubscription, TypedMessage,
|
||||
};
|
||||
use storage_broker::{BrokerClientChannel, Code, Streaming};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
@@ -89,6 +91,14 @@ pub(super) async fn connection_manager_loop_step(
|
||||
.timeline
|
||||
.subscribe_for_state_updates();
|
||||
|
||||
let mut wait_lsn_status = connection_manager_state
|
||||
.timeline
|
||||
.subscribe_for_wait_lsn_updates();
|
||||
|
||||
// TODO: create a separate config option for discovery request interval
|
||||
let discovery_request_interval = connection_manager_state.conf.lagging_wal_timeout;
|
||||
let mut last_discovery_ts: Option<std::time::Instant> = None;
|
||||
|
||||
// Subscribe to the broker updates. Stream shares underlying TCP connection
|
||||
// with other streams on this client (other connection managers). When
|
||||
// object goes out of scope, stream finishes in drop() automatically.
|
||||
@@ -97,10 +107,12 @@ pub(super) async fn connection_manager_loop_step(
|
||||
|
||||
loop {
|
||||
let time_until_next_retry = connection_manager_state.time_until_next_retry();
|
||||
let any_activity = connection_manager_state.wal_connection.is_some()
|
||||
|| !connection_manager_state.wal_stream_candidates.is_empty();
|
||||
|
||||
// These things are happening concurrently:
|
||||
//
|
||||
// - cancellation request
|
||||
// - cancellation request
|
||||
// - keep receiving WAL on the current connection
|
||||
// - if the shared state says we need to change connection, disconnect and return
|
||||
// - this runs in a separate task and we receive updates via a watch channel
|
||||
@@ -108,6 +120,7 @@ pub(super) async fn connection_manager_loop_step(
|
||||
// - receive updates from broker
|
||||
// - this might change the current desired connection
|
||||
// - timeline state changes to something that does not allow walreceiver to run concurrently
|
||||
// - if there's no connection and no candidates, try to send a discovery request
|
||||
|
||||
// NB: make sure each of the select expressions are cancellation-safe
|
||||
// (no need for arms to be cancellation-safe).
|
||||
@@ -214,6 +227,65 @@ pub(super) async fn connection_manager_loop_step(
|
||||
}
|
||||
}
|
||||
} => debug!("Waking up for the next retry after waiting for {time_until_next_retry:?}"),
|
||||
|
||||
Some(()) = async {
|
||||
// Reminder: this match arm needs to be cancellation-safe.
|
||||
// Calculating time needed to wait until sending the next discovery request.
|
||||
// Current implementation is conservative and sends discovery requests only when there are no candidates.
|
||||
|
||||
if any_activity {
|
||||
// No need to send discovery requests if there is an active connection or candidates.
|
||||
return None;
|
||||
}
|
||||
|
||||
// Waiting for an active wait_lsn request.
|
||||
while wait_lsn_status.borrow().is_none() {
|
||||
if wait_lsn_status.changed().await.is_err() {
|
||||
// wait_lsn_status channel was closed, exiting
|
||||
warn!("wait_lsn_status channel was closed in connection_manager_loop_step");
|
||||
return None;
|
||||
}
|
||||
}
|
||||
|
||||
// All preconditions met, preparing to send a discovery request.
|
||||
let now = std::time::Instant::now();
|
||||
let next_discovery_ts = last_discovery_ts
|
||||
.map(|ts| ts + discovery_request_interval)
|
||||
.unwrap_or_else(|| now);
|
||||
|
||||
if next_discovery_ts > now {
|
||||
// Prevent sending discovery requests too frequently.
|
||||
tokio::time::sleep(next_discovery_ts - now).await;
|
||||
}
|
||||
|
||||
let tenant_timeline_id = Some(ProtoTenantTimelineId {
|
||||
tenant_id: id.tenant_id.as_ref().to_owned(),
|
||||
timeline_id: id.timeline_id.as_ref().to_owned(),
|
||||
});
|
||||
let request = SafekeeperDiscoveryRequest { tenant_timeline_id };
|
||||
let msg = TypedMessage {
|
||||
r#type: MessageType::SafekeeperDiscoveryRequest as i32,
|
||||
safekeeper_timeline_info: None,
|
||||
safekeeper_discovery_request: Some(request),
|
||||
safekeeper_discovery_response: None,
|
||||
};
|
||||
|
||||
last_discovery_ts = Some(std::time::Instant::now());
|
||||
debug!("No active connection and no candidates, sending discovery request to the broker");
|
||||
|
||||
// Cancellation safety: we want to send a message to the broker, but publish_one()
|
||||
// function can get cancelled by the other select! arm. This is absolutely fine, because
|
||||
// we just want to receive broker updates and discovery is not important if we already
|
||||
// receive updates.
|
||||
//
|
||||
// It is possible that `last_discovery_ts` will be updated, but the message will not be sent.
|
||||
// This is totally fine because of the reason above.
|
||||
|
||||
// This is a fire-and-forget request, we don't care about the response
|
||||
let _ = broker_client.publish_one(msg).await;
|
||||
debug!("Discovery request sent to the broker");
|
||||
None
|
||||
} => {}
|
||||
}
|
||||
|
||||
if let Some(new_candidate) = connection_manager_state.next_connection_candidate() {
|
||||
@@ -231,7 +303,7 @@ async fn subscribe_for_timeline_updates(
|
||||
broker_client: &mut BrokerClientChannel,
|
||||
id: TenantTimelineId,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Streaming<SafekeeperTimelineInfo>, Cancelled> {
|
||||
) -> Result<Streaming<TypedMessage>, Cancelled> {
|
||||
let mut attempt = 0;
|
||||
loop {
|
||||
exponential_backoff(
|
||||
@@ -244,17 +316,27 @@ async fn subscribe_for_timeline_updates(
|
||||
attempt += 1;
|
||||
|
||||
// subscribe to the specific timeline
|
||||
let key = SubscriptionKey::TenantTimelineId(ProtoTenantTimelineId {
|
||||
tenant_id: id.tenant_id.as_ref().to_owned(),
|
||||
timeline_id: id.timeline_id.as_ref().to_owned(),
|
||||
});
|
||||
let request = SubscribeSafekeeperInfoRequest {
|
||||
subscription_key: Some(key),
|
||||
let request = SubscribeByFilterRequest {
|
||||
types: vec![
|
||||
TypeSubscription {
|
||||
r#type: MessageType::SafekeeperTimelineInfo as i32,
|
||||
},
|
||||
TypeSubscription {
|
||||
r#type: MessageType::SafekeeperDiscoveryResponse as i32,
|
||||
},
|
||||
],
|
||||
tenant_timeline_id: Some(FilterTenantTimelineId {
|
||||
enabled: true,
|
||||
tenant_timeline_id: Some(ProtoTenantTimelineId {
|
||||
tenant_id: id.tenant_id.as_ref().to_owned(),
|
||||
timeline_id: id.timeline_id.as_ref().to_owned(),
|
||||
}),
|
||||
}),
|
||||
};
|
||||
|
||||
match {
|
||||
tokio::select! {
|
||||
r = broker_client.subscribe_safekeeper_info(request) => { r }
|
||||
r = broker_client.subscribe_by_filter(request) => { r }
|
||||
_ = cancel.cancelled() => { return Err(Cancelled); }
|
||||
}
|
||||
} {
|
||||
@@ -398,7 +480,7 @@ struct RetryInfo {
|
||||
/// Data about the timeline to connect to, received from the broker.
|
||||
#[derive(Debug, Clone)]
|
||||
struct BrokerSkTimeline {
|
||||
timeline: SafekeeperTimelineInfo,
|
||||
timeline: SafekeeperDiscoveryResponse,
|
||||
/// Time at which the data was fetched from the broker last time, to track the stale data.
|
||||
latest_update: NaiveDateTime,
|
||||
}
|
||||
@@ -606,7 +688,41 @@ impl ConnectionManagerState {
|
||||
}
|
||||
|
||||
/// Adds another broker timeline into the state, if its more recent than the one already added there for the same key.
|
||||
fn register_timeline_update(&mut self, timeline_update: SafekeeperTimelineInfo) {
|
||||
fn register_timeline_update(&mut self, typed_msg: TypedMessage) {
|
||||
let mut is_discovery = false;
|
||||
let timeline_update = match typed_msg.r#type() {
|
||||
MessageType::SafekeeperTimelineInfo => {
|
||||
let info = match typed_msg.safekeeper_timeline_info {
|
||||
Some(info) => info,
|
||||
None => {
|
||||
warn!("bad proto message from broker: no safekeeper_timeline_info");
|
||||
return;
|
||||
}
|
||||
};
|
||||
SafekeeperDiscoveryResponse {
|
||||
safekeeper_id: info.safekeeper_id,
|
||||
tenant_timeline_id: info.tenant_timeline_id,
|
||||
commit_lsn: info.commit_lsn,
|
||||
safekeeper_connstr: info.safekeeper_connstr,
|
||||
availability_zone: info.availability_zone,
|
||||
}
|
||||
}
|
||||
MessageType::SafekeeperDiscoveryResponse => {
|
||||
is_discovery = true;
|
||||
match typed_msg.safekeeper_discovery_response {
|
||||
Some(response) => response,
|
||||
None => {
|
||||
warn!("bad proto message from broker: no safekeeper_discovery_response");
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
// unexpected message
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
WALRECEIVER_BROKER_UPDATES.inc();
|
||||
|
||||
let new_safekeeper_id = NodeId(timeline_update.safekeeper_id);
|
||||
@@ -619,7 +735,11 @@ impl ConnectionManagerState {
|
||||
);
|
||||
|
||||
if old_entry.is_none() {
|
||||
info!("New SK node was added: {new_safekeeper_id}");
|
||||
info!(
|
||||
?is_discovery,
|
||||
%new_safekeeper_id,
|
||||
"New SK node was added",
|
||||
);
|
||||
WALRECEIVER_CANDIDATES_ADDED.inc();
|
||||
}
|
||||
}
|
||||
@@ -818,7 +938,7 @@ impl ConnectionManagerState {
|
||||
fn select_connection_candidate(
|
||||
&self,
|
||||
node_to_omit: Option<NodeId>,
|
||||
) -> Option<(NodeId, &SafekeeperTimelineInfo, PgConnectionConfig)> {
|
||||
) -> Option<(NodeId, &SafekeeperDiscoveryResponse, PgConnectionConfig)> {
|
||||
self.applicable_connection_candidates()
|
||||
.filter(|&(sk_id, _, _)| Some(sk_id) != node_to_omit)
|
||||
.max_by_key(|(_, info, _)| info.commit_lsn)
|
||||
@@ -828,7 +948,7 @@ impl ConnectionManagerState {
|
||||
/// Some safekeepers are filtered by the retry cooldown.
|
||||
fn applicable_connection_candidates(
|
||||
&self,
|
||||
) -> impl Iterator<Item = (NodeId, &SafekeeperTimelineInfo, PgConnectionConfig)> {
|
||||
) -> impl Iterator<Item = (NodeId, &SafekeeperDiscoveryResponse, PgConnectionConfig)> {
|
||||
let now = Utc::now().naive_utc();
|
||||
|
||||
self.wal_stream_candidates
|
||||
@@ -968,19 +1088,11 @@ mod tests {
|
||||
latest_update: NaiveDateTime,
|
||||
) -> BrokerSkTimeline {
|
||||
BrokerSkTimeline {
|
||||
timeline: SafekeeperTimelineInfo {
|
||||
timeline: SafekeeperDiscoveryResponse {
|
||||
safekeeper_id: 0,
|
||||
tenant_timeline_id: None,
|
||||
term: 0,
|
||||
last_log_term: 0,
|
||||
flush_lsn: 0,
|
||||
commit_lsn,
|
||||
backup_lsn: 0,
|
||||
remote_consistent_lsn: 0,
|
||||
peer_horizon_lsn: 0,
|
||||
local_start_lsn: 0,
|
||||
safekeeper_connstr: safekeeper_connstr.to_owned(),
|
||||
http_connstr: safekeeper_connstr.to_owned(),
|
||||
availability_zone: None,
|
||||
},
|
||||
latest_update,
|
||||
@@ -1423,7 +1535,7 @@ mod tests {
|
||||
|
||||
let harness = TenantHarness::create("switch_to_same_availability_zone")?;
|
||||
let mut state = dummy_state(&harness).await;
|
||||
state.conf.availability_zone = test_az.clone();
|
||||
state.conf.availability_zone.clone_from(&test_az);
|
||||
let current_lsn = Lsn(100_000).align();
|
||||
let now = Utc::now().naive_utc();
|
||||
|
||||
@@ -1456,7 +1568,7 @@ mod tests {
|
||||
// We have another safekeeper with the same commit_lsn, and it have the same availability zone as
|
||||
// the current pageserver.
|
||||
let mut same_az_sk = dummy_broker_sk_timeline(current_lsn.0, "same_az", now);
|
||||
same_az_sk.timeline.availability_zone = test_az.clone();
|
||||
same_az_sk.timeline.availability_zone.clone_from(&test_az);
|
||||
|
||||
state.wal_stream_candidates = HashMap::from([
|
||||
(
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
//! This is similar to PostgreSQL's virtual file descriptor facility in
|
||||
//! src/backend/storage/file/fd.c
|
||||
//!
|
||||
use crate::context::RequestContext;
|
||||
use crate::metrics::{StorageIoOperation, STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC};
|
||||
|
||||
use crate::page_cache::PageWriteGuard;
|
||||
@@ -615,6 +616,7 @@ impl VirtualFile {
|
||||
&self,
|
||||
buf: B,
|
||||
mut offset: u64,
|
||||
ctx: &RequestContext,
|
||||
) -> (B::Buf, Result<(), Error>) {
|
||||
let buf_len = buf.bytes_init();
|
||||
if buf_len == 0 {
|
||||
@@ -623,7 +625,7 @@ impl VirtualFile {
|
||||
let mut buf = buf.slice(0..buf_len);
|
||||
while !buf.is_empty() {
|
||||
let res;
|
||||
(buf, res) = self.write_at(buf, offset).await;
|
||||
(buf, res) = self.write_at(buf, offset, ctx).await;
|
||||
match res {
|
||||
Ok(0) => {
|
||||
return (
|
||||
@@ -652,6 +654,7 @@ impl VirtualFile {
|
||||
pub async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
|
||||
&mut self,
|
||||
buf: B,
|
||||
ctx: &RequestContext,
|
||||
) -> (B::Buf, Result<usize, Error>) {
|
||||
let nbytes = buf.bytes_init();
|
||||
if nbytes == 0 {
|
||||
@@ -660,7 +663,7 @@ impl VirtualFile {
|
||||
let mut buf = buf.slice(0..nbytes);
|
||||
while !buf.is_empty() {
|
||||
let res;
|
||||
(buf, res) = self.write(buf).await;
|
||||
(buf, res) = self.write(buf, ctx).await;
|
||||
match res {
|
||||
Ok(0) => {
|
||||
return (
|
||||
@@ -684,9 +687,10 @@ impl VirtualFile {
|
||||
async fn write<B: IoBuf + Send>(
|
||||
&mut self,
|
||||
buf: Slice<B>,
|
||||
ctx: &RequestContext,
|
||||
) -> (Slice<B>, Result<usize, std::io::Error>) {
|
||||
let pos = self.pos;
|
||||
let (buf, res) = self.write_at(buf, pos).await;
|
||||
let (buf, res) = self.write_at(buf, pos, ctx).await;
|
||||
let n = match res {
|
||||
Ok(n) => n,
|
||||
Err(e) => return (buf, Err(e)),
|
||||
@@ -724,6 +728,7 @@ impl VirtualFile {
|
||||
&self,
|
||||
buf: Slice<B>,
|
||||
offset: u64,
|
||||
_ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */
|
||||
) -> (Slice<B>, Result<usize, Error>) {
|
||||
let file_guard = match self.lock_file().await {
|
||||
Ok(file_guard) => file_guard,
|
||||
@@ -1088,8 +1093,9 @@ impl OwnedAsyncWriter for VirtualFile {
|
||||
async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
|
||||
&mut self,
|
||||
buf: B,
|
||||
ctx: &RequestContext,
|
||||
) -> std::io::Result<(usize, B::Buf)> {
|
||||
let (buf, res) = VirtualFile::write_all(self, buf).await;
|
||||
let (buf, res) = VirtualFile::write_all(self, buf, ctx).await;
|
||||
res.map(move |v| (v, buf))
|
||||
}
|
||||
}
|
||||
@@ -1146,6 +1152,9 @@ fn get_open_files() -> &'static OpenFiles {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::context::DownloadBehavior;
|
||||
use crate::task_mgr::TaskKind;
|
||||
|
||||
use super::*;
|
||||
use rand::seq::SliceRandom;
|
||||
use rand::thread_rng;
|
||||
@@ -1177,10 +1186,11 @@ mod tests {
|
||||
&self,
|
||||
buf: B,
|
||||
offset: u64,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), Error> {
|
||||
match self {
|
||||
MaybeVirtualFile::VirtualFile(file) => {
|
||||
let (_buf, res) = file.write_all_at(buf, offset).await;
|
||||
let (_buf, res) = file.write_all_at(buf, offset, ctx).await;
|
||||
res
|
||||
}
|
||||
MaybeVirtualFile::File(file) => {
|
||||
@@ -1201,10 +1211,11 @@ mod tests {
|
||||
async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
|
||||
&mut self,
|
||||
buf: B,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), Error> {
|
||||
match self {
|
||||
MaybeVirtualFile::VirtualFile(file) => {
|
||||
let (_buf, res) = file.write_all(buf).await;
|
||||
let (_buf, res) = file.write_all(buf, ctx).await;
|
||||
res.map(|_| ())
|
||||
}
|
||||
MaybeVirtualFile::File(file) => {
|
||||
@@ -1275,6 +1286,7 @@ mod tests {
|
||||
OF: Fn(Utf8PathBuf, OpenOptions) -> FT,
|
||||
FT: Future<Output = Result<MaybeVirtualFile, std::io::Error>>,
|
||||
{
|
||||
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
|
||||
let testdir = crate::config::PageServerConf::test_repo_dir(testname);
|
||||
std::fs::create_dir_all(&testdir)?;
|
||||
|
||||
@@ -1288,7 +1300,7 @@ mod tests {
|
||||
.to_owned(),
|
||||
)
|
||||
.await?;
|
||||
file_a.write_all(b"foobar".to_vec()).await?;
|
||||
file_a.write_all(b"foobar".to_vec(), &ctx).await?;
|
||||
|
||||
// cannot read from a file opened in write-only mode
|
||||
let _ = file_a.read_string().await.unwrap_err();
|
||||
@@ -1297,7 +1309,7 @@ mod tests {
|
||||
let mut file_a = openfunc(path_a, OpenOptions::new().read(true).to_owned()).await?;
|
||||
|
||||
// cannot write to a file opened in read-only mode
|
||||
let _ = file_a.write_all(b"bar".to_vec()).await.unwrap_err();
|
||||
let _ = file_a.write_all(b"bar".to_vec(), &ctx).await.unwrap_err();
|
||||
|
||||
// Try simple read
|
||||
assert_eq!("foobar", file_a.read_string().await?);
|
||||
@@ -1339,8 +1351,8 @@ mod tests {
|
||||
.to_owned(),
|
||||
)
|
||||
.await?;
|
||||
file_b.write_all_at(b"BAR".to_vec(), 3).await?;
|
||||
file_b.write_all_at(b"FOO".to_vec(), 0).await?;
|
||||
file_b.write_all_at(b"BAR".to_vec(), 3, &ctx).await?;
|
||||
file_b.write_all_at(b"FOO".to_vec(), 0, &ctx).await?;
|
||||
|
||||
assert_eq!(file_b.read_string_at(2, 3).await?, "OBA");
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter;
|
||||
use crate::{context::RequestContext, virtual_file::owned_buffers_io::write::OwnedAsyncWriter};
|
||||
use tokio_epoll_uring::{BoundedBuf, IoBuf};
|
||||
|
||||
pub struct Writer<W> {
|
||||
@@ -38,8 +38,9 @@ where
|
||||
async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
|
||||
&mut self,
|
||||
buf: B,
|
||||
ctx: &RequestContext,
|
||||
) -> std::io::Result<(usize, B::Buf)> {
|
||||
let (nwritten, buf) = self.dst.write_all(buf).await?;
|
||||
let (nwritten, buf) = self.dst.write_all(buf, ctx).await?;
|
||||
self.bytes_amount += u64::try_from(nwritten).unwrap();
|
||||
Ok((nwritten, buf))
|
||||
}
|
||||
|
||||
@@ -1,12 +1,15 @@
|
||||
use bytes::BytesMut;
|
||||
use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};
|
||||
|
||||
use crate::context::RequestContext;
|
||||
|
||||
/// A trait for doing owned-buffer write IO.
|
||||
/// Think [`tokio::io::AsyncWrite`] but with owned buffers.
|
||||
pub trait OwnedAsyncWriter {
|
||||
async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
|
||||
&mut self,
|
||||
buf: B,
|
||||
ctx: &RequestContext,
|
||||
) -> std::io::Result<(usize, B::Buf)>;
|
||||
}
|
||||
|
||||
@@ -57,8 +60,9 @@ where
|
||||
}
|
||||
|
||||
#[cfg_attr(target_os = "macos", allow(dead_code))]
|
||||
pub async fn flush_and_into_inner(mut self) -> std::io::Result<W> {
|
||||
self.flush().await?;
|
||||
pub async fn flush_and_into_inner(mut self, ctx: &RequestContext) -> std::io::Result<W> {
|
||||
self.flush(ctx).await?;
|
||||
|
||||
let Self { buf, writer } = self;
|
||||
assert!(buf.is_some());
|
||||
Ok(writer)
|
||||
@@ -72,14 +76,15 @@ where
|
||||
}
|
||||
|
||||
#[cfg_attr(target_os = "macos", allow(dead_code))]
|
||||
pub async fn write_buffered<S: IoBuf>(&mut self, chunk: Slice<S>) -> std::io::Result<(usize, S)>
|
||||
where
|
||||
S: IoBuf + Send,
|
||||
{
|
||||
pub async fn write_buffered<S: IoBuf + Send>(
|
||||
&mut self,
|
||||
chunk: Slice<S>,
|
||||
ctx: &RequestContext,
|
||||
) -> std::io::Result<(usize, S)> {
|
||||
let chunk_len = chunk.len();
|
||||
// avoid memcpy for the middle of the chunk
|
||||
if chunk.len() >= self.buf().cap() {
|
||||
self.flush().await?;
|
||||
self.flush(ctx).await?;
|
||||
// do a big write, bypassing `buf`
|
||||
assert_eq!(
|
||||
self.buf
|
||||
@@ -88,7 +93,7 @@ where
|
||||
.pending(),
|
||||
0
|
||||
);
|
||||
let (nwritten, chunk) = self.writer.write_all(chunk).await?;
|
||||
let (nwritten, chunk) = self.writer.write_all(chunk, ctx).await?;
|
||||
assert_eq!(nwritten, chunk_len);
|
||||
return Ok((nwritten, chunk));
|
||||
}
|
||||
@@ -104,7 +109,7 @@ where
|
||||
slice = &slice[n..];
|
||||
if buf.pending() >= buf.cap() {
|
||||
assert_eq!(buf.pending(), buf.cap());
|
||||
self.flush().await?;
|
||||
self.flush(ctx).await?;
|
||||
}
|
||||
}
|
||||
assert!(slice.is_empty(), "by now we should have drained the chunk");
|
||||
@@ -116,7 +121,11 @@ where
|
||||
/// It is less performant because we always have to copy the borrowed data into the internal buffer
|
||||
/// before we can do the IO. The [`Self::write_buffered`] can avoid this, which is more performant
|
||||
/// for large writes.
|
||||
pub async fn write_buffered_borrowed(&mut self, mut chunk: &[u8]) -> std::io::Result<usize> {
|
||||
pub async fn write_buffered_borrowed(
|
||||
&mut self,
|
||||
mut chunk: &[u8],
|
||||
ctx: &RequestContext,
|
||||
) -> std::io::Result<usize> {
|
||||
let chunk_len = chunk.len();
|
||||
while !chunk.is_empty() {
|
||||
let buf = self.buf.as_mut().expect("must not use after an error");
|
||||
@@ -127,20 +136,20 @@ where
|
||||
chunk = &chunk[n..];
|
||||
if buf.pending() >= buf.cap() {
|
||||
assert_eq!(buf.pending(), buf.cap());
|
||||
self.flush().await?;
|
||||
self.flush(ctx).await?;
|
||||
}
|
||||
}
|
||||
Ok(chunk_len)
|
||||
}
|
||||
|
||||
async fn flush(&mut self) -> std::io::Result<()> {
|
||||
async fn flush(&mut self, ctx: &RequestContext) -> std::io::Result<()> {
|
||||
let buf = self.buf.take().expect("must not use after an error");
|
||||
let buf_len = buf.pending();
|
||||
if buf_len == 0 {
|
||||
self.buf = Some(buf);
|
||||
return Ok(());
|
||||
}
|
||||
let (nwritten, io_buf) = self.writer.write_all(buf.flush()).await?;
|
||||
let (nwritten, io_buf) = self.writer.write_all(buf.flush(), ctx).await?;
|
||||
assert_eq!(nwritten, buf_len);
|
||||
self.buf = Some(Buffer::reuse_after_flush(io_buf));
|
||||
Ok(())
|
||||
@@ -206,6 +215,7 @@ impl OwnedAsyncWriter for Vec<u8> {
|
||||
async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
|
||||
&mut self,
|
||||
buf: B,
|
||||
_: &RequestContext,
|
||||
) -> std::io::Result<(usize, B::Buf)> {
|
||||
let nbytes = buf.bytes_init();
|
||||
if nbytes == 0 {
|
||||
@@ -222,6 +232,8 @@ mod tests {
|
||||
use bytes::BytesMut;
|
||||
|
||||
use super::*;
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::task_mgr::TaskKind;
|
||||
|
||||
#[derive(Default)]
|
||||
struct RecorderWriter {
|
||||
@@ -231,6 +243,7 @@ mod tests {
|
||||
async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
|
||||
&mut self,
|
||||
buf: B,
|
||||
_: &RequestContext,
|
||||
) -> std::io::Result<(usize, B::Buf)> {
|
||||
let nbytes = buf.bytes_init();
|
||||
if nbytes == 0 {
|
||||
@@ -243,10 +256,14 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
fn test_ctx() -> RequestContext {
|
||||
RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error)
|
||||
}
|
||||
|
||||
macro_rules! write {
|
||||
($writer:ident, $data:literal) => {{
|
||||
$writer
|
||||
.write_buffered(::bytes::Bytes::from_static($data).slice_full())
|
||||
.write_buffered(::bytes::Bytes::from_static($data).slice_full(), &test_ctx())
|
||||
.await?;
|
||||
}};
|
||||
}
|
||||
@@ -260,7 +277,7 @@ mod tests {
|
||||
write!(writer, b"c");
|
||||
write!(writer, b"d");
|
||||
write!(writer, b"e");
|
||||
let recorder = writer.flush_and_into_inner().await?;
|
||||
let recorder = writer.flush_and_into_inner(&test_ctx()).await?;
|
||||
assert_eq!(
|
||||
recorder.writes,
|
||||
vec![Vec::from(b"ab"), Vec::from(b"cd"), Vec::from(b"e")]
|
||||
@@ -276,7 +293,7 @@ mod tests {
|
||||
write!(writer, b"de");
|
||||
write!(writer, b"");
|
||||
write!(writer, b"fghijk");
|
||||
let recorder = writer.flush_and_into_inner().await?;
|
||||
let recorder = writer.flush_and_into_inner(&test_ctx()).await?;
|
||||
assert_eq!(
|
||||
recorder.writes,
|
||||
vec![Vec::from(b"abc"), Vec::from(b"de"), Vec::from(b"fghijk")]
|
||||
@@ -292,7 +309,7 @@ mod tests {
|
||||
write!(writer, b"bc");
|
||||
write!(writer, b"d");
|
||||
write!(writer, b"e");
|
||||
let recorder = writer.flush_and_into_inner().await?;
|
||||
let recorder = writer.flush_and_into_inner(&test_ctx()).await?;
|
||||
assert_eq!(
|
||||
recorder.writes,
|
||||
vec![Vec::from(b"a"), Vec::from(b"bc"), Vec::from(b"de")]
|
||||
@@ -302,18 +319,20 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_write_all_borrowed_always_goes_through_buffer() -> std::io::Result<()> {
|
||||
let ctx = test_ctx();
|
||||
let ctx = &ctx;
|
||||
let recorder = RecorderWriter::default();
|
||||
let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
|
||||
|
||||
writer.write_buffered_borrowed(b"abc").await?;
|
||||
writer.write_buffered_borrowed(b"d").await?;
|
||||
writer.write_buffered_borrowed(b"e").await?;
|
||||
writer.write_buffered_borrowed(b"fg").await?;
|
||||
writer.write_buffered_borrowed(b"hi").await?;
|
||||
writer.write_buffered_borrowed(b"j").await?;
|
||||
writer.write_buffered_borrowed(b"klmno").await?;
|
||||
writer.write_buffered_borrowed(b"abc", ctx).await?;
|
||||
writer.write_buffered_borrowed(b"d", ctx).await?;
|
||||
writer.write_buffered_borrowed(b"e", ctx).await?;
|
||||
writer.write_buffered_borrowed(b"fg", ctx).await?;
|
||||
writer.write_buffered_borrowed(b"hi", ctx).await?;
|
||||
writer.write_buffered_borrowed(b"j", ctx).await?;
|
||||
writer.write_buffered_borrowed(b"klmno", ctx).await?;
|
||||
|
||||
let recorder = writer.flush_and_into_inner().await?;
|
||||
let recorder = writer.flush_and_into_inner(ctx).await?;
|
||||
assert_eq!(
|
||||
recorder.writes,
|
||||
{
|
||||
|
||||
3000
poetry.lock
generated
3000
poetry.lock
generated
File diff suppressed because one or more lines are too long
@@ -59,8 +59,8 @@ prometheus.workspace = true
|
||||
rand.workspace = true
|
||||
regex.workspace = true
|
||||
remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
|
||||
reqwest = { workspace = true, features = ["json"] }
|
||||
reqwest-middleware.workspace = true
|
||||
reqwest.workspace = true
|
||||
reqwest-middleware = { workspace = true, features = ["json"] }
|
||||
reqwest-retry.workspace = true
|
||||
reqwest-tracing.workspace = true
|
||||
routerify.workspace = true
|
||||
@@ -84,6 +84,7 @@ tokio-postgres.workspace = true
|
||||
tokio-rustls.workspace = true
|
||||
tokio-util.workspace = true
|
||||
tokio = { workspace = true, features = ["signal"] }
|
||||
tower-service.workspace = true
|
||||
tracing-opentelemetry.workspace = true
|
||||
tracing-subscriber.workspace = true
|
||||
tracing-utils.workspace = true
|
||||
|
||||
@@ -118,8 +118,11 @@ struct ProxyCliArgs {
|
||||
#[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
|
||||
wake_compute_cache: String,
|
||||
/// lock for `wake_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable).
|
||||
#[clap(long, default_value = config::WakeComputeLockOptions::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK)]
|
||||
#[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK)]
|
||||
wake_compute_lock: String,
|
||||
/// lock for `connect_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable).
|
||||
#[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK)]
|
||||
connect_compute_lock: String,
|
||||
/// Allow self-signed certificates for compute nodes (for testing)
|
||||
#[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
|
||||
allow_self_signed_compute: bool,
|
||||
@@ -529,24 +532,21 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
||||
endpoint_cache_config,
|
||||
)));
|
||||
|
||||
let config::WakeComputeLockOptions {
|
||||
let config::ConcurrencyLockOptions {
|
||||
shards,
|
||||
permits,
|
||||
epoch,
|
||||
timeout,
|
||||
} = args.wake_compute_lock.parse()?;
|
||||
info!(permits, shards, ?epoch, "Using NodeLocks (wake_compute)");
|
||||
let locks = Box::leak(Box::new(
|
||||
console::locks::ApiLocks::new(
|
||||
"wake_compute_lock",
|
||||
permits,
|
||||
shards,
|
||||
timeout,
|
||||
epoch,
|
||||
&Metrics::get().wake_compute_lock,
|
||||
)
|
||||
.unwrap(),
|
||||
));
|
||||
let locks = Box::leak(Box::new(console::locks::ApiLocks::new(
|
||||
"wake_compute_lock",
|
||||
permits,
|
||||
shards,
|
||||
timeout,
|
||||
epoch,
|
||||
&Metrics::get().wake_compute_lock,
|
||||
)?));
|
||||
tokio::spawn(locks.garbage_collect_worker());
|
||||
|
||||
let url = args.auth_endpoint.parse()?;
|
||||
@@ -572,6 +572,23 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
||||
auth::BackendType::Link(MaybeOwned::Owned(url), ())
|
||||
}
|
||||
};
|
||||
|
||||
let config::ConcurrencyLockOptions {
|
||||
shards,
|
||||
permits,
|
||||
epoch,
|
||||
timeout,
|
||||
} = args.connect_compute_lock.parse()?;
|
||||
info!(permits, shards, ?epoch, "Using NodeLocks (connect_compute)");
|
||||
let connect_compute_locks = console::locks::ApiLocks::new(
|
||||
"connect_compute_lock",
|
||||
permits,
|
||||
shards,
|
||||
timeout,
|
||||
epoch,
|
||||
&Metrics::get().proxy.connect_compute_lock,
|
||||
)?;
|
||||
|
||||
let http_config = HttpConfig {
|
||||
request_timeout: args.sql_over_http.sql_over_http_timeout,
|
||||
pool_options: GlobalConnPoolOptions {
|
||||
@@ -607,11 +624,14 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
||||
region: args.region.clone(),
|
||||
aws_region: args.aws_region.clone(),
|
||||
wake_compute_retry_config: config::RetryConfig::parse(&args.wake_compute_retry)?,
|
||||
connect_compute_locks,
|
||||
connect_to_compute_retry_config: config::RetryConfig::parse(
|
||||
&args.connect_to_compute_retry,
|
||||
)?,
|
||||
}));
|
||||
|
||||
tokio::spawn(config.connect_compute_locks.garbage_collect_worker());
|
||||
|
||||
Ok(config)
|
||||
}
|
||||
|
||||
|
||||
14
proxy/src/cache/endpoints.rs
vendored
14
proxy/src/cache/endpoints.rs
vendored
@@ -21,7 +21,7 @@ use crate::{
|
||||
config::EndpointCacheConfig,
|
||||
context::RequestMonitoring,
|
||||
intern::{BranchIdInt, EndpointIdInt, ProjectIdInt},
|
||||
metrics::{Metrics, RedisErrors},
|
||||
metrics::{Metrics, RedisErrors, RedisEventsCount},
|
||||
rate_limiter::GlobalRateLimiter,
|
||||
redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider,
|
||||
EndpointId,
|
||||
@@ -100,14 +100,26 @@ impl EndpointsCache {
|
||||
if let Some(endpoint_created) = key.endpoint_created {
|
||||
self.endpoints
|
||||
.insert(EndpointIdInt::from(&endpoint_created.endpoint_id.into()));
|
||||
Metrics::get()
|
||||
.proxy
|
||||
.redis_events_count
|
||||
.inc(RedisEventsCount::EndpointCreated);
|
||||
}
|
||||
if let Some(branch_created) = key.branch_created {
|
||||
self.branches
|
||||
.insert(BranchIdInt::from(&branch_created.branch_id.into()));
|
||||
Metrics::get()
|
||||
.proxy
|
||||
.redis_events_count
|
||||
.inc(RedisEventsCount::BranchCreated);
|
||||
}
|
||||
if let Some(project_created) = key.project_created {
|
||||
self.projects
|
||||
.insert(ProjectIdInt::from(&project_created.project_id.into()));
|
||||
Metrics::get()
|
||||
.proxy
|
||||
.redis_events_count
|
||||
.inc(RedisEventsCount::ProjectCreated);
|
||||
}
|
||||
}
|
||||
pub async fn do_read(
|
||||
|
||||
42
proxy/src/cache/project_info.rs
vendored
42
proxy/src/cache/project_info.rs
vendored
@@ -5,9 +5,11 @@ use std::{
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
use async_trait::async_trait;
|
||||
use dashmap::DashMap;
|
||||
use rand::{thread_rng, Rng};
|
||||
use smol_str::SmolStr;
|
||||
use tokio::sync::Mutex;
|
||||
use tokio::time::Instant;
|
||||
use tracing::{debug, info};
|
||||
|
||||
@@ -21,11 +23,12 @@ use crate::{
|
||||
|
||||
use super::{Cache, Cached};
|
||||
|
||||
#[async_trait]
|
||||
pub trait ProjectInfoCache {
|
||||
fn invalidate_allowed_ips_for_project(&self, project_id: ProjectIdInt);
|
||||
fn invalidate_role_secret_for_project(&self, project_id: ProjectIdInt, role_name: RoleNameInt);
|
||||
fn enable_ttl(&self);
|
||||
fn disable_ttl(&self);
|
||||
async fn decrement_active_listeners(&self);
|
||||
async fn increment_active_listeners(&self);
|
||||
}
|
||||
|
||||
struct Entry<T> {
|
||||
@@ -116,8 +119,10 @@ pub struct ProjectInfoCacheImpl {
|
||||
|
||||
start_time: Instant,
|
||||
ttl_disabled_since_us: AtomicU64,
|
||||
active_listeners_lock: Mutex<usize>,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl ProjectInfoCache for ProjectInfoCacheImpl {
|
||||
fn invalidate_allowed_ips_for_project(&self, project_id: ProjectIdInt) {
|
||||
info!("invalidating allowed ips for project `{}`", project_id);
|
||||
@@ -148,15 +153,27 @@ impl ProjectInfoCache for ProjectInfoCacheImpl {
|
||||
}
|
||||
}
|
||||
}
|
||||
fn enable_ttl(&self) {
|
||||
self.ttl_disabled_since_us
|
||||
.store(u64::MAX, std::sync::atomic::Ordering::Relaxed);
|
||||
async fn decrement_active_listeners(&self) {
|
||||
let mut listeners_guard = self.active_listeners_lock.lock().await;
|
||||
if *listeners_guard == 0 {
|
||||
tracing::error!("active_listeners count is already 0, something is broken");
|
||||
return;
|
||||
}
|
||||
*listeners_guard -= 1;
|
||||
if *listeners_guard == 0 {
|
||||
self.ttl_disabled_since_us
|
||||
.store(u64::MAX, std::sync::atomic::Ordering::SeqCst);
|
||||
}
|
||||
}
|
||||
|
||||
fn disable_ttl(&self) {
|
||||
let new_ttl = (self.start_time.elapsed() + self.config.ttl).as_micros() as u64;
|
||||
self.ttl_disabled_since_us
|
||||
.store(new_ttl, std::sync::atomic::Ordering::Relaxed);
|
||||
async fn increment_active_listeners(&self) {
|
||||
let mut listeners_guard = self.active_listeners_lock.lock().await;
|
||||
*listeners_guard += 1;
|
||||
if *listeners_guard == 1 {
|
||||
let new_ttl = (self.start_time.elapsed() + self.config.ttl).as_micros() as u64;
|
||||
self.ttl_disabled_since_us
|
||||
.store(new_ttl, std::sync::atomic::Ordering::SeqCst);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -168,6 +185,7 @@ impl ProjectInfoCacheImpl {
|
||||
config,
|
||||
ttl_disabled_since_us: AtomicU64::new(u64::MAX),
|
||||
start_time: Instant::now(),
|
||||
active_listeners_lock: Mutex::new(0),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -432,7 +450,7 @@ mod tests {
|
||||
ttl: Duration::from_secs(1),
|
||||
gc_interval: Duration::from_secs(600),
|
||||
}));
|
||||
cache.clone().disable_ttl();
|
||||
cache.clone().increment_active_listeners().await;
|
||||
tokio::time::advance(Duration::from_secs(2)).await;
|
||||
|
||||
let project_id: ProjectId = "project".into();
|
||||
@@ -489,7 +507,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_disable_ttl_invalidate_added_before() {
|
||||
async fn test_increment_active_listeners_invalidate_added_before() {
|
||||
tokio::time::pause();
|
||||
let cache = Arc::new(ProjectInfoCacheImpl::new(ProjectInfoCacheOptions {
|
||||
size: 2,
|
||||
@@ -514,7 +532,7 @@ mod tests {
|
||||
(&user1).into(),
|
||||
secret1.clone(),
|
||||
);
|
||||
cache.clone().disable_ttl();
|
||||
cache.clone().increment_active_listeners().await;
|
||||
tokio::time::advance(Duration::from_millis(100)).await;
|
||||
cache.insert_role_secret(
|
||||
(&project_id).into(),
|
||||
|
||||
@@ -6,6 +6,7 @@ use crate::{
|
||||
error::{ReportableError, UserFacingError},
|
||||
metrics::{Metrics, NumDbConnectionsGuard},
|
||||
proxy::neon_option,
|
||||
Host,
|
||||
};
|
||||
use futures::{FutureExt, TryFutureExt};
|
||||
use itertools::Itertools;
|
||||
@@ -101,6 +102,16 @@ impl ConnCfg {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_host(&self) -> Result<Host, WakeComputeError> {
|
||||
match self.0.get_hosts() {
|
||||
[tokio_postgres::config::Host::Tcp(s)] => Ok(s.into()),
|
||||
// we should not have multiple address or unix addresses.
|
||||
_ => Err(WakeComputeError::BadComputeAddress(
|
||||
"invalid compute address".into(),
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Apply startup message params to the connection config.
|
||||
pub fn set_startup_params(&mut self, params: &StartupMessageParams) {
|
||||
// Only set `user` if it's not present in the config.
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
use crate::{
|
||||
auth::{self, backend::AuthRateLimiter},
|
||||
console::locks::ApiLocks,
|
||||
rate_limiter::RateBucketInfo,
|
||||
serverless::GlobalConnPoolOptions,
|
||||
Host,
|
||||
};
|
||||
use anyhow::{bail, ensure, Context, Ok};
|
||||
use itertools::Itertools;
|
||||
@@ -34,6 +36,7 @@ pub struct ProxyConfig {
|
||||
pub handshake_timeout: Duration,
|
||||
pub aws_region: String,
|
||||
pub wake_compute_retry_config: RetryConfig,
|
||||
pub connect_compute_locks: ApiLocks<Host>,
|
||||
pub connect_to_compute_retry_config: RetryConfig,
|
||||
}
|
||||
|
||||
@@ -573,7 +576,7 @@ impl RetryConfig {
|
||||
}
|
||||
|
||||
/// Helper for cmdline cache options parsing.
|
||||
pub struct WakeComputeLockOptions {
|
||||
pub struct ConcurrencyLockOptions {
|
||||
/// The number of shards the lock map should have
|
||||
pub shards: usize,
|
||||
/// The number of allowed concurrent requests for each endpoitn
|
||||
@@ -584,9 +587,12 @@ pub struct WakeComputeLockOptions {
|
||||
pub timeout: Duration,
|
||||
}
|
||||
|
||||
impl WakeComputeLockOptions {
|
||||
impl ConcurrencyLockOptions {
|
||||
/// Default options for [`crate::console::provider::ApiLocks`].
|
||||
pub const DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK: &'static str = "permits=0";
|
||||
/// Default options for [`crate::console::provider::ApiLocks`].
|
||||
pub const DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK: &'static str =
|
||||
"shards=64,permits=50,epoch=10m,timeout=500ms";
|
||||
|
||||
// pub const DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK: &'static str = "shards=32,permits=4,epoch=10m,timeout=1s";
|
||||
|
||||
@@ -636,7 +642,7 @@ impl WakeComputeLockOptions {
|
||||
}
|
||||
}
|
||||
|
||||
impl FromStr for WakeComputeLockOptions {
|
||||
impl FromStr for ConcurrencyLockOptions {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(options: &str) -> Result<Self, Self::Err> {
|
||||
@@ -672,7 +678,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_parse_lock_options() -> anyhow::Result<()> {
|
||||
let WakeComputeLockOptions {
|
||||
let ConcurrencyLockOptions {
|
||||
epoch,
|
||||
permits,
|
||||
shards,
|
||||
@@ -683,7 +689,7 @@ mod tests {
|
||||
assert_eq!(shards, 32);
|
||||
assert_eq!(permits, 4);
|
||||
|
||||
let WakeComputeLockOptions {
|
||||
let ConcurrencyLockOptions {
|
||||
epoch,
|
||||
permits,
|
||||
shards,
|
||||
@@ -694,7 +700,7 @@ mod tests {
|
||||
assert_eq!(shards, 16);
|
||||
assert_eq!(permits, 8);
|
||||
|
||||
let WakeComputeLockOptions {
|
||||
let ConcurrencyLockOptions {
|
||||
epoch,
|
||||
permits,
|
||||
shards,
|
||||
|
||||
@@ -75,7 +75,6 @@ pub type ComputeReady = DatabaseInfo;
|
||||
|
||||
// TODO: replace with an http-based protocol.
|
||||
struct MgmtHandler;
|
||||
#[async_trait::async_trait]
|
||||
impl postgres_backend::Handler<tokio::net::TcpStream> for MgmtHandler {
|
||||
async fn process_query(
|
||||
&mut self,
|
||||
|
||||
@@ -17,7 +17,7 @@ use crate::{
|
||||
scram, EndpointCacheKey,
|
||||
};
|
||||
use dashmap::DashMap;
|
||||
use std::{sync::Arc, time::Duration};
|
||||
use std::{hash::Hash, sync::Arc, time::Duration};
|
||||
use tokio::sync::{OwnedSemaphorePermit, Semaphore};
|
||||
use tokio::time::Instant;
|
||||
use tracing::info;
|
||||
@@ -447,16 +447,16 @@ impl ApiCaches {
|
||||
}
|
||||
|
||||
/// Various caches for [`console`](super).
|
||||
pub struct ApiLocks {
|
||||
pub struct ApiLocks<K> {
|
||||
name: &'static str,
|
||||
node_locks: DashMap<EndpointCacheKey, Arc<Semaphore>>,
|
||||
node_locks: DashMap<K, Arc<Semaphore>>,
|
||||
permits: usize,
|
||||
timeout: Duration,
|
||||
epoch: std::time::Duration,
|
||||
metrics: &'static ApiLockMetrics,
|
||||
}
|
||||
|
||||
impl ApiLocks {
|
||||
impl<K: Hash + Eq + Clone> ApiLocks<K> {
|
||||
pub fn new(
|
||||
name: &'static str,
|
||||
permits: usize,
|
||||
@@ -475,10 +475,7 @@ impl ApiLocks {
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn get_wake_compute_permit(
|
||||
&self,
|
||||
key: &EndpointCacheKey,
|
||||
) -> Result<WakeComputePermit, errors::WakeComputeError> {
|
||||
pub async fn get_permit(&self, key: &K) -> Result<WakeComputePermit, errors::WakeComputeError> {
|
||||
if self.permits == 0 {
|
||||
return Ok(WakeComputePermit { permit: None });
|
||||
}
|
||||
|
||||
@@ -13,7 +13,7 @@ use crate::{
|
||||
http,
|
||||
metrics::{CacheOutcome, Metrics},
|
||||
rate_limiter::EndpointRateLimiter,
|
||||
scram, Normalize,
|
||||
scram, EndpointCacheKey, Normalize,
|
||||
};
|
||||
use crate::{cache::Cached, context::RequestMonitoring};
|
||||
use futures::TryFutureExt;
|
||||
@@ -25,7 +25,7 @@ use tracing::{error, info, info_span, warn, Instrument};
|
||||
pub struct Api {
|
||||
endpoint: http::Endpoint,
|
||||
pub caches: &'static ApiCaches,
|
||||
pub locks: &'static ApiLocks,
|
||||
pub locks: &'static ApiLocks<EndpointCacheKey>,
|
||||
pub endpoint_rate_limiter: Arc<EndpointRateLimiter>,
|
||||
jwt: String,
|
||||
}
|
||||
@@ -35,7 +35,7 @@ impl Api {
|
||||
pub fn new(
|
||||
endpoint: http::Endpoint,
|
||||
caches: &'static ApiCaches,
|
||||
locks: &'static ApiLocks,
|
||||
locks: &'static ApiLocks<EndpointCacheKey>,
|
||||
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
|
||||
) -> Self {
|
||||
let jwt: String = match std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") {
|
||||
@@ -289,7 +289,7 @@ impl super::Api for Api {
|
||||
return Err(WakeComputeError::TooManyConnections);
|
||||
}
|
||||
|
||||
let permit = self.locks.get_wake_compute_permit(&key).await?;
|
||||
let permit = self.locks.get_permit(&key).await?;
|
||||
|
||||
// after getting back a permit - it's possible the cache was filled
|
||||
// double check
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
|
||||
pub mod health_server;
|
||||
|
||||
use std::{sync::Arc, time::Duration};
|
||||
use std::{str::FromStr, sync::Arc, time::Duration};
|
||||
|
||||
use futures::FutureExt;
|
||||
pub use reqwest::{Request, Response, StatusCode};
|
||||
@@ -103,12 +103,12 @@ impl Endpoint {
|
||||
}
|
||||
}
|
||||
|
||||
/// https://docs.rs/reqwest/0.11.18/src/reqwest/dns/gai.rs.html
|
||||
use hyper::{
|
||||
client::connect::dns::{GaiResolver as HyperGaiResolver, Name},
|
||||
service::Service,
|
||||
use hyper_util::client::legacy::connect::dns::{
|
||||
GaiResolver as HyperGaiResolver, Name as HyperName,
|
||||
};
|
||||
use reqwest::dns::{Addrs, Resolve, Resolving};
|
||||
use reqwest::dns::{Addrs, Name, Resolve, Resolving};
|
||||
/// https://docs.rs/reqwest/0.11.18/src/reqwest/dns/gai.rs.html
|
||||
use tower_service::Service;
|
||||
#[derive(Debug)]
|
||||
pub struct GaiResolver(HyperGaiResolver);
|
||||
|
||||
@@ -121,11 +121,12 @@ impl Default for GaiResolver {
|
||||
impl Resolve for GaiResolver {
|
||||
fn resolve(&self, name: Name) -> Resolving {
|
||||
let this = &mut self.0.clone();
|
||||
let hyper_name = HyperName::from_str(name.as_str()).expect("name should be valid");
|
||||
let start = Instant::now();
|
||||
Box::pin(
|
||||
Service::<Name>::call(this, name.clone()).map(move |result| {
|
||||
Service::<HyperName>::call(this, hyper_name).map(move |result| {
|
||||
let resolve_duration = start.elapsed();
|
||||
trace!(duration = ?resolve_duration, addr = %name, "resolve host complete");
|
||||
trace!(duration = ?resolve_duration, addr = %name.as_str(), "resolve host complete");
|
||||
result
|
||||
.map(|addrs| -> Addrs { Box::new(addrs) })
|
||||
.map_err(|err| -> Box<dyn std::error::Error + Send + Sync> { Box::new(err) })
|
||||
|
||||
@@ -159,6 +159,9 @@ smol_str_wrapper!(EndpointCacheKey);
|
||||
|
||||
smol_str_wrapper!(DbName);
|
||||
|
||||
// postgres hostname, will likely be a port:ip addr
|
||||
smol_str_wrapper!(Host);
|
||||
|
||||
// Endpoints are a bit tricky. Rare they might be branches or projects.
|
||||
impl EndpointId {
|
||||
pub fn is_endpoint(&self) -> bool {
|
||||
|
||||
@@ -123,6 +123,12 @@ pub struct ProxyMetrics {
|
||||
/// Number of retries (per outcome, per retry_type).
|
||||
#[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]))]
|
||||
pub retries_metric: HistogramVec<RetriesMetricSet, 9>,
|
||||
|
||||
/// Number of events consumed from redis (per event type).
|
||||
pub redis_events_count: CounterVec<StaticLabelSet<RedisEventsCount>>,
|
||||
|
||||
#[metric(namespace = "connect_compute_lock")]
|
||||
pub connect_compute_lock: ApiLockMetrics,
|
||||
}
|
||||
|
||||
#[derive(MetricGroup)]
|
||||
@@ -146,6 +152,12 @@ impl Default for ProxyMetrics {
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ApiLockMetrics {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(FixedCardinalityLabel, Copy, Clone)]
|
||||
#[label(singleton = "direction")]
|
||||
pub enum HttpDirection {
|
||||
@@ -530,3 +542,14 @@ pub enum RetryType {
|
||||
WakeCompute,
|
||||
ConnectToCompute,
|
||||
}
|
||||
|
||||
#[derive(FixedCardinalityLabel, Clone, Copy, Debug)]
|
||||
#[label(singleton = "event")]
|
||||
pub enum RedisEventsCount {
|
||||
EndpointCreated,
|
||||
BranchCreated,
|
||||
ProjectCreated,
|
||||
CancelSession,
|
||||
PasswordUpdate,
|
||||
AllowedIpsUpdate,
|
||||
}
|
||||
|
||||
@@ -301,7 +301,10 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
|
||||
let mut node = connect_to_compute(
|
||||
ctx,
|
||||
&TcpMechanism { params: ¶ms },
|
||||
&TcpMechanism {
|
||||
params: ¶ms,
|
||||
locks: &config.connect_compute_locks,
|
||||
},
|
||||
&user_info,
|
||||
mode.allow_self_signed_compute(config),
|
||||
config.wake_compute_retry_config,
|
||||
|
||||
@@ -2,7 +2,7 @@ use crate::{
|
||||
auth::backend::ComputeCredentialKeys,
|
||||
compute::{self, PostgresConnection},
|
||||
config::RetryConfig,
|
||||
console::{self, errors::WakeComputeError, CachedNodeInfo, NodeInfo},
|
||||
console::{self, errors::WakeComputeError, locks::ApiLocks, CachedNodeInfo, NodeInfo},
|
||||
context::RequestMonitoring,
|
||||
error::ReportableError,
|
||||
metrics::{ConnectOutcome, ConnectionFailureKind, Metrics, RetriesMetricGroup, RetryType},
|
||||
@@ -10,6 +10,7 @@ use crate::{
|
||||
retry::{retry_after, ShouldRetry},
|
||||
wake_compute::wake_compute,
|
||||
},
|
||||
Host,
|
||||
};
|
||||
use async_trait::async_trait;
|
||||
use pq_proto::StartupMessageParams;
|
||||
@@ -64,6 +65,9 @@ pub trait ComputeConnectBackend {
|
||||
pub struct TcpMechanism<'a> {
|
||||
/// KV-dictionary with PostgreSQL connection params.
|
||||
pub params: &'a StartupMessageParams,
|
||||
|
||||
/// connect_to_compute concurrency lock
|
||||
pub locks: &'static ApiLocks<Host>,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
@@ -79,6 +83,8 @@ impl ConnectMechanism for TcpMechanism<'_> {
|
||||
node_info: &console::CachedNodeInfo,
|
||||
timeout: time::Duration,
|
||||
) -> Result<PostgresConnection, Self::Error> {
|
||||
let host = node_info.config.get_host()?;
|
||||
let _permit = self.locks.get_permit(&host).await?;
|
||||
node_info.connect(ctx, timeout).await
|
||||
}
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@ use crate::metrics::{
|
||||
WakeupFailureKind,
|
||||
};
|
||||
use crate::proxy::retry::retry_after;
|
||||
use hyper::StatusCode;
|
||||
use hyper1::StatusCode;
|
||||
use std::ops::ControlFlow;
|
||||
use tracing::{error, info, warn};
|
||||
|
||||
|
||||
@@ -4,6 +4,7 @@ use futures::StreamExt;
|
||||
use pq_proto::CancelKeyData;
|
||||
use redis::aio::PubSub;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use uuid::Uuid;
|
||||
|
||||
use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
|
||||
@@ -11,7 +12,7 @@ use crate::{
|
||||
cache::project_info::ProjectInfoCache,
|
||||
cancellation::{CancelMap, CancellationHandler},
|
||||
intern::{ProjectIdInt, RoleNameInt},
|
||||
metrics::{Metrics, RedisErrors},
|
||||
metrics::{Metrics, RedisErrors, RedisEventsCount},
|
||||
};
|
||||
|
||||
const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates";
|
||||
@@ -77,6 +78,16 @@ struct MessageHandler<C: ProjectInfoCache + Send + Sync + 'static> {
|
||||
region_id: String,
|
||||
}
|
||||
|
||||
impl<C: ProjectInfoCache + Send + Sync + 'static> Clone for MessageHandler<C> {
|
||||
fn clone(&self) -> Self {
|
||||
Self {
|
||||
cache: self.cache.clone(),
|
||||
cancellation_handler: self.cancellation_handler.clone(),
|
||||
region_id: self.region_id.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
|
||||
pub fn new(
|
||||
cache: Arc<C>,
|
||||
@@ -89,11 +100,11 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
|
||||
region_id,
|
||||
}
|
||||
}
|
||||
pub fn disable_ttl(&self) {
|
||||
self.cache.disable_ttl();
|
||||
pub async fn increment_active_listeners(&self) {
|
||||
self.cache.increment_active_listeners().await;
|
||||
}
|
||||
pub fn enable_ttl(&self) {
|
||||
self.cache.enable_ttl();
|
||||
pub async fn decrement_active_listeners(&self) {
|
||||
self.cache.decrement_active_listeners().await;
|
||||
}
|
||||
#[tracing::instrument(skip(self, msg), fields(session_id = tracing::field::Empty))]
|
||||
async fn handle_message(&self, msg: redis::Msg) -> anyhow::Result<()> {
|
||||
@@ -118,6 +129,10 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
|
||||
"session_id",
|
||||
&tracing::field::display(cancel_session.session_id),
|
||||
);
|
||||
Metrics::get()
|
||||
.proxy
|
||||
.redis_events_count
|
||||
.inc(RedisEventsCount::CancelSession);
|
||||
if let Some(cancel_region) = cancel_session.region_id {
|
||||
// If the message is not for this region, ignore it.
|
||||
if cancel_region != self.region_id {
|
||||
@@ -138,6 +153,17 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
|
||||
}
|
||||
_ => {
|
||||
invalidate_cache(self.cache.clone(), msg.clone());
|
||||
if matches!(msg, AllowedIpsUpdate { .. }) {
|
||||
Metrics::get()
|
||||
.proxy
|
||||
.redis_events_count
|
||||
.inc(RedisEventsCount::AllowedIpsUpdate);
|
||||
} else if matches!(msg, PasswordUpdate { .. }) {
|
||||
Metrics::get()
|
||||
.proxy
|
||||
.redis_events_count
|
||||
.inc(RedisEventsCount::PasswordUpdate);
|
||||
}
|
||||
// It might happen that the invalid entry is on the way to be cached.
|
||||
// To make sure that the entry is invalidated, let's repeat the invalidation in INVALIDATION_LAG seconds.
|
||||
// TODO: include the version (or the timestamp) in the message and invalidate only if the entry is cached before the message.
|
||||
@@ -167,37 +193,24 @@ fn invalidate_cache<C: ProjectInfoCache>(cache: Arc<C>, msg: Notification) {
|
||||
}
|
||||
}
|
||||
|
||||
/// Handle console's invalidation messages.
|
||||
#[tracing::instrument(name = "console_notifications", skip_all)]
|
||||
pub async fn task_main<C>(
|
||||
async fn handle_messages<C: ProjectInfoCache + Send + Sync + 'static>(
|
||||
handler: MessageHandler<C>,
|
||||
redis: ConnectionWithCredentialsProvider,
|
||||
cache: Arc<C>,
|
||||
cancel_map: CancelMap,
|
||||
region_id: String,
|
||||
) -> anyhow::Result<Infallible>
|
||||
where
|
||||
C: ProjectInfoCache + Send + Sync + 'static,
|
||||
{
|
||||
cache.enable_ttl();
|
||||
let handler = MessageHandler::new(
|
||||
cache,
|
||||
Arc::new(CancellationHandler::<()>::new(
|
||||
cancel_map,
|
||||
crate::metrics::CancellationSource::FromRedis,
|
||||
)),
|
||||
region_id,
|
||||
);
|
||||
|
||||
cancellation_token: CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
loop {
|
||||
if cancellation_token.is_cancelled() {
|
||||
return Ok(());
|
||||
}
|
||||
let mut conn = match try_connect(&redis).await {
|
||||
Ok(conn) => {
|
||||
handler.disable_ttl();
|
||||
handler.increment_active_listeners().await;
|
||||
conn
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::error!(
|
||||
"failed to connect to redis: {e}, will try to reconnect in {RECONNECT_TIMEOUT:#?}"
|
||||
);
|
||||
"failed to connect to redis: {e}, will try to reconnect in {RECONNECT_TIMEOUT:#?}"
|
||||
);
|
||||
tokio::time::sleep(RECONNECT_TIMEOUT).await;
|
||||
continue;
|
||||
}
|
||||
@@ -211,8 +224,47 @@ where
|
||||
break;
|
||||
}
|
||||
}
|
||||
if cancellation_token.is_cancelled() {
|
||||
handler.decrement_active_listeners().await;
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
handler.enable_ttl();
|
||||
handler.decrement_active_listeners().await;
|
||||
}
|
||||
}
|
||||
|
||||
/// Handle console's invalidation messages.
|
||||
#[tracing::instrument(name = "redis_notifications", skip_all)]
|
||||
pub async fn task_main<C>(
|
||||
redis: ConnectionWithCredentialsProvider,
|
||||
cache: Arc<C>,
|
||||
cancel_map: CancelMap,
|
||||
region_id: String,
|
||||
) -> anyhow::Result<Infallible>
|
||||
where
|
||||
C: ProjectInfoCache + Send + Sync + 'static,
|
||||
{
|
||||
let cancellation_handler = Arc::new(CancellationHandler::<()>::new(
|
||||
cancel_map,
|
||||
crate::metrics::CancellationSource::FromRedis,
|
||||
));
|
||||
let handler = MessageHandler::new(cache, cancellation_handler, region_id);
|
||||
// 6h - 1m.
|
||||
// There will be 1 minute overlap between two tasks. But at least we can be sure that no message is lost.
|
||||
let mut interval = tokio::time::interval(std::time::Duration::from_secs(6 * 60 * 60 - 60));
|
||||
loop {
|
||||
let cancellation_token = CancellationToken::new();
|
||||
interval.tick().await;
|
||||
|
||||
tokio::spawn(handle_messages(
|
||||
handler.clone(),
|
||||
redis.clone(),
|
||||
cancellation_token.clone(),
|
||||
));
|
||||
tokio::spawn(async move {
|
||||
tokio::time::sleep(std::time::Duration::from_secs(6 * 60 * 60)).await; // 6h.
|
||||
cancellation_token.cancel();
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -9,11 +9,13 @@ use crate::{
|
||||
config::{AuthenticationConfig, ProxyConfig},
|
||||
console::{
|
||||
errors::{GetAuthInfoError, WakeComputeError},
|
||||
locks::ApiLocks,
|
||||
CachedNodeInfo,
|
||||
},
|
||||
context::RequestMonitoring,
|
||||
error::{ErrorKind, ReportableError, UserFacingError},
|
||||
proxy::connect_compute::ConnectMechanism,
|
||||
proxy::{connect_compute::ConnectMechanism, retry::ShouldRetry},
|
||||
Host,
|
||||
};
|
||||
|
||||
use super::conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool};
|
||||
@@ -105,6 +107,7 @@ impl PoolingBackend {
|
||||
conn_id,
|
||||
conn_info,
|
||||
pool: self.pool.clone(),
|
||||
locks: &self.config.connect_compute_locks,
|
||||
},
|
||||
&backend,
|
||||
false, // do not allow self signed compute for http flow
|
||||
@@ -154,16 +157,31 @@ impl UserFacingError for HttpConnError {
|
||||
}
|
||||
}
|
||||
|
||||
impl ShouldRetry for HttpConnError {
|
||||
fn could_retry(&self) -> bool {
|
||||
match self {
|
||||
HttpConnError::ConnectionError(e) => e.could_retry(),
|
||||
HttpConnError::ConnectionClosedAbruptly(_) => false,
|
||||
HttpConnError::GetAuthInfo(_) => false,
|
||||
HttpConnError::AuthError(_) => false,
|
||||
HttpConnError::WakeCompute(_) => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct TokioMechanism {
|
||||
pool: Arc<GlobalConnPool<tokio_postgres::Client>>,
|
||||
conn_info: ConnInfo,
|
||||
conn_id: uuid::Uuid,
|
||||
|
||||
/// connect_to_compute concurrency lock
|
||||
locks: &'static ApiLocks<Host>,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl ConnectMechanism for TokioMechanism {
|
||||
type Connection = Client<tokio_postgres::Client>;
|
||||
type ConnectError = tokio_postgres::Error;
|
||||
type ConnectError = HttpConnError;
|
||||
type Error = HttpConnError;
|
||||
|
||||
async fn connect_once(
|
||||
@@ -172,6 +190,9 @@ impl ConnectMechanism for TokioMechanism {
|
||||
node_info: &CachedNodeInfo,
|
||||
timeout: Duration,
|
||||
) -> Result<Self::Connection, Self::ConnectError> {
|
||||
let host = node_info.config.get_host()?;
|
||||
let permit = self.locks.get_permit(&host).await?;
|
||||
|
||||
let mut config = (*node_info.config).clone();
|
||||
let config = config
|
||||
.user(&self.conn_info.user_info.user)
|
||||
@@ -182,6 +203,7 @@ impl ConnectMechanism for TokioMechanism {
|
||||
let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
|
||||
let (client, connection) = config.connect(tokio_postgres::NoTls).await?;
|
||||
drop(pause);
|
||||
drop(permit);
|
||||
|
||||
tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id()));
|
||||
Ok(poll_client(
|
||||
|
||||
@@ -23,7 +23,7 @@ moto = {extras = ["server"], version = "^4.1.2"}
|
||||
backoff = "^2.2.1"
|
||||
pytest-lazy-fixture = "^0.6.3"
|
||||
prometheus-client = "^0.14.1"
|
||||
pytest-timeout = "^2.1.0"
|
||||
pytest-timeout = "^2.3.1"
|
||||
Werkzeug = "^3.0.1"
|
||||
pytest-order = "^1.1.0"
|
||||
allure-pytest = "^2.13.2"
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
[toolchain]
|
||||
channel = "1.77.0"
|
||||
channel = "1.78.0"
|
||||
profile = "default"
|
||||
# The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
|
||||
# https://rust-lang.github.io/rustup/concepts/profiles.html
|
||||
|
||||
@@ -22,7 +22,12 @@ serde_with.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
utils.workspace = true
|
||||
async-stream.workspace = true
|
||||
native-tls.workspace = true
|
||||
postgres-native-tls.workspace = true
|
||||
postgres_ffi.workspace = true
|
||||
tokio-stream.workspace = true
|
||||
tokio-postgres.workspace = true
|
||||
tokio-util = { workspace = true }
|
||||
futures-util.workspace = true
|
||||
itertools.workspace = true
|
||||
camino.workspace = true
|
||||
|
||||
@@ -67,10 +67,12 @@ the purge command will log all the keys that it would have deleted.
|
||||
|
||||
#### `scan-metadata`
|
||||
|
||||
Walk objects in a pageserver S3 bucket, and report statistics on the contents.
|
||||
Walk objects in a pageserver or safekeeper S3 bucket, and report statistics on the contents and checking consistency.
|
||||
Errors are logged to stderr and summary to stdout.
|
||||
|
||||
For pageserver:
|
||||
```
|
||||
env SSO_ACCOUNT_ID=123456 REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- scan-metadata
|
||||
env SSO_ACCOUNT_ID=123456 REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- scan-metadata --node-kind pageserver
|
||||
|
||||
Timelines: 31106
|
||||
With errors: 3
|
||||
@@ -82,6 +84,10 @@ Layer size bytes: min 24576, 1% 36879, 10% 36879, 50% 61471, 90% 44695551, 99% 2
|
||||
Timeline layer count: min 1, 1% 3, 10% 6, 50% 16, 90% 25, 99% 39, max 1053
|
||||
```
|
||||
|
||||
For safekeepers, dump_db_connstr and dump_db_table must be
|
||||
specified; they should point to table with debug dump which will be used
|
||||
to list timelines and find their backup and start LSNs.
|
||||
|
||||
## Cleaning up running pageservers
|
||||
|
||||
If S3 state is altered first manually, pageserver in-memory state will contain wrong data about S3 state, and tenants/timelines may get recreated on S3 (due to any layer upload due to compaction, pageserver restart, etc.). So before proceeding, for tenants/timelines which are already deleted in the console, we must remove these from pageservers.
|
||||
|
||||
@@ -1,11 +1,13 @@
|
||||
use std::time::Duration;
|
||||
|
||||
use chrono::{DateTime, Utc};
|
||||
use futures::Future;
|
||||
use hex::FromHex;
|
||||
|
||||
use reqwest::{header, Client, StatusCode, Url};
|
||||
use serde::Deserialize;
|
||||
use tokio::sync::Semaphore;
|
||||
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::backoff;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
@@ -137,7 +139,7 @@ pub struct ProjectData {
|
||||
pub region_id: String,
|
||||
pub platform_id: String,
|
||||
pub user_id: String,
|
||||
pub pageserver_id: u64,
|
||||
pub pageserver_id: Option<u64>,
|
||||
#[serde(deserialize_with = "from_nullable_id")]
|
||||
pub tenant: TenantId,
|
||||
pub safekeepers: Vec<SafekeeperData>,
|
||||
@@ -155,7 +157,7 @@ pub struct ProjectData {
|
||||
pub maintenance_set: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, serde::Deserialize)]
|
||||
#[derive(Debug, Clone, serde::Deserialize)]
|
||||
pub struct BranchData {
|
||||
pub id: BranchId,
|
||||
pub created_at: DateTime<Utc>,
|
||||
@@ -210,30 +212,39 @@ impl CloudAdminApiClient {
|
||||
.await
|
||||
.expect("Semaphore is not closed");
|
||||
|
||||
let response = self
|
||||
.http_client
|
||||
.get(self.append_url("/projects"))
|
||||
.query(&[
|
||||
("tenant_id", tenant_id.to_string()),
|
||||
("show_deleted", "true".to_string()),
|
||||
])
|
||||
.header(header::ACCEPT, "application/json")
|
||||
.bearer_auth(&self.token)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| {
|
||||
Error::new(
|
||||
"Find project for tenant".to_string(),
|
||||
ErrorKind::RequestSend(e),
|
||||
)
|
||||
})?;
|
||||
let response = CloudAdminApiClient::with_retries(
|
||||
|| async {
|
||||
let response = self
|
||||
.http_client
|
||||
.get(self.append_url("/projects"))
|
||||
.query(&[
|
||||
("tenant_id", tenant_id.to_string()),
|
||||
("show_deleted", "true".to_string()),
|
||||
])
|
||||
.header(header::ACCEPT, "application/json")
|
||||
.bearer_auth(&self.token)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| {
|
||||
Error::new(
|
||||
"Find project for tenant".to_string(),
|
||||
ErrorKind::RequestSend(e),
|
||||
)
|
||||
})?;
|
||||
|
||||
let response: AdminApiResponse<Vec<ProjectData>> =
|
||||
response.json().await.map_err(|e| {
|
||||
Error::new(
|
||||
"Find project for tenant".to_string(),
|
||||
ErrorKind::BodyRead(e),
|
||||
)
|
||||
})?;
|
||||
Ok(response)
|
||||
},
|
||||
"find_tenant_project",
|
||||
)
|
||||
.await?;
|
||||
|
||||
let response: AdminApiResponse<Vec<ProjectData>> = response.json().await.map_err(|e| {
|
||||
Error::new(
|
||||
"Find project for tenant".to_string(),
|
||||
ErrorKind::BodyRead(e),
|
||||
)
|
||||
})?;
|
||||
match response.data.len() {
|
||||
0 => Ok(None),
|
||||
1 => Ok(Some(
|
||||
@@ -261,42 +272,34 @@ impl CloudAdminApiClient {
|
||||
const PAGINATION_LIMIT: usize = 512;
|
||||
let mut result: Vec<ProjectData> = Vec::with_capacity(PAGINATION_LIMIT);
|
||||
loop {
|
||||
let response = self
|
||||
.http_client
|
||||
.get(self.append_url("/projects"))
|
||||
.query(&[
|
||||
("show_deleted", "false".to_string()),
|
||||
("limit", format!("{PAGINATION_LIMIT}")),
|
||||
("offset", format!("{pagination_offset}")),
|
||||
])
|
||||
.header(header::ACCEPT, "application/json")
|
||||
.bearer_auth(&self.token)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| {
|
||||
Error::new(
|
||||
"List active projects".to_string(),
|
||||
ErrorKind::RequestSend(e),
|
||||
)
|
||||
})?;
|
||||
let response_bytes = CloudAdminApiClient::with_retries(
|
||||
|| async {
|
||||
let response = self
|
||||
.http_client
|
||||
.get(self.append_url("/projects"))
|
||||
.query(&[
|
||||
("show_deleted", "false".to_string()),
|
||||
("limit", format!("{PAGINATION_LIMIT}")),
|
||||
("offset", format!("{pagination_offset}")),
|
||||
])
|
||||
.header(header::ACCEPT, "application/json")
|
||||
.bearer_auth(&self.token)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| {
|
||||
Error::new(
|
||||
"List active projects".to_string(),
|
||||
ErrorKind::RequestSend(e),
|
||||
)
|
||||
})?;
|
||||
|
||||
match response.status() {
|
||||
StatusCode::OK => {}
|
||||
StatusCode::SERVICE_UNAVAILABLE | StatusCode::TOO_MANY_REQUESTS => {
|
||||
tokio::time::sleep(Duration::from_millis(500)).await;
|
||||
continue;
|
||||
}
|
||||
_status => {
|
||||
return Err(Error::new(
|
||||
"List active projects".to_string(),
|
||||
ErrorKind::ResponseStatus(response.status()),
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
let response_bytes = response.bytes().await.map_err(|e| {
|
||||
Error::new("List active projects".to_string(), ErrorKind::BodyRead(e))
|
||||
})?;
|
||||
response.bytes().await.map_err(|e| {
|
||||
Error::new("List active projects".to_string(), ErrorKind::BodyRead(e))
|
||||
})
|
||||
},
|
||||
"list_projects",
|
||||
)
|
||||
.await?;
|
||||
|
||||
let decode_result =
|
||||
serde_json::from_slice::<AdminApiResponse<Vec<ProjectData>>>(&response_bytes);
|
||||
@@ -327,6 +330,7 @@ impl CloudAdminApiClient {
|
||||
|
||||
pub async fn find_timeline_branch(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
) -> Result<Option<BranchData>, Error> {
|
||||
let _permit = self
|
||||
@@ -335,43 +339,61 @@ impl CloudAdminApiClient {
|
||||
.await
|
||||
.expect("Semaphore is not closed");
|
||||
|
||||
let response = self
|
||||
.http_client
|
||||
.get(self.append_url("/branches"))
|
||||
.query(&[
|
||||
("timeline_id", timeline_id.to_string()),
|
||||
("show_deleted", "true".to_string()),
|
||||
])
|
||||
.header(header::ACCEPT, "application/json")
|
||||
.bearer_auth(&self.token)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| {
|
||||
Error::new(
|
||||
"Find branch for timeline".to_string(),
|
||||
ErrorKind::RequestSend(e),
|
||||
)
|
||||
})?;
|
||||
let response = CloudAdminApiClient::with_retries(
|
||||
|| async {
|
||||
let response = self
|
||||
.http_client
|
||||
.get(self.append_url("/branches"))
|
||||
.query(&[
|
||||
("timeline_id", timeline_id.to_string()),
|
||||
("show_deleted", "true".to_string()),
|
||||
])
|
||||
.header(header::ACCEPT, "application/json")
|
||||
.bearer_auth(&self.token)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| {
|
||||
Error::new(
|
||||
"Find branch for timeline".to_string(),
|
||||
ErrorKind::RequestSend(e),
|
||||
)
|
||||
})?;
|
||||
|
||||
let response: AdminApiResponse<Vec<BranchData>> = response.json().await.map_err(|e| {
|
||||
Error::new(
|
||||
"Find branch for timeline".to_string(),
|
||||
ErrorKind::BodyRead(e),
|
||||
)
|
||||
})?;
|
||||
match response.data.len() {
|
||||
0 => Ok(None),
|
||||
1 => Ok(Some(
|
||||
response
|
||||
.data
|
||||
.into_iter()
|
||||
.next()
|
||||
.expect("Should have exactly one element"),
|
||||
)),
|
||||
too_many => Err(Error::new(
|
||||
format!("Find branch for timeline returned {too_many} branches instead of 0 or 1"),
|
||||
let response: AdminApiResponse<Vec<BranchData>> =
|
||||
response.json().await.map_err(|e| {
|
||||
Error::new(
|
||||
"Find branch for timeline".to_string(),
|
||||
ErrorKind::BodyRead(e),
|
||||
)
|
||||
})?;
|
||||
Ok(response)
|
||||
},
|
||||
"find_timeline_branch",
|
||||
)
|
||||
.await?;
|
||||
|
||||
let mut branches: Vec<BranchData> = response.data.into_iter().collect();
|
||||
// Normally timeline_id is unique. However, we do have at least one case
|
||||
// of the same timeline_id in two different projects, apparently after
|
||||
// manual recovery. So always recheck project_id (discovered through
|
||||
// tenant_id).
|
||||
let project_data = match self.find_tenant_project(tenant_id).await? {
|
||||
Some(pd) => pd,
|
||||
None => return Ok(None),
|
||||
};
|
||||
branches.retain(|b| b.project_id == project_data.id);
|
||||
if branches.len() < 2 {
|
||||
Ok(branches.first().cloned())
|
||||
} else {
|
||||
Err(Error::new(
|
||||
format!(
|
||||
"Find branch for timeline {}/{} returned {} branches instead of 0 or 1",
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
branches.len()
|
||||
),
|
||||
ErrorKind::UnexpectedState,
|
||||
)),
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -532,4 +554,15 @@ impl CloudAdminApiClient {
|
||||
.parse()
|
||||
.unwrap_or_else(|e| panic!("Could not append {subpath} to base url: {e}"))
|
||||
}
|
||||
|
||||
async fn with_retries<T, O, F>(op: O, description: &str) -> Result<T, Error>
|
||||
where
|
||||
O: FnMut() -> F,
|
||||
F: Future<Output = Result<T, Error>>,
|
||||
{
|
||||
let cancel = CancellationToken::new(); // not really used
|
||||
backoff::retry(op, |_| false, 1, 20, description, &cancel)
|
||||
.await
|
||||
.expect("cancellations are disabled")
|
||||
}
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user