Optimize calculation of HLL's register maximum

Use simplest wau of calculating access counter
Use exponential histogram to improve prediction of LFC size
2026-01-27 07:10:37 +00:00 · 2024-10-02 15:55:26 +03:00 · 2024-09-27 17:41:30 +03:00 · 2024-09-23 09:45:14 +03:00 · 2024-09-23 00:37:41 +03:00 · 2024-09-23 00:37:41 +03:00
29 changed files with 764 additions and 436 deletions
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -120,6 +120,59 @@ jobs:
      - name: Run mypy to check types
        run: poetry run mypy .

+  # Check that the vendor/postgres-* submodules point to the
+  # corresponding REL_*_STABLE_neon branches.
+  check-submodules:
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+
+      - uses: dorny/paths-filter@v3
+        id: check-if-submodules-changed
+        with:
+          filters: |
+            vendor:
+              - 'vendor/**'
+
+      - name: Check vendor/postgres-v14 submodule reference
+        if: steps.check-if-submodules-changed.outputs.vendor == 'true'
+        uses: jtmullen/submodule-branch-check-action@v1
+        with:
+          path: "vendor/postgres-v14"
+          fetch_depth: "50"
+          sub_fetch_depth: "50"
+          pass_if_unchanged: true
+
+      - name: Check vendor/postgres-v15 submodule reference
+        if: steps.check-if-submodules-changed.outputs.vendor == 'true'
+        uses: jtmullen/submodule-branch-check-action@v1
+        with:
+          path: "vendor/postgres-v15"
+          fetch_depth: "50"
+          sub_fetch_depth: "50"
+          pass_if_unchanged: true
+
+      - name: Check vendor/postgres-v16 submodule reference
+        if: steps.check-if-submodules-changed.outputs.vendor == 'true'
+        uses: jtmullen/submodule-branch-check-action@v1
+        with:
+          path: "vendor/postgres-v16"
+          fetch_depth: "50"
+          sub_fetch_depth: "50"
+          pass_if_unchanged: true
+
+      - name: Check vendor/postgres-v17 submodule reference
+        if: steps.check-if-submodules-changed.outputs.vendor == 'true'
+        uses: jtmullen/submodule-branch-check-action@v1
+        with:
+          path: "vendor/postgres-v17"
+          fetch_depth: "50"
+          sub_fetch_depth: "50"
+          pass_if_unchanged: true
+
  check-codestyle-rust:
    needs: [ check-permissions, build-build-tools-image ]
    strategy:
@@ -159,6 +212,10 @@ jobs:
      # This will catch compiler & clippy warnings in all feature combinations.
      # TODO: use cargo hack for build and test as well, but, that's quite expensive.
      # NB: keep clippy args in sync with ./run_clippy.sh
+      #
+      # The only difference between "clippy --debug" and "clippy --release" is that in --release mode,
+      # #[cfg(debug_assertions)] blocks are not built. It's not worth building everything for second
+      # time just for that, so skip "clippy --release".
      - run: |
          CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")"
          if [ "$CLIPPY_COMMON_ARGS" = "" ]; then
@@ -168,8 +225,6 @@ jobs:
          echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
      - name: Run cargo clippy (debug)
        run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS
-      - name: Run cargo clippy (release)
-        run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS

      - name: Check documentation generation
        run: cargo doc --workspace --no-deps --document-private-items
@@ -1152,6 +1207,7 @@ jobs:
    # Usually we do `needs: [...]`
    needs:
      - build-and-test-locally
+      - check-submodules
      - check-codestyle-python
      - check-codestyle-rust
      - promote-images
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -255,12 +255,6 @@ dependencies = [
 "syn 2.0.52",
 ]

-[[package]]
-name = "atomic"
-version = "0.5.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c59bdb34bc650a32731b31bd8f0829cc15d24a708ee31559e0bb34f2bc320cba"
-
 [[package]]
 name = "atomic-take"
 version = "1.1.0"
@@ -295,8 +289,8 @@ dependencies = [
 "fastrand 2.0.0",
 "hex",
 "http 0.2.9",
- "hyper 0.14.26",
- "ring 0.17.6",
+ "hyper 0.14.30",
+ "ring",
 "time",
 "tokio",
 "tracing",
@@ -486,7 +480,7 @@ dependencies = [
 "once_cell",
 "p256 0.11.1",
 "percent-encoding",
- "ring 0.17.6",
+ "ring",
 "sha2",
 "subtle",
 "time",
@@ -593,7 +587,7 @@ dependencies = [
 "http 0.2.9",
 "http-body 0.4.5",
 "http-body 1.0.0",
- "hyper 0.14.26",
+ "hyper 0.14.30",
 "hyper-rustls 0.24.0",
 "once_cell",
 "pin-project-lite",
@@ -684,7 +678,7 @@ dependencies = [
 "futures-util",
 "http 0.2.9",
 "http-body 0.4.5",
- "hyper 0.14.26",
+ "hyper 0.14.30",
 "itoa",
 "matchit 0.7.0",
 "memchr",
@@ -1089,9 +1083,9 @@ dependencies = [

 [[package]]
 name = "ciborium"
-version = "0.2.1"
+version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "effd91f6c78e5a4ace8a5d3c0b6bfaec9e2baaef55f3efc00e45fb2e477ee926"
+checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e"
 dependencies = [
 "ciborium-io",
 "ciborium-ll",
@@ -1100,18 +1094,18 @@ dependencies = [

 [[package]]
 name = "ciborium-io"
-version = "0.2.1"
+version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cdf919175532b369853f5d5e20b26b43112613fd6fe7aee757e35f7a44642656"
+checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757"

 [[package]]
 name = "ciborium-ll"
-version = "0.2.1"
+version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "defaa24ecc093c77630e6c15e17c51f5e187bf35ee514f4e2d67baaa96dae22b"
+checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9"
 dependencies = [
 "ciborium-io",
- "half 1.8.2",
+ "half",
 ]

 [[package]]
@@ -1224,7 +1218,7 @@ dependencies = [
 "compute_api",
 "flate2",
 "futures",
- "hyper 0.14.26",
+ "hyper 0.14.30",
 "nix 0.27.1",
 "notify",
 "num_cpus",
@@ -1330,7 +1324,7 @@ dependencies = [
 "git-version",
 "humantime",
 "humantime-serde",
- "hyper 0.14.26",
+ "hyper 0.14.30",
 "nix 0.27.1",
 "once_cell",
 "pageserver_api",
@@ -2304,12 +2298,6 @@ dependencies = [
 "tracing",
 ]

-[[package]]
-name = "half"
-version = "1.8.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7"
-
 [[package]]
 name = "half"
 version = "2.4.1"
@@ -2411,17 +2399,6 @@ dependencies = [
 "digest",
 ]

-[[package]]
-name = "hostname"
-version = "0.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3c731c3e10504cc8ed35cfe2f1db4c9274c3d35fa486e3b31df46f068ef3e867"
-dependencies = [
- "libc",
- "match_cfg",
- "winapi",
-]
-
 [[package]]
 name = "hostname"
 version = "0.4.0"
@@ -2430,7 +2407,7 @@ checksum = "f9c7c7c8ac16c798734b8a24560c1362120597c40d5e1459f09498f8f6c8f2ba"
 dependencies = [
 "cfg-if",
 "libc",
- "windows 0.52.0",
+ "windows",
 ]

 [[package]]
@@ -2539,9 +2516,9 @@ dependencies = [

 [[package]]
 name = "hyper"
-version = "0.14.26"
+version = "0.14.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ab302d72a6f11a3b910431ff93aae7e773078c769f0a3ef15fb9ec692ed147d4"
+checksum = "a152ddd61dfaec7273fe8419ab357f33aee0d914c5f4efbf0d96fa749eea5ec9"
 dependencies = [
 "bytes",
 "futures-channel",
@@ -2554,7 +2531,7 @@ dependencies = [
 "httpdate",
 "itoa",
 "pin-project-lite",
- "socket2 0.4.9",
+ "socket2",
 "tokio",
 "tower-service",
 "tracing",
@@ -2589,7 +2566,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0646026eb1b3eea4cd9ba47912ea5ce9cc07713d105b1a14698f4e6433d348b7"
 dependencies = [
 "http 0.2.9",
- "hyper 0.14.26",
+ "hyper 0.14.30",
 "log",
 "rustls 0.21.11",
 "rustls-native-certs 0.6.2",
@@ -2620,7 +2597,7 @@ version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1"
 dependencies = [
- "hyper 0.14.26",
+ "hyper 0.14.30",
 "pin-project-lite",
 "tokio",
 "tokio-io-timeout",
@@ -2639,7 +2616,7 @@ dependencies = [
 "http-body 1.0.0",
 "hyper 1.2.0",
 "pin-project-lite",
- "socket2 0.5.5",
+ "socket2",
 "tokio",
 "tower",
 "tower-service",
@@ -2648,16 +2625,16 @@ dependencies = [

 [[package]]
 name = "iana-time-zone"
-version = "0.1.56"
+version = "0.1.61"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0722cd7114b7de04316e7ea5456a0bbb20e4adb46fd27a3697adb812cff0f37c"
+checksum = "235e081f3925a06703c2d0117ea8b91f042756fd6e7a6e5d901e8ca1a996b220"
 dependencies = [
 "android_system_properties",
 "core-foundation-sys",
 "iana-time-zone-haiku",
 "js-sys",
 "wasm-bindgen",
- "windows 0.48.0",
+ "windows-core",
 ]

 [[package]]
@@ -2870,7 +2847,7 @@ dependencies = [
 "base64 0.21.1",
 "js-sys",
 "pem",
- "ring 0.17.6",
+ "ring",
 "serde",
 "serde_json",
 "simple_asn1",
@@ -2908,11 +2885,11 @@ dependencies = [

 [[package]]
 name = "lazy_static"
-version = "1.4.0"
+version = "1.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
+checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
 dependencies = [
- "spin 0.5.2",
+ "spin",
 ]

 [[package]]
@@ -2974,12 +2951,6 @@ dependencies = [
 "hashbrown 0.14.5",
 ]

-[[package]]
-name = "match_cfg"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ffbee8634e0d45d258acb448e7eaab3fce7a0a467395d4d9f228e3c1f01fb2e4"
-
 [[package]]
 name = "matchers"
 version = "0.1.0"
@@ -3072,15 +3043,6 @@ dependencies = [
 "autocfg",
 ]

-[[package]]
-name = "memoffset"
-version = "0.8.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d61c719bcfbcf5d62b3a09efa6088de8c54bc0bfcd3ea7ae39fcc186108b8de1"
-dependencies = [
- "autocfg",
-]
-
 [[package]]
 name = "memoffset"
 version = "0.9.0"
@@ -3660,7 +3622,7 @@ dependencies = [
 "hex-literal",
 "humantime",
 "humantime-serde",
- "hyper 0.14.26",
+ "hyper 0.14.30",
 "indoc",
 "itertools 0.10.5",
 "md5",
@@ -3853,7 +3815,7 @@ dependencies = [
 "ahash",
 "bytes",
 "chrono",
- "half 2.4.1",
+ "half",
 "hashbrown 0.14.5",
 "num",
 "num-bigint",
@@ -4140,7 +4102,7 @@ dependencies = [
 "crc32c",
 "env_logger",
 "log",
- "memoffset 0.8.0",
+ "memoffset 0.9.0",
 "once_cell",
 "postgres",
 "regex",
@@ -4350,12 +4312,12 @@ dependencies = [
 "hashlink",
 "hex",
 "hmac",
- "hostname 0.3.1",
+ "hostname",
 "http 1.1.0",
 "http-body-util",
 "humantime",
 "humantime-serde",
- "hyper 0.14.26",
+ "hyper 0.14.30",
 "hyper 1.2.0",
 "hyper-util",
 "indexmap 2.0.1",
@@ -4400,7 +4362,7 @@ dependencies = [
 "signature 2.2.0",
 "smallvec",
 "smol_str",
- "socket2 0.5.5",
+ "socket2",
 "subtle",
 "thiserror",
 "tikv-jemalloc-ctl",
@@ -4578,7 +4540,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "48406db8ac1f3cbc7dcdb56ec355343817958a356ff430259bb07baf7607e1e1"
 dependencies = [
 "pem",
- "ring 0.17.6",
+ "ring",
 "time",
 "yasna",
 ]
@@ -4602,7 +4564,7 @@ dependencies = [
 "rustls-pki-types",
 "ryu",
 "sha1_smol",
- "socket2 0.5.5",
+ "socket2",
 "tokio",
 "tokio-rustls 0.25.0",
 "tokio-util",
@@ -4714,7 +4676,7 @@ dependencies = [
 "futures-util",
 "http-types",
 "humantime-serde",
- "hyper 0.14.26",
+ "hyper 0.14.30",
 "itertools 0.10.5",
 "metrics",
 "once_cell",
@@ -4747,7 +4709,7 @@ dependencies = [
 "h2 0.3.26",
 "http 0.2.9",
 "http-body 0.4.5",
- "hyper 0.14.26",
+ "hyper 0.14.30",
 "hyper-rustls 0.24.0",
 "ipnet",
 "js-sys",
@@ -4905,21 +4867,6 @@ dependencies = [
 "subtle",
 ]

-[[package]]
-name = "ring"
-version = "0.16.20"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3053cf52e236a3ed746dfc745aa9cacf1b791d846bdaf412f60a8d7d6e17c8fc"
-dependencies = [
- "cc",
- "libc",
- "once_cell",
- "spin 0.5.2",
- "untrusted 0.7.1",
- "web-sys",
- "winapi",
-]
-
 [[package]]
 name = "ring"
 version = "0.17.6"
@@ -4929,8 +4876,8 @@ dependencies = [
 "cc",
 "getrandom 0.2.11",
 "libc",
- "spin 0.9.8",
- "untrusted 0.9.0",
+ "spin",
+ "untrusted",
 "windows-sys 0.48.0",
 ]

@@ -4950,7 +4897,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "496c1d3718081c45ba9c31fbfc07417900aa96f4070ff90dc29961836b7a9945"
 dependencies = [
 "http 0.2.9",
- "hyper 0.14.26",
+ "hyper 0.14.30",
 "lazy_static",
 "percent-encoding",
 "regex",
@@ -5074,7 +5021,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7fecbfb7b1444f477b345853b1fce097a2c6fb637b2bfb87e6bc5db0f043fae4"
 dependencies = [
 "log",
- "ring 0.17.6",
+ "ring",
 "rustls-webpki 0.101.7",
 "sct",
 ]
@@ -5086,7 +5033,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bf4ef73721ac7bcd79b2b315da7779d8fc09718c6b3d2d1b2d94850eb8c18432"
 dependencies = [
 "log",
- "ring 0.17.6",
+ "ring",
 "rustls-pki-types",
 "rustls-webpki 0.102.2",
 "subtle",
@@ -5143,24 +5090,14 @@ version = "1.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5ede67b28608b4c60685c7d54122d4400d90f62b40caee7700e700380a390fa8"

-[[package]]
-name = "rustls-webpki"
-version = "0.100.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e98ff011474fa39949b7e5c0428f9b4937eda7da7848bbb947786b7be0b27dab"
-dependencies = [
- "ring 0.16.20",
- "untrusted 0.7.1",
-]
-
 [[package]]
 name = "rustls-webpki"
 version = "0.101.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8b6275d1ee7a1cd780b64aca7726599a1dbc893b1e64144529e55c3c2f745765"
 dependencies = [
- "ring 0.17.6",
- "untrusted 0.9.0",
+ "ring",
+ "untrusted",
 ]

 [[package]]
@@ -5169,9 +5106,9 @@ version = "0.102.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "faaa0a62740bedb9b2ef5afa303da42764c012f743917351dc9a237ea1663610"
 dependencies = [
- "ring 0.17.6",
+ "ring",
 "rustls-pki-types",
- "untrusted 0.9.0",
+ "untrusted",
 ]

 [[package]]
@@ -5205,7 +5142,7 @@ dependencies = [
 "git-version",
 "hex",
 "humantime",
- "hyper 0.14.26",
+ "hyper 0.14.30",
 "metrics",
 "once_cell",
 "parking_lot 0.12.1",
@@ -5262,11 +5199,11 @@ dependencies = [

 [[package]]
 name = "schannel"
-version = "0.1.21"
+version = "0.1.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "713cfb06c7059f3588fb8044c0fad1d09e3c01d225e25b9220dbfdcf16dbb1b3"
+checksum = "fbc91545643bcf3a0bbb6569265615222618bdf33ce4ffbbd13c4bbd4c093534"
 dependencies = [
- "windows-sys 0.42.0",
+ "windows-sys 0.52.0",
 ]

 [[package]]
@@ -5290,8 +5227,8 @@ version = "0.7.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "da046153aa2352493d6cb7da4b6e5c0c057d8a1d0a9aa8560baffdd945acd414"
 dependencies = [
- "ring 0.17.6",
- "untrusted 0.9.0",
+ "ring",
+ "untrusted",
 ]

 [[package]]
@@ -5400,7 +5337,7 @@ version = "0.32.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "eba8870c5dba2bfd9db25c75574a11429f6b95957b0a78ac02e2970dd7a5249a"
 dependencies = [
- "hostname 0.4.0",
+ "hostname",
 "libc",
 "os_info",
 "rustc_version",
@@ -5712,16 +5649,6 @@ dependencies = [
 "serde",
 ]

-[[package]]
-name = "socket2"
-version = "0.4.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "64a4a911eed85daf18834cfaa86a79b7d266ff93ff5ba14005426219480ed662"
-dependencies = [
- "libc",
- "winapi",
-]
-
 [[package]]
 name = "socket2"
 version = "0.5.5"
@@ -5732,12 +5659,6 @@ dependencies = [
 "windows-sys 0.48.0",
 ]

-[[package]]
-name = "spin"
-version = "0.5.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d"
-
 [[package]]
 name = "spin"
 version = "0.9.8"
@@ -5783,7 +5704,7 @@ dependencies = [
 "futures-util",
 "git-version",
 "humantime",
- "hyper 0.14.26",
+ "hyper 0.14.30",
 "metrics",
 "once_cell",
 "parking_lot 0.12.1",
@@ -5812,7 +5733,7 @@ dependencies = [
 "git-version",
 "hex",
 "humantime",
- "hyper 0.14.26",
+ "hyper 0.14.30",
 "itertools 0.10.5",
 "lasso",
 "measured",
@@ -6228,7 +6149,7 @@ dependencies = [
 "num_cpus",
 "pin-project-lite",
 "signal-hook-registry",
- "socket2 0.5.5",
+ "socket2",
 "tokio-macros",
 "windows-sys 0.48.0",
 ]
@@ -6288,7 +6209,7 @@ dependencies = [
 "pin-project-lite",
 "postgres-protocol",
 "postgres-types",
- "socket2 0.5.5",
+ "socket2",
 "tokio",
 "tokio-util",
 ]
@@ -6300,7 +6221,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0ea13f22eda7127c827983bdaf0d7fff9df21c8817bab02815ac277a21143677"
 dependencies = [
 "futures",
- "ring 0.17.6",
+ "ring",
 "rustls 0.22.4",
 "tokio",
 "tokio-postgres",
@@ -6434,7 +6355,7 @@ dependencies = [
 "h2 0.3.26",
 "http 0.2.9",
 "http-body 0.4.5",
- "hyper 0.14.26",
+ "hyper 0.14.30",
 "hyper-timeout",
 "percent-encoding",
 "pin-project",
@@ -6611,7 +6532,7 @@ dependencies = [
 name = "tracing-utils"
 version = "0.1.0"
 dependencies = [
- "hyper 0.14.26",
+ "hyper 0.14.30",
 "opentelemetry",
 "opentelemetry-otlp",
 "opentelemetry-semantic-conventions",
@@ -6714,12 +6635,6 @@ version = "0.2.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f962df74c8c05a667b5ee8bcf162993134c104e96440b663c8daa176dc772d8c"

-[[package]]
-name = "untrusted"
-version = "0.7.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a"
-
 [[package]]
 name = "untrusted"
 version = "0.9.0"
@@ -6728,17 +6643,18 @@ checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1"

 [[package]]
 name = "ureq"
-version = "2.7.1"
+version = "2.9.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0b11c96ac7ee530603dcdf68ed1557050f374ce55a5a07193ebf8cbc9f8927e9"
+checksum = "d11a831e3c0b56e438a28308e7c810799e3c118417f342d30ecec080105395cd"
 dependencies = [
- "base64 0.21.1",
+ "base64 0.22.1",
 "log",
 "once_cell",
- "rustls 0.21.11",
- "rustls-webpki 0.100.2",
+ "rustls 0.22.4",
+ "rustls-pki-types",
+ "rustls-webpki 0.102.2",
 "url",
- "webpki-roots 0.23.1",
+ "webpki-roots 0.26.1",
 ]

 [[package]]
@@ -6802,7 +6718,7 @@ dependencies = [
 "hex",
 "hex-literal",
 "humantime",
- "hyper 0.14.26",
+ "hyper 0.14.30",
 "jsonwebtoken",
 "metrics",
 "nix 0.27.1",
@@ -6837,11 +6753,10 @@ dependencies = [

 [[package]]
 name = "uuid"
-version = "1.6.1"
+version = "1.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5e395fcf16a7a3d8127ec99782007af141946b4795001f876d54fb0d55978560"
+checksum = "81dfa00651efa65069b0b6b651f4aaa31ba9e3c3ce0137aaad053604ee7e0314"
 dependencies = [
- "atomic",
 "getrandom 0.2.11",
 "serde",
 ]
@@ -7075,15 +6990,6 @@ dependencies = [
 "wasm-bindgen",
 ]

-[[package]]
-name = "webpki-roots"
-version = "0.23.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b03058f88386e5ff5310d9111d53f48b17d732b401aeb83a8d5190f2ac459338"
-dependencies = [
- "rustls-webpki 0.100.2",
-]
-
 [[package]]
 name = "webpki-roots"
 version = "0.25.2"
@@ -7152,15 +7058,6 @@ version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"

-[[package]]
-name = "windows"
-version = "0.48.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f"
-dependencies = [
- "windows-targets 0.48.0",
-]
-
 [[package]]
 name = "windows"
 version = "0.52.0"
@@ -7180,21 +7077,6 @@ dependencies = [
 "windows-targets 0.52.4",
 ]

-[[package]]
-name = "windows-sys"
-version = "0.42.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7"
-dependencies = [
- "windows_aarch64_gnullvm 0.42.2",
- "windows_aarch64_msvc 0.42.2",
- "windows_i686_gnu 0.42.2",
- "windows_i686_msvc 0.42.2",
- "windows_x86_64_gnu 0.42.2",
- "windows_x86_64_gnullvm 0.42.2",
- "windows_x86_64_msvc 0.42.2",
-]
-
 [[package]]
 name = "windows-sys"
 version = "0.48.0"
@@ -7243,12 +7125,6 @@ dependencies = [
 "windows_x86_64_msvc 0.52.4",
 ]

-[[package]]
-name = "windows_aarch64_gnullvm"
-version = "0.42.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8"
-
 [[package]]
 name = "windows_aarch64_gnullvm"
 version = "0.48.0"
@@ -7261,12 +7137,6 @@ version = "0.52.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9"

-[[package]]
-name = "windows_aarch64_msvc"
-version = "0.42.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43"
-
 [[package]]
 name = "windows_aarch64_msvc"
 version = "0.48.0"
@@ -7279,12 +7149,6 @@ version = "0.52.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675"

-[[package]]
-name = "windows_i686_gnu"
-version = "0.42.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f"
-
 [[package]]
 name = "windows_i686_gnu"
 version = "0.48.0"
@@ -7297,12 +7161,6 @@ version = "0.52.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3"

-[[package]]
-name = "windows_i686_msvc"
-version = "0.42.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060"
-
 [[package]]
 name = "windows_i686_msvc"
 version = "0.48.0"
@@ -7315,12 +7173,6 @@ version = "0.52.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02"

-[[package]]
-name = "windows_x86_64_gnu"
-version = "0.42.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36"
-
 [[package]]
 name = "windows_x86_64_gnu"
 version = "0.48.0"
@@ -7333,12 +7185,6 @@ version = "0.52.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03"

-[[package]]
-name = "windows_x86_64_gnullvm"
-version = "0.42.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3"
-
 [[package]]
 name = "windows_x86_64_gnullvm"
 version = "0.48.0"
@@ -7351,12 +7197,6 @@ version = "0.52.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177"

-[[package]]
-name = "windows_x86_64_msvc"
-version = "0.42.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0"
-
 [[package]]
 name = "windows_x86_64_msvc"
 version = "0.48.0"
@@ -7433,10 +7273,11 @@ dependencies = [
 "futures-util",
 "generic-array",
 "getrandom 0.2.11",
+ "half",
 "hashbrown 0.14.5",
 "hex",
 "hmac",
- "hyper 0.14.26",
+ "hyper 0.14.30",
 "indexmap 1.9.3",
 "itertools 0.10.5",
 "itertools 0.12.1",
@@ -7504,7 +7345,7 @@ dependencies = [
 "der 0.7.8",
 "hex",
 "pem",
- "ring 0.17.6",
+ "ring",
 "signature 2.2.0",
 "spki 0.7.3",
 "thiserror",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -76,8 +76,6 @@ clap = { version = "4.0", features = ["derive"] }
 comfy-table = "7.1"
 const_format = "0.2"
 crc32c = "0.6"
-crossbeam-deque = "0.8.5"
-crossbeam-utils = "0.8.5"
 dashmap = { version = "5.5.0", features = ["raw-api"] }
 either = "1.8"
 enum-map = "2.4.2"
@@ -95,7 +93,7 @@ hdrhistogram = "7.5.2"
 hex = "0.4"
 hex-literal = "0.4"
 hmac = "0.12.1"
-hostname = "0.3.1"
+hostname = "0.4"
 http = {version = "1.1.0", features = ["std"]}
 http-types = { version = "2", default-features = false }
 humantime = "2.1"
@@ -104,7 +102,6 @@ hyper = "0.14"
 tokio-tungstenite = "0.20.0"
 indexmap = "2"
 indoc = "2"
-inotify = "0.10.2"
 ipnet = "2.9.0"
 itertools = "0.10"
 jsonwebtoken = "9"
@@ -113,7 +110,7 @@ libc = "0.2"
 md5 = "0.7.0"
 measured = { version = "0.0.22", features=["lasso"] }
 measured-process = { version = "0.0.22" }
-memoffset = "0.8"
+memoffset = "0.9"
 nix = { version = "0.27", features = ["dir", "fs", "process", "socket", "signal", "poll"] }
 notify = "6.0.0"
 num_cpus = "1.15"
@@ -142,7 +139,6 @@ rpds = "0.13"
 rustc-hash = "1.1.0"
 rustls = "0.22"
 rustls-pemfile = "2"
-rustls-split = "0.3"
 scopeguard = "1.1"
 sysinfo = "0.29.2"
 sd-notify = "0.4.1"
@@ -164,7 +160,6 @@ strum_macros = "0.26"
 svg_fmt = "0.4.3"
 sync_wrapper = "0.1.2"
 tar = "0.4"
-task-local-extensions = "0.1.4"
 test-context = "0.3"
 thiserror = "1.0"
 tikv-jemallocator = "0.5"
--- a/README.md
+++ b/README.md
@@ -52,11 +52,6 @@ Building Neon requires 3.15+ version of `protoc` (protobuf-compiler). If your di
 ```
 # recommended approach from https://www.rust-lang.org/tools/install
 curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
-
-# bash-only: add $HOME/.cargo/bin to PATH. The script does this for zsh
-# echo 'export PATH="$HOME/.cargo/bin:$PATH"' >> ~/.bashrc
-
-rustup show
 ```

 #### Installing dependencies on macOS (12.3.1)
@@ -79,11 +74,6 @@ brew link --force m4
 ```
 # recommended approach from https://www.rust-lang.org/tools/install
 curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
-
-# bash-only: add $HOME/.cargo/bin to PATH. The script does this for zsh
-echo 'export PATH="$HOME/.cargo/bin:$PATH"' >> ~/.bashrc
-
-rustup show
 ```

 3. Install PostgreSQL Client
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -82,7 +82,7 @@ impl ApiError {
                StatusCode::INTERNAL_SERVER_ERROR,
            ),
            ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
-                err.to_string(),
+                format!("{err:#}"), // use alternative formatting so that we give the cause without backtrace
                StatusCode::INTERNAL_SERVER_ERROR,
            ),
        }
--- a/libs/utils/src/leaky_bucket.rs
+++ b/libs/utils/src/leaky_bucket.rs
@@ -21,7 +21,13 @@
 //!
 //! Another explaination can be found here: <https://brandur.org/rate-limiting>

-use std::{sync::Mutex, time::Duration};
+use std::{
+    sync::{
+        atomic::{AtomicU64, Ordering},
+        Mutex,
+    },
+    time::Duration,
+};

 use tokio::{sync::Notify, time::Instant};

@@ -128,6 +134,7 @@ impl LeakyBucketState {

 pub struct RateLimiter {
    pub config: LeakyBucketConfig,
+    pub sleep_counter: AtomicU64,
    pub state: Mutex<LeakyBucketState>,
    /// a queue to provide this fair ordering.
    pub queue: Notify,
@@ -144,6 +151,7 @@ impl Drop for Requeue<'_> {
 impl RateLimiter {
    pub fn with_initial_tokens(config: LeakyBucketConfig, initial_tokens: f64) -> Self {
        RateLimiter {
+            sleep_counter: AtomicU64::new(0),
            state: Mutex::new(LeakyBucketState::with_initial_tokens(
                &config,
                initial_tokens,
@@ -163,15 +171,16 @@ impl RateLimiter {

    /// returns true if we did throttle
    pub async fn acquire(&self, count: usize) -> bool {
-        let mut throttled = false;
-
        let start = tokio::time::Instant::now();

+        let start_count = self.sleep_counter.load(Ordering::Acquire);
+        let mut end_count = start_count;
+
        // wait until we are the first in the queue
        let mut notified = std::pin::pin!(self.queue.notified());
        if !notified.as_mut().enable() {
-            throttled = true;
            notified.await;
+            end_count = self.sleep_counter.load(Ordering::Acquire);
        }

        // notify the next waiter in the queue when we are done.
@@ -184,9 +193,22 @@ impl RateLimiter {
                .unwrap()
                .add_tokens(&self.config, start, count as f64);
            match res {
-                Ok(()) => return throttled,
+                Ok(()) => return end_count > start_count,
                Err(ready_at) => {
-                    throttled = true;
+                    struct Increment<'a>(&'a AtomicU64);
+
+                    impl Drop for Increment<'_> {
+                        fn drop(&mut self) {
+                            self.0.fetch_add(1, Ordering::AcqRel);
+                        }
+                    }
+
+                    // increment the counter after we finish sleeping (or cancel this task).
+                    // this ensures that tasks that have already started the acquire will observe
+                    // the new sleep count when they are allowed to resume on the notify.
+                    let _inc = Increment(&self.sleep_counter);
+                    end_count += 1;
+
                    tokio::time::sleep_until(ready_at).await;
                }
            }
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1177,10 +1177,10 @@ pub(crate) mod virtual_file_io_engine {
 }

 struct GlobalAndPerTimelineHistogramTimer<'a, 'c> {
-    global_metric: &'a Histogram,
+    global_latency_histo: &'a Histogram,

    // Optional because not all op types are tracked per-timeline
-    timeline_metric: Option<&'a Histogram>,
+    per_timeline_latency_histo: Option<&'a Histogram>,

    ctx: &'c RequestContext,
    start: std::time::Instant,
@@ -1212,9 +1212,10 @@ impl<'a, 'c> Drop for GlobalAndPerTimelineHistogramTimer<'a, 'c> {
                elapsed
            }
        };
-        self.global_metric.observe(ex_throttled.as_secs_f64());
-        if let Some(timeline_metric) = self.timeline_metric {
-            timeline_metric.observe(ex_throttled.as_secs_f64());
+        self.global_latency_histo
+            .observe(ex_throttled.as_secs_f64());
+        if let Some(per_timeline_getpage_histo) = self.per_timeline_latency_histo {
+            per_timeline_getpage_histo.observe(ex_throttled.as_secs_f64());
        }
    }
 }
@@ -1240,10 +1241,32 @@ pub enum SmgrQueryType {

 #[derive(Debug)]
 pub(crate) struct SmgrQueryTimePerTimeline {
-    global_metrics: [Histogram; SmgrQueryType::COUNT],
-    per_timeline_getpage: Histogram,
+    global_started: [IntCounter; SmgrQueryType::COUNT],
+    global_latency: [Histogram; SmgrQueryType::COUNT],
+    per_timeline_getpage_started: IntCounter,
+    per_timeline_getpage_latency: Histogram,
 }

+static SMGR_QUERY_STARTED_GLOBAL: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        // it's a counter, but, name is prepared to extend it to a histogram of queue depth
+        "pageserver_smgr_query_started_global_count",
+        "Number of smgr queries started, aggregated by query type.",
+        &["smgr_query_type"],
+    )
+    .expect("failed to define a metric")
+});
+
+static SMGR_QUERY_STARTED_PER_TENANT_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        // it's a counter, but, name is prepared to extend it to a histogram of queue depth
+        "pageserver_smgr_query_started_count",
+        "Number of smgr queries started, aggregated by query type and tenant/timeline.",
+        &["smgr_query_type", "tenant_id", "shard_id", "timeline_id"],
+    )
+    .expect("failed to define a metric")
+});
+
 static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_smgr_query_seconds",
@@ -1319,14 +1342,20 @@ impl SmgrQueryTimePerTimeline {
        let tenant_id = tenant_shard_id.tenant_id.to_string();
        let shard_slug = format!("{}", tenant_shard_id.shard_slug());
        let timeline_id = timeline_id.to_string();
-        let global_metrics = std::array::from_fn(|i| {
+        let global_started = std::array::from_fn(|i| {
+            let op = SmgrQueryType::from_repr(i).unwrap();
+            SMGR_QUERY_STARTED_GLOBAL
+                .get_metric_with_label_values(&[op.into()])
+                .unwrap()
+        });
+        let global_latency = std::array::from_fn(|i| {
            let op = SmgrQueryType::from_repr(i).unwrap();
            SMGR_QUERY_TIME_GLOBAL
                .get_metric_with_label_values(&[op.into()])
                .unwrap()
        });

-        let per_timeline_getpage = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
+        let per_timeline_getpage_started = SMGR_QUERY_STARTED_PER_TENANT_TIMELINE
            .get_metric_with_label_values(&[
                SmgrQueryType::GetPageAtLsn.into(),
                &tenant_id,
@@ -1334,9 +1363,20 @@ impl SmgrQueryTimePerTimeline {
                &timeline_id,
            ])
            .unwrap();
+        let per_timeline_getpage_latency = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
+            .get_metric_with_label_values(&[
+                SmgrQueryType::GetPageAtLsn.into(),
+                &tenant_id,
+                &shard_slug,
+                &timeline_id,
+            ])
+            .unwrap();
+
        Self {
-            global_metrics,
-            per_timeline_getpage,
+            global_started,
+            global_latency,
+            per_timeline_getpage_latency,
+            per_timeline_getpage_started,
        }
    }
    pub(crate) fn start_timer<'c: 'a, 'a>(
@@ -1344,8 +1384,11 @@ impl SmgrQueryTimePerTimeline {
        op: SmgrQueryType,
        ctx: &'c RequestContext,
    ) -> Option<impl Drop + '_> {
-        let global_metric = &self.global_metrics[op as usize];
        let start = Instant::now();
+
+        self.global_started[op as usize].inc();
+
+        // We subtract time spent throttled from the observed latency.
        match ctx.micros_spent_throttled.open() {
            Ok(()) => (),
            Err(error) => {
@@ -1364,15 +1407,16 @@ impl SmgrQueryTimePerTimeline {
            }
        }

-        let timeline_metric = if matches!(op, SmgrQueryType::GetPageAtLsn) {
-            Some(&self.per_timeline_getpage)
+        let per_timeline_latency_histo = if matches!(op, SmgrQueryType::GetPageAtLsn) {
+            self.per_timeline_getpage_started.inc();
+            Some(&self.per_timeline_getpage_latency)
        } else {
            None
        };

        Some(GlobalAndPerTimelineHistogramTimer {
-            global_metric,
-            timeline_metric,
+            global_latency_histo: &self.global_latency[op as usize],
+            per_timeline_latency_histo,
            ctx,
            start,
            op,
@@ -1423,9 +1467,12 @@ mod smgr_query_time_tests {
            let get_counts = || {
                let global: u64 = ops
                    .iter()
-                    .map(|op| metrics.global_metrics[*op as usize].get_sample_count())
+                    .map(|op| metrics.global_latency[*op as usize].get_sample_count())
                    .sum();
-                (global, metrics.per_timeline_getpage.get_sample_count())
+                (
+                    global,
+                    metrics.per_timeline_getpage_latency.get_sample_count(),
+                )
            };

            let (pre_global, pre_per_tenant_timeline) = get_counts();
@@ -2576,6 +2623,12 @@ impl TimelineMetrics {
            let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, shard_id, timeline_id]);
        }

+        let _ = SMGR_QUERY_STARTED_PER_TENANT_TIMELINE.remove_label_values(&[
+            SmgrQueryType::GetPageAtLsn.into(),
+            tenant_id,
+            shard_id,
+            timeline_id,
+        ]);
        let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[
            SmgrQueryType::GetPageAtLsn.into(),
            tenant_id,
@@ -2592,6 +2645,8 @@ pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
        let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
    }

+    tenant_throttling::remove_tenant_metrics(tenant_shard_id);
+
    // we leave the BROKEN_TENANTS_SET entry if any
 }

@@ -3055,41 +3110,180 @@ pub mod tokio_epoll_uring {
 pub(crate) mod tenant_throttling {
    use metrics::{register_int_counter_vec, IntCounter};
    use once_cell::sync::Lazy;
+    use utils::shard::TenantShardId;

    use crate::tenant::{self, throttle::Metric};

-    pub(crate) struct TimelineGet {
-        wait_time: IntCounter,
-        count: IntCounter,
+    struct GlobalAndPerTenantIntCounter {
+        global: IntCounter,
+        per_tenant: IntCounter,
    }

-    pub(crate) static TIMELINE_GET: Lazy<TimelineGet> = Lazy::new(|| {
-        static WAIT_USECS: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
-            register_int_counter_vec!(
-            "pageserver_tenant_throttling_wait_usecs_sum_global",
-            "Sum of microseconds that tenants spent waiting for a tenant throttle of a given kind.",
+    impl GlobalAndPerTenantIntCounter {
+        #[inline(always)]
+        pub(crate) fn inc(&self) {
+            self.inc_by(1)
+        }
+        #[inline(always)]
+        pub(crate) fn inc_by(&self, n: u64) {
+            self.global.inc_by(n);
+            self.per_tenant.inc_by(n);
+        }
+    }
+
+    pub(crate) struct TimelineGet {
+        count_accounted_start: GlobalAndPerTenantIntCounter,
+        count_accounted_finish: GlobalAndPerTenantIntCounter,
+        wait_time: GlobalAndPerTenantIntCounter,
+        count_throttled: GlobalAndPerTenantIntCounter,
+    }
+
+    static COUNT_ACCOUNTED_START: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
+        register_int_counter_vec!(
+            "pageserver_tenant_throttling_count_accounted_start_global",
+            "Count of tenant throttling starts, by kind of throttle.",
            &["kind"]
        )
-            .unwrap()
-        });
-
-        static WAIT_COUNT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
-            register_int_counter_vec!(
-                "pageserver_tenant_throttling_count_global",
-                "Count of tenant throttlings, by kind of throttle.",
-                &["kind"]
-            )
-            .unwrap()
-        });
-
-        let kind = "timeline_get";
-        TimelineGet {
-            wait_time: WAIT_USECS.with_label_values(&[kind]),
-            count: WAIT_COUNT.with_label_values(&[kind]),
-        }
+        .unwrap()
+    });
+    static COUNT_ACCOUNTED_START_PER_TENANT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
+        register_int_counter_vec!(
+            "pageserver_tenant_throttling_count_accounted_start",
+            "Count of tenant throttling starts, by kind of throttle.",
+            &["kind", "tenant_id", "shard_id"]
+        )
+        .unwrap()
+    });
+    static COUNT_ACCOUNTED_FINISH: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
+        register_int_counter_vec!(
+            "pageserver_tenant_throttling_count_accounted_finish_global",
+            "Count of tenant throttling finishes, by kind of throttle.",
+            &["kind"]
+        )
+        .unwrap()
+    });
+    static COUNT_ACCOUNTED_FINISH_PER_TENANT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
+        register_int_counter_vec!(
+            "pageserver_tenant_throttling_count_accounted_finish",
+            "Count of tenant throttling finishes, by kind of throttle.",
+            &["kind", "tenant_id", "shard_id"]
+        )
+        .unwrap()
+    });
+    static WAIT_USECS: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
+        register_int_counter_vec!(
+            "pageserver_tenant_throttling_wait_usecs_sum_global",
+            "Sum of microseconds that spent waiting throttle by kind of throttle.",
+            &["kind"]
+        )
+        .unwrap()
+    });
+    static WAIT_USECS_PER_TENANT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
+        register_int_counter_vec!(
+            "pageserver_tenant_throttling_wait_usecs_sum",
+            "Sum of microseconds that spent waiting throttle by kind of throttle.",
+            &["kind", "tenant_id", "shard_id"]
+        )
+        .unwrap()
    });

-    impl Metric for &'static TimelineGet {
+    static WAIT_COUNT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
+        register_int_counter_vec!(
+            "pageserver_tenant_throttling_count_global",
+            "Count of tenant throttlings, by kind of throttle.",
+            &["kind"]
+        )
+        .unwrap()
+    });
+    static WAIT_COUNT_PER_TENANT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
+        register_int_counter_vec!(
+            "pageserver_tenant_throttling_count",
+            "Count of tenant throttlings, by kind of throttle.",
+            &["kind", "tenant_id", "shard_id"]
+        )
+        .unwrap()
+    });
+
+    const KIND: &str = "timeline_get";
+
+    impl TimelineGet {
+        pub(crate) fn new(tenant_shard_id: &TenantShardId) -> Self {
+            TimelineGet {
+                count_accounted_start: {
+                    GlobalAndPerTenantIntCounter {
+                        global: COUNT_ACCOUNTED_START.with_label_values(&[KIND]),
+                        per_tenant: COUNT_ACCOUNTED_START_PER_TENANT.with_label_values(&[
+                            KIND,
+                            &tenant_shard_id.tenant_id.to_string(),
+                            &tenant_shard_id.shard_slug().to_string(),
+                        ]),
+                    }
+                },
+                count_accounted_finish: {
+                    GlobalAndPerTenantIntCounter {
+                        global: COUNT_ACCOUNTED_FINISH.with_label_values(&[KIND]),
+                        per_tenant: COUNT_ACCOUNTED_FINISH_PER_TENANT.with_label_values(&[
+                            KIND,
+                            &tenant_shard_id.tenant_id.to_string(),
+                            &tenant_shard_id.shard_slug().to_string(),
+                        ]),
+                    }
+                },
+                wait_time: {
+                    GlobalAndPerTenantIntCounter {
+                        global: WAIT_USECS.with_label_values(&[KIND]),
+                        per_tenant: WAIT_USECS_PER_TENANT.with_label_values(&[
+                            KIND,
+                            &tenant_shard_id.tenant_id.to_string(),
+                            &tenant_shard_id.shard_slug().to_string(),
+                        ]),
+                    }
+                },
+                count_throttled: {
+                    GlobalAndPerTenantIntCounter {
+                        global: WAIT_COUNT.with_label_values(&[KIND]),
+                        per_tenant: WAIT_COUNT_PER_TENANT.with_label_values(&[
+                            KIND,
+                            &tenant_shard_id.tenant_id.to_string(),
+                            &tenant_shard_id.shard_slug().to_string(),
+                        ]),
+                    }
+                },
+            }
+        }
+    }
+
+    pub(crate) fn preinitialize_global_metrics() {
+        Lazy::force(&COUNT_ACCOUNTED_START);
+        Lazy::force(&COUNT_ACCOUNTED_FINISH);
+        Lazy::force(&WAIT_USECS);
+        Lazy::force(&WAIT_COUNT);
+    }
+
+    pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
+        for m in &[
+            &COUNT_ACCOUNTED_START_PER_TENANT,
+            &COUNT_ACCOUNTED_FINISH_PER_TENANT,
+            &WAIT_USECS_PER_TENANT,
+            &WAIT_COUNT_PER_TENANT,
+        ] {
+            let _ = m.remove_label_values(&[
+                KIND,
+                &tenant_shard_id.tenant_id.to_string(),
+                &tenant_shard_id.shard_slug().to_string(),
+            ]);
+        }
+    }
+
+    impl Metric for TimelineGet {
+        #[inline(always)]
+        fn accounting_start(&self) {
+            self.count_accounted_start.inc();
+        }
+        #[inline(always)]
+        fn accounting_finish(&self) {
+            self.count_accounted_finish.inc();
+        }
        #[inline(always)]
        fn observe_throttling(
            &self,
@@ -3097,7 +3291,7 @@ pub(crate) mod tenant_throttling {
        ) {
            let val = u64::try_from(wait_time.as_micros()).unwrap();
            self.wait_time.inc_by(val);
-            self.count.inc();
+            self.count_throttled.inc();
        }
    }
 }
@@ -3227,11 +3421,14 @@ pub fn preinitialize_metrics() {
    }

    // countervecs
-    [&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
-        .into_iter()
-        .for_each(|c| {
-            Lazy::force(c);
-        });
+    [
+        &BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT,
+        &SMGR_QUERY_STARTED_GLOBAL,
+    ]
+    .into_iter()
+    .for_each(|c| {
+        Lazy::force(c);
+    });

    // gauges
    WALRECEIVER_ACTIVE_MANAGERS.get();
@@ -3253,7 +3450,8 @@ pub fn preinitialize_metrics() {

    // Custom
    Lazy::force(&RECONSTRUCT_TIME);
-    Lazy::force(&tenant_throttling::TIMELINE_GET);
    Lazy::force(&BASEBACKUP_QUERY_TIME);
    Lazy::force(&COMPUTE_COMMANDS_COUNTERS);
+
+    tenant_throttling::preinitialize_global_metrics();
 }
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -18,7 +18,6 @@ use camino::Utf8Path;
 use camino::Utf8PathBuf;
 use enumset::EnumSet;
 use futures::stream::FuturesUnordered;
-use futures::FutureExt;
 use futures::StreamExt;
 use pageserver_api::models;
 use pageserver_api::models::AuxFilePolicy;
@@ -34,6 +33,7 @@ use remote_storage::GenericRemoteStorage;
 use remote_storage::TimeoutOrCancel;
 use std::collections::BTreeMap;
 use std::fmt;
+use std::future::Future;
 use std::sync::Weak;
 use std::time::SystemTime;
 use storage_broker::BrokerClientChannel;
@@ -302,7 +302,7 @@ pub struct Tenant {
    /// Throttle applied at the top of [`Timeline::get`].
    /// All [`Tenant::timelines`] of a given [`Tenant`] instance share the same [`throttle::Throttle`] instance.
    pub(crate) timeline_get_throttle:
-        Arc<throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>>,
+        Arc<throttle::Throttle<crate::metrics::tenant_throttling::TimelineGet>>,

    /// An ongoing timeline detach concurrency limiter.
    ///
@@ -1031,13 +1031,9 @@ impl Tenant {
        }

        Ok(TenantPreload {
-            timelines: Self::load_timeline_metadata(
-                self,
-                remote_timeline_ids,
-                remote_storage,
-                cancel,
-            )
-            .await?,
+            timelines: self
+                .load_timelines_metadata(remote_timeline_ids, remote_storage, cancel)
+                .await?,
        })
    }

@@ -1303,7 +1299,7 @@ impl Tenant {
        .await
    }

-    async fn load_timeline_metadata(
+    async fn load_timelines_metadata(
        self: &Arc<Tenant>,
        timeline_ids: HashSet<TimelineId>,
        remote_storage: &GenericRemoteStorage,
@@ -1311,33 +1307,10 @@ impl Tenant {
    ) -> anyhow::Result<HashMap<TimelineId, TimelinePreload>> {
        let mut part_downloads = JoinSet::new();
        for timeline_id in timeline_ids {
-            let client = RemoteTimelineClient::new(
-                remote_storage.clone(),
-                self.deletion_queue_client.clone(),
-                self.conf,
-                self.tenant_shard_id,
-                timeline_id,
-                self.generation,
-            );
            let cancel_clone = cancel.clone();
            part_downloads.spawn(
-                async move {
-                    debug!("starting index part download");
-
-                    let index_part = client.download_index_file(&cancel_clone).await;
-
-                    debug!("finished index part download");
-
-                    Result::<_, anyhow::Error>::Ok(TimelinePreload {
-                        client,
-                        timeline_id,
-                        index_part,
-                    })
-                }
-                .map(move |res| {
-                    res.with_context(|| format!("download index part for timeline {timeline_id}"))
-                })
-                .instrument(info_span!("download_index_part", %timeline_id)),
+                self.load_timeline_metadata(timeline_id, remote_storage.clone(), cancel_clone)
+                    .instrument(info_span!("download_index_part", %timeline_id)),
            );
        }

@@ -1348,8 +1321,7 @@ impl Tenant {
                next = part_downloads.join_next() => {
                    match next {
                        Some(result) => {
-                            let preload_result = result.context("join preload task")?;
-                            let preload = preload_result?;
+                            let preload = result.context("join preload task")?;
                            timeline_preloads.insert(preload.timeline_id, preload);
                        },
                        None => {
@@ -1366,6 +1338,36 @@ impl Tenant {
        Ok(timeline_preloads)
    }

+    fn load_timeline_metadata(
+        self: &Arc<Tenant>,
+        timeline_id: TimelineId,
+        remote_storage: GenericRemoteStorage,
+        cancel: CancellationToken,
+    ) -> impl Future<Output = TimelinePreload> {
+        let client = RemoteTimelineClient::new(
+            remote_storage.clone(),
+            self.deletion_queue_client.clone(),
+            self.conf,
+            self.tenant_shard_id,
+            timeline_id,
+            self.generation,
+        );
+        async move {
+            debug_assert_current_span_has_tenant_and_timeline_id();
+            debug!("starting index part download");
+
+            let index_part = client.download_index_file(&cancel).await;
+
+            debug!("finished index part download");
+
+            TimelinePreload {
+                client,
+                timeline_id,
+                index_part,
+            }
+        }
+    }
+
    pub(crate) async fn apply_timeline_archival_config(
        &self,
        timeline_id: TimelineId,
@@ -2831,7 +2833,7 @@ impl Tenant {
            gate: Gate::default(),
            timeline_get_throttle: Arc::new(throttle::Throttle::new(
                Tenant::get_timeline_get_throttle_config(conf, &attached_conf.tenant_conf),
-                &crate::metrics::tenant_throttling::TIMELINE_GET,
+                crate::metrics::tenant_throttling::TimelineGet::new(&tenant_shard_id),
            )),
            tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
            ongoing_timeline_detach: std::sync::Mutex::default(),
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -439,11 +439,30 @@ impl Layer {

    fn record_access(&self, ctx: &RequestContext) {
        if self.0.access_stats.record_access(ctx) {
-            // Visibility was modified to Visible
-            tracing::info!(
-                "Layer {} became visible as a result of access",
-                self.0.desc.key()
-            );
+            // Visibility was modified to Visible: maybe log about this
+            match ctx.task_kind() {
+                TaskKind::CalculateSyntheticSize
+                | TaskKind::GarbageCollector
+                | TaskKind::MgmtRequest => {
+                    // This situation is expected in code paths do binary searches of the LSN space to resolve
+                    // an LSN to a timestamp, which happens during GC, during GC cutoff calculations in synthetic size,
+                    // and on-demand for certain HTTP API requests.
+                }
+                _ => {
+                    // In all other contexts, it is unusual to do I/O involving layers which are not visible at
+                    // some branch tip, so we log the fact that we are accessing something that the visibility
+                    // calculation thought should not be visible.
+                    //
+                    // This case is legal in brief time windows: for example an in-flight getpage request can hold on to a layer object
+                    // which was covered by a concurrent compaction.
+                    tracing::info!(
+                        "Layer {} became visible as a result of access",
+                        self.0.desc.key()
+                    );
+                }
+            }
+
+            // Update the timeline's visible bytes count
            if let Some(tl) = self.0.timeline.upgrade() {
                tl.metrics
                    .visible_physical_size_gauge
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -1025,6 +1025,15 @@ fn access_stats() {
    assert_eq!(access_stats.latest_activity(), lowres_time(atime));
    access_stats.set_visibility(LayerVisibilityHint::Visible);
    assert_eq!(access_stats.latest_activity(), lowres_time(atime));
+
+    // Recording access implicitly makes layer visible, if it wasn't already
+    let atime = UNIX_EPOCH + Duration::from_secs(2200000000);
+    access_stats.set_visibility(LayerVisibilityHint::Covered);
+    assert_eq!(access_stats.visibility(), LayerVisibilityHint::Covered);
+    assert!(access_stats.record_access_at(atime));
+    access_stats.set_visibility(LayerVisibilityHint::Visible);
+    assert!(!access_stats.record_access_at(atime));
+    access_stats.set_visibility(LayerVisibilityHint::Visible);
 }

 #[test]
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -163,8 +163,6 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
    // How many errors we have seen consequtively
    let mut error_run_count = 0;

-    let mut last_throttle_flag_reset_at = Instant::now();
-
    TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
    async {
        let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
@@ -191,8 +189,6 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                }
            }

-
-
            let sleep_duration;
            if period == Duration::ZERO {
                #[cfg(not(feature = "testing"))]
@@ -207,12 +203,18 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                };

                // Run compaction
-                let IterationResult { output, elapsed } = iteration.run(tenant.compaction_iteration(&cancel, &ctx)).await;
+                let IterationResult { output, elapsed } = iteration
+                    .run(tenant.compaction_iteration(&cancel, &ctx))
+                    .await;
                match output {
                    Ok(has_pending_task) => {
                        error_run_count = 0;
                        // schedule the next compaction immediately in case there is a pending compaction task
-                        sleep_duration = if has_pending_task { Duration::ZERO } else { period };
+                        sleep_duration = if has_pending_task {
+                            Duration::ZERO
+                        } else {
+                            period
+                        };
                    }
                    Err(e) => {
                        let wait_duration = backoff::exponential_backoff_duration_seconds(
@@ -233,38 +235,20 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                }

                // the duration is recorded by performance tests by enabling debug in this function
-                tracing::debug!(elapsed_ms=elapsed.as_millis(), "compaction iteration complete");
+                tracing::debug!(
+                    elapsed_ms = elapsed.as_millis(),
+                    "compaction iteration complete"
+                );
            };

-
            // Perhaps we did no work and the walredo process has been idle for some time:
            // give it a chance to shut down to avoid leaving walredo process running indefinitely.
+            // TODO: move this to a separate task (housekeeping loop) that isn't affected by the back-off,
+            // so we get some upper bound guarantee on when walredo quiesce / this throttling reporting here happens.
            if let Some(walredo_mgr) = &tenant.walredo_mgr {
                walredo_mgr.maybe_quiesce(period * 10);
            }

-            // TODO: move this (and walredo quiesce) to a separate task that isn't affected by the back-off,
-            // so we get some upper bound guarantee on when walredo quiesce / this throttling reporting here happens.
-            info_span!(parent: None, "timeline_get_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| {
-                let now = Instant::now();
-                let prev = std::mem::replace(&mut last_throttle_flag_reset_at, now);
-                let Stats { count_accounted, count_throttled, sum_throttled_usecs } = tenant.timeline_get_throttle.reset_stats();
-                if count_throttled == 0 {
-                    return;
-                }
-                let allowed_rps = tenant.timeline_get_throttle.steady_rps();
-                let delta = now - prev;
-                info!(
-                    n_seconds=%format_args!("{:.3}",
-                    delta.as_secs_f64()),
-                    count_accounted,
-                    count_throttled,
-                    sum_throttled_usecs,
-                    allowed_rps=%format_args!("{allowed_rps:.0}"),
-                    "shard was throttled in the last n_seconds"
-                );
-            });
-
            // Sleep
            if tokio::time::timeout(sleep_duration, cancel.cancelled())
                .await
@@ -437,6 +421,7 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
 async fn ingest_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
    TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
    async {
+    let mut last_throttle_flag_reset_at = Instant::now();
        loop {
            tokio::select! {
                _ = cancel.cancelled() => {
@@ -483,6 +468,29 @@ async fn ingest_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken
                kind: BackgroundLoopKind::IngestHouseKeeping,
            };
            iteration.run(tenant.ingest_housekeeping()).await;
+
+            // TODO: rename the background loop kind to something more generic, like, tenant housekeeping.
+            // Or just spawn another background loop for this throttle, it's not like it's super costly.
+            info_span!(parent: None, "timeline_get_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| {
+                let now = Instant::now();
+                let prev = std::mem::replace(&mut last_throttle_flag_reset_at, now);
+                let Stats { count_accounted_start, count_accounted_finish, count_throttled, sum_throttled_usecs} = tenant.timeline_get_throttle.reset_stats();
+                if count_throttled == 0 {
+                    return;
+                }
+                let allowed_rps = tenant.timeline_get_throttle.steady_rps();
+                let delta = now - prev;
+                info!(
+                    n_seconds=%format_args!("{:.3}",
+                    delta.as_secs_f64()),
+                    count_accounted = count_accounted_finish,  // don't break existing log scraping
+                    count_throttled,
+                    sum_throttled_usecs,
+                    count_accounted_start, // log after pre-existing fields to not break existing log scraping
+                    allowed_rps=%format_args!("{allowed_rps:.0}"),
+                    "shard was throttled in the last n_seconds"
+                );
+            });
        }
    }
    .await;
--- a/pageserver/src/tenant/throttle.rs
+++ b/pageserver/src/tenant/throttle.rs
@@ -24,8 +24,10 @@ use crate::{context::RequestContext, task_mgr::TaskKind};
 pub struct Throttle<M: Metric> {
    inner: ArcSwap<Inner>,
    metric: M,
-    /// will be turned into [`Stats::count_accounted`]
-    count_accounted: AtomicU64,
+    /// will be turned into [`Stats::count_accounted_start`]
+    count_accounted_start: AtomicU64,
+    /// will be turned into [`Stats::count_accounted_finish`]
+    count_accounted_finish: AtomicU64,
    /// will be turned into [`Stats::count_throttled`]
    count_throttled: AtomicU64,
    /// will be turned into [`Stats::sum_throttled_usecs`]
@@ -43,17 +45,21 @@ pub struct Observation {
    pub wait_time: Duration,
 }
 pub trait Metric {
+    fn accounting_start(&self);
+    fn accounting_finish(&self);
    fn observe_throttling(&self, observation: &Observation);
 }

 /// See [`Throttle::reset_stats`].
 pub struct Stats {
-    // Number of requests that were subject to throttling, i.e., requests of the configured [`Config::task_kinds`].
-    pub count_accounted: u64,
-    // Subset of the `accounted` requests that were actually throttled.
-    // Note that the numbers are stored as two independent atomics, so, there might be a slight drift.
+    /// Number of requests that started [`Throttle::throttle`] calls.
+    pub count_accounted_start: u64,
+    /// Number of requests that finished [`Throttle::throttle`] calls.
+    pub count_accounted_finish: u64,
+    /// Subset of the `accounted` requests that were actually throttled.
+    /// Note that the numbers are stored as two independent atomics, so, there might be a slight drift.
    pub count_throttled: u64,
-    // Sum of microseconds that throttled requests spent waiting for throttling.
+    /// Sum of microseconds that throttled requests spent waiting for throttling.
    pub sum_throttled_usecs: u64,
 }

@@ -65,7 +71,8 @@ where
        Self {
            inner: ArcSwap::new(Arc::new(Self::new_inner(config))),
            metric,
-            count_accounted: AtomicU64::new(0),
+            count_accounted_start: AtomicU64::new(0),
+            count_accounted_finish: AtomicU64::new(0),
            count_throttled: AtomicU64::new(0),
            sum_throttled_usecs: AtomicU64::new(0),
        }
@@ -117,11 +124,13 @@ where
    /// This method allows retrieving & resetting that flag.
    /// Useful for periodic reporting.
    pub fn reset_stats(&self) -> Stats {
-        let count_accounted = self.count_accounted.swap(0, Ordering::Relaxed);
+        let count_accounted_start = self.count_accounted_start.swap(0, Ordering::Relaxed);
+        let count_accounted_finish = self.count_accounted_finish.swap(0, Ordering::Relaxed);
        let count_throttled = self.count_throttled.swap(0, Ordering::Relaxed);
        let sum_throttled_usecs = self.sum_throttled_usecs.swap(0, Ordering::Relaxed);
        Stats {
-            count_accounted,
+            count_accounted_start,
+            count_accounted_finish,
            count_throttled,
            sum_throttled_usecs,
        }
@@ -139,9 +148,12 @@ where
        };
        let start = std::time::Instant::now();

+        self.metric.accounting_start();
+        self.count_accounted_start.fetch_add(1, Ordering::Relaxed);
        let did_throttle = inner.rate_limiter.acquire(key_count).await;
+        self.count_accounted_finish.fetch_add(1, Ordering::Relaxed);
+        self.metric.accounting_finish();

-        self.count_accounted.fetch_add(1, Ordering::Relaxed);
        if did_throttle {
            self.count_throttled.fetch_add(1, Ordering::Relaxed);
            let now = Instant::now();
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -196,9 +196,8 @@ fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
 /// The outward-facing resources required to build a Timeline
 pub struct TimelineResources {
    pub remote_client: RemoteTimelineClient,
-    pub timeline_get_throttle: Arc<
-        crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>,
-    >,
+    pub timeline_get_throttle:
+        Arc<crate::tenant::throttle::Throttle<crate::metrics::tenant_throttling::TimelineGet>>,
    pub l0_flush_global_state: l0_flush::L0FlushGlobalState,
 }

@@ -406,9 +405,8 @@ pub struct Timeline {
    gc_lock: tokio::sync::Mutex<()>,

    /// Cloned from [`super::Tenant::timeline_get_throttle`] on construction.
-    timeline_get_throttle: Arc<
-        crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>,
-    >,
+    timeline_get_throttle:
+        Arc<crate::tenant::throttle::Throttle<crate::metrics::tenant_throttling::TimelineGet>>,

    /// Keep aux directory cache to avoid it's reconstruction on each update
    pub(crate) aux_files: tokio::sync::Mutex<AuxFilesState>,
@@ -4316,7 +4314,9 @@ impl Timeline {
        timer.stop_and_record();

        // Creating image layers may have caused some previously visible layers to be covered
-        self.update_layer_visibility().await?;
+        if !image_layers.is_empty() {
+            self.update_layer_visibility().await?;
+        }

        Ok(image_layers)
    }
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -23,7 +23,7 @@ SHLIB_LINK_INTERNAL = $(libpq)
 SHLIB_LINK = -lcurl

 EXTENSION = neon
-DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql neon--1.3--1.2.sql neon--1.2--1.1.sql neon--1.1--1.0.sql  neon--1.3--1.4.sql neon--1.4--1.3.sql
+DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql neon--1.3--1.2.sql neon--1.2--1.1.sql neon--1.1--1.0.sql  neon--1.3--1.4.sql neon--1.4--1.3.sql neon--1.4--1.5.sql neon--1.5--1.4.sql
 PGFILEDESC = "neon - cloud storage for PostgreSQL"

 EXTRA_CLEAN = \
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -1263,7 +1263,7 @@ approximate_working_set_size_seconds(PG_FUNCTION_ARGS)
 		int32 dc;
 		time_t duration = PG_ARGISNULL(0) ? (time_t)-1 : PG_GETARG_INT32(0);
 		LWLockAcquire(lfc_lock, LW_SHARED);
-		dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, duration);
+		dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, duration, 1.0);
 		LWLockRelease(lfc_lock);
 		PG_RETURN_INT32(dc);
 	}
@@ -1280,7 +1280,7 @@ approximate_working_set_size(PG_FUNCTION_ARGS)
 		int32 dc;
 		bool reset = PG_GETARG_BOOL(0);
 		LWLockAcquire(lfc_lock, reset ? LW_EXCLUSIVE : LW_SHARED);
-		dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, (time_t)-1);
+		dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, (time_t)-1, 1.0);
 		if (reset)
 			memset(lfc_ctl->wss_estimation.regs, 0, sizeof lfc_ctl->wss_estimation.regs);
 		LWLockRelease(lfc_lock);
@@ -1288,3 +1288,21 @@ approximate_working_set_size(PG_FUNCTION_ARGS)
 	}
 	PG_RETURN_NULL();
 }
+
+PG_FUNCTION_INFO_V1(approximate_optimal_cache_size);
+
+Datum
+approximate_optimal_cache_size(PG_FUNCTION_ARGS)
+{
+	if (lfc_size_limit != 0)
+	{
+		int32 dc;
+		time_t duration = PG_ARGISNULL(0) ? (time_t)-1 : PG_GETARG_INT32(0);
+		double min_hit_ratio = PG_ARGISNULL(1) ? 1.0 : PG_GETARG_FLOAT8(1);
+		LWLockAcquire(lfc_lock, LW_SHARED);
+		dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, duration, min_hit_ratio);
+		LWLockRelease(lfc_lock);
+		PG_RETURN_INT32(dc);
+	}
+	PG_RETURN_NULL();
+}
--- a/pgxn/neon/hll.c
+++ b/pgxn/neon/hll.c
@@ -6,7 +6,7 @@
 * Portions Copyright (c) 2014-2023, PostgreSQL Global Development Group
 *
 * Implements https://hal.science/hal-00465313/document
- * 
+ *
 * Based on Hideaki Ohno's C++ implementation.  This is probably not ideally
 * suited to estimating the cardinality of very large sets;  in particular, we
 * have not attempted to further optimize the implementation as described in
@@ -126,22 +126,69 @@ addSHLL(HyperLogLogState *cState, uint32 hash)
 	/* Compute the rank of the remaining 32 - "k" (registerWidth) bits */
 	count = rho(hash << HLL_BIT_WIDTH, HLL_C_BITS);

-	cState->regs[index][count] = now;
+	if (cState->regs[index][count].ts)
+	{
+		/* update histgoram */
+		int64_t delta = (now - cState->regs[index][count].ts)/USECS_PER_SEC;
+		uint32_t new_histogram[HIST_SIZE] = {0};
+		for (int i = 0; i < HIST_SIZE; i++) {
+			/* Use middle point of interval */
+			uint32 interval_log2 = pg_ceil_log2_32((delta + (HIST_MIN_INTERVAL*((1<<i) + ((1<<i)/2))/2)) / HIST_MIN_INTERVAL);
+			uint32 cell = Min(interval_log2, HIST_SIZE-1);
+			new_histogram[cell] += cState->regs[index][count].histogram[i];
+		}
+		memcpy(cState->regs[index][count].histogram, new_histogram, sizeof new_histogram);
+	}
+	cState->regs[index][count].ts = now;
+	cState->regs[index][count].histogram[0] += 1; // most recent access always goes to first histogram backet
+}
+
+static uint32_t
+getAccessCount(const HyperLogLogRegister* reg, time_t duration)
+{
+	uint32_t count = 0;
+    /* Simplest solution is to take in account all points fro overlapped interval */
+	for (size_t i = 0; i < HIST_SIZE && HIST_MIN_INTERVAL*((1 << i)/2) <= duration; i++) {
+		count += reg->histogram[i];
+	}
+	return count;
 }

 static uint8
-getMaximum(const TimestampTz* reg, TimestampTz since)
+getMaximum(const HyperLogLogRegister* reg, TimestampTz since, time_t duration, double min_hit_ratio)
 {
 	uint8 max = 0;
-
-	for (size_t i = 0; i < HLL_C_BITS + 1; i++)
+	size_t i, j;
+	if (min_hit_ratio == 1.0)
 	{
-		if (reg[i] >= since)
+		for (i = 0; i < HLL_C_BITS + 1; i++)
 		{
-			max = i;
+			if (reg[i].ts >= since)
+			{
+				max = i;
+			}
+		}
+	}
+	else
+	{
+		uint32_t total_count = 0;
+		for (i = 0; i < HLL_C_BITS + 1; i++)
+		{
+			total_count += getAccessCount(&reg[i], duration);
+		}
+		if (total_count != 0)
+		{
+			const double threshold = total_count * (1 - min_hit_ratio);
+			for (i = 0; i < HLL_C_BITS + 1; i++)
+			{
+				// Take in account only bits with access frequncy exceeding maximal miss rate (1 - hit rate)
+				if (reg[i].ts >= since && getAccessCount(&reg[i], duration) >= threshold)
+				{
+					max = i;
+				}
+			}
 		}
 	}
-
 	return max;
 }

@@ -150,7 +197,7 @@ getMaximum(const TimestampTz* reg, TimestampTz since)
 * Estimates cardinality, based on elements added so far
 */
 double
-estimateSHLL(HyperLogLogState *cState, time_t duration)
+estimateSHLL(HyperLogLogState *cState, time_t duration, double min_hit_ratio)
 {
 	double		result;
 	double		sum = 0.0;
@@ -161,7 +208,7 @@ estimateSHLL(HyperLogLogState *cState, time_t duration)

 	for (i = 0; i < HLL_N_REGISTERS; i++)
 	{
-		R[i] = getMaximum(cState->regs[i], since);
+		R[i] = getMaximum(cState->regs[i], since, duration, min_hit_ratio);
 		sum += 1.0 / pow(2.0, R[i]);
 	}

--- a/pgxn/neon/hll.h
+++ b/pgxn/neon/hll.h
@@ -53,6 +53,14 @@
 #define HLL_C_BITS      (32 - HLL_BIT_WIDTH)
 #define HLL_N_REGISTERS (1 << HLL_BIT_WIDTH)

+/*
+ * Number of histogram cells. We use exponential histogram with first interval
+ * equals to one minutes. Autoscaler request LFC  statistic with intervals 1,2,...,60 minutes
+ * so 2^8=64 seems to be enough for our needs.
+ */
+#define HIST_SIZE         8
+#define HIST_MIN_INTERVAL 60 /* seconds */
+
 /*
 * HyperLogLog is an approximate technique for computing the number of distinct
 * entries in a set.  Importantly, it does this by using a fixed amount of
@@ -69,18 +77,21 @@
 * modified timestamp >= the query timestamp. This value is the number of bits
 * for this register in the normal HLL calculation.
 *
- * The memory usage is 2^B * (C + 1) * sizeof(TimetampTz), or 184kiB.
- * Usage could be halved if we decide to reduce the required time dimension
- * precision; as 32 bits in second precision should be enough for statistics.
- * However, that is not yet implemented.
+ * The memory usage is 2^B * (C + 1) * sizeof(HyperLogLogRegister), or 920kiB.
 */
+typedef struct
+{
+	TimestampTz ts; /* last access timestamp */
+	uint32_t    histogram[HIST_SIZE]; /* access counter exponential histogram */
+} HyperLogLogRegister;
+
 typedef struct HyperLogLogState
 {
-	TimestampTz regs[HLL_N_REGISTERS][HLL_C_BITS + 1];
+	HyperLogLogRegister regs[HLL_N_REGISTERS][HLL_C_BITS + 1];
 } HyperLogLogState;

 extern void   initSHLL(HyperLogLogState *cState);
 extern void   addSHLL(HyperLogLogState *cState, uint32 hash);
-extern double estimateSHLL(HyperLogLogState *cState, time_t dutration);
+extern double estimateSHLL(HyperLogLogState *cState, time_t dutration, double min_hit_ratio);

 #endif
--- a/pgxn/neon/neon--1.4--1.5.sql
+++ b/pgxn/neon/neon--1.4--1.5.sql
@@ -0,0 +1,10 @@
+\echo Use "ALTER EXTENSION neon UPDATE TO '1.5'" to load this file. \quit
+
+-- returns minimal LFC cache size (in 8kb pages) provided specified hit rate
+CREATE FUNCTION approximate_optimal_cache_size(duration_sec integer default null, min_hit_ration float8 default null)
+RETURNS integer
+AS 'MODULE_PATHNAME', 'approximate_optimal_cache_size'
+LANGUAGE C PARALLEL SAFE;
+
+GRANT EXECUTE ON FUNCTION approximate_optimal_cache_size(integer,float8) TO pg_monitor;
+
--- a/pgxn/neon/neon--1.5--1.4.sql
+++ b/pgxn/neon/neon--1.5--1.4.sql
@@ -0,0 +1 @@
+DROP FUNCTION IF EXISTS approximate_optimal_cache_size(integer,float8) CASCADE;
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -3,6 +3,7 @@ use std::{
    borrow::Cow,
    cmp::Ordering,
    collections::{BTreeMap, HashMap, HashSet},
+    error::Error,
    ops::Deref,
    path::PathBuf,
    str::FromStr,
@@ -218,9 +219,16 @@ fn passthrough_api_error(node: &Node, e: mgmt_api::Error) -> ApiError {
                format!("{node} error receiving error body: {str}").into(),
            )
        }
-        mgmt_api::Error::ReceiveBody(str) => {
-            // Presume errors receiving body are connectivity/availability issues
-            ApiError::ResourceUnavailable(format!("{node} error receiving body: {str}").into())
+        mgmt_api::Error::ReceiveBody(err) if err.is_decode() => {
+            // Return 500 for decoding errors.
+            ApiError::InternalServerError(anyhow::Error::from(err).context("error decoding body"))
+        }
+        mgmt_api::Error::ReceiveBody(err) => {
+            // Presume errors receiving body are connectivity/availability issues except for decoding errors
+            let src_str = err.source().map(|e| e.to_string()).unwrap_or_default();
+            ApiError::ResourceUnavailable(
+                format!("{node} error receiving error body: {err} {}", src_str).into(),
+            )
        }
        mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, msg) => {
            ApiError::NotFound(anyhow::anyhow!(format!("{node}: {msg}")).into())
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -102,6 +102,11 @@ def histogram(prefix_without_trailing_underscore: str) -> List[str]:
    return [f"{prefix_without_trailing_underscore}_{x}" for x in ["bucket", "count", "sum"]]


+def counter(name: str) -> str:
+    # the prometheus_client package appends _total to all counters client-side
+    return f"{name}_total"
+
+
 PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS: Tuple[str, ...] = (
    "pageserver_remote_timeline_client_calls_started_total",
    "pageserver_remote_timeline_client_calls_finished_total",
@@ -132,9 +137,14 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
    *histogram("pageserver_wait_lsn_seconds"),
    *histogram("pageserver_remote_operation_seconds"),
    *histogram("pageserver_io_operations_seconds"),
+    "pageserver_smgr_query_started_global_count_total",
    "pageserver_tenant_states_count",
    "pageserver_circuit_breaker_broken_total",
    "pageserver_circuit_breaker_unbroken_total",
+    counter("pageserver_tenant_throttling_count_accounted_start_global"),
+    counter("pageserver_tenant_throttling_count_accounted_finish_global"),
+    counter("pageserver_tenant_throttling_wait_usecs_sum_global"),
+    counter("pageserver_tenant_throttling_count_global"),
 )

 PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
@@ -146,6 +156,7 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
    "pageserver_smgr_query_seconds_bucket",
    "pageserver_smgr_query_seconds_count",
    "pageserver_smgr_query_seconds_sum",
+    "pageserver_smgr_query_started_count_total",
    "pageserver_archive_size",
    "pageserver_pitr_history_size",
    "pageserver_layer_bytes",
@@ -157,6 +168,10 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
    "pageserver_evictions_with_low_residence_duration_total",
    "pageserver_aux_file_estimated_size",
    "pageserver_valid_lsn_lease_count",
+    counter("pageserver_tenant_throttling_count_accounted_start"),
+    counter("pageserver_tenant_throttling_count_accounted_finish"),
+    counter("pageserver_tenant_throttling_wait_usecs_sum"),
+    counter("pageserver_tenant_throttling_count"),
    *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
    # "pageserver_directory_entries_count", -- only used if above a certain threshold
    # "pageserver_broken_tenants_count" -- used only for broken
--- a/test_runner/regress/test_lfc_working_set_approximation.py
+++ b/test_runner/regress/test_lfc_working_set_approximation.py
@@ -114,3 +114,46 @@ def test_sliding_working_set_approximation(neon_simple_env: NeonEnv):

    assert estimation_1k >= 20 and estimation_1k <= 40
    assert estimation_10k >= 200 and estimation_10k <= 400
+
+
+def test_optimal_cache_size_approximation(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+
+    endpoint = env.endpoints.create_start(
+        branch_name="main",
+        config_lines=[
+            "autovacuum = off",
+            "shared_buffers=1MB",
+            "neon.max_file_cache_size=256MB",
+            "neon.file_cache_size_limit=245MB",
+        ],
+    )
+    conn = endpoint.connect()
+    cur = conn.cursor()
+    cur.execute("create extension neon version '1.5'")
+    cur.execute(
+        "create table t_huge(pk integer primary key, count integer default 0, payload text default repeat('?', 128))"
+    )
+    cur.execute(
+        "create table t_small(pk integer primary key, count integer default 0, payload text default repeat('?', 128))"
+    )
+    cur.execute(
+        "insert into t_huge(pk) values (generate_series(1,1000000))"
+    )  # table size is 21277 pages
+    cur.execute(
+        "insert into t_small(pk) values (generate_series(1,100000))"
+    )  # table size is 2128 pages
+    time.sleep(2)
+    before = time.monotonic()
+    for _ in range(100):
+        cur.execute("select sum(count) from t_small")
+    cur.execute("select sum(count) from t_huge")
+    after = time.monotonic()
+    cur.execute(f"select approximate_working_set_size_seconds({int(after - before + 1)})")
+    ws_estimation = cur.fetchall()[0][0]
+    log.info(f"Working set size estimaton {ws_estimation}")
+    cur.execute(f"select approximate_optimal_cache_size({int(after - before + 1)}, 0.99)")
+    optimal_cache_size = cur.fetchall()[0][0]
+    log.info(f"Optimal cache size for 99% hit rate {optimal_cache_size}")
+    assert ws_estimation >= 20000 and ws_estimation <= 30000
+    assert optimal_cache_size >= 2000 and optimal_cache_size <= 3000
--- a/test_runner/regress/test_twophase.py
+++ b/test_runner/regress/test_twophase.py
@@ -8,6 +8,7 @@ from fixtures.neon_fixtures import (
    PgBin,
    fork_at_current_lsn,
    import_timeline_from_vanilla_postgres,
+    wait_for_wal_insert_lsn,
 )


@@ -22,11 +23,6 @@ def twophase_test_on_timeline(env: NeonEnv):
    conn = endpoint.connect()
    cur = conn.cursor()

-    # FIXME: Switch to the next WAL segment, to work around the bug fixed in
-    # https://github.com/neondatabase/neon/pull/8914.  When that is merged, this can be
-    # removed.
-    cur.execute("select pg_switch_wal()")
-
    cur.execute("CREATE TABLE foo (t text)")

    # Prepare a transaction that will insert a row
@@ -140,3 +136,28 @@ def test_twophase_nonzero_epoch(
    vanilla_pg.stop()  # don't need the original server anymore

    twophase_test_on_timeline(env)
+
+
+def test_twophase_at_wal_segment_start(neon_simple_env: NeonEnv):
+    """
+    Same as 'test_twophase' test, but the server is started at an LSN at the beginning
+    of a WAL segment. We had a bug where we didn't initialize the "long XLOG page header"
+    at the beginning of the segment correctly, which was detected when the checkpointer
+    tried to read the XLOG_XACT_PREPARE record from the WAL, if that record was on the
+    very first page of a WAL segment and the server was started up at that first page.
+    """
+    env = neon_simple_env
+    timeline_id = env.neon_cli.create_branch("test_twophase", "main")
+
+    endpoint = env.endpoints.create_start(
+        "test_twophase", config_lines=["max_prepared_transactions=5"]
+    )
+    endpoint.safe_psql("SELECT pg_switch_wal()")
+
+    # to avoid hitting https://github.com/neondatabase/neon/issues/9079, wait for the
+    # WAL to reach the pageserver.
+    wait_for_wal_insert_lsn(env, endpoint, env.initial_tenant, timeline_id)
+
+    endpoint.stop_and_destroy()
+
+    twophase_test_on_timeline(env)
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
  "v17": [
    "17rc1",
-    "5bbb9bd93dd805e90bd8af15d00080363d18ec68"
+    "2cf120e7393ca5f537c6a38b457585576dc035fc"
  ],
  "v16": [
    "16.4",
-    "3ec6e2496f64c6fec35c67cb82efd6490a6a4738"
+    "1d7081a3b076ddf5086e0b118d4329820e6a7427"
  ],
  "v15": [
    "15.8",
-    "72b904c0b3ac43bd74d1e8e6d772e2c476ae25b1"
+    "16c3c6b64f1420a367a2a9b2510f20d94f791af8"
  ],
  "v14": [
    "14.13",
-    "87cb68f899db434cd6f1908cf0ac8fdeafdd88c1"
+    "a38d15f3233a4c07f2bf3335fcbd874dd1f4e386"
  ]
 }
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -45,6 +45,7 @@ futures-io = { version = "0.3" }
 futures-util = { version = "0.3", features = ["channel", "io", "sink"] }
 generic-array = { version = "0.14", default-features = false, features = ["more_lengths", "zeroize"] }
 getrandom = { version = "0.2", default-features = false, features = ["std"] }
+half = { version = "2", default-features = false, features = ["num-traits"] }
 hashbrown = { version = "0.14", features = ["raw"] }
 hex = { version = "0.4", features = ["serde"] }
 hmac = { version = "0.12", default-features = false, features = ["reset"] }
@@ -106,6 +107,7 @@ cc = { version = "1", default-features = false, features = ["parallel"] }
 chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] }
 either = { version = "1" }
 getrandom = { version = "0.2", default-features = false, features = ["std"] }
+half = { version = "2", default-features = false, features = ["num-traits"] }
 hashbrown = { version = "0.14", features = ["raw"] }
 indexmap = { version = "1", default-features = false, features = ["std"] }
 itertools-5ef9efb8ec2df382 = { package = "itertools", version = "0.12", default-features = false, features = ["use_std"] }
Author	SHA1	Message	Date
Konstantin Knizhnik	4a4c1b06eb	Optimize calculation of HLL's register maximum	2024-10-02 15:55:26 +03:00
Konstantin Knizhnik	a59720b5f5	Use simplest wau of calculating access counter	2024-09-27 17:41:30 +03:00
Konstantin Knizhnik	65c519f36c	Use exponential histogram to improve prediction of LFC size	2024-09-23 09:45:14 +03:00
Heikki Linnakangas	e16e82749f	Remove unused crates from workspace Cargo.toml These were not referenced in any of the other Cargo.toml files in the workspace. They were not being built because of that, so there was little harm in having them listed, but let's be tidy.	2024-09-23 00:37:41 +03:00
Heikki Linnakangas	9f653893b9	Update a few dependencies, removing some indirect dependencies cargo update ciborium iana-time-zone lazy_static schannel uuid cargo update hyper@0.14 cargo update --precise 2.9.7 ureq It might be worthwhile just update all our dependencies at some point, but this is aimed at pruning the dependency tree, to make the build a little faster. That's also why I didn't update ureq to the latest version: that would've added a dependency to yet another version of rustls.	2024-09-23 00:37:41 +03:00
Heikki Linnakangas	913af44219	Update "memoffset" crate To eliminate one version of it from our dependency tree.	2024-09-23 00:37:41 +03:00
Heikki Linnakangas	ecd615ab6d	Update "hostname" crate We were already building v0.4.0 as an indirect dependency, so this avoids having to build two different versions of it.	2024-09-23 00:37:41 +03:00
Heikki Linnakangas	c9b2ec9ff1	Check submodule forward progress (#8949 ) We frequently mess up our submodule references. This adds one safeguard: it checks that the submodule references are only updated "forwards", not to some older commit, or a commit that's not a descended of the previous one. As next step, I'm thinking that we should automate things so that when you merge a PR to the 'neon' repository that updates the submodule references, the REL_*_STABLE_neon branches are automatically updated to match the submodule references. That way, you never need to manually merge PRs in the postgres repository, it's all triggered from commits in the 'neon' repository. But that's not included here.	2024-09-22 21:46:53 +03:00
Arpad Müller	a3800dcb0c	Move load_timeline_metadata into separate function (#9080 ) Moves the per-timeline code to load timeline metadata into a new dedicated function called `load_timeline_metadata`. The old `load_timeline_metadata` becomes `load_timelines_metadata`. Split out of #8907 Part of #8088	2024-09-21 12:36:41 +00:00
Heikki Linnakangas	9a32aa828d	Fix init of WAL page header at startup (#8914 ) If the primary is started at an LSN within the first of a 16 MB WAL segment, the "long XLOG page header" at the beginning of the segment was not initialized correctly. That has gone unnnoticed, because under normal circumstances, nothing looks at the page header. The WAL that is streamed to the safekeepers starts at the new record's LSN, not at the beginning of the page, so that bogus page header didn't propagate elsewhere, and a primary server doesn't normally read the WAL its written. Which is good because the contents of the page would be bogus anyway, as it wouldn't contain any of the records before the LSN where the new record is written. Except that in the following cases a primary does read its own WAL: 1. When there are two-phase transactions in prepared state at checkpoint. The checkpointer reads the two-phase state from the XLOG_XACT_PREPARE record, and writes it to a file in pg_twophase/. 2. Logical decoding reads the WAL starting from the replication slot's restart LSN. This PR fixes the problem with two-phase transactions. For that, it's sufficient to initialize the page header correctly. The checkpointer only needs to read XLOG_XACT_PREPARE records that were generated after the server startup, so it's still OK that older WAL is missing / bogus. I have not investigated if we have a problem with logical decoding, however. Let's deal with that separately. Special thanks to @Lzjing-1997, who independently found the same bug and opened a PR to fix it, although I did not use that PR.	2024-09-21 04:00:38 +03:00
Anastasia Lubennikova	f03f7b3868	Bump vendor/postgres to include extension path fix (#9076 ) This is a pre requisite for https://github.com/neondatabase/neon/pull/8681	2024-09-20 20:24:40 +03:00
Christian Schwarz	ec5dce04eb	pageserver: throttling: per-tenant metrics + more metrics to help understand throttle queue depth (#9077 )	2024-09-20 16:48:26 +00:00
John Spray	6014f15157	pageserver: suppress noisy "layer became visible" logs (#9064 ) ## Problem When layer visibility was added, an info log was included for the situation where actual access to a layer disagrees with the visibility calculation. This situation is safe, but I was interested in seeing when it happens. The log is pretty high volume, so this PR refines it to fire less often. ## Summary of changes - For cases where accessing non-visible layers is normal, don't log at all. - Extend a unit test to increase confidence that the updates to visibility on access are working as expected - During compaction, only call the visibility calculation routine if some image layers were created: previously, frequent calls resulted in the visibility of layers getting reset every time we passed through create_image_layers.	2024-09-20 16:07:09 +00:00
Conrad Ludgate	e675a21346	utils: leaky bucket should only report throttled if the notify queue is blocked on sleep (#9072 ) ## Problem Seems that PS might be too eager in reporting throttled tasks ## Summary of changes Introduce a sleep counter. If the sleep counter increases, then the acquire tasks was throttled.	2024-09-20 16:09:39 +01:00
Alex Chi Z.	6b93230270	fix(pageserver): receive body error now 500 (#9052 ) close https://github.com/neondatabase/neon/issues/8903 In https://github.com/neondatabase/neon/issues/8903 we observed JSON decoding error to have the following error message in the log: ``` Error processing HTTP request: Resource temporarily unavailable: 3956 (pageserver-6.ap-southeast-1.aws.neon.tech) error receiving body: error decoding response body ``` This is hard to understand. In this patch, we make the error message more reasonable. ## Summary of changes * receive body error is now an internal server error, passthrough the `reqwest::Error` (only decoding error) as `anyhow::Error`. * instead of formatting the error using `to_string`, we use the alternative `anyhow::Error` formatting, so that it prints out the cause of the error (i.e., what exactly cannot serde decode). I would expect seeing something like `error receiving body: error decoding response body: XXX field not found` after this patch, though I didn't set up a testing environment to observe the exact behavior. --------- Signed-off-by: Alex Chi Z <chi@neon.tech>	2024-09-20 10:37:28 -04:00
Heikki Linnakangas	797aa4ffaa	Skip running clippy in --release mode. (#9073 ) It's pretty expensive to run, and there is very little difference between debug and release builds that could lead to different clippy warnings. This is extracted from PR #8912. That PR wandered off into various improvements we could make, but we seem to have consensus on this part at least.	2024-09-20 17:22:58 +03:00
Christian Schwarz	c45b56e0bb	pageserver: add counters for started smgr/getpage requests (#9069 ) After this PR ``` curl localhost:9898/metrics \| grep smgr_ \| grep start ``` ``` pageserver_smgr_query_started_count{shard_id="0000",smgr_query_type="get_page_at_lsn",tenant_id="...",timeline_id="..."} 0 pageserver_smgr_query_started_global_count{smgr_query_type="get_db_size"} 0 pageserver_smgr_query_started_global_count{smgr_query_type="get_page_at_lsn"} 0 pageserver_smgr_query_started_global_count{smgr_query_type="get_rel_exists"} 0 pageserver_smgr_query_started_global_count{smgr_query_type="get_rel_size"} 0 pageserver_smgr_query_started_global_count{smgr_query_type="get_slru_segment"} 0 ``` We instantiate the per-tenant counter only for `get_page_at_lsn`.	2024-09-20 14:55:50 +01:00
				`@@ -0,0 +1 @@`
				`DROP FUNCTION IF EXISTS approximate_optimal_cache_size(integer,float8) CASCADE;`