From b6d547cf92394cb3f8f73b23a769a3f4c241eec3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 8 May 2024 13:22:27 +0200
Subject: [PATCH 001/130] Tiered compaction: add order asserts after delta key
 k-merge (#7648)

Adds ordering asserts to the output of the delta key iterator
`MergeDeltaKeys` that implements a k-merge.

Part of #7296 : the asserts added by this PR get hit in the reproducers
of #7296 as well, but they are earlier in the pipeline.
---
 pageserver/compaction/src/helpers.rs | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/pageserver/compaction/src/helpers.rs b/pageserver/compaction/src/helpers.rs
index 1b80373ba7..eb0e5ee82a 100644
--- a/pageserver/compaction/src/helpers.rs
+++ b/pageserver/compaction/src/helpers.rs
@@ -9,6 +9,7 @@ use pageserver_api::shard::ShardIdentity;
 use pin_project_lite::pin_project;
 use std::collections::BinaryHeap;
 use std::collections::VecDeque;
+use std::fmt::Display;
 use std::future::Future;
 use std::ops::{DerefMut, Range};
 use std::pin::Pin;
@@ -214,7 +215,7 @@ pub struct KeySize<K> {
 
 pub fn accum_key_values<'a, I, K, D, E>(input: I) -> impl Stream<Item = Result<KeySize<K>, E>>
 where
-    K: Eq,
+    K: Eq + PartialOrd + Display + Copy,
     I: Stream<Item = Result<D, E>>,
     D: CompactionDeltaEntry<'a, K>,
 {
@@ -229,12 +230,15 @@ where
                 num_values: 1,
                 size: first.size(),
             };
+            let mut last_key = accum.key;
             while let Some(this) = input.next().await {
                 let this = this?;
                 if this.key() == accum.key {
                     accum.size += this.size();
                     accum.num_values += 1;
                 } else {
+                    assert!(last_key <= accum.key, "last_key={last_key} <= accum.key={}", accum.key);
+                    last_key = accum.key;
                     yield accum;
                     accum = KeySize {
                         key: this.key(),
@@ -243,6 +247,7 @@ where
                     };
                 }
             }
+            assert!(last_key <= accum.key, "last_key={last_key} <= accum.key={}", accum.key);
             yield accum;
         }
     }

From 870786bd8214480d8ce4aa56706cd8606b07ef15 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 8 May 2024 13:22:55 +0200
Subject: [PATCH 002/130] Improve tiered compaction tests (#7643)

Improves the tiered compaction tests:

* Adds a new test that is a simpler version of the ignored
`test_many_updates_for_single_key` test.
* Reduces the amount of data that `test_many_updates_for_single_key`
processes to make it execute more quickly.
* Adds logging support.
---
 pageserver/compaction/tests/tests.rs | 45 ++++++++++++++++++++++++++--
 1 file changed, 42 insertions(+), 3 deletions(-)

diff --git a/pageserver/compaction/tests/tests.rs b/pageserver/compaction/tests/tests.rs
index 1cea2a20e1..7aa20e6863 100644
--- a/pageserver/compaction/tests/tests.rs
+++ b/pageserver/compaction/tests/tests.rs
@@ -1,5 +1,20 @@
+use once_cell::sync::OnceCell;
 use pageserver_compaction::interface::CompactionLayer;
 use pageserver_compaction::simulator::MockTimeline;
+use utils::logging;
+
+static LOG_HANDLE: OnceCell<()> = OnceCell::new();
+
+pub(crate) fn setup_logging() {
+    LOG_HANDLE.get_or_init(|| {
+        logging::init(
+            logging::LogFormat::Test,
+            logging::TracingErrorLayerEnablement::EnableWithRustLogFilter,
+            logging::Output::Stdout,
+        )
+        .expect("Failed to init test logging")
+    });
+}
 
 /// Test the extreme case that there are so many updates for a single key that
 /// even if we produce an extremely narrow delta layer, spanning just that one
@@ -11,13 +26,14 @@ use pageserver_compaction::simulator::MockTimeline;
 #[ignore]
 #[tokio::test]
 async fn test_many_updates_for_single_key() {
+    setup_logging();
     let mut executor = MockTimeline::new();
-    executor.target_file_size = 10_000_000; // 10 MB
+    executor.target_file_size = 1_000_000; // 1 MB
 
-    // Ingest 100 MB of updates to a single key.
+    // Ingest 10 MB of updates to a single key.
     for _ in 1..1000 {
         executor.ingest_uniform(100, 10, &(0..100_000)).unwrap();
-        executor.ingest_uniform(10_000, 10, &(0..1)).unwrap();
+        executor.ingest_uniform(1000, 10, &(0..1)).unwrap();
         executor.compact().await.unwrap();
     }
 
@@ -33,3 +49,26 @@ async fn test_many_updates_for_single_key() {
         }
     }
 }
+
+#[tokio::test]
+async fn test_simple_updates() {
+    setup_logging();
+    let mut executor = MockTimeline::new();
+    executor.target_file_size = 500_000; // 500 KB
+
+    // Ingest some traffic.
+    for _ in 1..400 {
+        executor.ingest_uniform(100, 500, &(0..100_000)).unwrap();
+    }
+
+    for l in executor.live_layers.iter() {
+        println!("layer {}: {}", l.short_id(), l.file_size());
+    }
+
+    println!("Running compaction...");
+    executor.compact().await.unwrap();
+
+    for l in executor.live_layers.iter() {
+        println!("layer {}: {}", l.short_id(), l.file_size());
+    }
+}

From a4a4d78993781e7aa723c1df6b833435c2fb2e8c Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 8 May 2024 12:26:56 +0100
Subject: [PATCH 003/130] build(deps): bump moto from 4.1.2 to 5.0.6 (#7653)

## Problem

The main point of this PR is to get rid of `python-jose` and `ecdsa`
packages as transitive dependencies through `moto`.
They have a bunch of open vulnerabilities[1][2][3] (which don't affect
us directly), but it's nice not to have them at all.

- [1] https://github.com/advisories/GHSA-wj6h-64fc-37mp
- [2] https://github.com/advisories/GHSA-6c5p-j8vq-pqhj
- [3] https://github.com/advisories/GHSA-cjwg-qfpm-7377

## Summary of changes
- Update `moto` from 4.1.2 to 5.0.6
- Update code to accommodate breaking changes in `moto_server`
---
 poetry.lock                            | 585 +++++++++++++++++++------
 pyproject.toml                         |   2 +-
 test_runner/fixtures/remote_storage.py |   2 +-
 3 files changed, 443 insertions(+), 146 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index e437f5de74..ef9f572b17 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -158,6 +158,28 @@ files = [
 attrs = ">=16.0.0"
 pluggy = ">=0.4.0"
 
+[[package]]
+name = "annotated-types"
+version = "0.6.0"
+description = "Reusable constraint types to use with typing.Annotated"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "annotated_types-0.6.0-py3-none-any.whl", hash = "sha256:0641064de18ba7a25dee8f96403ebc39113d0cb953a01429249d5c7564666a43"},
+    {file = "annotated_types-0.6.0.tar.gz", hash = "sha256:563339e807e53ffd9c267e99fc6d9ea23eb8443c08f112651963e24e22f84a5d"},
+]
+
+[[package]]
+name = "antlr4-python3-runtime"
+version = "4.13.1"
+description = "ANTLR 4.13.1 runtime for Python 3"
+optional = false
+python-versions = "*"
+files = [
+    {file = "antlr4-python3-runtime-4.13.1.tar.gz", hash = "sha256:3cd282f5ea7cfb841537fe01f143350fdb1c0b1ce7981443a2fa8513fddb6d1a"},
+    {file = "antlr4_python3_runtime-4.13.1-py3-none-any.whl", hash = "sha256:78ec57aad12c97ac039ca27403ad61cb98aaec8a3f9bb8144f889aa0fa28b943"},
+]
+
 [[package]]
 name = "anyio"
 version = "4.3.0"
@@ -267,22 +289,23 @@ tests-no-zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy"
 
 [[package]]
 name = "aws-sam-translator"
-version = "1.48.0"
+version = "1.88.0"
 description = "AWS SAM Translator is a library that transform SAM templates into AWS CloudFormation templates"
 optional = false
-python-versions = ">=3.7, <=4.0, !=4.0"
+python-versions = "!=4.0,<=4.0,>=3.8"
 files = [
-    {file = "aws-sam-translator-1.48.0.tar.gz", hash = "sha256:7171037323dfa30f8f73e9bccb9210e4c384a585e087219a9518a5204f0a2c44"},
-    {file = "aws_sam_translator-1.48.0-py2-none-any.whl", hash = "sha256:be18dfa3dfe7ab291d281667c5f73ac62dbe6bfe86df7d122e4258b906b736f0"},
-    {file = "aws_sam_translator-1.48.0-py3-none-any.whl", hash = "sha256:ca4f8f9910d7713aeaba59346775bfb3198f6acb47c6704572f9bd3fc0fb5bf0"},
+    {file = "aws_sam_translator-1.88.0-py3-none-any.whl", hash = "sha256:aa93d498d8de3fb3d485c316155b1628144b823bbc176099a20de06df666fcac"},
+    {file = "aws_sam_translator-1.88.0.tar.gz", hash = "sha256:e77c65f3488566122277accd44a0f1ec018e37403e0d5fe25120d96e537e91a7"},
 ]
 
 [package.dependencies]
 boto3 = ">=1.19.5,<2.dev0"
-jsonschema = ">=3.2,<4.0"
+jsonschema = ">=3.2,<5"
+pydantic = ">=1.8,<3"
+typing-extensions = ">=4.4"
 
 [package.extras]
-dev = ["black (==20.8b1)", "boto3 (>=1.23,<2)", "click (>=7.1,<8.0)", "coverage (>=5.3,<6.0)", "dateparser (>=0.7,<1.0)", "docopt (>=0.6.2,<0.7.0)", "flake8 (>=3.8.4,<3.9.0)", "parameterized (>=0.7.4,<0.8.0)", "pylint (>=2.9.0,<2.10.0)", "pytest (>=6.2.5,<6.3.0)", "pytest-cov (>=2.10.1,<2.11.0)", "pytest-env (>=0.6.2,<0.7.0)", "pytest-xdist (>=2.5,<3.0)", "pyyaml (>=5.4,<6.0)", "requests (>=2.24.0,<2.25.0)", "tenacity (>=7.0.0,<7.1.0)", "tox (>=3.24,<4.0)"]
+dev = ["black (==24.3.0)", "boto3 (>=1.23,<2)", "boto3-stubs[appconfig,serverlessrepo] (>=1.19.5,<2.dev0)", "coverage (>=5.3,<8)", "dateparser (>=1.1,<2.0)", "mypy (>=1.3.0,<1.4.0)", "parameterized (>=0.7,<1.0)", "pytest (>=6.2,<8)", "pytest-cov (>=2.10,<5)", "pytest-env (>=0.6,<1)", "pytest-rerunfailures (>=9.1,<12)", "pytest-xdist (>=2.5,<4)", "pyyaml (>=6.0,<7.0)", "requests (>=2.28,<3.0)", "ruamel.yaml (==0.17.21)", "ruff (>=0.1.0,<0.2.0)", "tenacity (>=8.0,<9.0)", "types-PyYAML (>=6.0,<7.0)", "types-jsonschema (>=3.2,<4.0)"]
 
 [[package]]
 name = "aws-xray-sdk"
@@ -798,24 +821,26 @@ pycparser = "*"
 
 [[package]]
 name = "cfn-lint"
-version = "0.61.3"
+version = "0.87.1"
 description = "Checks CloudFormation templates for practices and behaviour that could potentially be improved"
 optional = false
-python-versions = ">=3.6, <=4.0, !=4.0"
+python-versions = "!=4.0,<=4.0,>=3.8"
 files = [
-    {file = "cfn-lint-0.61.3.tar.gz", hash = "sha256:3806e010d77901f5e935496df690c10e39676434a738fce1a1161cf9c7bd36a2"},
-    {file = "cfn_lint-0.61.3-py3-none-any.whl", hash = "sha256:8e9522fad0c7c98b31ecbdd4724f8d8a5787457cc0f71e62ae0d11104d6e52ab"},
+    {file = "cfn_lint-0.87.1-py3-none-any.whl", hash = "sha256:d450f450635fc223b6f66880ccac52a5fd1a52966fa1705f1ba52b88dfed3071"},
+    {file = "cfn_lint-0.87.1.tar.gz", hash = "sha256:b3ce9d3e5e0eadcea5d584c8ccaa00bf2a990a36a64d7ffd8683bc60b7e4f06f"},
 ]
 
 [package.dependencies]
-aws-sam-translator = ">=1.47.0"
+aws-sam-translator = ">=1.87.0"
 jschema-to-python = ">=1.2.3,<1.3.0"
 jsonpatch = "*"
-jsonschema = ">=3.0,<4.0"
+jsonschema = ">=3.0,<5"
 junit-xml = ">=1.9,<2.0"
-networkx = ">=2.4,<3.0"
+networkx = ">=2.4,<4"
 pyyaml = ">5.4"
+regex = ">=2021.7.1"
 sarif-om = ">=1.0.4,<1.1.0"
+sympy = ">=1.0.0"
 
 [[package]]
 name = "charset-normalizer"
@@ -931,24 +956,6 @@ websocket-client = ">=0.32.0"
 ssh = ["paramiko (>=2.4.2)"]
 tls = ["cryptography (>=1.3.4)", "idna (>=2.0.0)", "pyOpenSSL (>=17.5.0)"]
 
-[[package]]
-name = "ecdsa"
-version = "0.18.0"
-description = "ECDSA cryptographic signature library (pure python)"
-optional = false
-python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
-files = [
-    {file = "ecdsa-0.18.0-py2.py3-none-any.whl", hash = "sha256:80600258e7ed2f16b9aa1d7c295bd70194109ad5a30fdee0eaeefef1d4c559dd"},
-    {file = "ecdsa-0.18.0.tar.gz", hash = "sha256:190348041559e21b22a1d65cee485282ca11a6f81d503fddb84d5017e9ed1e49"},
-]
-
-[package.dependencies]
-six = ">=1.9.0"
-
-[package.extras]
-gmpy = ["gmpy"]
-gmpy2 = ["gmpy2"]
-
 [[package]]
 name = "exceptiongroup"
 version = "1.1.1"
@@ -1268,6 +1275,23 @@ files = [
     {file = "jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe"},
 ]
 
+[[package]]
+name = "joserfc"
+version = "0.9.0"
+description = "The ultimate Python library for JOSE RFCs, including JWS, JWE, JWK, JWA, JWT"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "joserfc-0.9.0-py3-none-any.whl", hash = "sha256:4026bdbe2c196cd40574e916fa1e28874d99649412edaab0e373dec3077153fb"},
+    {file = "joserfc-0.9.0.tar.gz", hash = "sha256:eebca7f587b1761ce43a98ffd5327f2b600b9aa5bb0a77b947687f503ad43bc0"},
+]
+
+[package.dependencies]
+cryptography = "*"
+
+[package.extras]
+drafts = ["pycryptodome"]
+
 [[package]]
 name = "jschema-to-python"
 version = "1.2.3"
@@ -1309,6 +1333,20 @@ files = [
 [package.dependencies]
 jsonpointer = ">=1.9"
 
+[[package]]
+name = "jsonpath-ng"
+version = "1.6.1"
+description = "A final implementation of JSONPath for Python that aims to be standard compliant, including arithmetic and binary comparison operators and providing clear AST for metaprogramming."
+optional = false
+python-versions = "*"
+files = [
+    {file = "jsonpath-ng-1.6.1.tar.gz", hash = "sha256:086c37ba4917304850bd837aeab806670224d3f038fe2833ff593a672ef0a5fa"},
+    {file = "jsonpath_ng-1.6.1-py3-none-any.whl", hash = "sha256:8f22cd8273d7772eea9aaa84d922e0841aa36fdb8a2c6b7f6c3791a16a9bc0be"},
+]
+
+[package.dependencies]
+ply = "*"
+
 [[package]]
 name = "jsonpickle"
 version = "2.2.0"
@@ -1338,24 +1376,39 @@ files = [
 
 [[package]]
 name = "jsonschema"
-version = "3.2.0"
+version = "4.17.3"
 description = "An implementation of JSON Schema validation for Python"
 optional = false
-python-versions = "*"
+python-versions = ">=3.7"
 files = [
-    {file = "jsonschema-3.2.0-py2.py3-none-any.whl", hash = "sha256:4e5b3cf8216f577bee9ce139cbe72eca3ea4f292ec60928ff24758ce626cd163"},
-    {file = "jsonschema-3.2.0.tar.gz", hash = "sha256:c8a85b28d377cc7737e46e2d9f2b4f44ee3c0e1deac6bf46ddefc7187d30797a"},
+    {file = "jsonschema-4.17.3-py3-none-any.whl", hash = "sha256:a870ad254da1a8ca84b6a2905cac29d265f805acc57af304784962a2aa6508f6"},
+    {file = "jsonschema-4.17.3.tar.gz", hash = "sha256:0f864437ab8b6076ba6707453ef8f98a6a0d512a80e93f8abdb676f737ecb60d"},
 ]
 
 [package.dependencies]
 attrs = ">=17.4.0"
-pyrsistent = ">=0.14.0"
-setuptools = "*"
-six = ">=1.11.0"
+pyrsistent = ">=0.14.0,<0.17.0 || >0.17.0,<0.17.1 || >0.17.1,<0.17.2 || >0.17.2"
 
 [package.extras]
-format = ["idna", "jsonpointer (>1.13)", "rfc3987", "strict-rfc3339", "webcolors"]
-format-nongpl = ["idna", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "webcolors"]
+format = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3987", "uri-template", "webcolors (>=1.11)"]
+format-nongpl = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "uri-template", "webcolors (>=1.11)"]
+
+[[package]]
+name = "jsonschema-spec"
+version = "0.1.6"
+description = "JSONSchema Spec with object-oriented paths"
+optional = false
+python-versions = ">=3.7.0,<4.0.0"
+files = [
+    {file = "jsonschema_spec-0.1.6-py3-none-any.whl", hash = "sha256:f2206d18c89d1824c1f775ba14ed039743b41a9167bd2c5bdb774b66b3ca0bbf"},
+    {file = "jsonschema_spec-0.1.6.tar.gz", hash = "sha256:90215863b56e212086641956b20127ccbf6d8a3a38343dad01d6a74d19482f76"},
+]
+
+[package.dependencies]
+jsonschema = ">=4.0.0,<4.18.0"
+pathable = ">=0.4.1,<0.5.0"
+PyYAML = ">=5.1"
+requests = ">=2.31.0,<3.0.0"
 
 [[package]]
 name = "junit-xml"
@@ -1371,6 +1424,52 @@ files = [
 [package.dependencies]
 six = "*"
 
+[[package]]
+name = "lazy-object-proxy"
+version = "1.10.0"
+description = "A fast and thorough lazy object proxy."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "lazy-object-proxy-1.10.0.tar.gz", hash = "sha256:78247b6d45f43a52ef35c25b5581459e85117225408a4128a3daf8bf9648ac69"},
+    {file = "lazy_object_proxy-1.10.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:855e068b0358ab916454464a884779c7ffa312b8925c6f7401e952dcf3b89977"},
+    {file = "lazy_object_proxy-1.10.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7ab7004cf2e59f7c2e4345604a3e6ea0d92ac44e1c2375527d56492014e690c3"},
+    {file = "lazy_object_proxy-1.10.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dc0d2fc424e54c70c4bc06787e4072c4f3b1aa2f897dfdc34ce1013cf3ceef05"},
+    {file = "lazy_object_proxy-1.10.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:e2adb09778797da09d2b5ebdbceebf7dd32e2c96f79da9052b2e87b6ea495895"},
+    {file = "lazy_object_proxy-1.10.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:b1f711e2c6dcd4edd372cf5dec5c5a30d23bba06ee012093267b3376c079ec83"},
+    {file = "lazy_object_proxy-1.10.0-cp310-cp310-win32.whl", hash = "sha256:76a095cfe6045c7d0ca77db9934e8f7b71b14645f0094ffcd842349ada5c5fb9"},
+    {file = "lazy_object_proxy-1.10.0-cp310-cp310-win_amd64.whl", hash = "sha256:b4f87d4ed9064b2628da63830986c3d2dca7501e6018347798313fcf028e2fd4"},
+    {file = "lazy_object_proxy-1.10.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:fec03caabbc6b59ea4a638bee5fce7117be8e99a4103d9d5ad77f15d6f81020c"},
+    {file = "lazy_object_proxy-1.10.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:02c83f957782cbbe8136bee26416686a6ae998c7b6191711a04da776dc9e47d4"},
+    {file = "lazy_object_proxy-1.10.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:009e6bb1f1935a62889ddc8541514b6a9e1fcf302667dcb049a0be5c8f613e56"},
+    {file = "lazy_object_proxy-1.10.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:75fc59fc450050b1b3c203c35020bc41bd2695ed692a392924c6ce180c6f1dc9"},
+    {file = "lazy_object_proxy-1.10.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:782e2c9b2aab1708ffb07d4bf377d12901d7a1d99e5e410d648d892f8967ab1f"},
+    {file = "lazy_object_proxy-1.10.0-cp311-cp311-win32.whl", hash = "sha256:edb45bb8278574710e68a6b021599a10ce730d156e5b254941754a9cc0b17d03"},
+    {file = "lazy_object_proxy-1.10.0-cp311-cp311-win_amd64.whl", hash = "sha256:e271058822765ad5e3bca7f05f2ace0de58a3f4e62045a8c90a0dfd2f8ad8cc6"},
+    {file = "lazy_object_proxy-1.10.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:e98c8af98d5707dcdecc9ab0863c0ea6e88545d42ca7c3feffb6b4d1e370c7ba"},
+    {file = "lazy_object_proxy-1.10.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:952c81d415b9b80ea261d2372d2a4a2332a3890c2b83e0535f263ddfe43f0d43"},
+    {file = "lazy_object_proxy-1.10.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80b39d3a151309efc8cc48675918891b865bdf742a8616a337cb0090791a0de9"},
+    {file = "lazy_object_proxy-1.10.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:e221060b701e2aa2ea991542900dd13907a5c90fa80e199dbf5a03359019e7a3"},
+    {file = "lazy_object_proxy-1.10.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:92f09ff65ecff3108e56526f9e2481b8116c0b9e1425325e13245abfd79bdb1b"},
+    {file = "lazy_object_proxy-1.10.0-cp312-cp312-win32.whl", hash = "sha256:3ad54b9ddbe20ae9f7c1b29e52f123120772b06dbb18ec6be9101369d63a4074"},
+    {file = "lazy_object_proxy-1.10.0-cp312-cp312-win_amd64.whl", hash = "sha256:127a789c75151db6af398b8972178afe6bda7d6f68730c057fbbc2e96b08d282"},
+    {file = "lazy_object_proxy-1.10.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9e4ed0518a14dd26092614412936920ad081a424bdcb54cc13349a8e2c6d106a"},
+    {file = "lazy_object_proxy-1.10.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5ad9e6ed739285919aa9661a5bbed0aaf410aa60231373c5579c6b4801bd883c"},
+    {file = "lazy_object_proxy-1.10.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2fc0a92c02fa1ca1e84fc60fa258458e5bf89d90a1ddaeb8ed9cc3147f417255"},
+    {file = "lazy_object_proxy-1.10.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:0aefc7591920bbd360d57ea03c995cebc204b424524a5bd78406f6e1b8b2a5d8"},
+    {file = "lazy_object_proxy-1.10.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:5faf03a7d8942bb4476e3b62fd0f4cf94eaf4618e304a19865abf89a35c0bbee"},
+    {file = "lazy_object_proxy-1.10.0-cp38-cp38-win32.whl", hash = "sha256:e333e2324307a7b5d86adfa835bb500ee70bfcd1447384a822e96495796b0ca4"},
+    {file = "lazy_object_proxy-1.10.0-cp38-cp38-win_amd64.whl", hash = "sha256:cb73507defd385b7705c599a94474b1d5222a508e502553ef94114a143ec6696"},
+    {file = "lazy_object_proxy-1.10.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:366c32fe5355ef5fc8a232c5436f4cc66e9d3e8967c01fb2e6302fd6627e3d94"},
+    {file = "lazy_object_proxy-1.10.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2297f08f08a2bb0d32a4265e98a006643cd7233fb7983032bd61ac7a02956b3b"},
+    {file = "lazy_object_proxy-1.10.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:18dd842b49456aaa9a7cf535b04ca4571a302ff72ed8740d06b5adcd41fe0757"},
+    {file = "lazy_object_proxy-1.10.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:217138197c170a2a74ca0e05bddcd5f1796c735c37d0eee33e43259b192aa424"},
+    {file = "lazy_object_proxy-1.10.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:9a3a87cf1e133e5b1994144c12ca4aa3d9698517fe1e2ca82977781b16955658"},
+    {file = "lazy_object_proxy-1.10.0-cp39-cp39-win32.whl", hash = "sha256:30b339b2a743c5288405aa79a69e706a06e02958eab31859f7f3c04980853b70"},
+    {file = "lazy_object_proxy-1.10.0-cp39-cp39-win_amd64.whl", hash = "sha256:a899b10e17743683b293a729d3a11f2f399e8a90c73b089e29f5d0fe3509f0dd"},
+    {file = "lazy_object_proxy-1.10.0-pp310.pp311.pp312.pp38.pp39-none-any.whl", hash = "sha256:80fa48bd89c8f2f456fc0765c11c23bf5af827febacd2f523ca5bc1893fcc09d"},
+]
+
 [[package]]
 name = "markupsafe"
 version = "2.1.1"
@@ -1422,64 +1521,80 @@ files = [
 
 [[package]]
 name = "moto"
-version = "4.1.2"
+version = "5.0.6"
 description = ""
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "moto-4.1.2-py2.py3-none-any.whl", hash = "sha256:1b361ece638c74a657325378a259276f368aafce2f8be84f8143e69fa93ce8ec"},
-    {file = "moto-4.1.2.tar.gz", hash = "sha256:63431733d2a02c7bd652ad71ec1da442a0e0d580cbac5eeb50d440a2ce066eac"},
+    {file = "moto-5.0.6-py2.py3-none-any.whl", hash = "sha256:ca1e22831a741733b581ff2ef4d6ae2e1c6db1eab97af1b78b86ca2c6e88c609"},
+    {file = "moto-5.0.6.tar.gz", hash = "sha256:ad8b23f2b555ad694da8b2432a42b6d96beaaf67a4e7d932196a72193a2eee2c"},
 ]
 
 [package.dependencies]
+antlr4-python3-runtime = {version = "*", optional = true, markers = "extra == \"server\""}
 aws-xray-sdk = {version = ">=0.93,<0.96 || >0.96", optional = true, markers = "extra == \"server\""}
 boto3 = ">=1.9.201"
-botocore = ">=1.12.201"
+botocore = ">=1.14.0"
 cfn-lint = {version = ">=0.40.0", optional = true, markers = "extra == \"server\""}
 cryptography = ">=3.3.1"
-docker = {version = ">=2.5.1", optional = true, markers = "extra == \"server\""}
-ecdsa = {version = "!=0.15", optional = true, markers = "extra == \"server\""}
+docker = {version = ">=3.0.0", optional = true, markers = "extra == \"server\""}
 flask = {version = "<2.2.0 || >2.2.0,<2.2.1 || >2.2.1", optional = true, markers = "extra == \"server\""}
 flask-cors = {version = "*", optional = true, markers = "extra == \"server\""}
 graphql-core = {version = "*", optional = true, markers = "extra == \"server\""}
 Jinja2 = ">=2.10.1"
+joserfc = {version = ">=0.9.0", optional = true, markers = "extra == \"server\""}
 jsondiff = {version = ">=1.1.2", optional = true, markers = "extra == \"server\""}
-openapi-spec-validator = {version = ">=0.2.8", optional = true, markers = "extra == \"server\""}
+jsonpath-ng = {version = "*", optional = true, markers = "extra == \"server\""}
+openapi-spec-validator = {version = ">=0.5.0", optional = true, markers = "extra == \"server\""}
+py-partiql-parser = {version = "0.5.4", optional = true, markers = "extra == \"server\""}
 pyparsing = {version = ">=3.0.7", optional = true, markers = "extra == \"server\""}
 python-dateutil = ">=2.1,<3.0.0"
-python-jose = {version = ">=3.1.0,<4.0.0", extras = ["cryptography"], optional = true, markers = "extra == \"server\""}
 PyYAML = {version = ">=5.1", optional = true, markers = "extra == \"server\""}
 requests = ">=2.5"
-responses = ">=0.13.0"
+responses = ">=0.15.0"
 setuptools = {version = "*", optional = true, markers = "extra == \"server\""}
-sshpubkeys = {version = ">=3.1.0", optional = true, markers = "extra == \"server\""}
 werkzeug = ">=0.5,<2.2.0 || >2.2.0,<2.2.1 || >2.2.1"
 xmltodict = "*"
 
 [package.extras]
-all = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"]
-apigateway = ["PyYAML (>=5.1)", "ecdsa (!=0.15)", "openapi-spec-validator (>=0.2.8)", "python-jose[cryptography] (>=3.1.0,<4.0.0)"]
-apigatewayv2 = ["PyYAML (>=5.1)"]
+all = ["PyYAML (>=5.1)", "antlr4-python3-runtime", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "jsonpath-ng", "multipart", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.4)", "pyparsing (>=3.0.7)", "setuptools"]
+apigateway = ["PyYAML (>=5.1)", "joserfc (>=0.9.0)", "openapi-spec-validator (>=0.5.0)"]
+apigatewayv2 = ["PyYAML (>=5.1)", "openapi-spec-validator (>=0.5.0)"]
 appsync = ["graphql-core"]
-awslambda = ["docker (>=2.5.1)"]
-batch = ["docker (>=2.5.1)"]
-cloudformation = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"]
-cognitoidp = ["ecdsa (!=0.15)", "python-jose[cryptography] (>=3.1.0,<4.0.0)"]
-ds = ["sshpubkeys (>=3.1.0)"]
-dynamodb = ["docker (>=2.5.1)"]
-dynamodbstreams = ["docker (>=2.5.1)"]
-ebs = ["sshpubkeys (>=3.1.0)"]
-ec2 = ["sshpubkeys (>=3.1.0)"]
-efs = ["sshpubkeys (>=3.1.0)"]
-eks = ["sshpubkeys (>=3.1.0)"]
+awslambda = ["docker (>=3.0.0)"]
+batch = ["docker (>=3.0.0)"]
+cloudformation = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.4)", "pyparsing (>=3.0.7)", "setuptools"]
+cognitoidp = ["joserfc (>=0.9.0)"]
+dynamodb = ["docker (>=3.0.0)", "py-partiql-parser (==0.5.4)"]
+dynamodbstreams = ["docker (>=3.0.0)", "py-partiql-parser (==0.5.4)"]
 glue = ["pyparsing (>=3.0.7)"]
 iotdata = ["jsondiff (>=1.1.2)"]
-route53resolver = ["sshpubkeys (>=3.1.0)"]
-s3 = ["PyYAML (>=5.1)"]
-server = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "flask (!=2.2.0,!=2.2.1)", "flask-cors", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"]
+proxy = ["PyYAML (>=5.1)", "antlr4-python3-runtime", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "jsonpath-ng", "multipart", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.4)", "pyparsing (>=3.0.7)", "setuptools"]
+resourcegroupstaggingapi = ["PyYAML (>=5.1)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.4)", "pyparsing (>=3.0.7)"]
+s3 = ["PyYAML (>=5.1)", "py-partiql-parser (==0.5.4)"]
+s3crc32c = ["PyYAML (>=5.1)", "crc32c", "py-partiql-parser (==0.5.4)"]
+server = ["PyYAML (>=5.1)", "antlr4-python3-runtime", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "flask (!=2.2.0,!=2.2.1)", "flask-cors", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "jsonpath-ng", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.4)", "pyparsing (>=3.0.7)", "setuptools"]
 ssm = ["PyYAML (>=5.1)"]
+stepfunctions = ["antlr4-python3-runtime", "jsonpath-ng"]
 xray = ["aws-xray-sdk (>=0.93,!=0.96)", "setuptools"]
 
+[[package]]
+name = "mpmath"
+version = "1.3.0"
+description = "Python library for arbitrary-precision floating-point arithmetic"
+optional = false
+python-versions = "*"
+files = [
+    {file = "mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c"},
+    {file = "mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f"},
+]
+
+[package.extras]
+develop = ["codecov", "pycodestyle", "pytest (>=4.6)", "pytest-cov", "wheel"]
+docs = ["sphinx"]
+gmpy = ["gmpy2 (>=2.1.0a4)"]
+tests = ["pytest (>=4.6)"]
+
 [[package]]
 name = "multidict"
 version = "6.0.4"
@@ -1654,42 +1769,38 @@ test = ["codecov (>=2.1)", "pytest (>=7.1)", "pytest-cov (>=3.0)"]
 
 [[package]]
 name = "openapi-schema-validator"
-version = "0.2.3"
+version = "0.4.4"
 description = "OpenAPI schema validation for Python"
 optional = false
 python-versions = ">=3.7.0,<4.0.0"
 files = [
-    {file = "openapi-schema-validator-0.2.3.tar.gz", hash = "sha256:2c64907728c3ef78e23711c8840a423f0b241588c9ed929855e4b2d1bb0cf5f2"},
-    {file = "openapi_schema_validator-0.2.3-py3-none-any.whl", hash = "sha256:9bae709212a19222892cabcc60cafd903cbf4b220223f48583afa3c0e3cc6fc4"},
+    {file = "openapi_schema_validator-0.4.4-py3-none-any.whl", hash = "sha256:79f37f38ef9fd5206b924ed7a6f382cea7b649b3b56383c47f1906082b7b9015"},
+    {file = "openapi_schema_validator-0.4.4.tar.gz", hash = "sha256:c573e2be2c783abae56c5a1486ab716ca96e09d1c3eab56020d1dc680aa57bf8"},
 ]
 
 [package.dependencies]
-jsonschema = ">=3.0.0,<5.0.0"
+jsonschema = ">=4.0.0,<4.18.0"
+rfc3339-validator = "*"
 
 [package.extras]
-isodate = ["isodate"]
-rfc3339-validator = ["rfc3339-validator"]
-strict-rfc3339 = ["strict-rfc3339"]
+docs = ["sphinx (>=5.3.0,<6.0.0)", "sphinx-immaterial (>=0.11.0,<0.12.0)"]
 
 [[package]]
 name = "openapi-spec-validator"
-version = "0.4.0"
-description = "OpenAPI 2.0 (aka Swagger) and OpenAPI 3.0 spec validator"
+version = "0.5.7"
+description = "OpenAPI 2.0 (aka Swagger) and OpenAPI 3 spec validator"
 optional = false
 python-versions = ">=3.7.0,<4.0.0"
 files = [
-    {file = "openapi-spec-validator-0.4.0.tar.gz", hash = "sha256:97f258850afc97b048f7c2653855e0f88fa66ac103c2be5077c7960aca2ad49a"},
-    {file = "openapi_spec_validator-0.4.0-py3-none-any.whl", hash = "sha256:06900ac4d546a1df3642a779da0055be58869c598e3042a2fef067cfd99d04d0"},
+    {file = "openapi_spec_validator-0.5.7-py3-none-any.whl", hash = "sha256:8712d2879db7692974ef89c47a3ebfc79436442921ec3a826ac0ce80cde8c549"},
+    {file = "openapi_spec_validator-0.5.7.tar.gz", hash = "sha256:6c2d42180045a80fd6314de848b94310bdb0fa4949f4b099578b69f79d9fa5ac"},
 ]
 
 [package.dependencies]
-jsonschema = ">=3.2.0,<5.0.0"
-openapi-schema-validator = ">=0.2.0,<0.3.0"
-PyYAML = ">=5.1"
-setuptools = "*"
-
-[package.extras]
-requests = ["requests"]
+jsonschema = ">=4.0.0,<4.18.0"
+jsonschema-spec = ">=0.1.1,<0.2.0"
+lazy-object-proxy = ">=1.7.1,<2.0.0"
+openapi-schema-validator = ">=0.4.2,<0.5.0"
 
 [[package]]
 name = "packaging"
@@ -1702,6 +1813,17 @@ files = [
     {file = "packaging-23.0.tar.gz", hash = "sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97"},
 ]
 
+[[package]]
+name = "pathable"
+version = "0.4.3"
+description = "Object-oriented paths"
+optional = false
+python-versions = ">=3.7.0,<4.0.0"
+files = [
+    {file = "pathable-0.4.3-py3-none-any.whl", hash = "sha256:cdd7b1f9d7d5c8b8d3315dbf5a86b2596053ae845f056f57d97c0eefff84da14"},
+    {file = "pathable-0.4.3.tar.gz", hash = "sha256:5c869d315be50776cc8a993f3af43e0c60dc01506b399643f919034ebf4cdcab"},
+]
+
 [[package]]
 name = "pbr"
 version = "5.9.0"
@@ -1728,6 +1850,17 @@ files = [
 dev = ["pre-commit", "tox"]
 testing = ["pytest", "pytest-benchmark"]
 
+[[package]]
+name = "ply"
+version = "3.11"
+description = "Python Lex & Yacc"
+optional = false
+python-versions = "*"
+files = [
+    {file = "ply-3.11-py2.py3-none-any.whl", hash = "sha256:096f9b8350b65ebd2fd1346b12452efe5b9607f7482813ffca50c22722a807ce"},
+    {file = "ply-3.11.tar.gz", hash = "sha256:00c7c1aaa88358b9c765b6d3000c6eec0ba42abca5351b095321aef446081da3"},
+]
+
 [[package]]
 name = "prometheus-client"
 version = "0.14.1"
@@ -1840,16 +1973,19 @@ files = [
 ]
 
 [[package]]
-name = "pyasn1"
-version = "0.4.8"
-description = "ASN.1 types and codecs"
+name = "py-partiql-parser"
+version = "0.5.4"
+description = "Pure Python PartiQL Parser"
 optional = false
 python-versions = "*"
 files = [
-    {file = "pyasn1-0.4.8-py2.py3-none-any.whl", hash = "sha256:39c7e2ec30515947ff4e87fb6f456dfc6e84857d34be479c9d4a4ba4bf46aa5d"},
-    {file = "pyasn1-0.4.8.tar.gz", hash = "sha256:aef77c9fb94a3ac588e87841208bdec464471d9871bd5050a287cc9a475cd0ba"},
+    {file = "py_partiql_parser-0.5.4-py2.py3-none-any.whl", hash = "sha256:3dc4295a47da9587681a96b35c6e151886fdbd0a4acbe0d97c4c68e5f689d315"},
+    {file = "py_partiql_parser-0.5.4.tar.gz", hash = "sha256:72e043919538fa63edae72fb59afc7e3fd93adbde656718a7d2b4666f23dd114"},
 ]
 
+[package.extras]
+dev = ["black (==22.6.0)", "flake8", "mypy", "pytest"]
+
 [[package]]
 name = "pycparser"
 version = "2.21"
@@ -1861,6 +1997,116 @@ files = [
     {file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"},
 ]
 
+[[package]]
+name = "pydantic"
+version = "2.7.1"
+description = "Data validation using Python type hints"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "pydantic-2.7.1-py3-none-any.whl", hash = "sha256:e029badca45266732a9a79898a15ae2e8b14840b1eabbb25844be28f0b33f3d5"},
+    {file = "pydantic-2.7.1.tar.gz", hash = "sha256:e9dbb5eada8abe4d9ae5f46b9939aead650cd2b68f249bb3a8139dbe125803cc"},
+]
+
+[package.dependencies]
+annotated-types = ">=0.4.0"
+pydantic-core = "2.18.2"
+typing-extensions = ">=4.6.1"
+
+[package.extras]
+email = ["email-validator (>=2.0.0)"]
+
+[[package]]
+name = "pydantic-core"
+version = "2.18.2"
+description = "Core functionality for Pydantic validation and serialization"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "pydantic_core-2.18.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:9e08e867b306f525802df7cd16c44ff5ebbe747ff0ca6cf3fde7f36c05a59a81"},
+    {file = "pydantic_core-2.18.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f0a21cbaa69900cbe1a2e7cad2aa74ac3cf21b10c3efb0fa0b80305274c0e8a2"},
+    {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0680b1f1f11fda801397de52c36ce38ef1c1dc841a0927a94f226dea29c3ae3d"},
+    {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:95b9d5e72481d3780ba3442eac863eae92ae43a5f3adb5b4d0a1de89d42bb250"},
+    {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4fcf5cd9c4b655ad666ca332b9a081112cd7a58a8b5a6ca7a3104bc950f2038"},
+    {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9b5155ff768083cb1d62f3e143b49a8a3432e6789a3abee8acd005c3c7af1c74"},
+    {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:553ef617b6836fc7e4df130bb851e32fe357ce36336d897fd6646d6058d980af"},
+    {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b89ed9eb7d616ef5714e5590e6cf7f23b02d0d539767d33561e3675d6f9e3857"},
+    {file = "pydantic_core-2.18.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:75f7e9488238e920ab6204399ded280dc4c307d034f3924cd7f90a38b1829563"},
+    {file = "pydantic_core-2.18.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:ef26c9e94a8c04a1b2924149a9cb081836913818e55681722d7f29af88fe7b38"},
+    {file = "pydantic_core-2.18.2-cp310-none-win32.whl", hash = "sha256:182245ff6b0039e82b6bb585ed55a64d7c81c560715d1bad0cbad6dfa07b4027"},
+    {file = "pydantic_core-2.18.2-cp310-none-win_amd64.whl", hash = "sha256:e23ec367a948b6d812301afc1b13f8094ab7b2c280af66ef450efc357d2ae543"},
+    {file = "pydantic_core-2.18.2-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:219da3f096d50a157f33645a1cf31c0ad1fe829a92181dd1311022f986e5fbe3"},
+    {file = "pydantic_core-2.18.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:cc1cfd88a64e012b74e94cd00bbe0f9c6df57049c97f02bb07d39e9c852e19a4"},
+    {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:05b7133a6e6aeb8df37d6f413f7705a37ab4031597f64ab56384c94d98fa0e90"},
+    {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:224c421235f6102e8737032483f43c1a8cfb1d2f45740c44166219599358c2cd"},
+    {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b14d82cdb934e99dda6d9d60dc84a24379820176cc4a0d123f88df319ae9c150"},
+    {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2728b01246a3bba6de144f9e3115b532ee44bd6cf39795194fb75491824a1413"},
+    {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:470b94480bb5ee929f5acba6995251ada5e059a5ef3e0dfc63cca287283ebfa6"},
+    {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:997abc4df705d1295a42f95b4eec4950a37ad8ae46d913caeee117b6b198811c"},
+    {file = "pydantic_core-2.18.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:75250dbc5290e3f1a0f4618db35e51a165186f9034eff158f3d490b3fed9f8a0"},
+    {file = "pydantic_core-2.18.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:4456f2dca97c425231d7315737d45239b2b51a50dc2b6f0c2bb181fce6207664"},
+    {file = "pydantic_core-2.18.2-cp311-none-win32.whl", hash = "sha256:269322dcc3d8bdb69f054681edff86276b2ff972447863cf34c8b860f5188e2e"},
+    {file = "pydantic_core-2.18.2-cp311-none-win_amd64.whl", hash = "sha256:800d60565aec896f25bc3cfa56d2277d52d5182af08162f7954f938c06dc4ee3"},
+    {file = "pydantic_core-2.18.2-cp311-none-win_arm64.whl", hash = "sha256:1404c69d6a676245199767ba4f633cce5f4ad4181f9d0ccb0577e1f66cf4c46d"},
+    {file = "pydantic_core-2.18.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:fb2bd7be70c0fe4dfd32c951bc813d9fe6ebcbfdd15a07527796c8204bd36242"},
+    {file = "pydantic_core-2.18.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6132dd3bd52838acddca05a72aafb6eab6536aa145e923bb50f45e78b7251043"},
+    {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7d904828195733c183d20a54230c0df0eb46ec746ea1a666730787353e87182"},
+    {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c9bd70772c720142be1020eac55f8143a34ec9f82d75a8e7a07852023e46617f"},
+    {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2b8ed04b3582771764538f7ee7001b02e1170223cf9b75dff0bc698fadb00cf3"},
+    {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e6dac87ddb34aaec85f873d737e9d06a3555a1cc1a8e0c44b7f8d5daeb89d86f"},
+    {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ca4ae5a27ad7a4ee5170aebce1574b375de390bc01284f87b18d43a3984df72"},
+    {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:886eec03591b7cf058467a70a87733b35f44707bd86cf64a615584fd72488b7c"},
+    {file = "pydantic_core-2.18.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ca7b0c1f1c983e064caa85f3792dd2fe3526b3505378874afa84baf662e12241"},
+    {file = "pydantic_core-2.18.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4b4356d3538c3649337df4074e81b85f0616b79731fe22dd11b99499b2ebbdf3"},
+    {file = "pydantic_core-2.18.2-cp312-none-win32.whl", hash = "sha256:8b172601454f2d7701121bbec3425dd71efcb787a027edf49724c9cefc14c038"},
+    {file = "pydantic_core-2.18.2-cp312-none-win_amd64.whl", hash = "sha256:b1bd7e47b1558ea872bd16c8502c414f9e90dcf12f1395129d7bb42a09a95438"},
+    {file = "pydantic_core-2.18.2-cp312-none-win_arm64.whl", hash = "sha256:98758d627ff397e752bc339272c14c98199c613f922d4a384ddc07526c86a2ec"},
+    {file = "pydantic_core-2.18.2-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:9fdad8e35f278b2c3eb77cbdc5c0a49dada440657bf738d6905ce106dc1de439"},
+    {file = "pydantic_core-2.18.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:1d90c3265ae107f91a4f279f4d6f6f1d4907ac76c6868b27dc7fb33688cfb347"},
+    {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:390193c770399861d8df9670fb0d1874f330c79caaca4642332df7c682bf6b91"},
+    {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:82d5d4d78e4448683cb467897fe24e2b74bb7b973a541ea1dcfec1d3cbce39fb"},
+    {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4774f3184d2ef3e14e8693194f661dea5a4d6ca4e3dc8e39786d33a94865cefd"},
+    {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d4d938ec0adf5167cb335acb25a4ee69a8107e4984f8fbd2e897021d9e4ca21b"},
+    {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e0e8b1be28239fc64a88a8189d1df7fad8be8c1ae47fcc33e43d4be15f99cc70"},
+    {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:868649da93e5a3d5eacc2b5b3b9235c98ccdbfd443832f31e075f54419e1b96b"},
+    {file = "pydantic_core-2.18.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:78363590ef93d5d226ba21a90a03ea89a20738ee5b7da83d771d283fd8a56761"},
+    {file = "pydantic_core-2.18.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:852e966fbd035a6468fc0a3496589b45e2208ec7ca95c26470a54daed82a0788"},
+    {file = "pydantic_core-2.18.2-cp38-none-win32.whl", hash = "sha256:6a46e22a707e7ad4484ac9ee9f290f9d501df45954184e23fc29408dfad61350"},
+    {file = "pydantic_core-2.18.2-cp38-none-win_amd64.whl", hash = "sha256:d91cb5ea8b11607cc757675051f61b3d93f15eca3cefb3e6c704a5d6e8440f4e"},
+    {file = "pydantic_core-2.18.2-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:ae0a8a797a5e56c053610fa7be147993fe50960fa43609ff2a9552b0e07013e8"},
+    {file = "pydantic_core-2.18.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:042473b6280246b1dbf530559246f6842b56119c2926d1e52b631bdc46075f2a"},
+    {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1a388a77e629b9ec814c1b1e6b3b595fe521d2cdc625fcca26fbc2d44c816804"},
+    {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e25add29b8f3b233ae90ccef2d902d0ae0432eb0d45370fe315d1a5cf231004b"},
+    {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f459a5ce8434614dfd39bbebf1041952ae01da6bed9855008cb33b875cb024c0"},
+    {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:eff2de745698eb46eeb51193a9f41d67d834d50e424aef27df2fcdee1b153845"},
+    {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a8309f67285bdfe65c372ea3722b7a5642680f3dba538566340a9d36e920b5f0"},
+    {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f93a8a2e3938ff656a7c1bc57193b1319960ac015b6e87d76c76bf14fe0244b4"},
+    {file = "pydantic_core-2.18.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:22057013c8c1e272eb8d0eebc796701167d8377441ec894a8fed1af64a0bf399"},
+    {file = "pydantic_core-2.18.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:cfeecd1ac6cc1fb2692c3d5110781c965aabd4ec5d32799773ca7b1456ac636b"},
+    {file = "pydantic_core-2.18.2-cp39-none-win32.whl", hash = "sha256:0d69b4c2f6bb3e130dba60d34c0845ba31b69babdd3f78f7c0c8fae5021a253e"},
+    {file = "pydantic_core-2.18.2-cp39-none-win_amd64.whl", hash = "sha256:d9319e499827271b09b4e411905b24a426b8fb69464dfa1696258f53a3334641"},
+    {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:a1874c6dd4113308bd0eb568418e6114b252afe44319ead2b4081e9b9521fe75"},
+    {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:ccdd111c03bfd3666bd2472b674c6899550e09e9f298954cfc896ab92b5b0e6d"},
+    {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e18609ceaa6eed63753037fc06ebb16041d17d28199ae5aba0052c51449650a9"},
+    {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e5c584d357c4e2baf0ff7baf44f4994be121e16a2c88918a5817331fc7599d7"},
+    {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:43f0f463cf89ace478de71a318b1b4f05ebc456a9b9300d027b4b57c1a2064fb"},
+    {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:e1b395e58b10b73b07b7cf740d728dd4ff9365ac46c18751bf8b3d8cca8f625a"},
+    {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:0098300eebb1c837271d3d1a2cd2911e7c11b396eac9661655ee524a7f10587b"},
+    {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:36789b70d613fbac0a25bb07ab3d9dba4d2e38af609c020cf4d888d165ee0bf3"},
+    {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:3f9a801e7c8f1ef8718da265bba008fa121243dfe37c1cea17840b0944dfd72c"},
+    {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:3a6515ebc6e69d85502b4951d89131ca4e036078ea35533bb76327f8424531ce"},
+    {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:20aca1e2298c56ececfd8ed159ae4dde2df0781988c97ef77d5c16ff4bd5b400"},
+    {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:223ee893d77a310a0391dca6df00f70bbc2f36a71a895cecd9a0e762dc37b349"},
+    {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2334ce8c673ee93a1d6a65bd90327588387ba073c17e61bf19b4fd97d688d63c"},
+    {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:cbca948f2d14b09d20268cda7b0367723d79063f26c4ffc523af9042cad95592"},
+    {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:b3ef08e20ec49e02d5c6717a91bb5af9b20f1805583cb0adfe9ba2c6b505b5ae"},
+    {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:c6fdc8627910eed0c01aed6a390a252fe3ea6d472ee70fdde56273f198938374"},
+    {file = "pydantic_core-2.18.2.tar.gz", hash = "sha256:2e29d20810dfc3043ee13ac7d9e25105799817683348823f305ab3f349b9386e"},
+]
+
+[package.dependencies]
+typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0"
+
 [[package]]
 name = "pyjwt"
 version = "2.4.0"
@@ -2115,28 +2361,6 @@ files = [
 [package.dependencies]
 six = ">=1.5"
 
-[[package]]
-name = "python-jose"
-version = "3.3.0"
-description = "JOSE implementation in Python"
-optional = false
-python-versions = "*"
-files = [
-    {file = "python-jose-3.3.0.tar.gz", hash = "sha256:55779b5e6ad599c6336191246e95eb2293a9ddebd555f796a65f838f07e5d78a"},
-    {file = "python_jose-3.3.0-py2.py3-none-any.whl", hash = "sha256:9b1376b023f8b298536eedd47ae1089bcdb848f1535ab30555cd92002d78923a"},
-]
-
-[package.dependencies]
-cryptography = {version = ">=3.4.0", optional = true, markers = "extra == \"cryptography\""}
-ecdsa = "!=0.15"
-pyasn1 = "*"
-rsa = "*"
-
-[package.extras]
-cryptography = ["cryptography (>=3.4.0)"]
-pycrypto = ["pyasn1", "pycrypto (>=2.6.0,<2.7.0)"]
-pycryptodome = ["pyasn1", "pycryptodome (>=3.3.1,<4.0.0)"]
-
 [[package]]
 name = "pywin32"
 version = "301"
@@ -2181,7 +2405,6 @@ files = [
     {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
-    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
     {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
     {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
     {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
@@ -2216,6 +2439,94 @@ files = [
     {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"},
 ]
 
+[[package]]
+name = "regex"
+version = "2024.4.28"
+description = "Alternative regular expression module, to replace re."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "regex-2024.4.28-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:cd196d056b40af073d95a2879678585f0b74ad35190fac04ca67954c582c6b61"},
+    {file = "regex-2024.4.28-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8bb381f777351bd534462f63e1c6afb10a7caa9fa2a421ae22c26e796fe31b1f"},
+    {file = "regex-2024.4.28-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:47af45b6153522733aa6e92543938e97a70ce0900649ba626cf5aad290b737b6"},
+    {file = "regex-2024.4.28-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:99d6a550425cc51c656331af0e2b1651e90eaaa23fb4acde577cf15068e2e20f"},
+    {file = "regex-2024.4.28-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bf29304a8011feb58913c382902fde3395957a47645bf848eea695839aa101b7"},
+    {file = "regex-2024.4.28-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:92da587eee39a52c91aebea8b850e4e4f095fe5928d415cb7ed656b3460ae79a"},
+    {file = "regex-2024.4.28-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6277d426e2f31bdbacb377d17a7475e32b2d7d1f02faaecc48d8e370c6a3ff31"},
+    {file = "regex-2024.4.28-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:28e1f28d07220c0f3da0e8fcd5a115bbb53f8b55cecf9bec0c946eb9a059a94c"},
+    {file = "regex-2024.4.28-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:aaa179975a64790c1f2701ac562b5eeb733946eeb036b5bcca05c8d928a62f10"},
+    {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:6f435946b7bf7a1b438b4e6b149b947c837cb23c704e780c19ba3e6855dbbdd3"},
+    {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:19d6c11bf35a6ad077eb23852827f91c804eeb71ecb85db4ee1386825b9dc4db"},
+    {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:fdae0120cddc839eb8e3c15faa8ad541cc6d906d3eb24d82fb041cfe2807bc1e"},
+    {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:e672cf9caaf669053121f1766d659a8813bd547edef6e009205378faf45c67b8"},
+    {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f57515750d07e14743db55d59759893fdb21d2668f39e549a7d6cad5d70f9fea"},
+    {file = "regex-2024.4.28-cp310-cp310-win32.whl", hash = "sha256:a1409c4eccb6981c7baabc8888d3550df518add6e06fe74fa1d9312c1838652d"},
+    {file = "regex-2024.4.28-cp310-cp310-win_amd64.whl", hash = "sha256:1f687a28640f763f23f8a9801fe9e1b37338bb1ca5d564ddd41619458f1f22d1"},
+    {file = "regex-2024.4.28-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:84077821c85f222362b72fdc44f7a3a13587a013a45cf14534df1cbbdc9a6796"},
+    {file = "regex-2024.4.28-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b45d4503de8f4f3dc02f1d28a9b039e5504a02cc18906cfe744c11def942e9eb"},
+    {file = "regex-2024.4.28-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:457c2cd5a646dd4ed536c92b535d73548fb8e216ebee602aa9f48e068fc393f3"},
+    {file = "regex-2024.4.28-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2b51739ddfd013c6f657b55a508de8b9ea78b56d22b236052c3a85a675102dc6"},
+    {file = "regex-2024.4.28-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:459226445c7d7454981c4c0ce0ad1a72e1e751c3e417f305722bbcee6697e06a"},
+    {file = "regex-2024.4.28-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:670fa596984b08a4a769491cbdf22350431970d0112e03d7e4eeaecaafcd0fec"},
+    {file = "regex-2024.4.28-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fe00f4fe11c8a521b173e6324d862ee7ee3412bf7107570c9b564fe1119b56fb"},
+    {file = "regex-2024.4.28-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:36f392dc7763fe7924575475736bddf9ab9f7a66b920932d0ea50c2ded2f5636"},
+    {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:23a412b7b1a7063f81a742463f38821097b6a37ce1e5b89dd8e871d14dbfd86b"},
+    {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:f1d6e4b7b2ae3a6a9df53efbf199e4bfcff0959dbdb5fd9ced34d4407348e39a"},
+    {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:499334ad139557de97cbc4347ee921c0e2b5e9c0f009859e74f3f77918339257"},
+    {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:0940038bec2fe9e26b203d636c44d31dd8766abc1fe66262da6484bd82461ccf"},
+    {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:66372c2a01782c5fe8e04bff4a2a0121a9897e19223d9eab30c54c50b2ebeb7f"},
+    {file = "regex-2024.4.28-cp311-cp311-win32.whl", hash = "sha256:c77d10ec3c1cf328b2f501ca32583625987ea0f23a0c2a49b37a39ee5c4c4630"},
+    {file = "regex-2024.4.28-cp311-cp311-win_amd64.whl", hash = "sha256:fc0916c4295c64d6890a46e02d4482bb5ccf33bf1a824c0eaa9e83b148291f90"},
+    {file = "regex-2024.4.28-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:08a1749f04fee2811c7617fdd46d2e46d09106fa8f475c884b65c01326eb15c5"},
+    {file = "regex-2024.4.28-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b8eb28995771c087a73338f695a08c9abfdf723d185e57b97f6175c5051ff1ae"},
+    {file = "regex-2024.4.28-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:dd7ef715ccb8040954d44cfeff17e6b8e9f79c8019daae2fd30a8806ef5435c0"},
+    {file = "regex-2024.4.28-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb0315a2b26fde4005a7c401707c5352df274460f2f85b209cf6024271373013"},
+    {file = "regex-2024.4.28-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f2fc053228a6bd3a17a9b0a3f15c3ab3cf95727b00557e92e1cfe094b88cc662"},
+    {file = "regex-2024.4.28-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7fe9739a686dc44733d52d6e4f7b9c77b285e49edf8570754b322bca6b85b4cc"},
+    {file = "regex-2024.4.28-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a74fcf77d979364f9b69fcf8200849ca29a374973dc193a7317698aa37d8b01c"},
+    {file = "regex-2024.4.28-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:965fd0cf4694d76f6564896b422724ec7b959ef927a7cb187fc6b3f4e4f59833"},
+    {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:2fef0b38c34ae675fcbb1b5db760d40c3fc3612cfa186e9e50df5782cac02bcd"},
+    {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:bc365ce25f6c7c5ed70e4bc674f9137f52b7dd6a125037f9132a7be52b8a252f"},
+    {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:ac69b394764bb857429b031d29d9604842bc4cbfd964d764b1af1868eeebc4f0"},
+    {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:144a1fc54765f5c5c36d6d4b073299832aa1ec6a746a6452c3ee7b46b3d3b11d"},
+    {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:2630ca4e152c221072fd4a56d4622b5ada876f668ecd24d5ab62544ae6793ed6"},
+    {file = "regex-2024.4.28-cp312-cp312-win32.whl", hash = "sha256:7f3502f03b4da52bbe8ba962621daa846f38489cae5c4a7b5d738f15f6443d17"},
+    {file = "regex-2024.4.28-cp312-cp312-win_amd64.whl", hash = "sha256:0dd3f69098511e71880fb00f5815db9ed0ef62c05775395968299cb400aeab82"},
+    {file = "regex-2024.4.28-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:374f690e1dd0dbdcddea4a5c9bdd97632cf656c69113f7cd6a361f2a67221cb6"},
+    {file = "regex-2024.4.28-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:25f87ae6b96374db20f180eab083aafe419b194e96e4f282c40191e71980c666"},
+    {file = "regex-2024.4.28-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5dbc1bcc7413eebe5f18196e22804a3be1bfdfc7e2afd415e12c068624d48247"},
+    {file = "regex-2024.4.28-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f85151ec5a232335f1be022b09fbbe459042ea1951d8a48fef251223fc67eee1"},
+    {file = "regex-2024.4.28-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:57ba112e5530530fd175ed550373eb263db4ca98b5f00694d73b18b9a02e7185"},
+    {file = "regex-2024.4.28-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:224803b74aab56aa7be313f92a8d9911dcade37e5f167db62a738d0c85fdac4b"},
+    {file = "regex-2024.4.28-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0a54a047b607fd2d2d52a05e6ad294602f1e0dec2291152b745870afc47c1397"},
+    {file = "regex-2024.4.28-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a2a512d623f1f2d01d881513af9fc6a7c46e5cfffb7dc50c38ce959f9246c94"},
+    {file = "regex-2024.4.28-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:c06bf3f38f0707592898428636cbb75d0a846651b053a1cf748763e3063a6925"},
+    {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:1031a5e7b048ee371ab3653aad3030ecfad6ee9ecdc85f0242c57751a05b0ac4"},
+    {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:d7a353ebfa7154c871a35caca7bfd8f9e18666829a1dc187115b80e35a29393e"},
+    {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:7e76b9cfbf5ced1aca15a0e5b6f229344d9b3123439ffce552b11faab0114a02"},
+    {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:5ce479ecc068bc2a74cb98dd8dba99e070d1b2f4a8371a7dfe631f85db70fe6e"},
+    {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:7d77b6f63f806578c604dca209280e4c54f0fa9a8128bb8d2cc5fb6f99da4150"},
+    {file = "regex-2024.4.28-cp38-cp38-win32.whl", hash = "sha256:d84308f097d7a513359757c69707ad339da799e53b7393819ec2ea36bc4beb58"},
+    {file = "regex-2024.4.28-cp38-cp38-win_amd64.whl", hash = "sha256:2cc1b87bba1dd1a898e664a31012725e48af826bf3971e786c53e32e02adae6c"},
+    {file = "regex-2024.4.28-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7413167c507a768eafb5424413c5b2f515c606be5bb4ef8c5dee43925aa5718b"},
+    {file = "regex-2024.4.28-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:108e2dcf0b53a7c4ab8986842a8edcb8ab2e59919a74ff51c296772e8e74d0ae"},
+    {file = "regex-2024.4.28-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f1c5742c31ba7d72f2dedf7968998730664b45e38827637e0f04a2ac7de2f5f1"},
+    {file = "regex-2024.4.28-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ecc6148228c9ae25ce403eade13a0961de1cb016bdb35c6eafd8e7b87ad028b1"},
+    {file = "regex-2024.4.28-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b7d893c8cf0e2429b823ef1a1d360a25950ed11f0e2a9df2b5198821832e1947"},
+    {file = "regex-2024.4.28-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4290035b169578ffbbfa50d904d26bec16a94526071ebec3dadbebf67a26b25e"},
+    {file = "regex-2024.4.28-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:44a22ae1cfd82e4ffa2066eb3390777dc79468f866f0625261a93e44cdf6482b"},
+    {file = "regex-2024.4.28-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fd24fd140b69f0b0bcc9165c397e9b2e89ecbeda83303abf2a072609f60239e2"},
+    {file = "regex-2024.4.28-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:39fb166d2196413bead229cd64a2ffd6ec78ebab83fff7d2701103cf9f4dfd26"},
+    {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9301cc6db4d83d2c0719f7fcda37229691745168bf6ae849bea2e85fc769175d"},
+    {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:7c3d389e8d76a49923683123730c33e9553063d9041658f23897f0b396b2386f"},
+    {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:99ef6289b62042500d581170d06e17f5353b111a15aa6b25b05b91c6886df8fc"},
+    {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:b91d529b47798c016d4b4c1d06cc826ac40d196da54f0de3c519f5a297c5076a"},
+    {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:43548ad74ea50456e1c68d3c67fff3de64c6edb85bcd511d1136f9b5376fc9d1"},
+    {file = "regex-2024.4.28-cp39-cp39-win32.whl", hash = "sha256:05d9b6578a22db7dedb4df81451f360395828b04f4513980b6bd7a1412c679cc"},
+    {file = "regex-2024.4.28-cp39-cp39-win_amd64.whl", hash = "sha256:3986217ec830c2109875be740531feb8ddafe0dfa49767cdcd072ed7e8927962"},
+    {file = "regex-2024.4.28.tar.gz", hash = "sha256:83ab366777ea45d58f72593adf35d36ca911ea8bd838483c1823b883a121b0e4"},
+]
+
 [[package]]
 name = "requests"
 version = "2.31.0"
@@ -2256,18 +2567,18 @@ urllib3 = ">=1.25.10"
 tests = ["coverage (>=6.0.0)", "flake8", "mypy", "pytest (>=7.0.0)", "pytest-asyncio", "pytest-cov", "pytest-localserver", "types-mock", "types-requests"]
 
 [[package]]
-name = "rsa"
-version = "4.9"
-description = "Pure-Python RSA implementation"
+name = "rfc3339-validator"
+version = "0.1.4"
+description = "A pure python RFC3339 validator"
 optional = false
-python-versions = ">=3.6,<4"
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
 files = [
-    {file = "rsa-4.9-py3-none-any.whl", hash = "sha256:90260d9058e514786967344d0ef75fa8727eed8a7d2e43ce9f4bcf1b536174f7"},
-    {file = "rsa-4.9.tar.gz", hash = "sha256:e38464a49c6c85d7f1351b0126661487a7e0a14a50f1675ec50eb34d4f20ef21"},
+    {file = "rfc3339_validator-0.1.4-py2.py3-none-any.whl", hash = "sha256:24f6ec1eda14ef823da9e36ec7113124b39c04d50a4d3d3a3c2859577e7791fa"},
+    {file = "rfc3339_validator-0.1.4.tar.gz", hash = "sha256:138a2abdf93304ad60530167e51d2dfb9549521a836871b88d7f4695d0022f6b"},
 ]
 
 [package.dependencies]
-pyasn1 = ">=0.1.3"
+six = "*"
 
 [[package]]
 name = "ruff"
@@ -2366,22 +2677,18 @@ files = [
 ]
 
 [[package]]
-name = "sshpubkeys"
-version = "3.3.1"
-description = "SSH public key parser"
+name = "sympy"
+version = "1.12"
+description = "Computer algebra system (CAS) in Python"
 optional = false
-python-versions = ">=3"
+python-versions = ">=3.8"
 files = [
-    {file = "sshpubkeys-3.3.1-py2.py3-none-any.whl", hash = "sha256:946f76b8fe86704b0e7c56a00d80294e39bc2305999844f079a217885060b1ac"},
-    {file = "sshpubkeys-3.3.1.tar.gz", hash = "sha256:3020ed4f8c846849299370fbe98ff4157b0ccc1accec105e07cfa9ae4bb55064"},
+    {file = "sympy-1.12-py3-none-any.whl", hash = "sha256:c3588cd4295d0c0f603d0f2ae780587e64e2efeedb3521e46b9bb1d08d184fa5"},
+    {file = "sympy-1.12.tar.gz", hash = "sha256:ebf595c8dac3e0fdc4152c51878b498396ec7f30e7a914d6071e674d49420fb8"},
 ]
 
 [package.dependencies]
-cryptography = ">=2.1.4"
-ecdsa = ">=0.13"
-
-[package.extras]
-dev = ["twine", "wheel", "yapf"]
+mpmath = ">=0.19"
 
 [[package]]
 name = "toml"
@@ -2652,16 +2959,6 @@ files = [
     {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
     {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
     {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
-    {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
-    {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
-    {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
-    {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
@@ -2899,4 +3196,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "496d6d9f722983bda4d1265370bc8ba75560da74ab5d6b68c94a03290815e1eb"
+content-hash = "dcde14c58a32bda5f123319a069352c458b3719f3c62977991eebb9803a46a9e"
diff --git a/pyproject.toml b/pyproject.toml
index 4ec8efc2ff..ac7f9b061c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,7 +19,7 @@ types-requests = "^2.31.0.0"
 types-psycopg2 = "^2.9.21.10"
 boto3 = "^1.34.11"
 boto3-stubs = {extras = ["s3"], version = "^1.26.16"}
-moto = {extras = ["server"], version = "^4.1.2"}
+moto = {extras = ["server"], version = "^5.0.6"}
 backoff = "^2.2.1"
 pytest-lazy-fixture = "^0.6.3"
 prometheus-client = "^0.14.1"
diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py
index 925e1b450f..132d2450a7 100644
--- a/test_runner/fixtures/remote_storage.py
+++ b/test_runner/fixtures/remote_storage.py
@@ -50,7 +50,7 @@ class MockS3Server:
         # XXX: do not use `shell=True` or add `exec ` to the command here otherwise.
         # We use `self.subprocess.kill()` to shut down the server, which would not "just" work in Linux
         # if a process is started from the shell process.
-        self.subprocess = subprocess.Popen(["poetry", "run", "moto_server", "s3", f"-p{port}"])
+        self.subprocess = subprocess.Popen(["poetry", "run", "moto_server", f"-p{port}"])
         error = None
         try:
             return_code = self.subprocess.poll()

From 8728d5a5fd261a483560fcc53ece7cc51ec82600 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 8 May 2024 16:32:21 +0200
Subject: [PATCH 004/130] neon_local: use `pageserver.toml` as source of truth
 for `struct PageServerConf` (#7642)

Before this PR, `neon_local` would store a copy of a subset of the
initial `pageserver.toml` in its `.neon/config`, e.g, `listen_pg_addr`.
That copy is represented as `struct PageServerConf`.

This copy was used to inform e.g., `neon_local endpoint` and other
commands that depend on Pageserver about which port to connect to.

The problem with that scheme is that the duplicated information in
`.neon/config` can get stale if `pageserver.toml` is changed.

This PR fixes that by eliminating populating `struct PageServerConf`
from the `pageserver.toml`s.

The `[[pageservers]]` TOML table in the `.neon/config` is obsolete.
As of this PR, `neon_local` will fail to start and print an error
informing about this change.

Code-level changes:

- Remove the `--pg-version` flag, it was only used for some checks
during `neon_local init`
- Remove the warn-but-continue behavior for when auth key creation fails
but auth keys are not required. It's just complexity that is unjustified
for a tool like `neon_local`.
- Introduce a type-system-level distinction between the runtime state
and the two (!) toml formats that are almost the same but not quite.
  - runtime state: `struct PageServerConf`, now without `serde` derives
  - toml format 1: the state in `.neon/config` => `struct OnDiskState`
- toml format 2: the `neon_local init --config TMPFILE` that, unlike
`struct OnDiskState`, allows specifying `pageservers`
- Remove `[[pageservers]]` from the `struct OnDiskState` and load the
data from the individual `pageserver.toml`s instead.
---
 control_plane/src/bin/neon_local.rs        | 151 +++----
 control_plane/src/local_env.rs             | 468 +++++++++++++--------
 control_plane/src/pageserver.rs            |  81 +---
 test_runner/fixtures/neon_fixtures.py      |  54 +--
 test_runner/regress/test_pageserver_api.py |  35 +-
 5 files changed, 412 insertions(+), 377 deletions(-)

diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 179a756135..18e395e2b5 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -9,8 +9,11 @@ use anyhow::{anyhow, bail, Context, Result};
 use clap::{value_parser, Arg, ArgAction, ArgMatches, Command, ValueEnum};
 use compute_api::spec::ComputeMode;
 use control_plane::endpoint::ComputeControlPlane;
-use control_plane::local_env::{InitForceMode, LocalEnv};
-use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
+use control_plane::local_env::{
+    InitForceMode, LocalEnv, NeonBroker, NeonLocalInitConf, NeonLocalInitPageserverConf,
+    SafekeeperConf,
+};
+use control_plane::pageserver::PageServerNode;
 use control_plane::safekeeper::SafekeeperNode;
 use control_plane::storage_controller::StorageController;
 use control_plane::{broker, local_env};
@@ -52,44 +55,6 @@ const DEFAULT_PG_VERSION: &str = "15";
 
 const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/";
 
-fn default_conf(num_pageservers: u16) -> String {
-    let mut template = format!(
-        r#"
-# Default built-in configuration, defined in main.rs
-control_plane_api = '{DEFAULT_PAGESERVER_CONTROL_PLANE_API}'
-
-[broker]
-listen_addr = '{DEFAULT_BROKER_ADDR}'
-
-[[safekeepers]]
-id = {DEFAULT_SAFEKEEPER_ID}
-pg_port = {DEFAULT_SAFEKEEPER_PG_PORT}
-http_port = {DEFAULT_SAFEKEEPER_HTTP_PORT}
-
-"#,
-    );
-
-    for i in 0..num_pageservers {
-        let pageserver_id = NodeId(DEFAULT_PAGESERVER_ID.0 + i as u64);
-        let pg_port = DEFAULT_PAGESERVER_PG_PORT + i;
-        let http_port = DEFAULT_PAGESERVER_HTTP_PORT + i;
-
-        template += &format!(
-            r#"
-[[pageservers]]
-id = {pageserver_id}
-listen_pg_addr = '127.0.0.1:{pg_port}'
-listen_http_addr = '127.0.0.1:{http_port}'
-pg_auth_type = '{trust_auth}'
-http_auth_type = '{trust_auth}'
-"#,
-            trust_auth = AuthType::Trust,
-        )
-    }
-
-    template
-}
-
 ///
 /// Timelines tree element used as a value in the HashMap.
 ///
@@ -152,7 +117,7 @@ fn main() -> Result<()> {
     };
 
     match subcommand_result {
-        Ok(Some(updated_env)) => updated_env.persist_config(&updated_env.base_data_dir)?,
+        Ok(Some(updated_env)) => updated_env.persist_config()?,
         Ok(None) => (),
         Err(e) => {
             eprintln!("command failed: {e:?}");
@@ -341,55 +306,65 @@ fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TimelineId
 }
 
 fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
-    let num_pageservers = init_match
-        .get_one::<u16>("num-pageservers")
-        .expect("num-pageservers arg has a default");
-    // Create config file
-    let toml_file: String = if let Some(config_path) = init_match.get_one::<PathBuf>("config") {
+    let num_pageservers = init_match.get_one::<u16>("num-pageservers");
+
+    let force = init_match.get_one("force").expect("we set a default value");
+
+    // Create the in-memory `LocalEnv` that we'd normally load from disk in `load_config`.
+    let init_conf: NeonLocalInitConf = if let Some(config_path) =
+        init_match.get_one::<PathBuf>("config")
+    {
+        // User (likely the Python test suite) provided a description of the environment.
+        if num_pageservers.is_some() {
+            bail!("Cannot specify both --num-pageservers and --config, use key `pageservers` in the --config file instead");
+        }
         // load and parse the file
-        std::fs::read_to_string(config_path).with_context(|| {
+        let contents = std::fs::read_to_string(config_path).with_context(|| {
             format!(
                 "Could not read configuration file '{}'",
                 config_path.display()
             )
-        })?
+        })?;
+        toml_edit::de::from_str(&contents)?
     } else {
-        // Built-in default config
-        default_conf(*num_pageservers)
+        // User (likely interactive) did not provide a description of the environment, give them the default
+        NeonLocalInitConf {
+            control_plane_api: Some(Some(DEFAULT_PAGESERVER_CONTROL_PLANE_API.parse().unwrap())),
+            broker: NeonBroker {
+                listen_addr: DEFAULT_BROKER_ADDR.parse().unwrap(),
+            },
+            safekeepers: vec![SafekeeperConf {
+                id: DEFAULT_SAFEKEEPER_ID,
+                pg_port: DEFAULT_SAFEKEEPER_PG_PORT,
+                http_port: DEFAULT_SAFEKEEPER_HTTP_PORT,
+                ..Default::default()
+            }],
+            pageservers: (0..num_pageservers.copied().unwrap_or(1))
+                .map(|i| {
+                    let pageserver_id = NodeId(DEFAULT_PAGESERVER_ID.0 + i as u64);
+                    let pg_port = DEFAULT_PAGESERVER_PG_PORT + i;
+                    let http_port = DEFAULT_PAGESERVER_HTTP_PORT + i;
+                    NeonLocalInitPageserverConf {
+                        id: pageserver_id,
+                        listen_pg_addr: format!("127.0.0.1:{pg_port}"),
+                        listen_http_addr: format!("127.0.0.1:{http_port}"),
+                        pg_auth_type: AuthType::Trust,
+                        http_auth_type: AuthType::Trust,
+                        other: Default::default(),
+                    }
+                })
+                .collect(),
+            pg_distrib_dir: None,
+            neon_distrib_dir: None,
+            default_tenant_id: TenantId::from_array(std::array::from_fn(|_| 0)),
+            storage_controller: None,
+            control_plane_compute_hook_api: None,
+        }
     };
 
-    let pageserver_config: toml_edit::Document =
-        if let Some(path) = init_match.get_one::<PathBuf>("pageserver-config") {
-            std::fs::read_to_string(path)?.parse()?
-        } else {
-            toml_edit::Document::new()
-        };
-
-    let pg_version = init_match
-        .get_one::<u32>("pg-version")
-        .copied()
-        .context("Failed to parse postgres version from the argument string")?;
-
-    let mut env =
-        LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?;
-    let force = init_match.get_one("force").expect("we set a default value");
-    env.init(pg_version, force)
-        .context("Failed to initialize neon repository")?;
-
-    // Create remote storage location for default LocalFs remote storage
-    std::fs::create_dir_all(env.base_data_dir.join(PAGESERVER_REMOTE_STORAGE_DIR))?;
-
-    // Initialize pageserver, create initial tenant and timeline.
-    for ps_conf in &env.pageservers {
-        PageServerNode::from_env(&env, ps_conf)
-            .initialize(pageserver_config.clone())
-            .unwrap_or_else(|e| {
-                eprintln!("pageserver init failed: {e:?}");
-                exit(1);
-            });
-    }
-
-    Ok(env)
+    LocalEnv::init(init_conf, force)
+        .context("materialize initial neon_local environment on disk")?;
+    Ok(LocalEnv::load_config().expect("freshly written config should be loadable"))
 }
 
 /// The default pageserver is the one where CLI tenant/timeline operations are sent by default.
@@ -1418,9 +1393,7 @@ fn cli() -> Command {
     let num_pageservers_arg = Arg::new("num-pageservers")
         .value_parser(value_parser!(u16))
         .long("num-pageservers")
-        .help("How many pageservers to create (default 1)")
-        .required(false)
-        .default_value("1");
+        .help("How many pageservers to create (default 1)");
 
     let update_catalog = Arg::new("update-catalog")
         .value_parser(value_parser!(bool))
@@ -1454,14 +1427,6 @@ fn cli() -> Command {
                         .value_parser(value_parser!(PathBuf))
                         .value_name("config")
                 )
-                .arg(
-                    Arg::new("pageserver-config")
-                        .long("pageserver-config")
-                        .required(false)
-                        .value_parser(value_parser!(PathBuf))
-                        .value_name("pageserver-config")
-                        .help("Merge the provided pageserver config into the one generated by neon_local."),
-                )
                 .arg(pg_version_arg.clone())
                 .arg(force_arg)
         )
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 7abbbce95a..d13884198e 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -3,7 +3,7 @@
 //! Now it also provides init method which acts like a stub for proper installation
 //! script which will use local paths.
 
-use anyhow::{bail, ensure, Context};
+use anyhow::{bail, Context};
 
 use clap::ValueEnum;
 use postgres_backend::AuthType;
@@ -23,6 +23,8 @@ use utils::{
     id::{NodeId, TenantId, TenantTimelineId, TimelineId},
 };
 
+use crate::pageserver::PageServerNode;
+use crate::pageserver::PAGESERVER_REMOTE_STORAGE_DIR;
 use crate::safekeeper::SafekeeperNode;
 
 pub const DEFAULT_PG_VERSION: u32 = 15;
@@ -34,7 +36,7 @@ pub const DEFAULT_PG_VERSION: u32 = 15;
 // to 'neon_local init --config=<path>' option. See control_plane/simple.conf for
 // an example.
 //
-#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
+#[derive(PartialEq, Eq, Clone, Debug)]
 pub struct LocalEnv {
     // Base directory for all the nodes (the pageserver, safekeepers and
     // compute endpoints).
@@ -42,59 +44,99 @@ pub struct LocalEnv {
     // This is not stored in the config file. Rather, this is the path where the
     // config file itself is. It is read from the NEON_REPO_DIR env variable or
     // '.neon' if not given.
-    #[serde(skip)]
     pub base_data_dir: PathBuf,
 
     // Path to postgres distribution. It's expected that "bin", "include",
     // "lib", "share" from postgres distribution are there. If at some point
     // in time we will be able to run against vanilla postgres we may split that
     // to four separate paths and match OS-specific installation layout.
-    #[serde(default)]
     pub pg_distrib_dir: PathBuf,
 
     // Path to pageserver binary.
-    #[serde(default)]
     pub neon_distrib_dir: PathBuf,
 
     // Default tenant ID to use with the 'neon_local' command line utility, when
     // --tenant_id is not explicitly specified.
-    #[serde(default)]
     pub default_tenant_id: Option<TenantId>,
 
     // used to issue tokens during e.g pg start
-    #[serde(default)]
     pub private_key_path: PathBuf,
 
     pub broker: NeonBroker,
 
     // Configuration for the storage controller (1 per neon_local environment)
-    #[serde(default)]
     pub storage_controller: NeonStorageControllerConf,
 
     /// This Vec must always contain at least one pageserver
+    /// Populdated by [`Self::load_config`] from the individual `pageserver.toml`s.
+    /// NB: not used anymore except for informing users that they need to change their `.neon/config`.
     pub pageservers: Vec<PageServerConf>,
 
-    #[serde(default)]
     pub safekeepers: Vec<SafekeeperConf>,
 
     // Control plane upcall API for pageserver: if None, we will not run storage_controller  If set, this will
     // be propagated into each pageserver's configuration.
-    #[serde(default)]
     pub control_plane_api: Option<Url>,
 
     // Control plane upcall API for storage controller.  If set, this will be propagated into the
     // storage controller's configuration.
-    #[serde(default)]
     pub control_plane_compute_hook_api: Option<Url>,
 
     /// Keep human-readable aliases in memory (and persist them to config), to hide ZId hex strings from the user.
-    #[serde(default)]
     // A `HashMap<String, HashMap<TenantId, TimelineId>>` would be more appropriate here,
     // but deserialization into a generic toml object as `toml::Value::try_from` fails with an error.
     // https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table".
+    pub branch_name_mappings: HashMap<String, Vec<(TenantId, TimelineId)>>,
+}
+
+/// On-disk state stored in `.neon/config`.
+#[derive(PartialEq, Eq, Clone, Debug, Default, Serialize, Deserialize)]
+#[serde(default, deny_unknown_fields)]
+pub struct OnDiskConfig {
+    pub pg_distrib_dir: PathBuf,
+    pub neon_distrib_dir: PathBuf,
+    pub default_tenant_id: Option<TenantId>,
+    pub private_key_path: PathBuf,
+    pub broker: NeonBroker,
+    pub storage_controller: NeonStorageControllerConf,
+    #[serde(
+        skip_serializing,
+        deserialize_with = "fail_if_pageservers_field_specified"
+    )]
+    pub pageservers: Vec<PageServerConf>,
+    pub safekeepers: Vec<SafekeeperConf>,
+    pub control_plane_api: Option<Url>,
+    pub control_plane_compute_hook_api: Option<Url>,
     branch_name_mappings: HashMap<String, Vec<(TenantId, TimelineId)>>,
 }
 
+fn fail_if_pageservers_field_specified<'de, D>(_: D) -> Result<Vec<PageServerConf>, D::Error>
+where
+    D: serde::Deserializer<'de>,
+{
+    Err(serde::de::Error::custom(
+        "The 'pageservers' field is no longer used; pageserver.toml is now authoritative; \
+         Please remove the `pageservers` from your .neon/config.",
+    ))
+}
+
+/// The description of the neon_local env to be initialized by `neon_local init --config`.
+#[derive(Clone, Debug, Deserialize)]
+#[serde(deny_unknown_fields)]
+pub struct NeonLocalInitConf {
+    // TODO: do we need this? Seems unused
+    pub pg_distrib_dir: Option<PathBuf>,
+    // TODO: do we need this? Seems unused
+    pub neon_distrib_dir: Option<PathBuf>,
+    pub default_tenant_id: TenantId,
+    pub broker: NeonBroker,
+    pub storage_controller: Option<NeonStorageControllerConf>,
+    pub pageservers: Vec<NeonLocalInitPageserverConf>,
+    pub safekeepers: Vec<SafekeeperConf>,
+    pub control_plane_api: Option<Option<Url>>,
+    pub control_plane_compute_hook_api: Option<Option<Url>>,
+}
+
 /// Broker config for cluster internal communication.
 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
 #[serde(default)]
@@ -141,24 +183,18 @@ impl NeonBroker {
     }
 }
 
+// neon_local needs to know this subset of pageserver configuration.
+// For legacy reasons, this information is duplicated from `pageserver.toml` into `.neon/config`.
+// It can get stale if `pageserver.toml` is changed.
+// TODO(christian): don't store this at all in `.neon/config`, always load it from `pageserver.toml`
 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
 #[serde(default, deny_unknown_fields)]
 pub struct PageServerConf {
-    // node id
     pub id: NodeId,
-
-    // Pageserver connection settings
     pub listen_pg_addr: String,
     pub listen_http_addr: String,
-
-    // auth type used for the PG and HTTP ports
     pub pg_auth_type: AuthType,
     pub http_auth_type: AuthType,
-
-    pub(crate) virtual_file_io_engine: Option<String>,
-    pub(crate) get_vectored_impl: Option<String>,
-    pub(crate) get_impl: Option<String>,
-    pub(crate) validate_vectored_get: Option<bool>,
 }
 
 impl Default for PageServerConf {
@@ -169,10 +205,40 @@ impl Default for PageServerConf {
             listen_http_addr: String::new(),
             pg_auth_type: AuthType::Trust,
             http_auth_type: AuthType::Trust,
-            virtual_file_io_engine: None,
-            get_vectored_impl: None,
-            get_impl: None,
-            validate_vectored_get: None,
+        }
+    }
+}
+
+/// The toml that can be passed to `neon_local init --config`.
+/// This is a subset of the `pageserver.toml` configuration.
+// TODO(christian): use pageserver_api::config::ConfigToml (PR #7656)
+#[derive(Clone, Debug, serde::Deserialize, serde::Serialize)]
+pub struct NeonLocalInitPageserverConf {
+    pub id: NodeId,
+    pub listen_pg_addr: String,
+    pub listen_http_addr: String,
+    pub pg_auth_type: AuthType,
+    pub http_auth_type: AuthType,
+    #[serde(flatten)]
+    pub other: HashMap<String, toml::Value>,
+}
+
+impl From<&NeonLocalInitPageserverConf> for PageServerConf {
+    fn from(conf: &NeonLocalInitPageserverConf) -> Self {
+        let NeonLocalInitPageserverConf {
+            id,
+            listen_pg_addr,
+            listen_http_addr,
+            pg_auth_type,
+            http_auth_type,
+            other: _,
+        } = conf;
+        Self {
+            id: *id,
+            listen_pg_addr: listen_pg_addr.clone(),
+            listen_http_addr: listen_http_addr.clone(),
+            pg_auth_type: *pg_auth_type,
+            http_auth_type: *http_auth_type,
         }
     }
 }
@@ -360,44 +426,7 @@ impl LocalEnv {
             .collect()
     }
 
-    /// Create a LocalEnv from a config file.
-    ///
-    /// Unlike 'load_config', this function fills in any defaults that are missing
-    /// from the config file.
-    pub fn parse_config(toml: &str) -> anyhow::Result<Self> {
-        let mut env: LocalEnv = toml::from_str(toml)?;
-
-        // Find postgres binaries.
-        // Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "pg_install".
-        // Note that later in the code we assume, that distrib dirs follow the same pattern
-        // for all postgres versions.
-        if env.pg_distrib_dir == Path::new("") {
-            if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") {
-                env.pg_distrib_dir = postgres_bin.into();
-            } else {
-                let cwd = env::current_dir()?;
-                env.pg_distrib_dir = cwd.join("pg_install")
-            }
-        }
-
-        // Find neon binaries.
-        if env.neon_distrib_dir == Path::new("") {
-            env::current_exe()?
-                .parent()
-                .unwrap()
-                .clone_into(&mut env.neon_distrib_dir);
-        }
-
-        if env.pageservers.is_empty() {
-            anyhow::bail!("Configuration must contain at least one pageserver");
-        }
-
-        env.base_data_dir = base_path();
-
-        Ok(env)
-    }
-
-    /// Locate and load config
+    ///  Construct `Self` from on-disk state.
     pub fn load_config() -> anyhow::Result<Self> {
         let repopath = base_path();
 
@@ -411,38 +440,129 @@ impl LocalEnv {
         // TODO: check that it looks like a neon repository
 
         // load and parse file
-        let config = fs::read_to_string(repopath.join("config"))?;
-        let mut env: LocalEnv = toml::from_str(config.as_str())?;
+        let config_file_contents = fs::read_to_string(repopath.join("config"))?;
+        let on_disk_config: OnDiskConfig = toml::from_str(config_file_contents.as_str())?;
+        let mut env = {
+            let OnDiskConfig {
+                pg_distrib_dir,
+                neon_distrib_dir,
+                default_tenant_id,
+                private_key_path,
+                broker,
+                storage_controller,
+                pageservers,
+                safekeepers,
+                control_plane_api,
+                control_plane_compute_hook_api,
+                branch_name_mappings,
+            } = on_disk_config;
+            LocalEnv {
+                base_data_dir: repopath.clone(),
+                pg_distrib_dir,
+                neon_distrib_dir,
+                default_tenant_id,
+                private_key_path,
+                broker,
+                storage_controller,
+                pageservers,
+                safekeepers,
+                control_plane_api,
+                control_plane_compute_hook_api,
+                branch_name_mappings,
+            }
+        };
 
-        env.base_data_dir = repopath;
+        // The source of truth for pageserver configuration is the pageserver.toml.
+        assert!(
+            env.pageservers.is_empty(),
+            "we ensure this during deserialization"
+        );
+        env.pageservers = {
+            let iter = std::fs::read_dir(&repopath).context("open dir")?;
+            let mut pageservers = Vec::new();
+            for res in iter {
+                let dentry = res?;
+                const PREFIX: &str = "pageserver_";
+                let dentry_name = dentry
+                    .file_name()
+                    .into_string()
+                    .ok()
+                    .with_context(|| format!("non-utf8 dentry: {:?}", dentry.path()))
+                    .unwrap();
+                if !dentry_name.starts_with(PREFIX) {
+                    continue;
+                }
+                if !dentry.file_type().context("determine file type")?.is_dir() {
+                    anyhow::bail!("expected a directory, got {:?}", dentry.path());
+                }
+                let id = dentry_name[PREFIX.len()..]
+                    .parse::<NodeId>()
+                    .with_context(|| format!("parse id from {:?}", dentry.path()))?;
+                // TODO(christian): use pageserver_api::config::ConfigToml (PR #7656)
+                #[derive(serde::Serialize, serde::Deserialize)]
+                // (allow unknown fields, unlike PageServerConf)
+                struct PageserverConfigTomlSubset {
+                    id: NodeId,
+                    listen_pg_addr: String,
+                    listen_http_addr: String,
+                    pg_auth_type: AuthType,
+                    http_auth_type: AuthType,
+                }
+                let config_toml_path = dentry.path().join("pageserver.toml");
+                let config_toml: PageserverConfigTomlSubset = toml_edit::de::from_str(
+                    &std::fs::read_to_string(&config_toml_path)
+                        .with_context(|| format!("read {:?}", config_toml_path))?,
+                )
+                .context("parse pageserver.toml")?;
+                let PageserverConfigTomlSubset {
+                    id: config_toml_id,
+                    listen_pg_addr,
+                    listen_http_addr,
+                    pg_auth_type,
+                    http_auth_type,
+                } = config_toml;
+                let conf = PageServerConf {
+                    id: {
+                        anyhow::ensure!(
+                            config_toml_id == id,
+                            "id mismatch: config_toml.id={config_toml_id} id={id}",
+                        );
+                        id
+                    },
+                    listen_pg_addr,
+                    listen_http_addr,
+                    pg_auth_type,
+                    http_auth_type,
+                };
+                pageservers.push(conf);
+            }
+            pageservers
+        };
 
         Ok(env)
     }
 
-    pub fn persist_config(&self, base_path: &Path) -> anyhow::Result<()> {
-        // Currently, the user first passes a config file with 'neon_local init --config=<path>'
-        // We read that in, in `create_config`, and fill any missing defaults. Then it's saved
-        // to .neon/config. TODO: We lose any formatting and comments along the way, which is
-        // a bit sad.
-        let mut conf_content = r#"# This file describes a local deployment of the page server
-# and safekeeeper node. It is read by the 'neon_local' command-line
-# utility.
-"#
-        .to_string();
-
-        // Convert the LocalEnv to a toml file.
-        //
-        // This could be as simple as this:
-        //
-        // conf_content += &toml::to_string_pretty(env)?;
-        //
-        // But it results in a "values must be emitted before tables". I'm not sure
-        // why, AFAICS the table, i.e. 'safekeepers: Vec<SafekeeperConf>' is last.
-        // Maybe rust reorders the fields to squeeze avoid padding or something?
-        // In any case, converting to toml::Value first, and serializing that, works.
-        // See https://github.com/alexcrichton/toml-rs/issues/142
-        conf_content += &toml::to_string_pretty(&toml::Value::try_from(self)?)?;
+    pub fn persist_config(&self) -> anyhow::Result<()> {
+        Self::persist_config_impl(
+            &self.base_data_dir,
+            &OnDiskConfig {
+                pg_distrib_dir: self.pg_distrib_dir.clone(),
+                neon_distrib_dir: self.neon_distrib_dir.clone(),
+                default_tenant_id: self.default_tenant_id,
+                private_key_path: self.private_key_path.clone(),
+                broker: self.broker.clone(),
+                storage_controller: self.storage_controller.clone(),
+                pageservers: vec![], // it's skip_serializing anyway
+                safekeepers: self.safekeepers.clone(),
+                control_plane_api: self.control_plane_api.clone(),
+                control_plane_compute_hook_api: self.control_plane_compute_hook_api.clone(),
+                branch_name_mappings: self.branch_name_mappings.clone(),
+            },
+        )
+    }
 
+    pub fn persist_config_impl(base_path: &Path, config: &OnDiskConfig) -> anyhow::Result<()> {
+        let conf_content = &toml::to_string_pretty(config)?;
         let target_config_path = base_path.join("config");
         fs::write(&target_config_path, conf_content).with_context(|| {
             format!(
@@ -467,17 +587,13 @@ impl LocalEnv {
         }
     }
 
-    //
-    // Initialize a new Neon repository
-    //
-    pub fn init(&mut self, pg_version: u32, force: &InitForceMode) -> anyhow::Result<()> {
-        // check if config already exists
-        let base_path = &self.base_data_dir;
-        ensure!(
-            base_path != Path::new(""),
-            "repository base path is missing"
-        );
+    /// Materialize the [`NeonLocalInitConf`] to disk. Called during [`neon_local init`].
+    pub fn init(conf: NeonLocalInitConf, force: &InitForceMode) -> anyhow::Result<()> {
+        let base_path = base_path();
+        assert_ne!(base_path, Path::new(""));
+        let base_path = &base_path;
 
+        // create base_path dir
         if base_path.exists() {
             match force {
                 InitForceMode::MustNotExist => {
@@ -509,74 +625,96 @@ impl LocalEnv {
                 }
             }
         }
-
-        if !self.pg_bin_dir(pg_version)?.join("postgres").exists() {
-            bail!(
-                "Can't find postgres binary at {}",
-                self.pg_bin_dir(pg_version)?.display()
-            );
-        }
-        for binary in ["pageserver", "safekeeper"] {
-            if !self.neon_distrib_dir.join(binary).exists() {
-                bail!(
-                    "Can't find binary '{binary}' in neon distrib dir '{}'",
-                    self.neon_distrib_dir.display()
-                );
-            }
-        }
-
         if !base_path.exists() {
             fs::create_dir(base_path)?;
         }
 
+        let NeonLocalInitConf {
+            pg_distrib_dir,
+            neon_distrib_dir,
+            default_tenant_id,
+            broker,
+            storage_controller,
+            pageservers,
+            safekeepers,
+            control_plane_api,
+            control_plane_compute_hook_api,
+        } = conf;
+
+        // Find postgres binaries.
+        // Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "pg_install".
+        // Note that later in the code we assume, that distrib dirs follow the same pattern
+        // for all postgres versions.
+        let pg_distrib_dir = pg_distrib_dir.unwrap_or_else(|| {
+            if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") {
+                postgres_bin.into()
+            } else {
+                let cwd = env::current_dir().unwrap();
+                cwd.join("pg_install")
+            }
+        });
+
+        // Find neon binaries.
+        let neon_distrib_dir = neon_distrib_dir
+            .unwrap_or_else(|| env::current_exe().unwrap().parent().unwrap().to_owned());
+
         // Generate keypair for JWT.
         //
         // The keypair is only needed if authentication is enabled in any of the
         // components. For convenience, we generate the keypair even if authentication
         // is not enabled, so that you can easily enable it after the initialization
-        // step. However, if the key generation fails, we treat it as non-fatal if
-        // authentication was not enabled.
-        if self.private_key_path == PathBuf::new() {
-            match generate_auth_keys(
-                base_path.join("auth_private_key.pem").as_path(),
-                base_path.join("auth_public_key.pem").as_path(),
-            ) {
-                Ok(()) => {
-                    self.private_key_path = PathBuf::from("auth_private_key.pem");
-                }
-                Err(e) => {
-                    if !self.auth_keys_needed() {
-                        eprintln!("Could not generate keypair for JWT authentication: {e}");
-                        eprintln!("Continuing anyway because authentication was not enabled");
-                        self.private_key_path = PathBuf::from("auth_private_key.pem");
-                    } else {
-                        return Err(e);
-                    }
-                }
-            }
+        // step.
+        generate_auth_keys(
+            base_path.join("auth_private_key.pem").as_path(),
+            base_path.join("auth_public_key.pem").as_path(),
+        )
+        .context("generate auth keys")?;
+        let private_key_path = PathBuf::from("auth_private_key.pem");
+
+        // create the runtime type because the remaining initialization code below needs
+        // a LocalEnv instance op operation
+        // TODO: refactor to avoid this, LocalEnv should only be constructed from on-disk state
+        let env = LocalEnv {
+            base_data_dir: base_path.clone(),
+            pg_distrib_dir,
+            neon_distrib_dir,
+            default_tenant_id: Some(default_tenant_id),
+            private_key_path,
+            broker,
+            storage_controller: storage_controller.unwrap_or_default(),
+            pageservers: pageservers.iter().map(Into::into).collect(),
+            safekeepers,
+            control_plane_api: control_plane_api.unwrap_or_default(),
+            control_plane_compute_hook_api: control_plane_compute_hook_api.unwrap_or_default(),
+            branch_name_mappings: Default::default(),
+        };
+
+        // create endpoints dir
+        fs::create_dir_all(env.endpoints_path())?;
+
+        // create safekeeper dirs
+        for safekeeper in &env.safekeepers {
+            fs::create_dir_all(SafekeeperNode::datadir_path_by_id(&env, safekeeper.id))?;
         }
 
-        fs::create_dir_all(self.endpoints_path())?;
-
-        for safekeeper in &self.safekeepers {
-            fs::create_dir_all(SafekeeperNode::datadir_path_by_id(self, safekeeper.id))?;
+        // initialize pageserver state
+        for (i, ps) in pageservers.into_iter().enumerate() {
+            let runtime_ps = &env.pageservers[i];
+            assert_eq!(&PageServerConf::from(&ps), runtime_ps);
+            fs::create_dir(env.pageserver_data_dir(ps.id))?;
+            PageServerNode::from_env(&env, runtime_ps)
+                .initialize(ps)
+                .context("pageserver init failed")?;
         }
 
-        for ps in &self.pageservers {
-            fs::create_dir(self.pageserver_data_dir(ps.id))?;
-        }
+        // setup remote remote location for default LocalFs remote storage
+        std::fs::create_dir_all(env.base_data_dir.join(PAGESERVER_REMOTE_STORAGE_DIR))?;
 
-        self.persist_config(base_path)
-    }
-
-    fn auth_keys_needed(&self) -> bool {
-        self.pageservers.iter().any(|ps| {
-            ps.pg_auth_type == AuthType::NeonJWT || ps.http_auth_type == AuthType::NeonJWT
-        }) || self.safekeepers.iter().any(|sk| sk.auth_enabled)
+        env.persist_config()
     }
 }
 
-fn base_path() -> PathBuf {
+pub fn base_path() -> PathBuf {
     match std::env::var_os("NEON_REPO_DIR") {
         Some(val) => PathBuf::from(val),
         None => PathBuf::from(".neon"),
@@ -619,31 +757,3 @@ fn generate_auth_keys(private_key_path: &Path, public_key_path: &Path) -> anyhow
     }
     Ok(())
 }
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn simple_conf_parsing() {
-        let simple_conf_toml = include_str!("../simple.conf");
-        let simple_conf_parse_result = LocalEnv::parse_config(simple_conf_toml);
-        assert!(
-            simple_conf_parse_result.is_ok(),
-            "failed to parse simple config {simple_conf_toml}, reason: {simple_conf_parse_result:?}"
-        );
-
-        let string_to_replace = "listen_addr = '127.0.0.1:50051'";
-        let spoiled_url_str = "listen_addr = '!@$XOXO%^&'";
-        let spoiled_url_toml = simple_conf_toml.replace(string_to_replace, spoiled_url_str);
-        assert!(
-            spoiled_url_toml.contains(spoiled_url_str),
-            "Failed to replace string {string_to_replace} in the toml file {simple_conf_toml}"
-        );
-        let spoiled_url_parse_result = LocalEnv::parse_config(&spoiled_url_toml);
-        assert!(
-            spoiled_url_parse_result.is_err(),
-            "expected toml with invalid Url {spoiled_url_toml} to fail the parsing, but got {spoiled_url_parse_result:?}"
-        );
-    }
-}
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 6046c93bad..5a84763697 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -30,7 +30,7 @@ use utils::{
     lsn::Lsn,
 };
 
-use crate::local_env::PageServerConf;
+use crate::local_env::{NeonLocalInitPageserverConf, PageServerConf};
 use crate::{background_process, local_env::LocalEnv};
 
 /// Directory within .neon which will be used by default for LocalFs remote storage.
@@ -76,9 +76,11 @@ impl PageServerNode {
 
     fn pageserver_init_make_toml(
         &self,
-        cli_overrides: toml_edit::Document,
+        conf: NeonLocalInitPageserverConf,
     ) -> anyhow::Result<toml_edit::Document> {
-        // TODO: this is a legacy code, it should be refactored to use toml_edit directly.
+        assert_eq!(&PageServerConf::from(&conf), &self.conf, "during neon_local init, we derive the runtime state of ps conf (self.conf) from the --config flag fully");
+
+        // TODO(christian): instead of what we do here, create a pageserver_api::config::ConfigToml (PR #7656)
 
         // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc.
         let pg_distrib_dir_param = format!(
@@ -86,61 +88,9 @@ impl PageServerNode {
             self.env.pg_distrib_dir_raw().display()
         );
 
-        let PageServerConf {
-            id,
-            listen_pg_addr,
-            listen_http_addr,
-            pg_auth_type,
-            http_auth_type,
-            virtual_file_io_engine,
-            get_vectored_impl,
-            get_impl,
-            validate_vectored_get,
-        } = &self.conf;
-
-        let id = format!("id={}", id);
-
-        let http_auth_type_param = format!("http_auth_type='{}'", http_auth_type);
-        let listen_http_addr_param = format!("listen_http_addr='{}'", listen_http_addr);
-
-        let pg_auth_type_param = format!("pg_auth_type='{}'", pg_auth_type);
-        let listen_pg_addr_param = format!("listen_pg_addr='{}'", listen_pg_addr);
-        let virtual_file_io_engine = if let Some(virtual_file_io_engine) = virtual_file_io_engine {
-            format!("virtual_file_io_engine='{virtual_file_io_engine}'")
-        } else {
-            String::new()
-        };
-        let get_vectored_impl = if let Some(get_vectored_impl) = get_vectored_impl {
-            format!("get_vectored_impl='{get_vectored_impl}'")
-        } else {
-            String::new()
-        };
-        let get_impl = if let Some(get_impl) = get_impl {
-            format!("get_impl='{get_impl}'")
-        } else {
-            String::new()
-        };
-        let validate_vectored_get = if let Some(validate_vectored_get) = validate_vectored_get {
-            format!("validate_vectored_get={validate_vectored_get}")
-        } else {
-            String::new()
-        };
-
         let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());
 
-        let mut overrides = vec![
-            id,
-            pg_distrib_dir_param,
-            http_auth_type_param,
-            pg_auth_type_param,
-            listen_http_addr_param,
-            listen_pg_addr_param,
-            broker_endpoint_param,
-            virtual_file_io_engine,
-            get_vectored_impl,
-            get_impl,
-            validate_vectored_get,
-        ];
+        let mut overrides = vec![pg_distrib_dir_param, broker_endpoint_param];
 
         if let Some(control_plane_api) = &self.env.control_plane_api {
             overrides.push(format!(
@@ -150,7 +100,7 @@ impl PageServerNode {
 
             // Storage controller uses the same auth as pageserver: if JWT is enabled
             // for us, we will also need it to talk to them.
-            if matches!(http_auth_type, AuthType::NeonJWT) {
+            if matches!(conf.http_auth_type, AuthType::NeonJWT) {
                 let jwt_token = self
                     .env
                     .generate_auth_token(&Claims::new(None, Scope::GenerationsApi))
@@ -159,20 +109,23 @@ impl PageServerNode {
             }
         }
 
-        if !cli_overrides.contains_key("remote_storage") {
+        if !conf.other.contains_key("remote_storage") {
             overrides.push(format!(
                 "remote_storage={{local_path='../{PAGESERVER_REMOTE_STORAGE_DIR}'}}"
             ));
         }
 
-        if *http_auth_type != AuthType::Trust || *pg_auth_type != AuthType::Trust {
+        if conf.http_auth_type != AuthType::Trust || conf.pg_auth_type != AuthType::Trust {
             // Keys are generated in the toplevel repo dir, pageservers' workdirs
             // are one level below that, so refer to keys with ../
             overrides.push("auth_validation_public_key_path='../auth_public_key.pem'".to_owned());
         }
 
         // Apply the user-provided overrides
-        overrides.push(cli_overrides.to_string());
+        overrides.push(
+            toml_edit::ser::to_string_pretty(&conf)
+                .expect("we deserialized this from toml earlier"),
+        );
 
         // Turn `overrides` into a toml document.
         // TODO: above code is legacy code, it should be refactored to use toml_edit directly.
@@ -188,8 +141,8 @@ impl PageServerNode {
     }
 
     /// Initializes a pageserver node by creating its config with the overrides provided.
-    pub fn initialize(&self, config_overrides: toml_edit::Document) -> anyhow::Result<()> {
-        self.pageserver_init(config_overrides)
+    pub fn initialize(&self, conf: NeonLocalInitPageserverConf) -> anyhow::Result<()> {
+        self.pageserver_init(conf)
             .with_context(|| format!("Failed to run init for pageserver node {}", self.conf.id))
     }
 
@@ -209,7 +162,7 @@ impl PageServerNode {
         self.start_node().await
     }
 
-    fn pageserver_init(&self, cli_overrides: toml_edit::Document) -> anyhow::Result<()> {
+    fn pageserver_init(&self, conf: NeonLocalInitPageserverConf) -> anyhow::Result<()> {
         let datadir = self.repo_path();
         let node_id = self.conf.id;
         println!(
@@ -221,7 +174,7 @@ impl PageServerNode {
         io::stdout().flush()?;
 
         let config = self
-            .pageserver_init_make_toml(cli_overrides)
+            .pageserver_init_make_toml(conf)
             .context("make pageserver toml")?;
         let config_file_path = datadir.join("pageserver.toml");
         let mut config_file = std::fs::OpenOptions::new()
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 30cec4c726..f618c508bc 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -14,7 +14,7 @@ import textwrap
 import threading
 import time
 import uuid
-from contextlib import ExitStack, closing, contextmanager
+from contextlib import closing, contextmanager
 from dataclasses import dataclass
 from datetime import datetime
 from enum import Enum
@@ -1054,14 +1054,14 @@ class NeonEnv:
         self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine
         self.pageserver_aux_file_policy = config.pageserver_aux_file_policy
 
-        # Create a config file corresponding to the options
+        # Create the neon_local's `NeonLocalInitConf`
         cfg: Dict[str, Any] = {
             "default_tenant_id": str(self.initial_tenant),
             "broker": {
                 "listen_addr": self.broker.listen_addr(),
             },
-            "pageservers": [],
             "safekeepers": [],
+            "pageservers": [],
         }
 
         if self.control_plane_api is not None:
@@ -1100,6 +1100,17 @@ class NeonEnv:
             if config.pageserver_validate_vectored_get is not None:
                 ps_cfg["validate_vectored_get"] = config.pageserver_validate_vectored_get
 
+            if self.pageserver_remote_storage is not None:
+                ps_cfg["remote_storage"] = remote_storage_to_toml_dict(
+                    self.pageserver_remote_storage
+                )
+
+            if config.pageserver_config_override is not None:
+                for o in config.pageserver_config_override.split(";"):
+                    override = toml.loads(o)
+                    for key, value in override.items():
+                        ps_cfg[key] = value
+
             # Create a corresponding NeonPageserver object
             self.pageservers.append(
                 NeonPageserver(
@@ -1136,7 +1147,6 @@ class NeonEnv:
         self.neon_cli.init(
             cfg,
             force=config.config_init_force,
-            pageserver_config_override=config.pageserver_config_override,
         )
 
     def start(self):
@@ -1722,46 +1732,22 @@ class NeonCli(AbstractNeonCli):
 
     def init(
         self,
-        config: Dict[str, Any],
+        init_config: Dict[str, Any],
         force: Optional[str] = None,
-        pageserver_config_override: Optional[str] = None,
     ) -> "subprocess.CompletedProcess[str]":
-        remote_storage = self.env.pageserver_remote_storage
-
-        ps_config = {}
-        if remote_storage is not None:
-            ps_config["remote_storage"] = remote_storage_to_toml_dict(remote_storage)
-
-        if pageserver_config_override is not None:
-            for o in pageserver_config_override.split(";"):
-                override = toml.loads(o)
-                for key, value in override.items():
-                    ps_config[key] = value
-
-        with ExitStack() as stack:
-            ps_config_file = stack.enter_context(tempfile.NamedTemporaryFile(mode="w+"))
-            ps_config_file.write(toml.dumps(ps_config))
-            ps_config_file.flush()
-
-            neon_local_config = stack.enter_context(tempfile.NamedTemporaryFile(mode="w+"))
-            neon_local_config.write(toml.dumps(config))
-            neon_local_config.flush()
+        with tempfile.NamedTemporaryFile(mode="w+") as init_config_tmpfile:
+            init_config_tmpfile.write(toml.dumps(init_config))
+            init_config_tmpfile.flush()
 
             cmd = [
                 "init",
-                f"--config={neon_local_config.name}",
-                "--pg-version",
-                self.env.pg_version,
-                f"--pageserver-config={ps_config_file.name}",
+                f"--config={init_config_tmpfile.name}",
             ]
 
             if force is not None:
                 cmd.extend(["--force", force])
 
-            s3_env_vars = None
-            if isinstance(remote_storage, S3Storage):
-                s3_env_vars = remote_storage.access_env_vars()
-            res = self.raw_cli(cmd, extra_env_vars=s3_env_vars)
+            res = self.raw_cli(cmd)
             res.check_returncode()
         return res
 
diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py
index be351db429..bd7e4f118f 100644
--- a/test_runner/regress/test_pageserver_api.py
+++ b/test_runner/regress/test_pageserver_api.py
@@ -2,6 +2,7 @@ import subprocess
 from pathlib import Path
 from typing import Optional
 
+import toml
 from fixtures.neon_fixtures import (
     DEFAULT_BRANCH_NAME,
     NeonEnv,
@@ -12,9 +13,11 @@ from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import wait_until
 
 
-def test_pageserver_init_node_id(
-    neon_simple_env: NeonEnv, neon_binpath: Path, pg_distrib_dir: Path
-):
+def test_pageserver_init_node_id(neon_simple_env: NeonEnv, neon_binpath: Path):
+    """
+    NB: The neon_local doesn't use `--init` mode anymore, but our production
+    deployment still does => https://github.com/neondatabase/aws/pull/1322
+    """
     workdir = neon_simple_env.pageserver.workdir
     pageserver_config = workdir / "pageserver.toml"
     pageserver_bin = neon_binpath / "pageserver"
@@ -28,18 +31,36 @@ def test_pageserver_init_node_id(
             stderr=subprocess.PIPE,
         )
 
-    # remove initial config and stop existing pageserver
-    pageserver_config.unlink()
     neon_simple_env.pageserver.stop()
 
-    bad_init = run_pageserver(["--init", "-c", f'pg_distrib_dir="{pg_distrib_dir}"'])
+    with open(neon_simple_env.pageserver.config_toml_path, "r") as f:
+        ps_config = toml.load(f)
+
+    required_config_keys = [
+        "pg_distrib_dir",
+        "listen_pg_addr",
+        "listen_http_addr",
+        "pg_auth_type",
+        "http_auth_type",
+    ]
+    required_config_overrides = [
+        f"--config-override={toml.dumps({k: ps_config[k]})}" for k in required_config_keys
+    ]
+
+    pageserver_config.unlink()
+
+    bad_init = run_pageserver(["--init", *required_config_overrides])
     assert (
         bad_init.returncode == 1
     ), "pageserver should not be able to init new config without the node id"
     assert 'missing config value "id"' in bad_init.stderr
     assert not pageserver_config.exists(), "config file should not be created after init error"
 
-    good_init_cmd = ["--init", "-c", "id = 12345", "-c", f'pg_distrib_dir="{pg_distrib_dir}"']
+    good_init_cmd = [
+        "--init",
+        f"--config-override=id={ps_config['id']}",
+        *required_config_overrides,
+    ]
     completed_init = run_pageserver(good_init_cmd)
     assert (
         completed_init.returncode == 0

From 0457980728d93e6c3a4fc25b6f5b6052bdff1457 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Wed, 8 May 2024 16:22:13 +0100
Subject: [PATCH 005/130] Fix flaky test_gc_of_remote_layers (#7647)

Fixes flaky test `test_gc_of_remote_layers`, which was failing because
of the `Nothing to GC` pageserver log.
I looked into the fails, it seems that backround `gc_loop` sometimes
started GC for initial tenant, which wasn't
configured to disable GC. The fix is to not create initial tenant with
enabled gc at all.

Fixes #7538
---
 test_runner/regress/test_layer_eviction.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/test_runner/regress/test_layer_eviction.py b/test_runner/regress/test_layer_eviction.py
index 5c967fd72e..b178baea11 100644
--- a/test_runner/regress/test_layer_eviction.py
+++ b/test_runner/regress/test_layer_eviction.py
@@ -159,7 +159,9 @@ def test_basic_eviction(
 def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder):
     neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
 
-    env = neon_env_builder.init_start()
+    # don't create initial tenant, we'll create it manually with custom config
+    env = neon_env_builder.init_configs()
+    env.start()
 
     tenant_config = {
         "pitr_interval": "1s",  # set to non-zero, so GC actually does something

From 21e1a496a3f706097578de396a9107813c541001 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Wed, 8 May 2024 08:49:57 -0700
Subject: [PATCH 006/130] Expose LSN and replication delay as metrics (#7610)

## Problem
We currently have no way to see what the current LSN of a compute its,
and in case of read replicas, we don't know what the difference in LSNs
is.

## Summary of changes
Adds these metrics
---
 vm-image-spec.yaml | 44 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 43 insertions(+), 1 deletion(-)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index 41ca16f16b..56538630ac 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -244,6 +244,49 @@ files:
         values: [approximate_working_set_size]
         query: |
           select neon.approximate_working_set_size(false) as approximate_working_set_size;
+
+      - metric_name: current_lsn
+        type: gauge
+        help: 'Current LSN of the database'
+        key_labels:
+        values: [lsn]
+        query: |
+          select
+            case
+              when pg_catalog.pg_is_in_recovery()
+              then pg_last_wal_replay_lsn()
+              else pg_current_wal_lsn()
+            end as lsn;
+
+      - metric_name: replication_delay_bytes
+        type: gauge
+        help: 'Bytes between received and replayed LSN'
+        key_labels:
+        values: [replication_delay_bytes]
+        query: |
+          SELECT pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn()) AS replication_delay_bytes;
+
+      - metric_name: replication_delay_seconds
+        type: gauge
+        help: 'Time since last LSN was replayed'
+        key_labels:
+        values: [replication_delay_seconds]
+        query: |
+          SELECT
+            CASE
+              WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0
+              ELSE GREATEST (0, EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()))
+            END AS replication_delay_seconds;
+
+      - metric_name: checkpoint_stats
+        type: gauge
+        help: 'Number of requested and scheduled checkpoints'
+        key_labels:
+        values:
+          - checkpoints_req
+          - checkpoints_timed
+        query: |
+          SELECT checkpoints_req, checkpoints_timed FROM pg_stat_bgwriter;
   - filename: neon_collector_autoscaling.yml
     content: |
       collector_name: neon_collector_autoscaling
@@ -295,7 +338,6 @@ files:
         values: [approximate_working_set_size]
         query: |
           select neon.approximate_working_set_size(false) as approximate_working_set_size;
-
 build: |
   # Build cgroup-tools
   #

From 1173ee6a7e1168e671a6847eb94807b45c703490 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Wed, 8 May 2024 11:53:54 -0400
Subject: [PATCH 007/130] chore(neon_test_utils): restrict installation to
 superuser (#7624)

The test utils should only be used during tests. Users should not be
able to create this extension on their own.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pgxn/neon_test_utils/neon_test_utils.control | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pgxn/neon_test_utils/neon_test_utils.control b/pgxn/neon_test_utils/neon_test_utils.control
index 5f6d640835..8c5b9b5dfe 100644
--- a/pgxn/neon_test_utils/neon_test_utils.control
+++ b/pgxn/neon_test_utils/neon_test_utils.control
@@ -3,4 +3,5 @@ comment = 'helpers for neon testing and debugging'
 default_version = '1.1'
 module_pathname = '$libdir/neon_test_utils'
 relocatable = true
-trusted = true
+trusted = false
+superuser = true

From ca154d9cd843dcc10d234266be0effff091e71e7 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 8 May 2024 17:50:21 +0100
Subject: [PATCH 008/130] pageserver: local layer path followups (#7640)

- Rename "filename" types which no longer map directly to a filename
(LayerFileName -> LayerName)
- Add a -v1- part to local layer paths to smooth the path to future
updates (we anticipate a -v2- that uses checksums later)
- Rename methods that refer to the string-ized version of a LayerName to
no longer be called "filename"
- Refactor reconcile() function to use a LocalLayerFileMetadata type
that includes the local path, rather than carrying local path separately
in a tuple and unwrap()'ing it later.
---
 pageserver/benches/bench_layer_map.rs         |   4 +-
 pageserver/ctl/src/index_part.rs              |   4 +-
 pageserver/src/deletion_queue.rs              |  16 +--
 pageserver/src/deletion_queue/list_writer.rs  |   4 +-
 pageserver/src/disk_usage_eviction_task.rs    |   8 +-
 pageserver/src/http/routes.rs                 |   6 +-
 .../src/tenant/remote_timeline_client.rs      |  64 +++++-----
 .../tenant/remote_timeline_client/download.rs |   4 +-
 .../tenant/remote_timeline_client/index.rs    |   6 +-
 pageserver/src/tenant/secondary.rs            |   4 +-
 pageserver/src/tenant/secondary/downloader.rs |  12 +-
 pageserver/src/tenant/secondary/heatmap.rs    |   8 +-
 pageserver/src/tenant/storage_layer.rs        |  20 +--
 .../src/tenant/storage_layer/delta_layer.rs   |  14 ++-
 .../src/tenant/storage_layer/image_layer.rs   |  21 ++--
 pageserver/src/tenant/storage_layer/layer.rs  |  39 +++---
 .../src/tenant/storage_layer/layer_desc.rs    |  28 ++---
 .../{filename.rs => layer_name.rs}            | 115 +++++++++---------
 pageserver/src/tenant/timeline.rs             |  71 +++++------
 .../src/tenant/timeline/detach_ancestor.rs    |   2 +-
 pageserver/src/tenant/timeline/init.rs        |  93 ++++++++------
 .../src/tenant/timeline/layer_manager.rs      |   2 +-
 pageserver/src/tenant/upload_queue.rs         |   8 +-
 s3_scrubber/src/checks.rs                     |  23 ++--
 s3_scrubber/src/tenant_snapshot.rs            |  18 +--
 test_runner/fixtures/neon_fixtures.py         |   4 +-
 test_runner/fixtures/pageserver/types.py      |  29 +++--
 .../regress/test_layers_from_future.py        |   8 +-
 .../regress/test_pageserver_generations.py    |   4 +-
 29 files changed, 324 insertions(+), 315 deletions(-)
 rename pageserver/src/tenant/storage_layer/{filename.rs => layer_name.rs} (72%)

diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs
index 5d05af0c00..1d02aa7709 100644
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -1,7 +1,7 @@
 use pageserver::keyspace::{KeyPartitioning, KeySpace};
 use pageserver::repository::Key;
 use pageserver::tenant::layer_map::LayerMap;
-use pageserver::tenant::storage_layer::LayerFileName;
+use pageserver::tenant::storage_layer::LayerName;
 use pageserver::tenant::storage_layer::PersistentLayerDesc;
 use pageserver_api::shard::TenantShardId;
 use rand::prelude::{SeedableRng, SliceRandom, StdRng};
@@ -28,7 +28,7 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap {
     let mut updates = layer_map.batch_update();
     for fname in filenames {
         let fname = fname.unwrap();
-        let fname = LayerFileName::from_str(&fname).unwrap();
+        let fname = LayerName::from_str(&fname).unwrap();
         let layer = PersistentLayerDesc::from(fname);
 
         let lsn_range = layer.get_lsn_range();
diff --git a/pageserver/ctl/src/index_part.rs b/pageserver/ctl/src/index_part.rs
index 20e5572914..0d010eb009 100644
--- a/pageserver/ctl/src/index_part.rs
+++ b/pageserver/ctl/src/index_part.rs
@@ -3,7 +3,7 @@ use std::collections::HashMap;
 use anyhow::Context;
 use camino::Utf8PathBuf;
 use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata;
-use pageserver::tenant::storage_layer::LayerFileName;
+use pageserver::tenant::storage_layer::LayerName;
 use pageserver::tenant::{metadata::TimelineMetadata, IndexPart};
 use utils::lsn::Lsn;
 
@@ -19,7 +19,7 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
             let des: IndexPart = IndexPart::from_s3_bytes(&bytes).context("deserialize")?;
             #[derive(serde::Serialize)]
             struct Output<'a> {
-                layer_metadata: &'a HashMap<LayerFileName, IndexLayerMetadata>,
+                layer_metadata: &'a HashMap<LayerName, IndexLayerMetadata>,
                 disk_consistent_lsn: Lsn,
                 timeline_metadata: &'a TimelineMetadata,
             }
diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs
index e3c11cb299..c937309d83 100644
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -38,7 +38,7 @@ use deleter::DeleterMessage;
 use list_writer::ListWriterQueueMessage;
 use validator::ValidatorQueueMessage;
 
-use crate::{config::PageServerConf, tenant::storage_layer::LayerFileName};
+use crate::{config::PageServerConf, tenant::storage_layer::LayerName};
 
 // TODO: configurable for how long to wait before executing deletions
 
@@ -479,7 +479,7 @@ impl DeletionQueueClient {
         tenant_shard_id: TenantShardId,
         timeline_id: TimelineId,
         current_generation: Generation,
-        layers: Vec<(LayerFileName, LayerFileMetadata)>,
+        layers: Vec<(LayerName, LayerFileMetadata)>,
     ) -> Result<(), DeletionQueueError> {
         if current_generation.is_none() {
             debug!("Enqueuing deletions in legacy mode, skipping queue");
@@ -511,7 +511,7 @@ impl DeletionQueueClient {
         tenant_shard_id: TenantShardId,
         timeline_id: TimelineId,
         current_generation: Generation,
-        layers: Vec<(LayerFileName, LayerFileMetadata)>,
+        layers: Vec<(LayerName, LayerFileMetadata)>,
     ) -> Result<(), DeletionQueueError> {
         metrics::DELETION_QUEUE
             .keys_submitted
@@ -734,20 +734,20 @@ mod test {
     use crate::{
         control_plane_client::RetryForeverError,
         repository::Key,
-        tenant::{harness::TenantHarness, storage_layer::DeltaFileName},
+        tenant::{harness::TenantHarness, storage_layer::DeltaLayerName},
     };
 
     use super::*;
     pub const TIMELINE_ID: TimelineId =
         TimelineId::from_array(hex!("11223344556677881122334455667788"));
 
-    pub const EXAMPLE_LAYER_NAME: LayerFileName = LayerFileName::Delta(DeltaFileName {
+    pub const EXAMPLE_LAYER_NAME: LayerName = LayerName::Delta(DeltaLayerName {
         key_range: Key::from_i128(0x0)..Key::from_i128(0xFFFFFFFFFFFFFFFF),
         lsn_range: Lsn(0x00000000016B59D8)..Lsn(0x00000000016B5A51),
     });
 
     // When you need a second layer in a test.
-    pub const EXAMPLE_LAYER_NAME_ALT: LayerFileName = LayerFileName::Delta(DeltaFileName {
+    pub const EXAMPLE_LAYER_NAME_ALT: LayerName = LayerName::Delta(DeltaLayerName {
         key_range: Key::from_i128(0x0)..Key::from_i128(0xFFFFFFFFFFFFFFFF),
         lsn_range: Lsn(0x00000000016B5A51)..Lsn(0x00000000016B5A61),
     });
@@ -797,7 +797,7 @@ mod test {
         /// Returns remote layer file name, suitable for use in assert_remote_files
         fn write_remote_layer(
             &self,
-            file_name: LayerFileName,
+            file_name: LayerName,
             gen: Generation,
         ) -> anyhow::Result<String> {
             let tenant_shard_id = self.harness.tenant_shard_id;
@@ -952,7 +952,7 @@ mod test {
         let client = ctx.deletion_queue.new_client();
         client.recover(HashMap::new())?;
 
-        let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
+        let layer_file_name_1: LayerName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
         let tenant_shard_id = ctx.harness.tenant_shard_id;
 
         let content: Vec<u8> = "victim1 contents".into();
diff --git a/pageserver/src/deletion_queue/list_writer.rs b/pageserver/src/deletion_queue/list_writer.rs
index 3a3d600ac2..ae3b2c9180 100644
--- a/pageserver/src/deletion_queue/list_writer.rs
+++ b/pageserver/src/deletion_queue/list_writer.rs
@@ -34,7 +34,7 @@ use crate::deletion_queue::TEMP_SUFFIX;
 use crate::metrics;
 use crate::tenant::remote_timeline_client::remote_layer_path;
 use crate::tenant::remote_timeline_client::LayerFileMetadata;
-use crate::tenant::storage_layer::LayerFileName;
+use crate::tenant::storage_layer::LayerName;
 use crate::virtual_file::on_fatal_io_error;
 use crate::virtual_file::MaybeFatalIo;
 
@@ -59,7 +59,7 @@ pub(super) struct DeletionOp {
     // `layers` and `objects` are both just lists of objects.  `layers` is used if you do not
     // have a config object handy to project it to a remote key, and need the consuming worker
     // to do it for you.
-    pub(super) layers: Vec<(LayerFileName, LayerFileMetadata)>,
+    pub(super) layers: Vec<(LayerName, LayerFileMetadata)>,
     pub(super) objects: Vec<RemotePath>,
 
     /// The _current_ generation of the Tenant shard attachment in which we are enqueuing
diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs
index 400930245b..ebeb8bbb20 100644
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -64,7 +64,7 @@ use crate::{
         mgr::TenantManager,
         remote_timeline_client::LayerFileMetadata,
         secondary::SecondaryTenant,
-        storage_layer::{AsLayerDesc, EvictionError, Layer, LayerFileName},
+        storage_layer::{AsLayerDesc, EvictionError, Layer, LayerName},
     },
 };
 
@@ -604,7 +604,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
 pub(crate) struct EvictionSecondaryLayer {
     pub(crate) secondary_tenant: Arc<SecondaryTenant>,
     pub(crate) timeline_id: TimelineId,
-    pub(crate) name: LayerFileName,
+    pub(crate) name: LayerName,
     pub(crate) metadata: LayerFileMetadata,
 }
 
@@ -637,9 +637,9 @@ impl EvictionLayer {
         }
     }
 
-    pub(crate) fn get_name(&self) -> LayerFileName {
+    pub(crate) fn get_name(&self) -> LayerName {
         match self {
-            Self::Attached(l) => l.layer_desc().filename(),
+            Self::Attached(l) => l.layer_desc().layer_name(),
             Self::Secondary(sl) => sl.name.clone(),
         }
     }
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 83b7b8a45e..a8ca642dc5 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -63,7 +63,7 @@ use crate::tenant::remote_timeline_client::list_remote_timelines;
 use crate::tenant::secondary::SecondaryController;
 use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
-use crate::tenant::storage_layer::LayerFileName;
+use crate::tenant::storage_layer::LayerName;
 use crate::tenant::timeline::CompactFlags;
 use crate::tenant::timeline::Timeline;
 use crate::tenant::SpawnMode;
@@ -1229,7 +1229,7 @@ async fn layer_download_handler(
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     let layer_file_name = get_request_param(&request, "layer_file_name")?;
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
-    let layer_name = LayerFileName::from_str(layer_file_name)
+    let layer_name = LayerName::from_str(layer_file_name)
         .map_err(|s| ApiError::BadRequest(anyhow::anyhow!(s)))?;
     let state = get_state(&request);
 
@@ -1261,7 +1261,7 @@ async fn evict_timeline_layer_handler(
     let layer_file_name = get_request_param(&request, "layer_file_name")?;
     let state = get_state(&request);
 
-    let layer_name = LayerFileName::from_str(layer_file_name)
+    let layer_name = LayerName::from_str(layer_file_name)
         .map_err(|s| ApiError::BadRequest(anyhow::anyhow!(s)))?;
 
     let timeline =
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 356a0dc51c..bbe4e16378 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -240,7 +240,7 @@ use utils::id::{TenantId, TimelineId};
 use self::index::IndexPart;
 
 use super::metadata::MetadataUpdate;
-use super::storage_layer::{Layer, LayerFileName, ResidentLayer};
+use super::storage_layer::{Layer, LayerName, ResidentLayer};
 use super::upload_queue::SetDeletedFlagProgress;
 use super::Generation;
 
@@ -503,7 +503,7 @@ impl RemoteTimelineClient {
     /// On success, returns the size of the downloaded file.
     pub async fn download_layer_file(
         &self,
-        layer_file_name: &LayerFileName,
+        layer_file_name: &LayerName,
         layer_metadata: &LayerFileMetadata,
         cancel: &CancellationToken,
         ctx: &RequestContext,
@@ -677,7 +677,7 @@ impl RemoteTimelineClient {
             for layer in layers {
                 upload_queue
                     .latest_files
-                    .insert(layer.layer_desc().filename(), layer.metadata());
+                    .insert(layer.layer_desc().layer_name(), layer.metadata());
             }
 
             self.schedule_index_upload(upload_queue);
@@ -713,7 +713,7 @@ impl RemoteTimelineClient {
 
         upload_queue
             .latest_files
-            .insert(layer.layer_desc().filename(), metadata.clone());
+            .insert(layer.layer_desc().layer_name(), metadata.clone());
         upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
 
         info!(
@@ -737,7 +737,7 @@ impl RemoteTimelineClient {
     /// successfully.
     pub fn schedule_layer_file_deletion(
         self: &Arc<Self>,
-        names: &[LayerFileName],
+        names: &[LayerName],
     ) -> anyhow::Result<()> {
         let mut guard = self.upload_queue.lock().unwrap();
         let upload_queue = guard.initialized_mut()?;
@@ -765,7 +765,7 @@ impl RemoteTimelineClient {
         // the layer files as "dangling". this is fine, at worst case we create work for the
         // scrubber.
 
-        let names = gc_layers.iter().map(|x| x.layer_desc().filename());
+        let names = gc_layers.iter().map(|x| x.layer_desc().layer_name());
 
         self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names);
 
@@ -780,9 +780,9 @@ impl RemoteTimelineClient {
         self: &Arc<Self>,
         upload_queue: &mut UploadQueueInitialized,
         names: I,
-    ) -> Vec<(LayerFileName, LayerFileMetadata)>
+    ) -> Vec<(LayerName, LayerFileMetadata)>
     where
-        I: IntoIterator<Item = LayerFileName>,
+        I: IntoIterator<Item = LayerName>,
     {
         // Decorate our list of names with each name's metadata, dropping
         // names that are unexpectedly missing from our metadata.  This metadata
@@ -832,7 +832,7 @@ impl RemoteTimelineClient {
     /// `index_part.json` with [`Self::schedule_gc_update`] or [`Self::schedule_compaction_update`].
     pub(crate) fn schedule_deletion_of_unlinked(
         self: &Arc<Self>,
-        layers: Vec<(LayerFileName, LayerFileMetadata)>,
+        layers: Vec<(LayerName, LayerFileMetadata)>,
     ) -> anyhow::Result<()> {
         let mut guard = self.upload_queue.lock().unwrap();
         let upload_queue = guard.initialized_mut()?;
@@ -845,7 +845,7 @@ impl RemoteTimelineClient {
     fn schedule_deletion_of_unlinked0(
         self: &Arc<Self>,
         upload_queue: &mut UploadQueueInitialized,
-        mut with_metadata: Vec<(LayerFileName, LayerFileMetadata)>,
+        mut with_metadata: Vec<(LayerName, LayerFileMetadata)>,
     ) {
         // Filter out any layers which were not created by this tenant shard.  These are
         // layers that originate from some ancestor shard after a split, and may still
@@ -914,7 +914,7 @@ impl RemoteTimelineClient {
             self.schedule_layer_file_upload0(upload_queue, layer.clone());
         }
 
-        let names = compacted_from.iter().map(|x| x.layer_desc().filename());
+        let names = compacted_from.iter().map(|x| x.layer_desc().layer_name());
 
         self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names);
         self.launch_queued_tasks(upload_queue);
@@ -1144,7 +1144,7 @@ impl RemoteTimelineClient {
             &self.tenant_shard_id.tenant_id,
             &self.timeline_id,
             self.tenant_shard_id.to_index(),
-            &uploaded.layer_desc().filename(),
+            &uploaded.layer_desc().layer_name(),
             uploaded.metadata().generation,
         );
 
@@ -1185,7 +1185,7 @@ impl RemoteTimelineClient {
                 .get_timeline_id()
                 .expect("Source timeline should be alive"),
             self.tenant_shard_id.to_index(),
-            &adopted.layer_desc().filename(),
+            &adopted.layer_desc().layer_name(),
             adopted.metadata().generation,
         );
 
@@ -1193,7 +1193,7 @@ impl RemoteTimelineClient {
             &self.tenant_shard_id.tenant_id,
             &self.timeline_id,
             self.tenant_shard_id.to_index(),
-            &adopted_as.layer_desc().filename(),
+            &adopted_as.layer_desc().layer_name(),
             adopted_as.metadata().generation,
         );
 
@@ -1527,7 +1527,7 @@ impl RemoteTimelineClient {
                         &self.tenant_shard_id.tenant_id,
                         &self.timeline_id,
                         layer_metadata.shard,
-                        &layer.layer_desc().filename(),
+                        &layer.layer_desc().layer_name(),
                         layer_metadata.generation,
                     );
 
@@ -1896,14 +1896,14 @@ pub fn remote_layer_path(
     tenant_id: &TenantId,
     timeline_id: &TimelineId,
     shard: ShardIndex,
-    layer_file_name: &LayerFileName,
+    layer_file_name: &LayerName,
     generation: Generation,
 ) -> RemotePath {
     // Generation-aware key format
     let path = format!(
         "tenants/{tenant_id}{0}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{1}{2}",
         shard.get_suffix(),
-        layer_file_name.file_name(),
+        layer_file_name,
         generation.get_suffix()
     );
 
@@ -2000,8 +2000,8 @@ mod tests {
         TimelineMetadata::from_bytes(&metadata.to_bytes().unwrap()).unwrap()
     }
 
-    fn assert_file_list(a: &HashSet<LayerFileName>, b: &[&str]) {
-        let mut avec: Vec<String> = a.iter().map(|x| x.file_name()).collect();
+    fn assert_file_list(a: &HashSet<LayerName>, b: &[&str]) {
+        let mut avec: Vec<String> = a.iter().map(|x| x.to_string()).collect();
         avec.sort();
 
         let mut bvec = b.to_vec();
@@ -2127,7 +2127,7 @@ mod tests {
             .layer_metadata
             .keys()
             .map(|f| f.to_owned())
-            .collect::<HashSet<LayerFileName>>();
+            .collect::<HashSet<LayerName>>();
         let initial_layer = {
             assert!(initial_layers.len() == 1);
             initial_layers.into_iter().next().unwrap()
@@ -2153,7 +2153,7 @@ mod tests {
             ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59DA-00000000016B5A53".parse().unwrap(), dummy_contents("baz"))
         ]
         .into_iter()
-        .map(|(name, contents): (LayerFileName, Vec<u8>)| {
+        .map(|(name, contents): (LayerName, Vec<u8>)| {
 
             let local_path = local_layer_path(
                 harness.conf,
@@ -2234,9 +2234,9 @@ mod tests {
                 .map(|f| f.to_owned())
                 .collect(),
             &[
-                &initial_layer.file_name(),
-                &layers[0].layer_desc().filename().file_name(),
-                &layers[1].layer_desc().filename().file_name(),
+                &initial_layer.to_string(),
+                &layers[0].layer_desc().layer_name().to_string(),
+                &layers[1].layer_desc().layer_name().to_string(),
             ],
         );
         assert_eq!(index_part.metadata, metadata);
@@ -2250,7 +2250,7 @@ mod tests {
         // keep using schedule_layer_file_deletion because we don't have a way to wait for the
         // spawn_blocking started by the drop.
         client
-            .schedule_layer_file_deletion(&[layers[0].layer_desc().filename()])
+            .schedule_layer_file_deletion(&[layers[0].layer_desc().layer_name()])
             .unwrap();
         {
             let mut guard = client.upload_queue.lock().unwrap();
@@ -2268,9 +2268,9 @@ mod tests {
         }
         assert_remote_files(
             &[
-                &initial_layer.file_name(),
-                &layers[0].layer_desc().filename().file_name(),
-                &layers[1].layer_desc().filename().file_name(),
+                &initial_layer.to_string(),
+                &layers[0].layer_desc().layer_name().to_string(),
+                &layers[1].layer_desc().layer_name().to_string(),
                 "index_part.json",
             ],
             &remote_timeline_dir,
@@ -2283,9 +2283,9 @@ mod tests {
 
         assert_remote_files(
             &[
-                &initial_layer.file_name(),
-                &layers[1].layer_desc().filename().file_name(),
-                &layers[2].layer_desc().filename().file_name(),
+                &initial_layer.to_string(),
+                &layers[1].layer_desc().layer_name().to_string(),
+                &layers[2].layer_desc().layer_name().to_string(),
                 "index_part.json",
             ],
             &remote_timeline_dir,
@@ -2305,7 +2305,7 @@ mod tests {
         } = TestSetup::new("metrics").await.unwrap();
         let client = timeline.remote_client.as_ref().unwrap();
 
-        let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
+        let layer_file_name_1: LayerName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
         let local_path = local_layer_path(
             harness.conf,
             &timeline.tenant_shard_id,
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index c86b22d481..b464437422 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -22,7 +22,7 @@ use crate::context::RequestContext;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
 use crate::tenant::storage_layer::layer::local_layer_path;
-use crate::tenant::storage_layer::LayerFileName;
+use crate::tenant::storage_layer::LayerName;
 use crate::tenant::Generation;
 use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
 use crate::TEMP_FILE_SUFFIX;
@@ -48,7 +48,7 @@ pub async fn download_layer_file<'a>(
     storage: &'a GenericRemoteStorage,
     tenant_shard_id: TenantShardId,
     timeline_id: TimelineId,
-    layer_file_name: &'a LayerFileName,
+    layer_file_name: &'a LayerName,
     layer_metadata: &'a LayerFileMetadata,
     cancel: &CancellationToken,
     ctx: &RequestContext,
diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index 7e0619945f..3e05905afa 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -8,7 +8,7 @@ use chrono::NaiveDateTime;
 use serde::{Deserialize, Serialize};
 
 use crate::tenant::metadata::TimelineMetadata;
-use crate::tenant::storage_layer::LayerFileName;
+use crate::tenant::storage_layer::LayerName;
 use crate::tenant::upload_queue::UploadQueueInitialized;
 use crate::tenant::Generation;
 use pageserver_api::shard::ShardIndex;
@@ -75,7 +75,7 @@ pub struct IndexPart {
     ///
     /// Older versions of `IndexPart` will not have this property or have only a part of metadata
     /// that latest version stores.
-    pub layer_metadata: HashMap<LayerFileName, IndexLayerMetadata>,
+    pub layer_metadata: HashMap<LayerName, IndexLayerMetadata>,
 
     // 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata.
     // It's duplicated for convenience when reading the serialized structure, but is
@@ -104,7 +104,7 @@ impl IndexPart {
     pub const FILE_NAME: &'static str = "index_part.json";
 
     fn new(
-        layers_and_metadata: &HashMap<LayerFileName, LayerFileMetadata>,
+        layers_and_metadata: &HashMap<LayerName, LayerFileMetadata>,
         disk_consistent_lsn: Lsn,
         metadata: TimelineMetadata,
     ) -> Self {
diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs
index 0bb25f0ace..7075044baf 100644
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -23,7 +23,7 @@ use super::{
     mgr::TenantManager,
     remote_timeline_client::LayerFileMetadata,
     span::debug_assert_current_span_has_tenant_id,
-    storage_layer::{layer::local_layer_path, LayerFileName},
+    storage_layer::{layer::local_layer_path, LayerName},
 };
 
 use pageserver_api::{
@@ -182,7 +182,7 @@ impl SecondaryTenant {
         self: &Arc<Self>,
         conf: &PageServerConf,
         timeline_id: TimelineId,
-        name: LayerFileName,
+        name: LayerName,
         metadata: LayerFileMetadata,
     ) {
         debug_assert_current_span_has_tenant_id();
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 092630e74d..2a8f83be95 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -22,7 +22,7 @@ use crate::{
             FAILED_REMOTE_OP_RETRIES,
         },
         span::debug_assert_current_span_has_tenant_id,
-        storage_layer::{layer::local_layer_path, LayerFileName},
+        storage_layer::{layer::local_layer_path, LayerName},
         tasks::{warn_when_period_overrun, BackgroundLoopKind},
     },
     virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile},
@@ -111,7 +111,7 @@ impl OnDiskState {
         _conf: &'static PageServerConf,
         _tenant_shard_id: &TenantShardId,
         _imeline_id: &TimelineId,
-        _ame: LayerFileName,
+        _ame: LayerName,
         metadata: LayerFileMetadata,
         access_time: SystemTime,
     ) -> Self {
@@ -124,10 +124,10 @@ impl OnDiskState {
 
 #[derive(Debug, Clone, Default)]
 pub(super) struct SecondaryDetailTimeline {
-    pub(super) on_disk_layers: HashMap<LayerFileName, OnDiskState>,
+    pub(super) on_disk_layers: HashMap<LayerName, OnDiskState>,
 
     /// We remember when layers were evicted, to prevent re-downloading them.
-    pub(super) evicted_at: HashMap<LayerFileName, SystemTime>,
+    pub(super) evicted_at: HashMap<LayerName, SystemTime>,
 }
 
 /// This state is written by the secondary downloader, it is opaque
@@ -997,7 +997,7 @@ async fn init_timeline_state(
 
     // As we iterate through layers found on disk, we will look up their metadata from this map.
     // Layers not present in metadata will be discarded.
-    let heatmap_metadata: HashMap<&LayerFileName, &HeatMapLayer> =
+    let heatmap_metadata: HashMap<&LayerName, &HeatMapLayer> =
         heatmap.layers.iter().map(|l| (&l.name, l)).collect();
 
     while let Some(dentry) = dir
@@ -1034,7 +1034,7 @@ async fn init_timeline_state(
             continue;
         }
 
-        match LayerFileName::from_str(file_name) {
+        match LayerName::from_str(file_name) {
             Ok(name) => {
                 let remote_meta = heatmap_metadata.get(&name);
                 match remote_meta {
diff --git a/pageserver/src/tenant/secondary/heatmap.rs b/pageserver/src/tenant/secondary/heatmap.rs
index 73cdf6c6d4..ca91ec24c6 100644
--- a/pageserver/src/tenant/secondary/heatmap.rs
+++ b/pageserver/src/tenant/secondary/heatmap.rs
@@ -1,8 +1,6 @@
 use std::time::SystemTime;
 
-use crate::tenant::{
-    remote_timeline_client::index::IndexLayerMetadata, storage_layer::LayerFileName,
-};
+use crate::tenant::{remote_timeline_client::index::IndexLayerMetadata, storage_layer::LayerName};
 
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr, TimestampSeconds};
@@ -31,7 +29,7 @@ pub(crate) struct HeatMapTimeline {
 #[serde_as]
 #[derive(Serialize, Deserialize)]
 pub(crate) struct HeatMapLayer {
-    pub(super) name: LayerFileName,
+    pub(super) name: LayerName,
     pub(super) metadata: IndexLayerMetadata,
 
     #[serde_as(as = "TimestampSeconds<i64>")]
@@ -42,7 +40,7 @@ pub(crate) struct HeatMapLayer {
 
 impl HeatMapLayer {
     pub(crate) fn new(
-        name: LayerFileName,
+        name: LayerName,
         metadata: IndexLayerMetadata,
         access_time: SystemTime,
     ) -> Self {
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 4f1b56ef9f..94a5e9ec47 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -1,11 +1,11 @@
 //! Common traits and structs for layers
 
 pub mod delta_layer;
-mod filename;
 pub mod image_layer;
 pub(crate) mod inmemory_layer;
 pub(crate) mod layer;
 mod layer_desc;
+mod layer_name;
 
 use crate::context::{AccessStatsBehavior, RequestContext};
 use crate::repository::Value;
@@ -34,10 +34,10 @@ use utils::rate_limit::RateLimit;
 use utils::{id::TimelineId, lsn::Lsn};
 
 pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef};
-pub use filename::{DeltaFileName, ImageFileName, LayerFileName};
 pub use image_layer::{ImageLayer, ImageLayerWriter};
 pub use inmemory_layer::InMemoryLayer;
 pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
+pub use layer_name::{DeltaLayerName, ImageLayerName, LayerName};
 
 pub(crate) use layer::{EvictionError, Layer, ResidentLayer};
 
@@ -646,8 +646,8 @@ pub mod tests {
 
     use super::*;
 
-    impl From<DeltaFileName> for PersistentLayerDesc {
-        fn from(value: DeltaFileName) -> Self {
+    impl From<DeltaLayerName> for PersistentLayerDesc {
+        fn from(value: DeltaLayerName) -> Self {
             PersistentLayerDesc::new_delta(
                 TenantShardId::from([0; 18]),
                 TimelineId::from_array([0; 16]),
@@ -658,8 +658,8 @@ pub mod tests {
         }
     }
 
-    impl From<ImageFileName> for PersistentLayerDesc {
-        fn from(value: ImageFileName) -> Self {
+    impl From<ImageLayerName> for PersistentLayerDesc {
+        fn from(value: ImageLayerName) -> Self {
             PersistentLayerDesc::new_img(
                 TenantShardId::from([0; 18]),
                 TimelineId::from_array([0; 16]),
@@ -670,11 +670,11 @@ pub mod tests {
         }
     }
 
-    impl From<LayerFileName> for PersistentLayerDesc {
-        fn from(value: LayerFileName) -> Self {
+    impl From<LayerName> for PersistentLayerDesc {
+        fn from(value: LayerName) -> Self {
             match value {
-                LayerFileName::Delta(d) => Self::from(d),
-                LayerFileName::Image(i) => Self::from(i),
+                LayerName::Delta(d) => Self::from(d),
+                LayerName::Image(i) => Self::from(i),
             }
         }
     }
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 6fd96b0e2f..c38c9bb656 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -57,6 +57,7 @@ use std::fs::File;
 use std::io::SeekFrom;
 use std::ops::Range;
 use std::os::unix::fs::FileExt;
+use std::str::FromStr;
 use std::sync::Arc;
 use tokio::sync::OnceCell;
 use tracing::*;
@@ -68,7 +69,8 @@ use utils::{
 };
 
 use super::{
-    AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer, ValuesReconstructState,
+    AsLayerDesc, LayerAccessStats, LayerName, PersistentLayerDesc, ResidentLayer,
+    ValuesReconstructState,
 };
 
 ///
@@ -309,13 +311,13 @@ impl DeltaLayer {
             .and_then(|res| res)?;
 
         // not production code
-        let actual_filename = path.file_name().unwrap().to_owned();
-        let expected_filename = self.layer_desc().filename().file_name();
+        let actual_layer_name = LayerName::from_str(path.file_name().unwrap()).unwrap();
+        let expected_layer_name = self.layer_desc().layer_name();
 
-        if actual_filename != expected_filename {
+        if actual_layer_name != expected_layer_name {
             println!("warning: filename does not match what is expected from in-file summary");
-            println!("actual: {:?}", actual_filename);
-            println!("expected: {:?}", expected_filename);
+            println!("actual: {:?}", actual_layer_name.to_string());
+            println!("expected: {:?}", expected_layer_name.to_string());
         }
 
         Ok(Arc::new(loaded))
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 1477a1fc33..c9874873e4 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -54,6 +54,7 @@ use std::fs::File;
 use std::io::SeekFrom;
 use std::ops::Range;
 use std::os::unix::prelude::FileExt;
+use std::str::FromStr;
 use std::sync::Arc;
 use tokio::sync::OnceCell;
 use tokio_stream::StreamExt;
@@ -65,8 +66,10 @@ use utils::{
     lsn::Lsn,
 };
 
-use super::filename::ImageFileName;
-use super::{AsLayerDesc, Layer, PersistentLayerDesc, ResidentLayer, ValuesReconstructState};
+use super::layer_name::ImageLayerName;
+use super::{
+    AsLayerDesc, Layer, LayerName, PersistentLayerDesc, ResidentLayer, ValuesReconstructState,
+};
 
 ///
 /// Header stored in the beginning of the file
@@ -231,7 +234,7 @@ impl ImageLayer {
         conf: &PageServerConf,
         timeline_id: TimelineId,
         tenant_shard_id: TenantShardId,
-        fname: &ImageFileName,
+        fname: &ImageLayerName,
     ) -> Utf8PathBuf {
         let rand_string: String = rand::thread_rng()
             .sample_iter(&Alphanumeric)
@@ -267,13 +270,13 @@ impl ImageLayer {
             .and_then(|res| res)?;
 
         // not production code
-        let actual_filename = path.file_name().unwrap().to_owned();
-        let expected_filename = self.layer_desc().filename().file_name();
+        let actual_layer_name = LayerName::from_str(path.file_name().unwrap()).unwrap();
+        let expected_layer_name = self.layer_desc().layer_name();
 
-        if actual_filename != expected_filename {
+        if actual_layer_name != expected_layer_name {
             println!("warning: filename does not match what is expected from in-file summary");
-            println!("actual: {:?}", actual_filename);
-            println!("expected: {:?}", expected_filename);
+            println!("actual: {:?}", actual_layer_name.to_string());
+            println!("expected: {:?}", expected_layer_name.to_string());
         }
 
         Ok(loaded)
@@ -635,7 +638,7 @@ impl ImageLayerWriterInner {
             conf,
             timeline_id,
             tenant_shard_id,
-            &ImageFileName {
+            &ImageLayerName {
                 key_range: key_range.clone(),
                 lsn,
             },
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index b5e69db7f4..b5b0260327 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -25,7 +25,7 @@ use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline};
 use super::delta_layer::{self, DeltaEntry};
 use super::image_layer;
 use super::{
-    AsLayerDesc, LayerAccessStats, LayerAccessStatsReset, LayerFileName, PersistentLayerDesc,
+    AsLayerDesc, LayerAccessStats, LayerAccessStatsReset, LayerName, PersistentLayerDesc,
     ValueReconstructResult, ValueReconstructState, ValuesReconstructState,
 };
 
@@ -128,19 +128,20 @@ pub(crate) fn local_layer_path(
     conf: &PageServerConf,
     tenant_shard_id: &TenantShardId,
     timeline_id: &TimelineId,
-    layer_file_name: &LayerFileName,
+    layer_file_name: &LayerName,
     _generation: &Generation,
 ) -> Utf8PathBuf {
     let timeline_path = conf.timeline_path(tenant_shard_id, timeline_id);
 
-    timeline_path.join(layer_file_name.file_name())
+    timeline_path.join(layer_file_name.to_string())
 
-    // TOOD: include generation in the name in now+1 releases.
-    // timeline_path.join(format!(
-    //     "{}{}",
-    //     layer_file_name.file_name(),
-    //     generation.get_suffix()
-    // ))
+    // TODO: switch to enabling new-style layer paths after next release
+    // if generation.is_none() {
+    //     // Without a generation, we may only use legacy path style
+    //     timeline_path.join(layer_file_name.to_string())
+    // } else {
+    //     timeline_path.join(format!("{}-v1{}", layer_file_name, generation.get_suffix()))
+    // }
 }
 
 impl Layer {
@@ -148,7 +149,7 @@ impl Layer {
     pub(crate) fn for_evicted(
         conf: &'static PageServerConf,
         timeline: &Arc<Timeline>,
-        file_name: LayerFileName,
+        file_name: LayerName,
         metadata: LayerFileMetadata,
     ) -> Self {
         let local_path = local_layer_path(
@@ -189,7 +190,7 @@ impl Layer {
         conf: &'static PageServerConf,
         timeline: &Arc<Timeline>,
         local_path: Utf8PathBuf,
-        file_name: LayerFileName,
+        file_name: LayerName,
         metadata: LayerFileMetadata,
     ) -> ResidentLayer {
         let desc = PersistentLayerDesc::from_filename(
@@ -261,7 +262,7 @@ impl Layer {
                 conf,
                 &timeline.tenant_shard_id,
                 &timeline.timeline_id,
-                &desc.filename(),
+                &desc.layer_name(),
                 &timeline.generation,
             );
 
@@ -689,7 +690,7 @@ impl Drop for LayerInner {
         let span = tracing::info_span!(parent: None, "layer_delete", tenant_id = %self.layer_desc().tenant_shard_id.tenant_id, shard_id=%self.layer_desc().tenant_shard_id.shard_slug(), timeline_id = %self.layer_desc().timeline_id);
 
         let path = std::mem::take(&mut self.path);
-        let file_name = self.layer_desc().filename();
+        let file_name = self.layer_desc().layer_name();
         let file_size = self.layer_desc().file_size;
         let timeline = self.timeline.clone();
         let meta = self.metadata();
@@ -782,7 +783,9 @@ impl LayerInner {
 
         LayerInner {
             conf,
-            debug_str: { format!("timelines/{}/{}", timeline.timeline_id, desc.filename()).into() },
+            debug_str: {
+                format!("timelines/{}/{}", timeline.timeline_id, desc.layer_name()).into()
+            },
             path: local_path,
             desc,
             timeline: Arc::downgrade(timeline),
@@ -1120,7 +1123,7 @@ impl LayerInner {
 
         let result = client
             .download_layer_file(
-                &self.desc.filename(),
+                &self.desc.layer_name(),
                 &self.metadata(),
                 &timeline.cancel,
                 ctx,
@@ -1257,7 +1260,7 @@ impl LayerInner {
     }
 
     fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
-        let layer_file_name = self.desc.filename().file_name();
+        let layer_name = self.desc.layer_name().to_string();
 
         let resident = self
             .inner
@@ -1271,7 +1274,7 @@ impl LayerInner {
             let lsn_range = &self.desc.lsn_range;
 
             HistoricLayerInfo::Delta {
-                layer_file_name,
+                layer_file_name: layer_name,
                 layer_file_size: self.desc.file_size,
                 lsn_start: lsn_range.start,
                 lsn_end: lsn_range.end,
@@ -1282,7 +1285,7 @@ impl LayerInner {
             let lsn = self.desc.image_layer_lsn();
 
             HistoricLayerInfo::Image {
-                layer_file_name,
+                layer_file_name: layer_name,
                 layer_file_size: self.desc.file_size,
                 lsn_start: lsn,
                 remote: !resident,
diff --git a/pageserver/src/tenant/storage_layer/layer_desc.rs b/pageserver/src/tenant/storage_layer/layer_desc.rs
index c375923e81..a89b66e4a1 100644
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -5,7 +5,7 @@ use utils::{id::TimelineId, lsn::Lsn};
 
 use crate::repository::Key;
 
-use super::{DeltaFileName, ImageFileName, LayerFileName};
+use super::{DeltaLayerName, ImageLayerName, LayerName};
 
 use serde::{Deserialize, Serialize};
 
@@ -51,7 +51,7 @@ impl PersistentLayerDesc {
     }
 
     pub fn short_id(&self) -> impl Display {
-        self.filename()
+        self.layer_name()
     }
 
     #[cfg(test)]
@@ -103,14 +103,14 @@ impl PersistentLayerDesc {
     pub fn from_filename(
         tenant_shard_id: TenantShardId,
         timeline_id: TimelineId,
-        filename: LayerFileName,
+        filename: LayerName,
         file_size: u64,
     ) -> Self {
         match filename {
-            LayerFileName::Image(i) => {
+            LayerName::Image(i) => {
                 Self::new_img(tenant_shard_id, timeline_id, i.key_range, i.lsn, file_size)
             }
-            LayerFileName::Delta(d) => Self::new_delta(
+            LayerName::Delta(d) => Self::new_delta(
                 tenant_shard_id,
                 timeline_id,
                 d.key_range,
@@ -132,34 +132,34 @@ impl PersistentLayerDesc {
         lsn..(lsn + 1)
     }
 
-    /// Get a delta file name for this layer.
+    /// Get a delta layer name for this layer.
     ///
     /// Panic: if this is not a delta layer.
-    pub fn delta_file_name(&self) -> DeltaFileName {
+    pub fn delta_layer_name(&self) -> DeltaLayerName {
         assert!(self.is_delta);
-        DeltaFileName {
+        DeltaLayerName {
             key_range: self.key_range.clone(),
             lsn_range: self.lsn_range.clone(),
         }
     }
 
-    /// Get a delta file name for this layer.
+    /// Get a image layer name for this layer.
     ///
     /// Panic: if this is not an image layer, or the lsn range is invalid
-    pub fn image_file_name(&self) -> ImageFileName {
+    pub fn image_layer_name(&self) -> ImageLayerName {
         assert!(!self.is_delta);
         assert!(self.lsn_range.start + 1 == self.lsn_range.end);
-        ImageFileName {
+        ImageLayerName {
             key_range: self.key_range.clone(),
             lsn: self.lsn_range.start,
         }
     }
 
-    pub fn filename(&self) -> LayerFileName {
+    pub fn layer_name(&self) -> LayerName {
         if self.is_delta {
-            self.delta_file_name().into()
+            self.delta_layer_name().into()
         } else {
-            self.image_file_name().into()
+            self.image_layer_name().into()
         }
     }
 
diff --git a/pageserver/src/tenant/storage_layer/filename.rs b/pageserver/src/tenant/storage_layer/layer_name.rs
similarity index 72%
rename from pageserver/src/tenant/storage_layer/filename.rs
rename to pageserver/src/tenant/storage_layer/layer_name.rs
index fff66a9d07..c733404693 100644
--- a/pageserver/src/tenant/storage_layer/filename.rs
+++ b/pageserver/src/tenant/storage_layer/layer_name.rs
@@ -15,29 +15,29 @@ use super::PersistentLayerDesc;
 
 // Note: Timeline::load_layer_map() relies on this sort order
 #[derive(PartialEq, Eq, Clone, Hash)]
-pub struct DeltaFileName {
+pub struct DeltaLayerName {
     pub key_range: Range<Key>,
     pub lsn_range: Range<Lsn>,
 }
 
-impl std::fmt::Debug for DeltaFileName {
+impl std::fmt::Debug for DeltaLayerName {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         use super::RangeDisplayDebug;
 
-        f.debug_struct("DeltaFileName")
+        f.debug_struct("DeltaLayerName")
             .field("key_range", &RangeDisplayDebug(&self.key_range))
             .field("lsn_range", &self.lsn_range)
             .finish()
     }
 }
 
-impl PartialOrd for DeltaFileName {
+impl PartialOrd for DeltaLayerName {
     fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
         Some(self.cmp(other))
     }
 }
 
-impl Ord for DeltaFileName {
+impl Ord for DeltaLayerName {
     fn cmp(&self, other: &Self) -> Ordering {
         let mut cmp = self.key_range.start.cmp(&other.key_range.start);
         if cmp != Ordering::Equal {
@@ -57,16 +57,14 @@ impl Ord for DeltaFileName {
     }
 }
 
-/// Represents the filename of a DeltaLayer
+/// Represents the region of the LSN-Key space covered by a DeltaLayer
 ///
 /// ```text
 ///    <key start>-<key end>__<LSN start>-<LSN end>
 /// ```
-impl DeltaFileName {
-    ///
-    /// Parse a string as a delta file name. Returns None if the filename does not
-    /// match the expected pattern.
-    ///
+impl DeltaLayerName {
+    /// Parse the part of a delta layer's file name that represents the LayerName. Returns None
+    /// if the filename does not match the expected pattern.
     pub fn parse_str(fname: &str) -> Option<Self> {
         let mut parts = fname.split("__");
         let mut key_parts = parts.next()?.split('-');
@@ -105,14 +103,14 @@ impl DeltaFileName {
             // or panic?
         }
 
-        Some(DeltaFileName {
+        Some(DeltaLayerName {
             key_range: key_start..key_end,
             lsn_range: start_lsn..end_lsn,
         })
     }
 }
 
-impl fmt::Display for DeltaFileName {
+impl fmt::Display for DeltaLayerName {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         write!(
             f,
@@ -126,29 +124,29 @@ impl fmt::Display for DeltaFileName {
 }
 
 #[derive(PartialEq, Eq, Clone, Hash)]
-pub struct ImageFileName {
+pub struct ImageLayerName {
     pub key_range: Range<Key>,
     pub lsn: Lsn,
 }
 
-impl std::fmt::Debug for ImageFileName {
+impl std::fmt::Debug for ImageLayerName {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         use super::RangeDisplayDebug;
 
-        f.debug_struct("ImageFileName")
+        f.debug_struct("ImageLayerName")
             .field("key_range", &RangeDisplayDebug(&self.key_range))
             .field("lsn", &self.lsn)
             .finish()
     }
 }
 
-impl PartialOrd for ImageFileName {
+impl PartialOrd for ImageLayerName {
     fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
         Some(self.cmp(other))
     }
 }
 
-impl Ord for ImageFileName {
+impl Ord for ImageLayerName {
     fn cmp(&self, other: &Self) -> Ordering {
         let mut cmp = self.key_range.start.cmp(&other.key_range.start);
         if cmp != Ordering::Equal {
@@ -164,7 +162,7 @@ impl Ord for ImageFileName {
     }
 }
 
-impl ImageFileName {
+impl ImageLayerName {
     pub fn lsn_as_range(&self) -> Range<Lsn> {
         // Saves from having to copypaste this all over
         PersistentLayerDesc::image_layer_lsn_range(self.lsn)
@@ -172,16 +170,14 @@ impl ImageFileName {
 }
 
 ///
-/// Represents the filename of an ImageLayer
+/// Represents the part of the Key-LSN space covered by an ImageLayer
 ///
 /// ```text
 ///    <key start>-<key end>__<LSN>
 /// ```
-impl ImageFileName {
-    ///
-    /// Parse a string as an image file name. Returns None if the filename does not
-    /// match the expected pattern.
-    ///
+impl ImageLayerName {
+    /// Parse a string as then LayerName part of an image layer file name. Returns None if the
+    /// filename does not match the expected pattern.
     pub fn parse_str(fname: &str) -> Option<Self> {
         let mut parts = fname.split("__");
         let mut key_parts = parts.next()?.split('-');
@@ -202,14 +198,14 @@ impl ImageFileName {
 
         let lsn = Lsn::from_hex(lsn_str).ok()?;
 
-        Some(ImageFileName {
+        Some(ImageLayerName {
             key_range: key_start..key_end,
             lsn,
         })
     }
 }
 
-impl fmt::Display for ImageFileName {
+impl fmt::Display for ImageLayerName {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         write!(
             f,
@@ -220,21 +216,24 @@ impl fmt::Display for ImageFileName {
         )
     }
 }
+
+/// LayerName is the logical identity of a layer within a LayerMap at a moment in time.  The
+/// LayerName is not a unique filename, as the same LayerName may have multiple physical incarnations
+/// over time (e.g. across shard splits or compression). The physical filenames of layers in local
+/// storage and object names in remote storage consist of the LayerName plus some extra qualifiers
+/// that uniquely identify the physical incarnation of a layer (see [crate::tenant::remote_timeline_client::remote_layer_path])
+/// and [`crate::tenant::storage_layer::layer::local_layer_path`])
 #[derive(Debug, PartialEq, Eq, Hash, Clone)]
-pub enum LayerFileName {
-    Image(ImageFileName),
-    Delta(DeltaFileName),
+pub enum LayerName {
+    Image(ImageLayerName),
+    Delta(DeltaLayerName),
 }
 
-impl LayerFileName {
-    pub fn file_name(&self) -> String {
-        self.to_string()
-    }
-
+impl LayerName {
     /// Determines if this layer file is considered to be in future meaning we will discard these
     /// layers during timeline initialization from the given disk_consistent_lsn.
     pub(crate) fn is_in_future(&self, disk_consistent_lsn: Lsn) -> bool {
-        use LayerFileName::*;
+        use LayerName::*;
         match self {
             Image(file_name) if file_name.lsn > disk_consistent_lsn => true,
             Delta(file_name) if file_name.lsn_range.end > disk_consistent_lsn + 1 => true,
@@ -243,7 +242,7 @@ impl LayerFileName {
     }
 
     pub(crate) fn kind(&self) -> &'static str {
-        use LayerFileName::*;
+        use LayerName::*;
         match self {
             Delta(_) => "delta",
             Image(_) => "image",
@@ -251,7 +250,7 @@ impl LayerFileName {
     }
 }
 
-impl fmt::Display for LayerFileName {
+impl fmt::Display for LayerName {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         match self {
             Self::Image(fname) => write!(f, "{fname}"),
@@ -260,25 +259,25 @@ impl fmt::Display for LayerFileName {
     }
 }
 
-impl From<ImageFileName> for LayerFileName {
-    fn from(fname: ImageFileName) -> Self {
+impl From<ImageLayerName> for LayerName {
+    fn from(fname: ImageLayerName) -> Self {
         Self::Image(fname)
     }
 }
-impl From<DeltaFileName> for LayerFileName {
-    fn from(fname: DeltaFileName) -> Self {
+impl From<DeltaLayerName> for LayerName {
+    fn from(fname: DeltaLayerName) -> Self {
         Self::Delta(fname)
     }
 }
 
-impl FromStr for LayerFileName {
+impl FromStr for LayerName {
     type Err = String;
 
     /// Conversion from either a physical layer filename, or the string-ization of
     /// Self. When loading a physical layer filename, we drop any extra information
     /// not needed to build Self.
     fn from_str(value: &str) -> Result<Self, Self::Err> {
-        let gen_suffix_regex = Regex::new("^(?<base>.+)-(?<gen>[0-9a-f]{8})$").unwrap();
+        let gen_suffix_regex = Regex::new("^(?<base>.+)(?<gen>-v1-[0-9a-f]{8})$").unwrap();
         let file_name: Cow<str> = match gen_suffix_regex.captures(value) {
             Some(captures) => captures
                 .name("base")
@@ -288,8 +287,8 @@ impl FromStr for LayerFileName {
             None => value.into(),
         };
 
-        let delta = DeltaFileName::parse_str(&file_name);
-        let image = ImageFileName::parse_str(&file_name);
+        let delta = DeltaLayerName::parse_str(&file_name);
+        let image = ImageLayerName::parse_str(&file_name);
         let ok = match (delta, image) {
             (None, None) => {
                 return Err(format!(
@@ -304,7 +303,7 @@ impl FromStr for LayerFileName {
     }
 }
 
-impl serde::Serialize for LayerFileName {
+impl serde::Serialize for LayerName {
     fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
     where
         S: serde::Serializer,
@@ -316,19 +315,19 @@ impl serde::Serialize for LayerFileName {
     }
 }
 
-impl<'de> serde::Deserialize<'de> for LayerFileName {
+impl<'de> serde::Deserialize<'de> for LayerName {
     fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
     where
         D: serde::Deserializer<'de>,
     {
-        deserializer.deserialize_string(LayerFileNameVisitor)
+        deserializer.deserialize_string(LayerNameVisitor)
     }
 }
 
-struct LayerFileNameVisitor;
+struct LayerNameVisitor;
 
-impl<'de> serde::de::Visitor<'de> for LayerFileNameVisitor {
-    type Value = LayerFileName;
+impl<'de> serde::de::Visitor<'de> for LayerNameVisitor {
+    type Value = LayerName;
 
     fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
         write!(
@@ -349,16 +348,16 @@ mod test {
     use super::*;
     #[test]
     fn image_layer_parse() -> anyhow::Result<()> {
-        let expected = LayerFileName::Image(ImageFileName {
+        let expected = LayerName::Image(ImageLayerName {
             key_range: Key::from_i128(0)
                 ..Key::from_hex("000000067F00000001000004DF0000000006").unwrap(),
             lsn: Lsn::from_hex("00000000014FED58").unwrap(),
         });
-        let parsed = LayerFileName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-00000001").map_err(|s| anyhow::anyhow!(s))?;
+        let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-v1-00000001").map_err(|s| anyhow::anyhow!(s))?;
         assert_eq!(parsed, expected,);
 
         // Omitting generation suffix is valid
-        let parsed = LayerFileName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58").map_err(|s| anyhow::anyhow!(s))?;
+        let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58").map_err(|s| anyhow::anyhow!(s))?;
         assert_eq!(parsed, expected,);
 
         Ok(())
@@ -366,17 +365,17 @@ mod test {
 
     #[test]
     fn delta_layer_parse() -> anyhow::Result<()> {
-        let expected = LayerFileName::Delta(DeltaFileName {
+        let expected = LayerName::Delta(DeltaLayerName {
             key_range: Key::from_i128(0)
                 ..Key::from_hex("000000067F00000001000004DF0000000006").unwrap(),
             lsn_range: Lsn::from_hex("00000000014FED58").unwrap()
                 ..Lsn::from_hex("000000000154C481").unwrap(),
         });
-        let parsed = LayerFileName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481-00000001").map_err(|s| anyhow::anyhow!(s))?;
+        let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481-v1-00000001").map_err(|s| anyhow::anyhow!(s))?;
         assert_eq!(parsed, expected);
 
         // Omitting generation suffix is valid
-        let parsed = LayerFileName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481").map_err(|s| anyhow::anyhow!(s))?;
+        let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481").map_err(|s| anyhow::anyhow!(s))?;
         assert_eq!(parsed, expected);
 
         Ok(())
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index d6d012c70c..7edb922069 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -60,7 +60,7 @@ use std::{
     ops::ControlFlow,
 };
 
-use crate::tenant::storage_layer::layer::local_layer_path;
+use crate::tenant::timeline::init::LocalLayerFileMetadata;
 use crate::tenant::{
     layer_map::{LayerMap, SearchResult},
     metadata::TimelineMetadata,
@@ -75,7 +75,7 @@ use crate::{
     disk_usage_eviction_task::finite_f32,
     tenant::storage_layer::{
         AsLayerDesc, DeltaLayerWriter, EvictionError, ImageLayerWriter, InMemoryLayer, Layer,
-        LayerAccessStatsReset, LayerFileName, ResidentLayer, ValueReconstructResult,
+        LayerAccessStatsReset, LayerName, ResidentLayer, ValueReconstructResult,
         ValueReconstructState, ValuesReconstructState,
     },
 };
@@ -1905,7 +1905,7 @@ impl Timeline {
     #[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))]
     pub(crate) async fn download_layer(
         &self,
-        layer_file_name: &LayerFileName,
+        layer_file_name: &LayerName,
     ) -> anyhow::Result<Option<bool>> {
         let Some(layer) = self.find_layer(layer_file_name).await else {
             return Ok(None);
@@ -1925,7 +1925,7 @@ impl Timeline {
     /// Returns `Ok(None)` in the case where the layer could not be found by its `layer_file_name`.
     pub(crate) async fn evict_layer(
         &self,
-        layer_file_name: &LayerFileName,
+        layer_file_name: &LayerName,
     ) -> anyhow::Result<Option<bool>> {
         let _gate = self
             .gate
@@ -2387,13 +2387,13 @@ impl Timeline {
         index_part: Option<IndexPart>,
     ) -> anyhow::Result<()> {
         use init::{Decision::*, Discovered, DismissedLayer};
-        use LayerFileName::*;
+        use LayerName::*;
 
         let mut guard = self.layers.write().await;
 
         let timer = self.metrics.load_layer_map_histo.start_timer();
 
-        // Scan timeline directory and create ImageFileName and DeltaFilename
+        // Scan timeline directory and create ImageLayerName and DeltaFilename
         // structs representing all files on disk
         let timeline_path = self
             .conf
@@ -2463,33 +2463,35 @@ impl Timeline {
                 let mut needs_cleanup = Vec::new();
                 let mut total_physical_size = 0;
 
-                for (name, local_path, decision) in decided {
+                for (name, decision) in decided {
                     let decision = match decision {
                         Ok(UseRemote { local, remote }) => {
                             // Remote is authoritative, but we may still choose to retain
                             // the local file if the contents appear to match
-                            if local.file_size() == remote.file_size() {
+                            if local.metadata.file_size() == remote.file_size() {
                                 // Use the local file, but take the remote metadata so that we pick up
                                 // the correct generation.
-                                UseLocal(remote)
+                                UseLocal(
+                                    LocalLayerFileMetadata {
+                                        metadata: remote,
+                                        local_path: local.local_path
+                                    }
+                                )
                             } else {
-                                let local_path = local_path.as_ref().expect("Locally found layer must have path");
-                                init::cleanup_local_file_for_remote(local_path, &local, &remote)?;
+                                init::cleanup_local_file_for_remote(&local, &remote)?;
                                 UseRemote { local, remote }
                             }
                         }
                         Ok(decision) => decision,
                         Err(DismissedLayer::Future { local }) => {
-                            if local.is_some() {
-                                let local_path = local_path.expect("Locally found layer must have path");
-                                init::cleanup_future_layer(&local_path, &name, disk_consistent_lsn)?;
+                            if let Some(local) = local {
+                                init::cleanup_future_layer(&local.local_path, &name, disk_consistent_lsn)?;
                             }
                             needs_cleanup.push(name);
                             continue;
                         }
                         Err(DismissedLayer::LocalOnly(local)) => {
-                            let local_path = local_path.expect("Locally found layer must have path");
-                            init::cleanup_local_only_file(&local_path, &name, &local)?;
+                            init::cleanup_local_only_file(&name, &local)?;
                             // this file never existed remotely, we will have to do rework
                             continue;
                         }
@@ -2503,20 +2505,9 @@ impl Timeline {
                     tracing::debug!(layer=%name, ?decision, "applied");
 
                     let layer = match decision {
-                        UseLocal(m) => {
-                            total_physical_size += m.file_size();
-
-                            let local_path = local_path.unwrap_or_else(|| {
-                                local_layer_path(
-                                    conf,
-                                    &this.tenant_shard_id,
-                                    &this.timeline_id,
-                                    &name,
-                                    &m.generation,
-                                )
-                            });
-
-                            Layer::for_resident(conf, &this, local_path, name, m).drop_eviction_guard()
+                        UseLocal(local) => {
+                            total_physical_size += local.metadata.file_size();
+                            Layer::for_resident(conf, &this, local.local_path, name, local.metadata).drop_eviction_guard()
                         }
                         Evicted(remote) | UseRemote { remote, .. } => {
                             Layer::for_evicted(conf, &this, name, remote)
@@ -2997,10 +2988,10 @@ impl Timeline {
         }
     }
 
-    async fn find_layer(&self, layer_name: &LayerFileName) -> Option<Layer> {
+    async fn find_layer(&self, layer_name: &LayerName) -> Option<Layer> {
         let guard = self.layers.read().await;
         for historic_layer in guard.layer_map().iter_historic_layers() {
-            let historic_layer_name = historic_layer.filename();
+            let historic_layer_name = historic_layer.layer_name();
             if layer_name == &historic_layer_name {
                 return Some(guard.get_from_desc(&historic_layer));
             }
@@ -3030,7 +3021,7 @@ impl Timeline {
             let last_activity_ts = layer.access_stats().latest_activity_or_now();
 
             HeatMapLayer::new(
-                layer.layer_desc().filename(),
+                layer.layer_desc().layer_name(),
                 (&layer.metadata()).into(),
                 last_activity_ts,
             )
@@ -3177,7 +3168,7 @@ impl Timeline {
             if let Some(open_layer) = &layers.open_layer {
                 let start_lsn = open_layer.get_lsn_range().start;
                 if cont_lsn > start_lsn {
-                    //info!("CHECKING for {} at {} on open layer {}", key, cont_lsn, open_layer.filename().display());
+                    //info!("CHECKING for {} at {} on open layer {}", key, cont_lsn, open_layer.layer_name().display());
                     // Get all the data needed to reconstruct the page version from this layer.
                     // But if we have an older cached page image, no need to go past that.
                     let lsn_floor = max(cached_lsn + 1, start_lsn);
@@ -3206,7 +3197,7 @@ impl Timeline {
             for frozen_layer in layers.frozen_layers.iter().rev() {
                 let start_lsn = frozen_layer.get_lsn_range().start;
                 if cont_lsn > start_lsn {
-                    //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display());
+                    //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.layer_name().display());
                     let lsn_floor = max(cached_lsn + 1, start_lsn);
 
                     let frozen_layer = frozen_layer.clone();
@@ -4731,7 +4722,7 @@ impl Timeline {
             if l.get_lsn_range().end > horizon_cutoff {
                 debug!(
                     "keeping {} because it's newer than horizon_cutoff {}",
-                    l.filename(),
+                    l.layer_name(),
                     horizon_cutoff,
                 );
                 result.layers_needed_by_cutoff += 1;
@@ -4742,7 +4733,7 @@ impl Timeline {
             if l.get_lsn_range().end > pitr_cutoff {
                 debug!(
                     "keeping {} because it's newer than pitr_cutoff {}",
-                    l.filename(),
+                    l.layer_name(),
                     pitr_cutoff,
                 );
                 result.layers_needed_by_pitr += 1;
@@ -4761,7 +4752,7 @@ impl Timeline {
                 if &l.get_lsn_range().start <= retain_lsn {
                     debug!(
                         "keeping {} because it's still might be referenced by child branch forked at {} is_dropped: xx is_incremental: {}",
-                        l.filename(),
+                        l.layer_name(),
                         retain_lsn,
                         l.is_incremental(),
                     );
@@ -4792,7 +4783,7 @@ impl Timeline {
             if !layers
                 .image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff))
             {
-                debug!("keeping {} because it is the latest layer", l.filename());
+                debug!("keeping {} because it is the latest layer", l.layer_name());
                 result.layers_not_updated += 1;
                 continue 'outer;
             }
@@ -4800,7 +4791,7 @@ impl Timeline {
             // We didn't find any reason to keep this file, so remove it.
             debug!(
                 "garbage collecting {} is_dropped: xx is_incremental: {}",
-                l.filename(),
+                l.layer_name(),
                 l.is_incremental(),
             );
             layers_to_remove.push(l);
diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs
index 5c2b25da56..2641bf3d13 100644
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -407,7 +407,7 @@ async fn remote_copy(
     let owned = crate::tenant::storage_layer::Layer::for_evicted(
         adoptee.conf,
         adoptee,
-        adopted.layer_desc().filename(),
+        adopted.layer_desc().layer_name(),
         metadata,
     );
 
diff --git a/pageserver/src/tenant/timeline/init.rs b/pageserver/src/tenant/timeline/init.rs
index 9c33981807..66aa765015 100644
--- a/pageserver/src/tenant/timeline/init.rs
+++ b/pageserver/src/tenant/timeline/init.rs
@@ -6,7 +6,7 @@ use crate::{
             self,
             index::{IndexPart, LayerFileMetadata},
         },
-        storage_layer::LayerFileName,
+        storage_layer::LayerName,
         Generation,
     },
     METADATA_FILE_NAME,
@@ -20,7 +20,7 @@ use utils::lsn::Lsn;
 /// Identified files in the timeline directory.
 pub(super) enum Discovered {
     /// The only one we care about
-    Layer(LayerFileName, Utf8PathBuf, u64),
+    Layer(LayerName, Utf8PathBuf, u64),
     /// Old ephmeral files from previous launches, should be removed
     Ephemeral(String),
     /// Old temporary timeline files, unsure what these really are, should be removed
@@ -43,7 +43,7 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result<Vec<Discovere
         let direntry = direntry?;
         let file_name = direntry.file_name().to_string();
 
-        let discovered = match LayerFileName::from_str(&file_name) {
+        let discovered = match LayerName::from_str(&file_name) {
             Ok(file_name) => {
                 let file_size = direntry.metadata()?.len();
                 Discovered::Layer(file_name, direntry.path().to_owned(), file_size)
@@ -72,6 +72,28 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result<Vec<Discovere
     Ok(ret)
 }
 
+/// Whereas `LayerFileMetadata` describes the metadata we would store in remote storage,
+/// this structure extends it with metadata describing the layer's presence in local storage.
+#[derive(Clone, Debug)]
+pub(super) struct LocalLayerFileMetadata {
+    pub(super) metadata: LayerFileMetadata,
+    pub(super) local_path: Utf8PathBuf,
+}
+
+impl LocalLayerFileMetadata {
+    pub fn new(
+        local_path: Utf8PathBuf,
+        file_size: u64,
+        generation: Generation,
+        shard: ShardIndex,
+    ) -> Self {
+        Self {
+            local_path,
+            metadata: LayerFileMetadata::new(file_size, generation, shard),
+        }
+    }
+}
+
 /// Decision on what to do with a layer file after considering its local and remote metadata.
 #[derive(Clone, Debug)]
 pub(super) enum Decision {
@@ -80,11 +102,11 @@ pub(super) enum Decision {
     /// The layer is present locally, but local metadata does not match remote; we must
     /// delete it and treat it as evicted.
     UseRemote {
-        local: LayerFileMetadata,
+        local: LocalLayerFileMetadata,
         remote: LayerFileMetadata,
     },
     /// The layer is present locally, and metadata matches.
-    UseLocal(LayerFileMetadata),
+    UseLocal(LocalLayerFileMetadata),
 }
 
 /// A layer needs to be left out of the layer map.
@@ -92,39 +114,29 @@ pub(super) enum Decision {
 pub(super) enum DismissedLayer {
     /// The related layer is is in future compared to disk_consistent_lsn, it must not be loaded.
     Future {
-        /// The local metadata. `None` if the layer is only known through [`IndexPart`].
-        local: Option<LayerFileMetadata>,
+        /// `None` if the layer is only known through [`IndexPart`].
+        local: Option<LocalLayerFileMetadata>,
     },
     /// The layer only exists locally.
     ///
     /// In order to make crash safe updates to layer map, we must dismiss layers which are only
     /// found locally or not yet included in the remote `index_part.json`.
-    LocalOnly(LayerFileMetadata),
+    LocalOnly(LocalLayerFileMetadata),
 }
 
 /// Merges local discoveries and remote [`IndexPart`] to a collection of decisions.
 pub(super) fn reconcile(
-    discovered: Vec<(LayerFileName, Utf8PathBuf, u64)>,
+    discovered: Vec<(LayerName, Utf8PathBuf, u64)>,
     index_part: Option<&IndexPart>,
     disk_consistent_lsn: Lsn,
     generation: Generation,
     shard: ShardIndex,
-) -> Vec<(
-    LayerFileName,
-    Option<Utf8PathBuf>,
-    Result<Decision, DismissedLayer>,
-)> {
+) -> Vec<(LayerName, Result<Decision, DismissedLayer>)> {
     use Decision::*;
 
-    // name => (local_path, local_metadata, remote_metadata)
-    type Collected = HashMap<
-        LayerFileName,
-        (
-            Option<Utf8PathBuf>,
-            Option<LayerFileMetadata>,
-            Option<LayerFileMetadata>,
-        ),
-    >;
+    // name => (local_metadata, remote_metadata)
+    type Collected =
+        HashMap<LayerName, (Option<LocalLayerFileMetadata>, Option<LayerFileMetadata>)>;
 
     let mut discovered = discovered
         .into_iter()
@@ -135,8 +147,9 @@ pub(super) fn reconcile(
                 // it is not in IndexPart, in which case using our current generation makes sense
                 // because it will be uploaded in this generation.
                 (
-                    Some(local_path),
-                    Some(LayerFileMetadata::new(file_size, generation, shard)),
+                    Some(LocalLayerFileMetadata::new(
+                        local_path, file_size, generation, shard,
+                    )),
                     None,
                 ),
             )
@@ -152,20 +165,20 @@ pub(super) fn reconcile(
         .map(|(name, metadata)| (name, LayerFileMetadata::from(metadata)))
         .for_each(|(name, metadata)| {
             if let Some(existing) = discovered.get_mut(name) {
-                existing.2 = Some(metadata);
+                existing.1 = Some(metadata);
             } else {
-                discovered.insert(name.to_owned(), (None, None, Some(metadata)));
+                discovered.insert(name.to_owned(), (None, Some(metadata)));
             }
         });
 
     discovered
         .into_iter()
-        .map(|(name, (local_path, local, remote))| {
+        .map(|(name, (local, remote))| {
             let decision = if name.is_in_future(disk_consistent_lsn) {
                 Err(DismissedLayer::Future { local })
             } else {
                 match (local, remote) {
-                    (Some(local), Some(remote)) if local != remote => {
+                    (Some(local), Some(remote)) if local.metadata != remote => {
                         Ok(UseRemote { local, remote })
                     }
                     (Some(x), Some(_)) => Ok(UseLocal(x)),
@@ -177,7 +190,7 @@ pub(super) fn reconcile(
                 }
             };
 
-            (name, local_path, decision)
+            (name, decision)
         })
         .collect::<Vec<_>>()
 }
@@ -189,12 +202,12 @@ pub(super) fn cleanup(path: &Utf8Path, kind: &str) -> anyhow::Result<()> {
 }
 
 pub(super) fn cleanup_local_file_for_remote(
-    path: &Utf8Path,
-    local: &LayerFileMetadata,
+    local: &LocalLayerFileMetadata,
     remote: &LayerFileMetadata,
 ) -> anyhow::Result<()> {
-    let local_size = local.file_size();
+    let local_size = local.metadata.file_size();
     let remote_size = remote.file_size();
+    let path = &local.local_path;
 
     let file_name = path.file_name().expect("must be file path");
     tracing::warn!("removing local file {file_name:?} because it has unexpected length {local_size}; length in remote index is {remote_size}");
@@ -211,7 +224,7 @@ pub(super) fn cleanup_local_file_for_remote(
 
 pub(super) fn cleanup_future_layer(
     path: &Utf8Path,
-    name: &LayerFileName,
+    name: &LayerName,
     disk_consistent_lsn: Lsn,
 ) -> anyhow::Result<()> {
     // future image layers are allowed to be produced always for not yet flushed to disk
@@ -223,12 +236,14 @@ pub(super) fn cleanup_future_layer(
 }
 
 pub(super) fn cleanup_local_only_file(
-    path: &Utf8Path,
-    name: &LayerFileName,
-    local: &LayerFileMetadata,
+    name: &LayerName,
+    local: &LocalLayerFileMetadata,
 ) -> anyhow::Result<()> {
     let kind = name.kind();
-    tracing::info!("found local-only {kind} layer {name}, metadata {local:?}");
-    std::fs::remove_file(path)?;
+    tracing::info!(
+        "found local-only {kind} layer {name}, metadata {:?}",
+        local.metadata
+    );
+    std::fs::remove_file(&local.local_path)?;
     Ok(())
 }
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index 8e8d64e0c6..a72eb1b3bf 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -294,7 +294,7 @@ impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
         // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
         self.0
             .get(&desc.key())
-            .with_context(|| format!("get layer from desc: {}", desc.filename()))
+            .with_context(|| format!("get layer from desc: {}", desc.layer_name()))
             .expect("not found")
             .clone()
     }
diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs
index 0bf4d1e599..7797117e0f 100644
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -1,4 +1,4 @@
-use super::storage_layer::LayerFileName;
+use super::storage_layer::LayerName;
 use super::storage_layer::ResidentLayer;
 use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::remote_timeline_client::index::IndexPart;
@@ -45,7 +45,7 @@ pub(crate) struct UploadQueueInitialized {
 
     /// All layer files stored in the remote storage, taking into account all
     /// in-progress and queued operations
-    pub(crate) latest_files: HashMap<LayerFileName, LayerFileMetadata>,
+    pub(crate) latest_files: HashMap<LayerName, LayerFileMetadata>,
 
     /// How many file uploads or deletions been scheduled, since the
     /// last (scheduling of) metadata index upload?
@@ -89,7 +89,7 @@ pub(crate) struct UploadQueueInitialized {
     /// Putting this behind a testing feature to catch problems in tests, but assuming we could have a
     /// bug causing leaks, then it's better to not leave this enabled for production builds.
     #[cfg(feature = "testing")]
-    pub(crate) dangling_files: HashMap<LayerFileName, Generation>,
+    pub(crate) dangling_files: HashMap<LayerName, Generation>,
 
     /// Set to true when we have inserted the `UploadOp::Shutdown` into the `inprogress_tasks`.
     pub(crate) shutting_down: bool,
@@ -281,7 +281,7 @@ pub(crate) struct UploadTask {
 /// for timeline deletion, which skips this queue and goes directly to DeletionQueue.
 #[derive(Debug)]
 pub(crate) struct Delete {
-    pub(crate) layers: Vec<(LayerFileName, LayerFileMetadata)>,
+    pub(crate) layers: Vec<(LayerName, LayerFileMetadata)>,
 }
 
 #[derive(Debug)]
diff --git a/s3_scrubber/src/checks.rs b/s3_scrubber/src/checks.rs
index 7c0f699958..68133fc0a9 100644
--- a/s3_scrubber/src/checks.rs
+++ b/s3_scrubber/src/checks.rs
@@ -13,7 +13,7 @@ use crate::metadata_stream::stream_listing;
 use crate::{download_object_with_retries, RootTarget, TenantShardTimelineId};
 use futures_util::StreamExt;
 use pageserver::tenant::remote_timeline_client::parse_remote_index_path;
-use pageserver::tenant::storage_layer::LayerFileName;
+use pageserver::tenant::storage_layer::LayerName;
 use pageserver::tenant::IndexPart;
 use remote_storage::RemotePath;
 
@@ -110,7 +110,7 @@ pub(crate) fn branch_cleanup_and_check_errors(
                     for (layer, metadata) in index_part.layer_metadata {
                         if metadata.file_size == 0 {
                             result.errors.push(format!(
-                                "index_part.json contains a layer {} that has 0 size in its layer metadata", layer.file_name(),
+                                "index_part.json contains a layer {} that has 0 size in its layer metadata", layer,
                             ))
                         }
 
@@ -121,7 +121,7 @@ pub(crate) fn branch_cleanup_and_check_errors(
                             // layer we think is missing.
                             result.errors.push(format!(
                                 "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage",
-                                layer.file_name(),
+                                layer,
                                 metadata.generation.get_suffix(),
                                 metadata.shard
                             ))
@@ -170,8 +170,7 @@ pub(crate) struct LayerRef {
 /// the tenant to query whether an object exists.
 #[derive(Default)]
 pub(crate) struct TenantObjectListing {
-    shard_timelines:
-        HashMap<(ShardIndex, TimelineId), HashMap<(LayerFileName, Generation), LayerRef>>,
+    shard_timelines: HashMap<(ShardIndex, TimelineId), HashMap<(LayerName, Generation), LayerRef>>,
 }
 
 impl TenantObjectListing {
@@ -180,7 +179,7 @@ impl TenantObjectListing {
     pub(crate) fn push(
         &mut self,
         ttid: TenantShardTimelineId,
-        layers: HashSet<(LayerFileName, Generation)>,
+        layers: HashSet<(LayerName, Generation)>,
     ) {
         let shard_index = ShardIndex::new(
             ttid.tenant_shard_id.shard_number,
@@ -208,7 +207,7 @@ impl TenantObjectListing {
     pub(crate) fn check_ref(
         &mut self,
         timeline_id: TimelineId,
-        layer_file: &LayerFileName,
+        layer_file: &LayerName,
         metadata: &IndexLayerMetadata,
     ) -> bool {
         let Some(shard_tl) = self.shard_timelines.get_mut(&(metadata.shard, timeline_id)) else {
@@ -224,7 +223,7 @@ impl TenantObjectListing {
         true
     }
 
-    pub(crate) fn get_orphans(&self) -> Vec<(ShardIndex, TimelineId, LayerFileName, Generation)> {
+    pub(crate) fn get_orphans(&self) -> Vec<(ShardIndex, TimelineId, LayerName, Generation)> {
         let mut result = Vec::new();
         for ((shard_index, timeline_id), layers) in &self.shard_timelines {
             for ((layer_file, generation), layer_ref) in layers {
@@ -249,23 +248,23 @@ pub(crate) enum BlobDataParseResult {
     Parsed {
         index_part: IndexPart,
         index_part_generation: Generation,
-        s3_layers: HashSet<(LayerFileName, Generation)>,
+        s3_layers: HashSet<(LayerName, Generation)>,
     },
     /// The remains of a deleted Timeline (i.e. an initdb archive only)
     Relic,
     Incorrect(Vec<String>),
 }
 
-fn parse_layer_object_name(name: &str) -> Result<(LayerFileName, Generation), String> {
+fn parse_layer_object_name(name: &str) -> Result<(LayerName, Generation), String> {
     match name.rsplit_once('-') {
         // FIXME: this is gross, just use a regex?
         Some((layer_filename, gen)) if gen.len() == 8 => {
-            let layer = layer_filename.parse::<LayerFileName>()?;
+            let layer = layer_filename.parse::<LayerName>()?;
             let gen =
                 Generation::parse_suffix(gen).ok_or("Malformed generation suffix".to_string())?;
             Ok((layer, gen))
         }
-        _ => Ok((name.parse::<LayerFileName>()?, Generation::none())),
+        _ => Ok((name.parse::<LayerName>()?, Generation::none())),
     }
 }
 
diff --git a/s3_scrubber/src/tenant_snapshot.rs b/s3_scrubber/src/tenant_snapshot.rs
index 4eccad381b..2c93a8490a 100644
--- a/s3_scrubber/src/tenant_snapshot.rs
+++ b/s3_scrubber/src/tenant_snapshot.rs
@@ -12,7 +12,7 @@ use aws_sdk_s3::Client;
 use camino::Utf8PathBuf;
 use futures::{StreamExt, TryStreamExt};
 use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata;
-use pageserver::tenant::storage_layer::LayerFileName;
+use pageserver::tenant::storage_layer::LayerName;
 use pageserver::tenant::IndexPart;
 use pageserver_api::shard::TenantShardId;
 use utils::generation::Generation;
@@ -48,16 +48,16 @@ impl SnapshotDownloader {
     async fn download_layer(
         &self,
         ttid: TenantShardTimelineId,
-        layer_name: LayerFileName,
+        layer_name: LayerName,
         layer_metadata: IndexLayerMetadata,
-    ) -> anyhow::Result<(LayerFileName, IndexLayerMetadata)> {
+    ) -> anyhow::Result<(LayerName, IndexLayerMetadata)> {
         // Note this is local as in a local copy of S3 data, not local as in the pageserver's local format.  They use
         // different layer names (remote-style has the generation suffix)
         let local_path = self.output_path.join(format!(
             "{}/timelines/{}/{}{}",
             ttid.tenant_shard_id,
             ttid.timeline_id,
-            layer_name.file_name(),
+            layer_name,
             layer_metadata.generation.get_suffix()
         ));
 
@@ -76,7 +76,7 @@ impl SnapshotDownloader {
             let remote_layer_path = format!(
                 "{}{}{}",
                 timeline_root.prefix_in_bucket,
-                layer_name.file_name(),
+                layer_name,
                 layer_metadata.generation.get_suffix()
             );
 
@@ -110,7 +110,7 @@ impl SnapshotDownloader {
     async fn download_layers(
         &self,
         ttid: TenantShardTimelineId,
-        layers: Vec<(LayerFileName, IndexLayerMetadata)>,
+        layers: Vec<(LayerName, IndexLayerMetadata)>,
     ) -> anyhow::Result<()> {
         let layer_count = layers.len();
         tracing::info!("Downloading {} layers for timeline {ttid}...", layer_count);
@@ -138,7 +138,7 @@ impl SnapshotDownloader {
                     tracing::info!(
                         "[{download_count}/{layer_count}] OK: {} bytes {ttid} {}",
                         layer_metadata.file_size,
-                        layer_name.file_name()
+                        layer_name
                     );
                 }
                 Err(e) => {
@@ -163,7 +163,7 @@ impl SnapshotDownloader {
         index_part_generation: Generation,
         ancestor_layers: &mut HashMap<
             TenantShardTimelineId,
-            HashMap<LayerFileName, IndexLayerMetadata>,
+            HashMap<LayerName, IndexLayerMetadata>,
         >,
     ) -> anyhow::Result<()> {
         let index_bytes = serde_json::to_string(&index_part).unwrap();
@@ -234,7 +234,7 @@ impl SnapshotDownloader {
         // happen if this tenant has been split at some point)
         let mut ancestor_layers: HashMap<
             TenantShardTimelineId,
-            HashMap<LayerFileName, IndexLayerMetadata>,
+            HashMap<LayerName, IndexLayerMetadata>,
         > = Default::default();
 
         for shard in shards.into_iter().filter(|s| s.shard_count == shard_count) {
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index f618c508bc..390b94c2ea 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -54,7 +54,7 @@ from fixtures.pageserver.allowed_errors import (
     DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS,
 )
 from fixtures.pageserver.http import PageserverHttpClient
-from fixtures.pageserver.types import IndexPartDump, LayerFileName, parse_layer_file_name
+from fixtures.pageserver.types import IndexPartDump, LayerName, parse_layer_file_name
 from fixtures.pageserver.utils import (
     wait_for_last_record_lsn,
     wait_for_upload,
@@ -2664,7 +2664,7 @@ class NeonPageserver(PgProtocol, LogUtils):
         )
 
     def layer_exists(
-        self, tenant_id: TenantId, timeline_id: TimelineId, layer_name: LayerFileName
+        self, tenant_id: TenantId, timeline_id: TimelineId, layer_name: LayerName
     ) -> bool:
         layers = self.list_layers(tenant_id, timeline_id)
         return layer_name in [parse_layer_file_name(p.name) for p in layers]
diff --git a/test_runner/fixtures/pageserver/types.py b/test_runner/fixtures/pageserver/types.py
index fd018cb778..1fb618f445 100644
--- a/test_runner/fixtures/pageserver/types.py
+++ b/test_runner/fixtures/pageserver/types.py
@@ -12,7 +12,7 @@ class IndexLayerMetadata:
 
 
 @dataclass(frozen=True)
-class ImageLayerFileName:
+class ImageLayerName:
     lsn: Lsn
     key_start: Key
     key_end: Key
@@ -26,7 +26,7 @@ class ImageLayerFileName:
 
 
 @dataclass(frozen=True)
-class DeltaLayerFileName:
+class DeltaLayerName:
     lsn_start: Lsn
     lsn_end: Lsn
     key_start: Key
@@ -41,14 +41,16 @@ class DeltaLayerFileName:
         return ret
 
 
-LayerFileName = Union[ImageLayerFileName, DeltaLayerFileName]
+LayerName = Union[ImageLayerName, DeltaLayerName]
 
 
 class InvalidFileName(Exception):
     pass
 
 
-IMAGE_LAYER_FILE_NAME = re.compile("^([A-F0-9]{36})-([A-F0-9]{36})__([A-F0-9]{16})(-[a-f0-9]{8})?$")
+IMAGE_LAYER_FILE_NAME = re.compile(
+    "^([A-F0-9]{36})-([A-F0-9]{36})__([A-F0-9]{16})(-v1-[a-f0-9]{8})?$"
+)
 
 
 def parse_image_layer(f_name: str) -> Tuple[int, int, int]:
@@ -62,7 +64,7 @@ def parse_image_layer(f_name: str) -> Tuple[int, int, int]:
 
 
 DELTA_LAYER_FILE_NAME = re.compile(
-    "^([A-F0-9]{36})-([A-F0-9]{36})__([A-F0-9]{16})-([A-F0-9]{16})(-[a-f0-9]{8})?$"
+    "^([A-F0-9]{36})-([A-F0-9]{36})__([A-F0-9]{16})-([A-F0-9]{16})(-v1-[a-f0-9]{8})?$"
 )
 
 
@@ -80,16 +82,16 @@ def parse_delta_layer(f_name: str) -> Tuple[int, int, int, int]:
     )
 
 
-def parse_layer_file_name(file_name: str) -> LayerFileName:
+def parse_layer_file_name(file_name: str) -> LayerName:
     try:
         key_start, key_end, lsn = parse_image_layer(file_name)
-        return ImageLayerFileName(lsn=Lsn(lsn), key_start=Key(key_start), key_end=Key(key_end))
+        return ImageLayerName(lsn=Lsn(lsn), key_start=Key(key_start), key_end=Key(key_end))
     except InvalidFileName:
         pass
 
     try:
         key_start, key_end, lsn_start, lsn_end = parse_delta_layer(file_name)
-        return DeltaLayerFileName(
+        return DeltaLayerName(
             lsn_start=Lsn(lsn_start),
             lsn_end=Lsn(lsn_end),
             key_start=Key(key_start),
@@ -101,18 +103,15 @@ def parse_layer_file_name(file_name: str) -> LayerFileName:
     raise InvalidFileName("neither image nor delta layer")
 
 
-def is_future_layer(layer_file_name: LayerFileName, disk_consistent_lsn: Lsn):
+def is_future_layer(layer_file_name: LayerName, disk_consistent_lsn: Lsn):
     """
     Determines if this layer file is considered to be in future meaning we will discard these
     layers during timeline initialization from the given disk_consistent_lsn.
     """
-    if (
-        isinstance(layer_file_name, ImageLayerFileName)
-        and layer_file_name.lsn > disk_consistent_lsn
-    ):
+    if isinstance(layer_file_name, ImageLayerName) and layer_file_name.lsn > disk_consistent_lsn:
         return True
     elif (
-        isinstance(layer_file_name, DeltaLayerFileName)
+        isinstance(layer_file_name, DeltaLayerName)
         and layer_file_name.lsn_end > disk_consistent_lsn + 1
     ):
         return True
@@ -122,7 +121,7 @@ def is_future_layer(layer_file_name: LayerFileName, disk_consistent_lsn: Lsn):
 
 @dataclass
 class IndexPartDump:
-    layer_metadata: Dict[LayerFileName, IndexLayerMetadata]
+    layer_metadata: Dict[LayerName, IndexLayerMetadata]
     disk_consistent_lsn: Lsn
 
     @classmethod
diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py
index f311a8bf2c..cc34fd83e9 100644
--- a/test_runner/regress/test_layers_from_future.py
+++ b/test_runner/regress/test_layers_from_future.py
@@ -3,8 +3,8 @@ import time
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder, flush_ep_to_pageserver
 from fixtures.pageserver.types import (
-    DeltaLayerFileName,
-    ImageLayerFileName,
+    DeltaLayerName,
+    ImageLayerName,
     is_future_layer,
 )
 from fixtures.pageserver.utils import (
@@ -81,7 +81,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
     current = get_index_part()
     assert len(set(current.layer_metadata.keys())) == 1
     layer_file_name = list(current.layer_metadata.keys())[0]
-    assert isinstance(layer_file_name, DeltaLayerFileName)
+    assert isinstance(layer_file_name, DeltaLayerName)
     assert layer_file_name.is_l0(), f"{layer_file_name}"
 
     log.info("force image layer creation in the future by writing some data into in-memory layer")
@@ -146,7 +146,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
     future_layers = get_future_layers()
     assert len(future_layers) == 1
     future_layer = future_layers[0]
-    assert isinstance(future_layer, ImageLayerFileName)
+    assert isinstance(future_layer, ImageLayerName)
     assert future_layer.lsn == last_record_lsn
     log.info(
         f"got layer from the future: lsn={future_layer.lsn} disk_consistent_lsn={ip.disk_consistent_lsn} last_record_lsn={last_record_lsn}"
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index adcf7de8d4..58eaf404d3 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -729,8 +729,8 @@ def test_upgrade_generationless_local_file_paths(
     for filename in os.listdir(timeline_dir):
         path = os.path.join(timeline_dir, filename)
         log.info(f"Found file {path}")
-        if path.endswith("-00000001"):
-            new_path = path[:-9]
+        if path.endswith("-v1-00000001"):
+            new_path = path[:-12]
             os.rename(path, new_path)
             log.info(f"Renamed {path} -> {new_path}")
             files_renamed += 1

From b06eec41fa5971899fd15ed4b643889863c616c7 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Wed, 8 May 2024 20:58:35 +0300
Subject: [PATCH 009/130] Ignore page header when comparing VM pages in
 test_vm_bits.py (#7499)

## Problem

See #6714, #6967

## Summary of changes

Completely ignore page header when comparing VM pages.

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 test_runner/regress/test_vm_bits.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py
index 06f2a8befd..b549db1af6 100644
--- a/test_runner/regress/test_vm_bits.py
+++ b/test_runner/regress/test_vm_bits.py
@@ -168,15 +168,16 @@ def test_vm_bit_clear_on_heap_lock(neon_env_builder: NeonEnvBuilder):
     # The VM page in shared buffer cache, and the same page as reconstructed
     # by the pageserver, should be equal.
     #
-    # Ignore the LSN on the page though (first 8 bytes). If the dirty
-    # VM page is flushed from the cache for some reason, it gets WAL-logged,
-    # which changes the LSN on the page.
+    # Ignore page header (24 bytes) of visibility map.
+    # If the dirty VM page is flushed from the cache for some reason,
+    # it gets WAL-logged, which changes the LSN on the page.
+    # Also in neon SMGR we can replace empty heap page with zero (uninitialized) heap page.
     cur.execute("select get_raw_page( 'vmtest_lock', 'vm', 0 )")
-    vm_page_in_cache = (cur.fetchall()[0][0])[8:100].hex()
+    vm_page_in_cache = (cur.fetchall()[0][0])[24:100].hex()
     cur.execute(
         "select get_raw_page_at_lsn( 'vmtest_lock', 'vm', 0, pg_current_wal_insert_lsn(), NULL )"
     )
-    vm_page_at_pageserver = (cur.fetchall()[0][0])[8:100].hex()
+    vm_page_at_pageserver = (cur.fetchall()[0][0])[24:100].hex()
 
     assert vm_page_at_pageserver == vm_page_in_cache
 

From d5399b729b3ecd3d9d38d8e61d3511fc4bf321b5 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 8 May 2024 19:29:16 +0100
Subject: [PATCH 010/130] pageserver: fix division by zero in layer counting
 metric (#7662)

For aux file keys (v1 or v2) the vectored read path does not return an
error when they're missing. Instead they are omitted from the resulting
btree (this is a requirement, not a bug). Skip updating the metric in
these cases to avoid infinite results.
---
 pageserver/src/tenant/timeline.rs | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 7edb922069..5983529a44 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1220,11 +1220,17 @@ impl Timeline {
         }
         reconstruct_timer.stop_and_record();
 
-        // Note that this is an approximation. Tracking the exact number of layers visited
-        // per key requires virtually unbounded memory usage and is inefficient
-        // (i.e. segment tree tracking each range queried from a layer)
-        crate::metrics::VEC_READ_NUM_LAYERS_VISITED
-            .observe(layers_visited as f64 / results.len() as f64);
+        // For aux file keys (v1 or v2) the vectored read path does not return an error
+        // when they're missing. Instead they are omitted from the resulting btree
+        // (this is a requirement, not a bug). Skip updating the metric in these cases
+        // to avoid infinite results.
+        if !results.is_empty() {
+            // Note that this is an approximation. Tracking the exact number of layers visited
+            // per key requires virtually unbounded memory usage and is inefficient
+            // (i.e. segment tree tracking each range queried from a layer)
+            crate::metrics::VEC_READ_NUM_LAYERS_VISITED
+                .observe(layers_visited as f64 / results.len() as f64);
+        }
 
         Ok(results)
     }

From ab10523cc1d59cd65d88181645b149d4adc23c5e Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 9 May 2024 10:58:38 +0200
Subject: [PATCH 011/130] remote_storage: AWS_PROFILE with endpoint overrides
 in ~/.aws/config (updates AWS SDKs) (#7664)

Before this PR, using the AWS SDK profile feature for running against
minio didn't work because
* our SDK versions were too old and didn't include
  https://github.com/awslabs/aws-sdk-rust/issues/1060 and
* we didn't massage the s3 client config builder correctly.

This PR
* udpates all the AWS SDKs we use to, respectively, the latest version I
could find on crates.io (Is there a better process?)
* changes the way remote_storage constructs the S3 client, and
* documents how to run the test suite against real S3 & local minio.

Regarding the changes to `remote_storage`: if one reads the SDK docs, it
is clear that the recommended way is to use `aws_config::from_env`, then
customize.
What we were doing instead is to use the `aws_sdk_s3` builder directly.

To get the `local-minio` in the added docs working, I needed to update
both the SDKs and make the changes to the `remote_storage`. See the
commit history in this PR for details.

Refs:
* byproduct: https://github.com/smithy-lang/smithy-rs/pull/3633
* follow-up on deprecation:
https://github.com/neondatabase/neon/issues/7665
* follow-up for scrubber S3 setup:
https://github.com/neondatabase/neon/issues/7667
---
 Cargo.lock                           |  88 +++++++++------
 Cargo.toml                           |  14 +--
 libs/remote_storage/src/s3_bucket.rs |  92 +++++++++------
 s3_scrubber/src/lib.rs               |   5 +-
 test_runner/README.md                | 160 +++++++++++++++++++++++++++
 workspace_hack/Cargo.toml            |   2 +-
 6 files changed, 285 insertions(+), 76 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 9bff5e1eff..6ce7180d67 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -25,9 +25,9 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
 
 [[package]]
 name = "ahash"
-version = "0.8.9"
+version = "0.8.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d713b3834d76b85304d4d525563c1276e2e30dc97cc67bfb4585a4a29fc2c89f"
+checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011"
 dependencies = [
  "cfg-if",
  "const-random",
@@ -284,9 +284,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
 
 [[package]]
 name = "aws-config"
-version = "1.1.4"
+version = "1.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8b30c39ebe61f75d1b3785362b1586b41991873c9ab3e317a9181c246fb71d82"
+checksum = "baaa0be6ee7d90b775ae6ccb6d2ba182b91219ec2001f92338773a094246af1d"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
@@ -309,14 +309,15 @@ dependencies = [
  "time",
  "tokio",
  "tracing",
+ "url",
  "zeroize",
 ]
 
 [[package]]
 name = "aws-credential-types"
-version = "1.1.8"
+version = "1.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fa8587ae17c8e967e4b05a62d495be2fb7701bec52a97f7acfe8a29f938384c8"
+checksum = "e16838e6c9e12125face1c1eff1343c75e3ff540de98ff7ebd61874a89bcfeb9"
 dependencies = [
  "aws-smithy-async",
  "aws-smithy-runtime-api",
@@ -326,9 +327,9 @@ dependencies = [
 
 [[package]]
 name = "aws-runtime"
-version = "1.1.8"
+version = "1.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b13dc54b4b49f8288532334bba8f87386a40571c47c37b1304979b556dc613c8"
+checksum = "785da4a15e7b166b505fd577e4560c7a7cd8fbdf842eb1336cbcbf8944ce56f1"
 dependencies = [
  "aws-credential-types",
  "aws-sigv4",
@@ -373,10 +374,11 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-s3"
-version = "1.14.0"
+version = "1.26.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "951f7730f51a2155c711c85c79f337fbc02a577fa99d2a0a8059acfce5392113"
+checksum = "7bc5ce518d4b8d16e0408de7bdf1b3097cec61a7daa979750a208f8d9934386d"
 dependencies = [
+ "ahash",
  "aws-credential-types",
  "aws-runtime",
  "aws-sigv4",
@@ -391,20 +393,25 @@ dependencies = [
  "aws-smithy-xml",
  "aws-types",
  "bytes",
+ "fastrand 2.0.0",
+ "hex",
+ "hmac",
  "http 0.2.9",
  "http-body 0.4.5",
+ "lru",
  "once_cell",
  "percent-encoding",
  "regex-lite",
+ "sha2",
  "tracing",
  "url",
 ]
 
 [[package]]
 name = "aws-sdk-sso"
-version = "1.12.0"
+version = "1.22.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f486420a66caad72635bc2ce0ff6581646e0d32df02aa39dc983bfe794955a5b"
+checksum = "ca3d6c4cba4e009391b72b0fcf12aff04ea3c9c3aa2ecaafa330326a8bd7e601"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
@@ -424,9 +431,9 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-ssooidc"
-version = "1.12.0"
+version = "1.22.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "39ddccf01d82fce9b4a15c8ae8608211ee7db8ed13a70b514bbfe41df3d24841"
+checksum = "73400dc239d14f63d932f4ca7b55af5e9ef1f857f7d70655249ccc287adb2570"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
@@ -446,9 +453,9 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-sts"
-version = "1.12.0"
+version = "1.22.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1a591f8c7e6a621a501b2b5d2e88e1697fcb6274264523a6ad4d5959889a41ce"
+checksum = "10f8858308af76fba3e5ffcf1bb56af5471574d2bdfaf0159470c25bc2f760e5"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
@@ -469,9 +476,9 @@ dependencies = [
 
 [[package]]
 name = "aws-sigv4"
-version = "1.2.0"
+version = "1.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "11d6f29688a4be9895c0ba8bef861ad0c0dac5c15e9618b9b7a6c233990fc263"
+checksum = "58b56f1cbe6fd4d0c2573df72868f20ab1c125ca9c9dbce17927a463433a2e57"
 dependencies = [
  "aws-credential-types",
  "aws-smithy-eventstream",
@@ -498,9 +505,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-async"
-version = "1.1.8"
+version = "1.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d26ea8fa03025b2face2b3038a63525a10891e3d8829901d502e5384a0d8cd46"
+checksum = "62220bc6e97f946ddd51b5f1361f78996e704677afc518a4ff66b7a72ea1378c"
 dependencies = [
  "futures-util",
  "pin-project-lite",
@@ -509,9 +516,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-checksums"
-version = "0.60.4"
+version = "0.60.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "be2acd1b9c6ae5859999250ed5a62423aedc5cf69045b844432de15fa2f31f2b"
+checksum = "83fa43bc04a6b2441968faeab56e68da3812f978a670a5db32accbdcafddd12f"
 dependencies = [
  "aws-smithy-http",
  "aws-smithy-types",
@@ -541,9 +548,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-http"
-version = "0.60.7"
+version = "0.60.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3f10fa66956f01540051b0aa7ad54574640f748f9839e843442d99b970d3aff9"
+checksum = "4a7de001a1b9a25601016d8057ea16e31a45fdca3751304c8edf4ad72e706c08"
 dependencies = [
  "aws-smithy-eventstream",
  "aws-smithy-runtime-api",
@@ -581,9 +588,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-runtime"
-version = "1.1.8"
+version = "1.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ec81002d883e5a7fd2bb063d6fb51c4999eb55d404f4fff3dd878bf4733b9f01"
+checksum = "c9ac79e9f3a4d576f3cd4a470a0275b138d9e7b11b1cd514a6858ae0a79dd5bb"
 dependencies = [
  "aws-smithy-async",
  "aws-smithy-http",
@@ -594,6 +601,7 @@ dependencies = [
  "h2 0.3.26",
  "http 0.2.9",
  "http-body 0.4.5",
+ "http-body 1.0.0",
  "hyper 0.14.26",
  "hyper-rustls 0.24.0",
  "once_cell",
@@ -606,9 +614,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-runtime-api"
-version = "1.2.0"
+version = "1.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9acb931e0adaf5132de878f1398d83f8677f90ba70f01f65ff87f6d7244be1c5"
+checksum = "04ec42c2f5c0e7796a2848dde4d9f3bf8ce12ccbb3d5aa40c52fa0cdd61a1c47"
 dependencies = [
  "aws-smithy-async",
  "aws-smithy-types",
@@ -623,16 +631,19 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-types"
-version = "1.1.8"
+version = "1.1.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "abe14dceea1e70101d38fbf2a99e6a34159477c0fb95e68e05c66bd7ae4c3729"
+checksum = "baf98d97bba6ddaba180f1b1147e202d8fe04940403a95a3f826c790f931bbd1"
 dependencies = [
  "base64-simd",
  "bytes",
  "bytes-utils",
  "futures-core",
  "http 0.2.9",
+ "http 1.1.0",
  "http-body 0.4.5",
+ "http-body 1.0.0",
+ "http-body-util",
  "itoa",
  "num-integer",
  "pin-project-lite",
@@ -646,18 +657,18 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-xml"
-version = "0.60.7"
+version = "0.60.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "872c68cf019c0e4afc5de7753c4f7288ce4b71663212771bf5e4542eb9346ca9"
+checksum = "d123fbc2a4adc3c301652ba8e149bf4bc1d1725affb9784eb20c953ace06bf55"
 dependencies = [
  "xmlparser",
 ]
 
 [[package]]
 name = "aws-types"
-version = "1.1.8"
+version = "1.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0dbf2f3da841a8930f159163175cf6a3d16ddde517c1b0fba7aa776822800f40"
+checksum = "5a43b56df2c529fe44cb4d92bd64d0479883fb9608ff62daede4df5405381814"
 dependencies = [
  "aws-credential-types",
  "aws-smithy-async",
@@ -2935,6 +2946,15 @@ version = "0.4.20"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f"
 
+[[package]]
+name = "lru"
+version = "0.12.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d3262e75e648fce39813cb56ac41f3c3e3f65217ebf3844d818d1f9398cfb0dc"
+dependencies = [
+ "hashbrown 0.14.0",
+]
+
 [[package]]
 name = "match_cfg"
 version = "0.1.0"
diff --git a/Cargo.toml b/Cargo.toml
index 1ddadd2f3c..17f30a1327 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -52,14 +52,14 @@ azure_storage_blobs = "0.19"
 flate2 = "1.0.26"
 async-stream = "0.3"
 async-trait = "0.1"
-aws-config = { version = "1.1.4", default-features = false, features=["rustls"] }
-aws-sdk-s3 = "1.14"
+aws-config = { version = "1.3", default-features = false, features=["rustls"] }
+aws-sdk-s3 = "1.26"
 aws-sdk-iam = "1.15.0"
-aws-smithy-async = { version = "1.1.4", default-features = false, features=["rt-tokio"] }
-aws-smithy-types = "1.1.4"
-aws-credential-types = "1.1.4"
-aws-sigv4 = { version = "1.2.0", features = ["sign-http"] }
-aws-types = "1.1.7"
+aws-smithy-async = { version = "1.2.1", default-features = false, features=["rt-tokio"] }
+aws-smithy-types = "1.1.9"
+aws-credential-types = "1.2.0"
+aws-sigv4 = { version = "1.2.1", features = ["sign-http"] }
+aws-types = "1.2.0"
 axum = { version = "0.6.20", features = ["ws"] }
 base64 = "0.13.0"
 bincode = "1.3"
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index c0b89cee2a..c3d6c75e20 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -27,7 +27,7 @@ use aws_config::{
 };
 use aws_credential_types::provider::SharedCredentialsProvider;
 use aws_sdk_s3::{
-    config::{AsyncSleep, Builder, IdentityCache, Region, SharedAsyncSleep},
+    config::{AsyncSleep, IdentityCache, Region, SharedAsyncSleep},
     error::SdkError,
     operation::get_object::GetObjectError,
     types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion, StorageClass},
@@ -75,13 +75,13 @@ struct GetObjectRequest {
 }
 impl S3Bucket {
     /// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
-    pub fn new(aws_config: &S3Config, timeout: Duration) -> anyhow::Result<Self> {
+    pub fn new(remote_storage_config: &S3Config, timeout: Duration) -> anyhow::Result<Self> {
         tracing::debug!(
             "Creating s3 remote storage for S3 bucket {}",
-            aws_config.bucket_name
+            remote_storage_config.bucket_name
         );
 
-        let region = Some(Region::new(aws_config.bucket_region.clone()));
+        let region = Some(Region::new(remote_storage_config.bucket_region.clone()));
 
         let provider_conf = ProviderConfig::without_region().with_region(region.clone());
 
@@ -113,6 +113,38 @@ impl S3Bucket {
         // AWS SDK requires us to specify how the RetryConfig should sleep when it wants to back off
         let sleep_impl: Arc<dyn AsyncSleep> = Arc::new(TokioSleep::new());
 
+        let sdk_config_loader: aws_config::ConfigLoader = aws_config::defaults(
+            #[allow(deprecated)] /* TODO: https://github.com/neondatabase/neon/issues/7665 */
+            BehaviorVersion::v2023_11_09(),
+        )
+        .region(region)
+        .identity_cache(IdentityCache::lazy().build())
+        .credentials_provider(SharedCredentialsProvider::new(credentials_provider))
+        .sleep_impl(SharedAsyncSleep::from(sleep_impl));
+
+        let sdk_config: aws_config::SdkConfig = std::thread::scope(|s| {
+            s.spawn(|| {
+                // TODO: make this function async.
+                tokio::runtime::Builder::new_current_thread()
+                    .enable_all()
+                    .build()
+                    .unwrap()
+                    .block_on(sdk_config_loader.load())
+            })
+            .join()
+            .unwrap()
+        });
+
+        let mut s3_config_builder = aws_sdk_s3::config::Builder::from(&sdk_config);
+
+        // Technically, the `remote_storage_config.endpoint` field only applies to S3 interactions.
+        // (In case we ever re-use the `sdk_config` for more than just the S3 client in the future)
+        if let Some(custom_endpoint) = remote_storage_config.endpoint.clone() {
+            s3_config_builder = s3_config_builder
+                .endpoint_url(custom_endpoint)
+                .force_path_style(true);
+        }
+
         // We do our own retries (see [`backoff::retry`]).  However, for the AWS SDK to enable rate limiting in response to throttling
         // responses (e.g. 429 on too many ListObjectsv2 requests), we must provide a retry config.  We set it to use at most one
         // attempt, and enable 'Adaptive' mode, which causes rate limiting to be enabled.
@@ -120,42 +152,36 @@ impl S3Bucket {
         retry_config
             .set_max_attempts(Some(1))
             .set_mode(Some(RetryMode::Adaptive));
+        s3_config_builder = s3_config_builder.retry_config(retry_config.build());
 
-        let mut config_builder = Builder::default()
-            .behavior_version(BehaviorVersion::v2023_11_09())
-            .region(region)
-            .identity_cache(IdentityCache::lazy().build())
-            .credentials_provider(SharedCredentialsProvider::new(credentials_provider))
-            .retry_config(retry_config.build())
-            .sleep_impl(SharedAsyncSleep::from(sleep_impl));
+        let s3_config = s3_config_builder.build();
+        let client = aws_sdk_s3::Client::from_conf(s3_config);
 
-        if let Some(custom_endpoint) = aws_config.endpoint.clone() {
-            config_builder = config_builder
-                .endpoint_url(custom_endpoint)
-                .force_path_style(true);
-        }
+        let prefix_in_bucket = remote_storage_config
+            .prefix_in_bucket
+            .as_deref()
+            .map(|prefix| {
+                let mut prefix = prefix;
+                while prefix.starts_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
+                    prefix = &prefix[1..]
+                }
 
-        let client = Client::from_conf(config_builder.build());
+                let mut prefix = prefix.to_string();
+                while prefix.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
+                    prefix.pop();
+                }
+                prefix
+            });
 
-        let prefix_in_bucket = aws_config.prefix_in_bucket.as_deref().map(|prefix| {
-            let mut prefix = prefix;
-            while prefix.starts_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
-                prefix = &prefix[1..]
-            }
-
-            let mut prefix = prefix.to_string();
-            while prefix.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
-                prefix.pop();
-            }
-            prefix
-        });
         Ok(Self {
             client,
-            bucket_name: aws_config.bucket_name.clone(),
-            max_keys_per_list_response: aws_config.max_keys_per_list_response,
+            bucket_name: remote_storage_config.bucket_name.clone(),
+            max_keys_per_list_response: remote_storage_config.max_keys_per_list_response,
             prefix_in_bucket,
-            concurrency_limiter: ConcurrencyLimiter::new(aws_config.concurrency_limit.get()),
-            upload_storage_class: aws_config.upload_storage_class.clone(),
+            concurrency_limiter: ConcurrencyLimiter::new(
+                remote_storage_config.concurrency_limit.get(),
+            ),
+            upload_storage_class: remote_storage_config.upload_storage_class.clone(),
             timeout,
         })
     }
diff --git a/s3_scrubber/src/lib.rs b/s3_scrubber/src/lib.rs
index e976e66748..7966fb6a88 100644
--- a/s3_scrubber/src/lib.rs
+++ b/s3_scrubber/src/lib.rs
@@ -312,7 +312,10 @@ pub fn init_s3_client(account_id: Option<String>, bucket_region: Region) -> Clie
     let sleep_impl: Arc<dyn AsyncSleep> = Arc::new(TokioSleep::new());
 
     let mut builder = Config::builder()
-        .behavior_version(BehaviorVersion::v2023_11_09())
+        .behavior_version(
+            #[allow(deprecated)] /* TODO: https://github.com/neondatabase/neon/issues/7665 */
+            BehaviorVersion::v2023_11_09(),
+        )
         .region(bucket_region)
         .retry_config(RetryConfig::adaptive().with_max_attempts(3))
         .sleep_impl(SharedAsyncSleep::from(sleep_impl))
diff --git a/test_runner/README.md b/test_runner/README.md
index 051897744a..fd68cfff79 100644
--- a/test_runner/README.md
+++ b/test_runner/README.md
@@ -92,6 +92,166 @@ Exit after the first test failure:
 `./scripts/pytest -x ...`
 (there are many more pytest options; run `pytest -h` to see them.)
 
+#### Running Python tests against real S3 or S3-compatible services
+
+Neon's `libs/remote_storage` supports multiple implementations of remote storage.
+At the time of writing, that is
+```rust
+pub enum RemoteStorageKind {
+    /// Storage based on local file system.
+    /// Specify a root folder to place all stored files into.
+    LocalFs(Utf8PathBuf),
+    /// AWS S3 based storage, storing all files in the S3 bucket
+    /// specified by the config
+    AwsS3(S3Config),
+    /// Azure Blob based storage, storing all files in the container
+    /// specified by the config
+    AzureContainer(AzureConfig),
+}
+```
+
+The test suite has a Python enum with equal name but different meaning:
+
+```python
+@enum.unique
+class RemoteStorageKind(str, enum.Enum):
+    LOCAL_FS = "local_fs"
+    MOCK_S3 = "mock_s3"
+    REAL_S3 = "real_s3"
+```
+
+* `LOCAL_FS` => `LocalFs`
+* `MOCK_S3`: starts [`moto`](https://github.com/getmoto/moto)'s S3 implementation, then configures Pageserver with `AwsS3`
+* `REAL_S3` => configure `AwsS3` as detailed below
+
+When a test in the test suite needs an `AwsS3`, it is supposed to call `remote_storage.s3_storage()`.
+That function checks env var `ENABLE_REAL_S3_REMOTE_STORAGE`:
+* If it is not set, use `MOCK_S3`
+* If it is set, use `REAL_S3`.
+
+For `REAL_S3`, the test suite creates the dict/toml representation of the `RemoteStorageKind::AwsS3` based on env vars:
+
+```rust
+pub struct S3Config {
+    // test suite env var: REMOTE_STORAGE_S3_BUCKET
+    pub bucket_name: String,
+    // test suite env var: REMOTE_STORAGE_S3_REGION
+    pub bucket_region: String,
+    // test suite determines this
+    pub prefix_in_bucket: Option<String>,
+    // no env var exists; test suite sets it for MOCK_S3, because that's how moto works
+    pub endpoint: Option<String>,
+    ...
+}
+```
+
+*Credentials* are not part of the config, but discovered by the AWS SDK.
+See the `libs/remote_storage` Rust code.
+We're documenting two mechanism here:
+
+The test suite supports two mechanisms (`remote_storage.py`):
+
+**Credential mechanism 1**: env vars `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`.
+Populate the env vars with AWS access keys that you created in IAM.
+Our CI uses this mechanism.
+However, it is _not_ recommended for interactive use by developers ([learn more](https://docs.aws.amazon.com/sdkref/latest/guide/access-users.html#credentials-long-term)).
+Instead, use profiles (next section).
+
+**Credential mechanism 2**: env var `AWS_PROFILE`.
+This uses the AWS SDK's (and CLI's) profile mechanism.
+Learn more about it [in the official docs](https://docs.aws.amazon.com/sdkref/latest/guide/file-format.html).
+After configuring a profile (e.g. via the aws CLI), set the env var to its name.
+
+In conclusion, the full command line is:
+
+```bash
+# with long-term AWS access keys
+ENABLE_REAL_S3_REMOTE_STORAGE=true \
+REMOTE_STORAGE_S3_BUCKET=mybucket \
+REMOTE_STORAGE_S3_REGION=eu-central-1 \
+AWS_ACCESS_KEY_ID=... \
+AWS_SECRET_ACCESS_KEY=... \
+./scripts/pytest
+```
+<!-- Don't forget to update the Minio example when changing these -->
+```bash
+# with AWS PROFILE
+ENABLE_REAL_S3_REMOTE_STORAGE=true \
+REMOTE_STORAGE_S3_BUCKET=mybucket \
+REMOTE_STORAGE_S3_REGION=eu-central-1 \
+AWS_PROFILE=... \
+./scripts/pytest
+```
+
+If you're using SSO, make sure to `aws sso login --profile $AWS_PROFILE` first.
+
+##### Minio
+
+If you want to run test without the cloud setup, we recommend [minio](https://min.io/docs/minio/linux/index.html).
+
+```bash
+# Start in Terminal 1
+mkdir /tmp/minio_data
+minio server /tmp/minio_data --console-address 127.0.0.1:9001 --address 127.0.0.1:9000
+```
+
+In another terminal, create an `aws` CLI profile for it:
+
+```ini
+# append to ~/.aws/config
+[profile local-minio]
+services = local-minio-services
+[services local-minio-services]
+s3 =
+  endpoint_url=http://127.0.0.1:9000/
+```
+
+
+Now configure the credentials (this is going to write `~/.aws/credentials` for you).
+It's an interactive prompt.
+
+```bash
+# Terminal 2
+$ aws --profile local-minio configure
+AWS Access Key ID [None]: minioadmin
+AWS Secret Access Key [None]: minioadmin
+Default region name [None]:
+Default output format [None]:
+```
+
+Now create a bucket `testbucket` using the CLI.
+
+```bash
+# (don't forget to have AWS_PROFILE env var set; or use --profile)
+aws --profile local-minio s3 mb s3://mybucket
+```
+
+(If it doesn't work, make sure you update your AWS CLI to a recent version.
+ The [service-specific endpoint feature](https://docs.aws.amazon.com/sdkref/latest/guide/feature-ss-endpoints.html)
+ that we're using is quite new.)
+
+```bash
+# with AWS PROFILE
+ENABLE_REAL_S3_REMOTE_STORAGE=true \
+REMOTE_STORAGE_S3_BUCKET=mybucket \
+REMOTE_STORAGE_S3_REGION=doesntmatterforminio \
+AWS_PROFILE=local-minio \
+./scripts/pytest
+```
+
+NB: you can avoid the `--profile` by setting the `AWS_PROFILE` variable.
+Just like the AWS SDKs, the `aws` CLI is sensible to it.
+
+#### Running Rust tests against real S3 or S3-compatible services
+
+We have some Rust tests that only run against real S3, e.g., [here](https://github.com/neondatabase/neon/blob/c18d3340b5e3c978a81c3db8b6f1e83cd9087e8a/libs/remote_storage/tests/test_real_s3.rs#L392-L397).
+
+They use the same env vars as the Python test suite (see previous section)
+but interpret them on their own.
+However, at this time, the interpretation is identical.
+
+So, above instructions apply to the Rust test as well.
+
 ### Writing a test
 
 Every test needs a Neon Environment, or NeonEnv to operate in. A Neon Environment
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index b2da33e44a..b605757f64 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -19,7 +19,7 @@ aws-runtime = { version = "1", default-features = false, features = ["event-stre
 aws-sigv4 = { version = "1", features = ["http0-compat", "sign-eventstream", "sigv4a"] }
 aws-smithy-async = { version = "1", default-features = false, features = ["rt-tokio"] }
 aws-smithy-http = { version = "0.60", default-features = false, features = ["event-stream"] }
-aws-smithy-types = { version = "1", default-features = false, features = ["byte-stream-poll-next", "http-body-0-4-x", "rt-tokio", "test-util"] }
+aws-smithy-types = { version = "1", default-features = false, features = ["byte-stream-poll-next", "http-body-0-4-x", "http-body-1-x", "rt-tokio", "test-util"] }
 axum = { version = "0.6", features = ["ws"] }
 base64 = { version = "0.21", features = ["alloc"] }
 base64ct = { version = "1", default-features = false, features = ["std"] }

From 39c712f2ca216a1d1556d4c0f8a846919418e661 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 9 May 2024 10:07:59 +0100
Subject: [PATCH 012/130] tests: adjust log allow list since reqwest upgrade
 (#7666)

## Problem

Various performance test cases were destabilized by the recent upgrade
of `reqwest`, because it changes an error string.

Examples:
-
https://neon-github-public-dev.s3.amazonaws.com/reports/main/9005532594/index.html#testresult/3f984e471a9029a5/
-
https://neon-github-public-dev.s3.amazonaws.com/reports/main/9005532594/index.html#testresult/8bd0f095fe0402b7/

The performance tests suffer from this more than most tests, because
they churn enough data that the pageserver is still trying to contact
the storage controller while it is shut down at the end of tests.

## Summary of changes

s/Connection refused/error sending request/
---
 test_runner/fixtures/pageserver/allowed_errors.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py
index e560844944..58a76d7586 100755
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -88,7 +88,7 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
     ".*Flushed oversized open layer with size.*",
     # During teardown, we stop the storage controller before the pageservers, so pageservers
     # can experience connection errors doing background deletion queue work.
-    ".*WARN deletion backend: calling control plane generation validation API failed.*Connection refused.*",
+    ".*WARN deletion backend: calling control plane generation validation API failed.*error sending request.*",
     # Can happen when the test shuts down the storage controller while it is calling the utilization API
     ".*WARN.*path=/v1/utilization .*request was dropped before completing",
 )

From 107f53529409533fec5e1ca39abf9acde8161862 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 9 May 2024 12:33:09 +0100
Subject: [PATCH 013/130] storage controller: fix handing of tenants with no
 timelines during scheduling optimization (#7673)

## Problem

Storage controller was using a zero layer count in SecondaryProgress as
a proxy for "not initialized". However, in tenants with zero timelines
(a legitimate state), the layer count remains zero forever.

This caused https://github.com/neondatabase/neon/pull/7583 to
destabilize the storage controller scale test, which creates lots of
tenants, some of which don't get any timelines.

## Summary of changes

- Use a None mtime instead of zero layer count to determine if a
SecondaryProgress should be ignored.
- Adjust the test to use a shorter heatmap upload period to let it
proceed faster while waiting for scheduling optimizations to complete.
---
 storage_controller/src/service.rs                        | 2 +-
 test_runner/performance/test_storage_controller_scale.py | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index d3a53066c9..ae7e8d3d7d 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -4745,7 +4745,7 @@ impl Service {
                     // them in an optimization
                     const DOWNLOAD_FRESHNESS_THRESHOLD: u64 = 10 * 1024 * 1024 * 1024;
 
-                    if progress.bytes_total == 0
+                    if progress.heatmap_mtime.is_none()
                         || progress.bytes_total < DOWNLOAD_FRESHNESS_THRESHOLD
                             && progress.bytes_downloaded != progress.bytes_total
                         || progress.bytes_total - progress.bytes_downloaded
diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py
index 17dc96dabe..632d465c3f 100644
--- a/test_runner/performance/test_storage_controller_scale.py
+++ b/test_runner/performance/test_storage_controller_scale.py
@@ -102,6 +102,9 @@ def test_storage_controller_many_tenants(
                 tenant_id,
                 shard_count,
                 stripe_size,
+                # Upload heatmaps fast, so that secondary downloads happen promptly, enabling
+                # the controller's optimization migrations to proceed promptly.
+                tenant_config={"heatmap_period": "10s"},
                 placement_policy={"Attached": 1},
             )
             futs.append(f)

From 41fb838799ca2b0e3c20c440d49151b7153d9ff8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 9 May 2024 16:01:16 +0200
Subject: [PATCH 014/130] Fix tiered compaction k-merge bug and use in-memory
 alternative (#7661)

This PR does two things:

First, it fixes a bug with tiered compaction's k-merge implementation.
It ignored the lsn of a key during ordering, so multiple updates of the
same key could be read in arbitrary order, say from different layers.
For example there is layers `[(a, 2),(b, 3)]` and `[(a, 1),(c, 2)]` in
the heap, they might return `(a,2)` and `(a,1)`.

Ultimately, this change wasn't enough to fix the ordering issues in
#7296, in other words there is likely still bugs in the k-merge. So as
the second thing, we switch away from the k-merge to an in-memory based
one, similar to #4839, but leave the code around to be improved and
maybe switched to later on.

Part of #7296
---
 pageserver/compaction/src/compact_tiered.rs |  9 +++++--
 pageserver/compaction/src/helpers.rs        | 30 ++++++++++++++++++---
 2 files changed, 34 insertions(+), 5 deletions(-)

diff --git a/pageserver/compaction/src/compact_tiered.rs b/pageserver/compaction/src/compact_tiered.rs
index 137b93055a..12882c9d59 100644
--- a/pageserver/compaction/src/compact_tiered.rs
+++ b/pageserver/compaction/src/compact_tiered.rs
@@ -24,7 +24,9 @@ use tracing::{debug, info};
 use std::collections::{HashSet, VecDeque};
 use std::ops::Range;
 
-use crate::helpers::{accum_key_values, keyspace_total_size, merge_delta_keys, overlaps_with};
+use crate::helpers::{
+    accum_key_values, keyspace_total_size, merge_delta_keys_buffered, overlaps_with,
+};
 use crate::interface::*;
 use utils::lsn::Lsn;
 
@@ -535,7 +537,10 @@ where
             }
         }
         // Open stream
-        let key_value_stream = std::pin::pin!(merge_delta_keys::<E>(deltas.as_slice(), ctx));
+        let key_value_stream =
+            std::pin::pin!(merge_delta_keys_buffered::<E>(deltas.as_slice(), ctx)
+                .await?
+                .map(Result::<_, anyhow::Error>::Ok));
         let mut new_jobs = Vec::new();
 
         // Slide a window through the keyspace
diff --git a/pageserver/compaction/src/helpers.rs b/pageserver/compaction/src/helpers.rs
index eb0e5ee82a..06454ee1d0 100644
--- a/pageserver/compaction/src/helpers.rs
+++ b/pageserver/compaction/src/helpers.rs
@@ -14,6 +14,7 @@ use std::future::Future;
 use std::ops::{DerefMut, Range};
 use std::pin::Pin;
 use std::task::{ready, Poll};
+use utils::lsn::Lsn;
 
 pub fn keyspace_total_size<K>(
     keyspace: &CompactionKeySpace<K>,
@@ -109,17 +110,40 @@ pub fn merge_delta_keys<'a, E: CompactionJobExecutor>(
     }
 }
 
+pub async fn merge_delta_keys_buffered<'a, E: CompactionJobExecutor + 'a>(
+    layers: &'a [E::DeltaLayer],
+    ctx: &'a E::RequestContext,
+) -> anyhow::Result<impl Stream<Item = <E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>>
+{
+    let mut keys = Vec::new();
+    for l in layers {
+        // Boxing and casting to LoadFuture is required to obtain the right Sync bound.
+        // If we do l.load_keys(ctx).await? directly, there is a compilation error.
+        let load_future: LoadFuture<'a, _> = Box::pin(l.load_keys(ctx));
+        keys.extend(load_future.await?.into_iter());
+    }
+    keys.sort_by_key(|k| (k.key(), k.lsn()));
+    let stream = futures::stream::iter(keys.into_iter());
+    Ok(stream)
+}
+
 enum LazyLoadLayer<'a, E: CompactionJobExecutor> {
     Loaded(VecDeque<<E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>),
     Unloaded(&'a E::DeltaLayer),
 }
 impl<'a, E: CompactionJobExecutor> LazyLoadLayer<'a, E> {
-    fn key(&self) -> E::Key {
+    fn min_key(&self) -> E::Key {
         match self {
             Self::Loaded(entries) => entries.front().unwrap().key(),
             Self::Unloaded(dl) => dl.key_range().start,
         }
     }
+    fn min_lsn(&self) -> Lsn {
+        match self {
+            Self::Loaded(entries) => entries.front().unwrap().lsn(),
+            Self::Unloaded(dl) => dl.lsn_range().start,
+        }
+    }
 }
 impl<'a, E: CompactionJobExecutor> PartialOrd for LazyLoadLayer<'a, E> {
     fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
@@ -129,12 +153,12 @@ impl<'a, E: CompactionJobExecutor> PartialOrd for LazyLoadLayer<'a, E> {
 impl<'a, E: CompactionJobExecutor> Ord for LazyLoadLayer<'a, E> {
     fn cmp(&self, other: &Self) -> std::cmp::Ordering {
         // reverse order so that we get a min-heap
-        other.key().cmp(&self.key())
+        (other.min_key(), other.min_lsn()).cmp(&(self.min_key(), self.min_lsn()))
     }
 }
 impl<'a, E: CompactionJobExecutor> PartialEq for LazyLoadLayer<'a, E> {
     fn eq(&self, other: &Self) -> bool {
-        self.key().eq(&other.key())
+        self.cmp(other) == std::cmp::Ordering::Equal
     }
 }
 impl<'a, E: CompactionJobExecutor> Eq for LazyLoadLayer<'a, E> {}

From 2682e0254ffb82f2e1eef0ec875346742b6e8b4e Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Thu, 9 May 2024 11:15:19 -0400
Subject: [PATCH 015/130] Revert "chore(neon_test_utils): restrict installation
 to superuser" (#7679)

This reverts commit 1173ee6a7e1168e671a6847eb94807b45c703490.

## Problem

It breaks autoscaling tests
---
 pgxn/neon_test_utils/neon_test_utils.control | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pgxn/neon_test_utils/neon_test_utils.control b/pgxn/neon_test_utils/neon_test_utils.control
index 8c5b9b5dfe..5f6d640835 100644
--- a/pgxn/neon_test_utils/neon_test_utils.control
+++ b/pgxn/neon_test_utils/neon_test_utils.control
@@ -3,5 +3,4 @@ comment = 'helpers for neon testing and debugging'
 default_version = '1.1'
 module_pathname = '$libdir/neon_test_utils'
 relocatable = true
-trusted = false
-superuser = true
+trusted = true

From 5ea117cddfe3bc58c500f0eff8352af796b58268 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 9 May 2024 17:55:57 +0000
Subject: [PATCH 016/130] build(deps): bump Npgsql from 8.0.2 to 8.0.3 in
 /test_runner/pg_clients/csharp/npgsql (#7680)

---
 test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj b/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj
index 50243e3ea7..edf2a01337 100644
--- a/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj
+++ b/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj
@@ -8,7 +8,7 @@
   </PropertyGroup>
 
   <ItemGroup>
-    <PackageReference Include="Npgsql" Version="8.0.2" />
+    <PackageReference Include="Npgsql" Version="8.0.3" />
   </ItemGroup>
 
 </Project>

From b9fd8dcf13e13b804047fc21089d2ecb509a1548 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Thu, 9 May 2024 15:52:56 -0400
Subject: [PATCH 017/130] fix(test): update the config for neon_binpath in
 from_repo_dir (#7684)

## Problem

https://github.com/neondatabase/neon/pull/7637 breaks forward compat
test.

On commit ea531d448eb65c4f58abb9ef7d8cd461952f7c5f.


https://neon-github-public-dev.s3.amazonaws.com/reports/main/8988324349/index.html

```
test_create_snapshot
2024-05-07T16:03:11.331883Z  INFO version: git-env:ea531d448eb65c4f58abb9ef7d8cd461952f7c5f failpoints: true, features: ["testing"] launch_timestamp: 2024-05-07 16:03:11.316131763 UTC build_tag: build_tag-env:5159

test_forward_compatibility
2024-05-07T16:07:02.310769Z  INFO version: git-env:ea531d448eb65c4f58abb9ef7d8cd461952f7c5f failpoints: true, features: ["testing"] launch_timestamp: 2024-05-07 16:07:02.294676183 UTC build_tag: build_tag-env:5159
```

The forward compatibility test is actually using the same tag as the
current build.

The commit before that,


https://neon-github-public-dev.s3.amazonaws.com/reports/main/8988126011/index.html

```
test_create_snapshot
2024-05-07T15:47:21.900796Z  INFO version: git-env:2dbd1c1ed5cd0458933e8ffd40a9c0a5f4d610b8 failpoints: true, features: ["testing"] launch_timestamp: 2024-05-07 15:47:21.882784185 UTC build_tag: build_tag-env:5158

test_forward_compatibility
2024-05-07T15:50:48.828733Z  INFO version: git-env:c4d7d5982553d2cf66634d1fbf85d95ef44a6524 failpoints: true, features: ["testing"] launch_timestamp: 2024-05-07 15:50:48.816635176 UTC build_tag: build_tag-env:release-5434
```

This pull request patches the bin path so that the new neon_local will
use the old binary.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 test_runner/fixtures/neon_fixtures.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 390b94c2ea..da379693a0 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -701,6 +701,11 @@ class NeonEnvBuilder:
         config["default_tenant_id"] = snapshot_config["default_tenant_id"]
         config["branch_name_mappings"] = snapshot_config["branch_name_mappings"]
 
+        # Update the config with new neon + postgres path in case of compat test
+        # FIXME: overriding pg_distrib_dir cause storage controller fail to start
+        # config["pg_distrib_dir"] = str(self.pg_distrib_dir)
+        config["neon_distrib_dir"] = str(self.neon_binpath)
+
         with (self.repo_dir / "config").open("w") as f:
             toml.dump(config, f)
 

From be1a88e574379ef29005e5a8760105509046584a Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Fri, 10 May 2024 12:17:00 +0200
Subject: [PATCH 018/130] Proxy added per ep rate limiter (#7636)

## Problem

There is no global per-ep rate limiter in proxy.

## Summary of changes

* Return global per-ep rate limiter back.
* Rename weak compute rate limiter (the cli flags were not used
anywhere, so it's safe to rename).
---
 proxy/src/auth/backend.rs          | 73 ++++++++++++++++++++++++------
 proxy/src/bin/proxy.rs             | 26 ++++++++---
 proxy/src/console/provider/neon.rs |  8 ++--
 proxy/src/proxy.rs                 |  7 ++-
 proxy/src/rate_limiter/limiter.rs  | 17 ++++---
 proxy/src/serverless.rs            | 19 ++++++--
 proxy/src/serverless/backend.rs    |  8 ++++
 proxy/src/serverless/websocket.rs  |  3 ++
 8 files changed, 126 insertions(+), 35 deletions(-)

diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 3795e3b608..6a906b299b 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -13,7 +13,7 @@ use tokio_postgres::config::AuthKeys;
 use tracing::{info, warn};
 
 use crate::auth::credentials::check_peer_addr_is_in_list;
-use crate::auth::validate_password_and_exchange;
+use crate::auth::{validate_password_and_exchange, AuthError};
 use crate::cache::Cached;
 use crate::console::errors::GetAuthInfoError;
 use crate::console::provider::{CachedRoleSecret, ConsoleBackend};
@@ -23,7 +23,7 @@ use crate::intern::EndpointIdInt;
 use crate::metrics::Metrics;
 use crate::proxy::connect_compute::ComputeConnectBackend;
 use crate::proxy::NeonOptions;
-use crate::rate_limiter::{BucketRateLimiter, RateBucketInfo};
+use crate::rate_limiter::{BucketRateLimiter, EndpointRateLimiter, RateBucketInfo};
 use crate::stream::Stream;
 use crate::{
     auth::{self, ComputeUserInfoMaybeEndpoint},
@@ -280,6 +280,7 @@ async fn auth_quirks(
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
     allow_cleartext: bool,
     config: &'static AuthenticationConfig,
+    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> auth::Result<ComputeCredentials> {
     // If there's no project so far, that entails that client doesn't
     // support SNI or other means of passing the endpoint (project) name.
@@ -305,6 +306,10 @@ async fn auth_quirks(
     if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) {
         return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr));
     }
+
+    if !endpoint_rate_limiter.check(info.endpoint.clone().into(), 1) {
+        return Err(AuthError::too_many_connections());
+    }
     let cached_secret = match maybe_secret {
         Some(secret) => secret,
         None => api.get_role_secret(ctx, &info).await?,
@@ -417,6 +422,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
         client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
         allow_cleartext: bool,
         config: &'static AuthenticationConfig,
+        endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     ) -> auth::Result<BackendType<'a, ComputeCredentials, NodeInfo>> {
         use BackendType::*;
 
@@ -428,8 +434,16 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
                     "performing authentication using the console"
                 );
 
-                let credentials =
-                    auth_quirks(ctx, &*api, user_info, client, allow_cleartext, config).await?;
+                let credentials = auth_quirks(
+                    ctx,
+                    &*api,
+                    user_info,
+                    client,
+                    allow_cleartext,
+                    config,
+                    endpoint_rate_limiter,
+                )
+                .await?;
                 BackendType::Console(api, credentials)
             }
             // NOTE: this auth backend doesn't use client credentials.
@@ -539,7 +553,7 @@ mod tests {
         },
         context::RequestMonitoring,
         proxy::NeonOptions,
-        rate_limiter::RateBucketInfo,
+        rate_limiter::{EndpointRateLimiter, RateBucketInfo},
         scram::ServerSecret,
         stream::{PqStream, Stream},
     };
@@ -699,10 +713,20 @@ mod tests {
                 _ => panic!("wrong message"),
             }
         });
+        let endpoint_rate_limiter =
+            Arc::new(EndpointRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET));
 
-        let _creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, false, &CONFIG)
-            .await
-            .unwrap();
+        let _creds = auth_quirks(
+            &mut ctx,
+            &api,
+            user_info,
+            &mut stream,
+            false,
+            &CONFIG,
+            endpoint_rate_limiter,
+        )
+        .await
+        .unwrap();
 
         handle.await.unwrap();
     }
@@ -739,10 +763,20 @@ mod tests {
             frontend::password_message(b"my-secret-password", &mut write).unwrap();
             client.write_all(&write).await.unwrap();
         });
+        let endpoint_rate_limiter =
+            Arc::new(EndpointRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET));
 
-        let _creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, true, &CONFIG)
-            .await
-            .unwrap();
+        let _creds = auth_quirks(
+            &mut ctx,
+            &api,
+            user_info,
+            &mut stream,
+            true,
+            &CONFIG,
+            endpoint_rate_limiter,
+        )
+        .await
+        .unwrap();
 
         handle.await.unwrap();
     }
@@ -780,9 +814,20 @@ mod tests {
             client.write_all(&write).await.unwrap();
         });
 
-        let creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, true, &CONFIG)
-            .await
-            .unwrap();
+        let endpoint_rate_limiter =
+            Arc::new(EndpointRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET));
+
+        let creds = auth_quirks(
+            &mut ctx,
+            &api,
+            user_info,
+            &mut stream,
+            true,
+            &CONFIG,
+            endpoint_rate_limiter,
+        )
+        .await
+        .unwrap();
 
         assert_eq!(creds.info.endpoint, "my-endpoint");
 
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 5399f13edd..be7d961b8c 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -144,6 +144,9 @@ struct ProxyCliArgs {
     /// Can be given multiple times for different bucket sizes.
     #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)]
     endpoint_rps_limit: Vec<RateBucketInfo>,
+    /// Wake compute rate limiter max number of requests per second.
+    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)]
+    wake_compute_limit: Vec<RateBucketInfo>,
     /// Whether the auth rate limiter actually takes effect (for testing)
     #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
     auth_rate_limit_enabled: bool,
@@ -154,7 +157,7 @@ struct ProxyCliArgs {
     #[clap(long, default_value_t = 64)]
     auth_rate_limit_ip_subnet: u8,
     /// Redis rate limiter max number of requests per second.
-    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)]
+    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)]
     redis_rps_limit: Vec<RateBucketInfo>,
     /// cache for `allowed_ips` (use `size=0` to disable)
     #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
@@ -365,6 +368,10 @@ async fn main() -> anyhow::Result<()> {
         proxy::metrics::CancellationSource::FromClient,
     ));
 
+    let mut endpoint_rps_limit = args.endpoint_rps_limit.clone();
+    RateBucketInfo::validate(&mut endpoint_rps_limit)?;
+    let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(endpoint_rps_limit));
+
     // client facing tasks. these will exit on error or on cancellation
     // cancellation returns Ok(())
     let mut client_tasks = JoinSet::new();
@@ -373,6 +380,7 @@ async fn main() -> anyhow::Result<()> {
         proxy_listener,
         cancellation_token.clone(),
         cancellation_handler.clone(),
+        endpoint_rate_limiter.clone(),
     ));
 
     // TODO: rename the argument to something like serverless.
@@ -387,6 +395,7 @@ async fn main() -> anyhow::Result<()> {
             serverless_listener,
             cancellation_token.clone(),
             cancellation_handler.clone(),
+            endpoint_rate_limiter.clone(),
         ));
     }
 
@@ -559,11 +568,16 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
             let url = args.auth_endpoint.parse()?;
             let endpoint = http::Endpoint::new(url, http::new_client());
 
-            let mut endpoint_rps_limit = args.endpoint_rps_limit.clone();
-            RateBucketInfo::validate(&mut endpoint_rps_limit)?;
-            let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(endpoint_rps_limit));
-            let api =
-                console::provider::neon::Api::new(endpoint, caches, locks, endpoint_rate_limiter);
+            let mut wake_compute_rps_limit = args.wake_compute_limit.clone();
+            RateBucketInfo::validate(&mut wake_compute_rps_limit)?;
+            let wake_compute_endpoint_rate_limiter =
+                Arc::new(EndpointRateLimiter::new(wake_compute_rps_limit));
+            let api = console::provider::neon::Api::new(
+                endpoint,
+                caches,
+                locks,
+                wake_compute_endpoint_rate_limiter,
+            );
             let api = console::provider::ConsoleBackend::Console(api);
             auth::BackendType::Console(MaybeOwned::Owned(api), ())
         }
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index ec66641d01..7728d2cafa 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -26,7 +26,7 @@ pub struct Api {
     endpoint: http::Endpoint,
     pub caches: &'static ApiCaches,
     pub locks: &'static ApiLocks<EndpointCacheKey>,
-    pub endpoint_rate_limiter: Arc<EndpointRateLimiter>,
+    pub wake_compute_endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     jwt: String,
 }
 
@@ -36,7 +36,7 @@ impl Api {
         endpoint: http::Endpoint,
         caches: &'static ApiCaches,
         locks: &'static ApiLocks<EndpointCacheKey>,
-        endpoint_rate_limiter: Arc<EndpointRateLimiter>,
+        wake_compute_endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     ) -> Self {
         let jwt: String = match std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") {
             Ok(v) => v,
@@ -46,7 +46,7 @@ impl Api {
             endpoint,
             caches,
             locks,
-            endpoint_rate_limiter,
+            wake_compute_endpoint_rate_limiter,
             jwt,
         }
     }
@@ -283,7 +283,7 @@ impl super::Api for Api {
 
         // check rate limit
         if !self
-            .endpoint_rate_limiter
+            .wake_compute_endpoint_rate_limiter
             .check(user_info.endpoint.normalize().into(), 1)
         {
             return Err(WakeComputeError::TooManyConnections);
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index e4e095d77d..5824b70df9 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -19,6 +19,7 @@ use crate::{
     metrics::{Metrics, NumClientConnectionsGuard},
     protocol2::read_proxy_protocol,
     proxy::handshake::{handshake, HandshakeData},
+    rate_limiter::EndpointRateLimiter,
     stream::{PqStream, Stream},
     EndpointCacheKey,
 };
@@ -61,6 +62,7 @@ pub async fn task_main(
     listener: tokio::net::TcpListener,
     cancellation_token: CancellationToken,
     cancellation_handler: Arc<CancellationHandlerMain>,
+    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> anyhow::Result<()> {
     scopeguard::defer! {
         info!("proxy has shut down");
@@ -86,6 +88,7 @@ pub async fn task_main(
         let cancellation_handler = Arc::clone(&cancellation_handler);
 
         tracing::info!(protocol = "tcp", %session_id, "accepted new TCP connection");
+        let endpoint_rate_limiter2 = endpoint_rate_limiter.clone();
 
         connections.spawn(async move {
             let (socket, peer_addr) = match read_proxy_protocol(socket).await{
@@ -123,6 +126,7 @@ pub async fn task_main(
                 cancellation_handler,
                 socket,
                 ClientMode::Tcp,
+                endpoint_rate_limiter2,
                 conn_gauge,
             )
             .instrument(span.clone())
@@ -234,6 +238,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     cancellation_handler: Arc<CancellationHandlerMain>,
     stream: S,
     mode: ClientMode,
+    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     conn_gauge: NumClientConnectionsGuard<'static>,
 ) -> Result<Option<ProxyPassthrough<CancellationHandlerMainInternal, S>>, ClientRequestError> {
     info!(
@@ -243,7 +248,6 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
 
     let metrics = &Metrics::get().proxy;
     let proto = ctx.protocol;
-    // let _client_gauge = metrics.client_connections.guard(proto);
     let _request_gauge = metrics.connection_requests.guard(proto);
 
     let tls = config.tls_config.as_ref();
@@ -286,6 +290,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
             &mut stream,
             mode.allow_cleartext(),
             &config.authentication_config,
+            endpoint_rate_limiter,
         )
         .await
     {
diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs
index 5ba2c36436..b8c9490696 100644
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -128,12 +128,18 @@ impl std::str::FromStr for RateBucketInfo {
 }
 
 impl RateBucketInfo {
-    pub const DEFAULT_ENDPOINT_SET: [Self; 3] = [
+    pub const DEFAULT_SET: [Self; 3] = [
         Self::new(300, Duration::from_secs(1)),
         Self::new(200, Duration::from_secs(60)),
         Self::new(100, Duration::from_secs(600)),
     ];
 
+    pub const DEFAULT_ENDPOINT_SET: [Self; 3] = [
+        Self::new(500, Duration::from_secs(1)),
+        Self::new(300, Duration::from_secs(60)),
+        Self::new(200, Duration::from_secs(600)),
+    ];
+
     pub fn validate(info: &mut [Self]) -> anyhow::Result<()> {
         info.sort_unstable_by_key(|info| info.interval);
         let invalid = info
@@ -266,7 +272,7 @@ mod tests {
 
     #[test]
     fn default_rate_buckets() {
-        let mut defaults = RateBucketInfo::DEFAULT_ENDPOINT_SET;
+        let mut defaults = RateBucketInfo::DEFAULT_SET;
         RateBucketInfo::validate(&mut defaults[..]).unwrap();
     }
 
@@ -333,11 +339,8 @@ mod tests {
         let rand = rand::rngs::StdRng::from_seed([1; 32]);
         let hasher = BuildHasherDefault::<FxHasher>::default();
 
-        let limiter = BucketRateLimiter::new_with_rand_and_hasher(
-            &RateBucketInfo::DEFAULT_ENDPOINT_SET,
-            rand,
-            hasher,
-        );
+        let limiter =
+            BucketRateLimiter::new_with_rand_and_hasher(&RateBucketInfo::DEFAULT_SET, rand, hasher);
         for i in 0..1_000_000 {
             limiter.check(i, 1);
         }
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index cbff51f207..f634ab4e98 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -36,6 +36,7 @@ use crate::context::RequestMonitoring;
 use crate::metrics::Metrics;
 use crate::protocol2::read_proxy_protocol;
 use crate::proxy::run_until_cancelled;
+use crate::rate_limiter::EndpointRateLimiter;
 use crate::serverless::backend::PoolingBackend;
 use crate::serverless::http_util::{api_error_into_response, json_response};
 
@@ -54,6 +55,7 @@ pub async fn task_main(
     ws_listener: TcpListener,
     cancellation_token: CancellationToken,
     cancellation_handler: Arc<CancellationHandlerMain>,
+    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> anyhow::Result<()> {
     scopeguard::defer! {
         info!("websocket server has shut down");
@@ -82,6 +84,7 @@ pub async fn task_main(
     let backend = Arc::new(PoolingBackend {
         pool: Arc::clone(&conn_pool),
         config,
+        endpoint_rate_limiter: Arc::clone(&endpoint_rate_limiter),
     });
 
     let tls_config = match config.tls_config.as_ref() {
@@ -129,6 +132,7 @@ pub async fn task_main(
             backend.clone(),
             connections.clone(),
             cancellation_handler.clone(),
+            endpoint_rate_limiter.clone(),
             conn_token.clone(),
             server.clone(),
             tls_acceptor.clone(),
@@ -162,6 +166,7 @@ async fn connection_handler(
     backend: Arc<PoolingBackend>,
     connections: TaskTracker,
     cancellation_handler: Arc<CancellationHandlerMain>,
+    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     cancellation_token: CancellationToken,
     server: Builder<TokioExecutor>,
     tls_acceptor: TlsAcceptor,
@@ -245,6 +250,7 @@ async fn connection_handler(
                     session_id,
                     peer_addr,
                     http_request_token,
+                    endpoint_rate_limiter.clone(),
                 )
                 .in_current_span()
                 .map_ok_or_else(api_error_into_response, |r| r),
@@ -285,6 +291,7 @@ async fn request_handler(
     peer_addr: IpAddr,
     // used to cancel in-flight HTTP requests. not used to cancel websockets
     http_cancellation_token: CancellationToken,
+    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> Result<Response<Full<Bytes>>, ApiError> {
     let host = request
         .headers()
@@ -310,9 +317,15 @@ async fn request_handler(
 
         ws_connections.spawn(
             async move {
-                if let Err(e) =
-                    websocket::serve_websocket(config, ctx, websocket, cancellation_handler, host)
-                        .await
+                if let Err(e) = websocket::serve_websocket(
+                    config,
+                    ctx,
+                    websocket,
+                    cancellation_handler,
+                    endpoint_rate_limiter,
+                    host,
+                )
+                .await
                 {
                     error!("error in websocket connection: {e:#}");
                 }
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index ce58f575e2..6b79c12316 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -16,6 +16,7 @@ use crate::{
     context::RequestMonitoring,
     error::{ErrorKind, ReportableError, UserFacingError},
     proxy::{connect_compute::ConnectMechanism, retry::ShouldRetry},
+    rate_limiter::EndpointRateLimiter,
     Host,
 };
 
@@ -24,6 +25,7 @@ use super::conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool};
 pub struct PoolingBackend {
     pub pool: Arc<GlobalConnPool<tokio_postgres::Client>>,
     pub config: &'static ProxyConfig,
+    pub endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 }
 
 impl PoolingBackend {
@@ -39,6 +41,12 @@ impl PoolingBackend {
         if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) {
             return Err(AuthError::ip_address_not_allowed(ctx.peer_addr));
         }
+        if !self
+            .endpoint_rate_limiter
+            .check(conn_info.user_info.endpoint.clone().into(), 1)
+        {
+            return Err(AuthError::too_many_connections());
+        }
         let cached_secret = match maybe_secret {
             Some(secret) => secret,
             None => backend.get_role_secret(ctx).await?,
diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs
index b6cd85af73..649bec2c7c 100644
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -5,6 +5,7 @@ use crate::{
     error::{io_error, ReportableError},
     metrics::Metrics,
     proxy::{handle_client, ClientMode},
+    rate_limiter::EndpointRateLimiter,
 };
 use bytes::{Buf, Bytes};
 use futures::{Sink, Stream};
@@ -134,6 +135,7 @@ pub async fn serve_websocket(
     mut ctx: RequestMonitoring,
     websocket: HyperWebsocket,
     cancellation_handler: Arc<CancellationHandlerMain>,
+    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     hostname: Option<String>,
 ) -> anyhow::Result<()> {
     let websocket = websocket.await?;
@@ -148,6 +150,7 @@ pub async fn serve_websocket(
         cancellation_handler,
         WebSocketRw::new(websocket),
         ClientMode::Websockets { hostname },
+        endpoint_rate_limiter,
         conn_gauge,
     )
     .await;

From 13d9589c35d444b444b6ed9ef4d8d7144ad232d0 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 10 May 2024 12:01:39 +0100
Subject: [PATCH 019/130] pageserver: don't call get_vectored with empty
 keyspace (#7686)

## Problem

This caused a variation of the stats bug fixed by
https://github.com/neondatabase/neon/pull/7662. That PR also fixed this
case, but we still shouldn't make redundant get calls.

## Summary of changes

- Only call get in the create image layers loop at the end of a range if
some keys have been accumulated
---
 pageserver/src/tenant/timeline.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 5983529a44..60b3873b71 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4230,7 +4230,7 @@ impl Timeline {
 
                     // Maybe flush `key_rest_accum`
                     if key_request_accum.raw_size() >= Timeline::MAX_GET_VECTORED_KEYS
-                        || last_key_in_range
+                        || (last_key_in_range && key_request_accum.raw_size() > 0)
                     {
                         let results = self
                             .get_vectored(key_request_accum.consume_keyspace(), lsn, ctx)

From 873b2220808e0bc059edb631186ed19b0238394d Mon Sep 17 00:00:00 2001
From: Andrey Taranik <andrey@cicd.team>
Date: Fri, 10 May 2024 15:04:23 +0400
Subject: [PATCH 020/130] use own arm64 gha runners (#7373)

## Problem

Move from aws based arm64 runners to bare-metal based

## Summary of changes
Changes in GitHub action workflows where `runs-on: arm64` used. More
parallelism added, build time for `neon with extra platform builds`
workflow reduced from 45m to 25m
---
 .config/nextest.toml                          |  2 +-
 .github/actionlint.yml                        |  4 +--
 .github/workflows/build-build-tools-image.yml |  2 +-
 .github/workflows/neon_extra_builds.yml       | 33 ++++++++++++-------
 4 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/.config/nextest.toml b/.config/nextest.toml
index a9398e4ab0..affdc16f31 100644
--- a/.config/nextest.toml
+++ b/.config/nextest.toml
@@ -1,2 +1,2 @@
 [profile.default]
-slow-timeout = { period = "20s", terminate-after = 3 }
+slow-timeout = { period = "60s", terminate-after = 3 }
diff --git a/.github/actionlint.yml b/.github/actionlint.yml
index cb36e2eee6..942861ecd8 100644
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -1,11 +1,9 @@
 self-hosted-runner:
   labels:
     - arm64
-    - dev
     - gen3
     - large
-    # Remove `macos-14` from the list after https://github.com/rhysd/actionlint/pull/392 is merged.
-    - macos-14
+    - large-arm64
     - small
     - us-east-2
 config-variables:
diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml
index c527cef1ac..bdf00bcaae 100644
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -39,7 +39,7 @@ jobs:
       matrix:
         arch: [ x64, arm64 ]
 
-    runs-on: ${{ fromJson(format('["self-hosted", "dev", "{0}"]', matrix.arch)) }}
+    runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
 
     env:
       IMAGE_TAG: ${{ inputs.image-tag }}
diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index 5a2f9d6645..fdb03963fb 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -136,7 +136,7 @@ jobs:
   check-linux-arm-build:
     needs: [ check-permissions, build-build-tools-image ]
     timeout-minutes: 90
-    runs-on: [ self-hosted, dev, arm64 ]
+    runs-on: [ self-hosted, large-arm64 ]
 
     env:
       # Use release build only, to have less debug info around
@@ -232,20 +232,20 @@ jobs:
 
       - name: Run cargo build
         run: |
-          mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests
+          mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests -j$(nproc)
 
       - name: Run cargo test
         env:
           NEXTEST_RETRIES: 3
         run: |
-          cargo nextest run $CARGO_FEATURES
+          cargo nextest run $CARGO_FEATURES -j$(nproc)
 
           # Run separate tests for real S3
           export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
           export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
           export REMOTE_STORAGE_S3_REGION=eu-central-1
           # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
-          cargo nextest run --package remote_storage --test test_real_s3
+          cargo nextest run --package remote_storage --test test_real_s3 -j$(nproc)
 
           # Run separate tests for real Azure Blob Storage
           # XXX: replace region with `eu-central-1`-like region
@@ -255,12 +255,12 @@ jobs:
           export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
           export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
           # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
-          cargo nextest run --package remote_storage --test test_real_azure
+          cargo nextest run --package remote_storage --test test_real_azure -j$(nproc)
 
   check-codestyle-rust-arm:
     needs: [ check-permissions, build-build-tools-image ]
     timeout-minutes: 90
-    runs-on: [ self-hosted, dev, arm64 ]
+    runs-on: [ self-hosted, large-arm64 ]
 
     container:
       image: ${{ needs.build-build-tools-image.outputs.image }}
@@ -269,6 +269,11 @@ jobs:
         password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
       options: --init
 
+    strategy:
+      fail-fast: false
+      matrix:
+        build_type: [ debug, release ]
+
     steps:
       - name: Fix git ownership
         run: |
@@ -305,31 +310,35 @@ jobs:
             exit 1
           fi
           echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
+
       - name: Run cargo clippy (debug)
+        if: matrix.build_type == 'debug'
         run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS
       - name: Run cargo clippy (release)
+        if: matrix.build_type == 'release'
         run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS
 
       - name: Check documentation generation
-        run: cargo doc --workspace --no-deps --document-private-items
+        if: matrix.build_type == 'release'
+        run: cargo doc --workspace --no-deps --document-private-items -j$(nproc)
         env:
             RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"
 
       # Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
       - name: Check formatting
-        if: ${{ !cancelled() }}
+        if: ${{ !cancelled() && matrix.build_type == 'release' }}
         run: cargo fmt --all -- --check
 
       # https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci
       - name: Check rust dependencies
-        if: ${{ !cancelled() }}
+        if: ${{ !cancelled() && matrix.build_type == 'release' }}
         run: |
           cargo hakari generate --diff  # workspace-hack Cargo.toml is up-to-date
           cargo hakari manage-deps --dry-run  # all workspace crates depend on workspace-hack
 
       # https://github.com/EmbarkStudios/cargo-deny
       - name: Check rust licenses/bans/advisories/sources
-        if: ${{ !cancelled() }}
+        if: ${{ !cancelled() && matrix.build_type == 'release' }}
         run: cargo deny check
 
   gather-rust-build-stats:
@@ -338,7 +347,7 @@ jobs:
       contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') ||
       contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
       github.ref_name == 'main'
-    runs-on: [ self-hosted, gen3, large ]
+    runs-on: [ self-hosted, large ]
     container:
       image: ${{ needs.build-build-tools-image.outputs.image }}
       credentials:
@@ -369,7 +378,7 @@ jobs:
         run: make walproposer-lib -j$(nproc)
 
       - name: Produce the build stats
-        run: cargo build --all --release --timings
+        run: cargo build --all --release --timings -j$(nproc)
 
       - name: Upload the build stats
         id: upload-stats

From 0b02043ba4e8d477b77a1f01bef9809c1f433ab4 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Fri, 10 May 2024 13:32:42 +0100
Subject: [PATCH 021/130] Fix permissions for safekeeper failpoints (#7669)

We didn't check permission in `"/v1/failpoints"` endpoint, it means that
everyone with per-tenant token could modify the failpoints. This commit
fixes that.
---
 safekeeper/src/http/routes.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index 9ce26e6c5d..30d0081a47 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -519,6 +519,7 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
         .get("/v1/status", |r| request_span(r, status_handler))
         .put("/v1/failpoints", |r| {
             request_span(r, move |r| async {
+                check_permission(&r, None)?;
                 let cancel = CancellationToken::new();
                 failpoints_handler(r, cancel).await
             })

From 86905c132205b5cbffb199fb002bc9c0484d4f43 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 10 May 2024 17:15:11 +0300
Subject: [PATCH 022/130] openapi: resolve the synthetic_size duplication
 (#7651)

We had accidentally left two endpoints for `tenant`: `/synthetic_size`
and `/size`. Size had the more extensive description but has returned
404 since renaming. Remove the `/size` in favor of the working one and
describe the `text/html` output.
---
 pageserver/src/http/openapi_spec.yml | 38 +++++-----------------------
 1 file changed, 6 insertions(+), 32 deletions(-)

diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index c425f3e628..36c74ed140 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -420,25 +420,6 @@ paths:
           description: Tenant scheduled to load successfully
 
   /v1/tenant/{tenant_id}/synthetic_size:
-    parameters:
-      - name: tenant_id
-        in: path
-        required: true
-        schema:
-          type: string
-    get:
-      description: |
-        Calculate tenant's synthetic size
-      responses:
-        "200":
-          description: Tenant's synthetic size
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/SyntheticSizeResponse"
-
-  # This route has no handler. TODO: remove?
-  /v1/tenant/{tenant_id}/size:
     parameters:
       - name: tenant_id
         in: path
@@ -468,19 +449,9 @@ paths:
           content:
             application/json:
               schema:
-                type: object
-                required:
-                  - id
-                  - size
-                properties:
-                  id:
-                    type: string
-                    format: hex
-                  size:
-                    type: integer
-                    nullable: true
-                    description: |
-                      Size metric in bytes or null if inputs_only=true was given.
+                $ref: "#/components/schemas/SyntheticSizeResponse"
+            text/html:
+              description: SVG representation of the tenant and it's timelines.
         "401":
           description: Unauthorized Error
           content:
@@ -929,6 +900,9 @@ components:
           format: hex
         size:
           type: integer
+          nullable: true
+          description: |
+            Size metric in bytes or null if inputs_only=true was given.
         segment_sizes:
           type: array
           items:

From d7f34bc3399c31f8e4c773cb5ae6f919e5d02d64 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 10 May 2024 17:41:34 +0300
Subject: [PATCH 023/130] draw_timeline_dir: draw branch points and gc cutoff
 lines (#7657)

in addition to layer names, expand the input vocabulary to recognize
lines in the form of:

    ${kind}:${lsn}

where:
- kind in `gc_cutoff` or `branch`
- lsn is accepted in Lsn display format (x/y) or hex (as used in layer
names)

gc_cutoff and branch have different colors.
---
 pageserver/ctl/src/draw_timeline_dir.rs | 89 +++++++++++++++++++++++--
 1 file changed, 82 insertions(+), 7 deletions(-)

diff --git a/pageserver/ctl/src/draw_timeline_dir.rs b/pageserver/ctl/src/draw_timeline_dir.rs
index 9a556cb3d4..d8082f8ab4 100644
--- a/pageserver/ctl/src/draw_timeline_dir.rs
+++ b/pageserver/ctl/src/draw_timeline_dir.rs
@@ -28,6 +28,8 @@
 //! # From an `index_part.json` in S3
 //! (jq -r '.layer_metadata | keys[]' | cargo  run -p pagectl draw-timeline ) < index_part.json-00000016 > out.svg
 //!
+//! # enrich with lines for gc_cutoff and a child branch point
+//! cat <(jq -r '.historic_layers[] | .layer_file_name' < layers.json) <(echo -e 'gc_cutoff:0000001CE3FE32C9\nbranch:0000001DE3FE32C9') | cargo run --bin pagectl draw-timeline >| out.svg
 //! ```
 //!
 //! ## Viewing
@@ -48,7 +50,7 @@
 //! ```
 //!
 
-use anyhow::Result;
+use anyhow::{Context, Result};
 use pageserver::repository::Key;
 use pageserver::METADATA_FILE_NAME;
 use std::cmp::Ordering;
@@ -90,6 +92,33 @@ fn parse_filename(name: &str) -> (Range<Key>, Range<Lsn>) {
     (keys, lsns)
 }
 
+#[derive(Clone, Copy)]
+enum LineKind {
+    GcCutoff,
+    Branch,
+}
+
+impl From<LineKind> for Fill {
+    fn from(value: LineKind) -> Self {
+        match value {
+            LineKind::GcCutoff => Fill::Color(rgb(255, 0, 0)),
+            LineKind::Branch => Fill::Color(rgb(0, 255, 0)),
+        }
+    }
+}
+
+impl FromStr for LineKind {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> std::prelude::v1::Result<Self, Self::Err> {
+        Ok(match s {
+            "gc_cutoff" => LineKind::GcCutoff,
+            "branch" => LineKind::Branch,
+            _ => anyhow::bail!("unsupported linekind: {s}"),
+        })
+    }
+}
+
 pub fn main() -> Result<()> {
     // Parse layer filenames from stdin
     struct Layer {
@@ -99,8 +128,29 @@ pub fn main() -> Result<()> {
     }
     let mut files: Vec<Layer> = vec![];
     let stdin = io::stdin();
-    for line in stdin.lock().lines() {
+
+    let mut lines: Vec<(Lsn, LineKind)> = vec![];
+
+    for (lineno, line) in stdin.lock().lines().enumerate() {
+        let lineno = lineno + 1;
+
         let line = line.unwrap();
+        if let Some((kind, lsn)) = line.split_once(':') {
+            let (kind, lsn) = LineKind::from_str(kind)
+                .context("parse kind")
+                .and_then(|kind| {
+                    if lsn.contains('/') {
+                        Lsn::from_str(lsn)
+                    } else {
+                        Lsn::from_hex(lsn)
+                    }
+                    .map(|lsn| (kind, lsn))
+                    .context("parse lsn")
+                })
+                .with_context(|| format!("parse {line:?} on {lineno}"))?;
+            lines.push((lsn, kind));
+            continue;
+        }
         let line = PathBuf::from_str(&line).unwrap();
         let filename = line.file_name().unwrap();
         let filename = filename.to_str().unwrap();
@@ -117,8 +167,9 @@ pub fn main() -> Result<()> {
     }
 
     // Collect all coordinates
-    let mut keys: Vec<Key> = vec![];
-    let mut lsns: Vec<Lsn> = vec![];
+    let mut keys: Vec<Key> = Vec::with_capacity(files.len());
+    let mut lsns: Vec<Lsn> = Vec::with_capacity(files.len() + lines.len());
+
     for Layer {
         key_range: keyr,
         lsn_range: lsnr,
@@ -131,6 +182,8 @@ pub fn main() -> Result<()> {
         lsns.push(lsnr.end);
     }
 
+    lsns.extend(lines.iter().map(|(lsn, _)| *lsn));
+
     // Analyze
     let key_map = build_coordinate_compression_map(keys);
     let lsn_map = build_coordinate_compression_map(lsns);
@@ -144,10 +197,13 @@ pub fn main() -> Result<()> {
     println!(
         "{}",
         BeginSvg {
-            w: key_map.len() as f32,
+            w: (key_map.len() + 10) as f32,
             h: stretch * lsn_map.len() as f32
         }
     );
+
+    let xmargin = 0.05; // Height-dependent margin to disambiguate overlapping deltas
+
     for Layer {
         filename,
         key_range: keyr,
@@ -169,7 +225,6 @@ pub fn main() -> Result<()> {
         let mut lsn_diff = (lsn_end - lsn_start) as f32;
         let mut fill = Fill::None;
         let mut ymargin = 0.05 * lsn_diff; // Height-dependent margin to disambiguate overlapping deltas
-        let xmargin = 0.05; // Height-dependent margin to disambiguate overlapping deltas
         let mut lsn_offset = 0.0;
 
         // Fill in and thicken rectangle if it's an
@@ -189,7 +244,7 @@ pub fn main() -> Result<()> {
         println!(
             "    {}",
             rectangle(
-                key_start as f32 + stretch * xmargin,
+                5.0 + key_start as f32 + stretch * xmargin,
                 stretch * (lsn_max as f32 - (lsn_end as f32 - ymargin - lsn_offset)),
                 key_diff as f32 - stretch * 2.0 * xmargin,
                 stretch * (lsn_diff - 2.0 * ymargin)
@@ -200,6 +255,26 @@ pub fn main() -> Result<()> {
             .comment(filename)
         );
     }
+
+    for (lsn, kind) in lines {
+        let lsn_start = *lsn_map.get(&lsn).unwrap();
+        let lsn_end = lsn_start;
+        let stretch = 2.0;
+        let lsn_diff = 0.3;
+        let lsn_offset = -lsn_diff / 2.0;
+        let ymargin = 0.05;
+        println!(
+            "{}",
+            rectangle(
+                0.0f32 + stretch * xmargin,
+                stretch * (lsn_map.len() as f32 - (lsn_end as f32 - ymargin - lsn_offset)),
+                (key_map.len() + 10) as f32,
+                stretch * (lsn_diff - 2.0 * ymargin)
+            )
+            .fill(kind)
+        );
+    }
+
     println!("{}", EndSvg);
 
     eprintln!("num_images: {}", num_images);

From 6206f76419416c6c936c97df5e660d28333ee835 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 10 May 2024 17:46:50 +0300
Subject: [PATCH 024/130] build: run doctests (#7697)

While switching to use nextest with the repository in f28bdb6, we had
not noticed that it doesn't yet support running doctests. Run the doc
tests before other tests.
---
 .github/workflows/build_and_test.yml | 3 +++
 libs/utils/src/poison.rs             | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index eada65505f..21e7a56670 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -341,6 +341,9 @@ jobs:
         env:
           NEXTEST_RETRIES: 3
         run: |
+          #nextest does not yet support running doctests
+          cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
+
           for io_engine in std-fs tokio-epoll-uring ; do
             NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
           done
diff --git a/libs/utils/src/poison.rs b/libs/utils/src/poison.rs
index 0bf5664f47..27378c69fc 100644
--- a/libs/utils/src/poison.rs
+++ b/libs/utils/src/poison.rs
@@ -3,7 +3,7 @@
 //!  # Example
 //!
 //!  ```
-//!  # tokio_test::block_on(async {
+//!  # tokio::runtime::Builder::new_current_thread().enable_all().build().unwrap().block_on(async {
 //!  use utils::poison::Poison;
 //!  use std::time::Duration;
 //!

From d7c68dc981db2d73cb5ff617472266b29bbc2ace Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 10 May 2024 17:50:47 +0200
Subject: [PATCH 025/130] Tiered compaction: fix early exit check in main loop
 (#7702)

The old test based on the immutable `target_file_size` that was a
parameter to the function.

It makes no sense to go further once `current_level_target_height` has
reached `u64::MAX`, as lsn's are u64 typed. In practice, we should only
run into this if there is a bug, as the practical lsn range usually ends
much earlier.

Testing on `target_file_size` makes less sense, it basically implements
an invocation mode that turns off the looping and only runs one
iteration of it.
@hlinnaka agrees that `current_level_target_height` is better here.

Part of #7554
---
 pageserver/compaction/src/compact_tiered.rs | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/pageserver/compaction/src/compact_tiered.rs b/pageserver/compaction/src/compact_tiered.rs
index 12882c9d59..20e9cf2196 100644
--- a/pageserver/compaction/src/compact_tiered.rs
+++ b/pageserver/compaction/src/compact_tiered.rs
@@ -106,7 +106,13 @@ pub async fn compact_tiered<E: CompactionJobExecutor>(
             ctx,
         )
         .await?;
-        if target_file_size == u64::MAX {
+        if current_level_target_height == u64::MAX {
+            // our target height includes all possible lsns
+            info!(
+                level = current_level_no,
+                depth = depth,
+                "compaction loop reached max current_level_target_height"
+            );
             break;
         }
         current_level_no += 1;

From 95098c3216929506b66ce244a2420d07ab65e8dc Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Fri, 10 May 2024 17:20:14 +0100
Subject: [PATCH 026/130] Fix checkpoint metric (#7701)

Split checkpoint_stats into two separate metrics: checkpoints_req and
checkpoints_timed

Fixes commit
https://github.com/neondatabase/neon/commit/21e1a496a3f706097578de396a9107813c541001

---------

Co-authored-by: Peter Bendel <peterbendel@neon.tech>
---
 vm-image-spec.yaml | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index 56538630ac..e9d983eba3 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -278,15 +278,21 @@ files:
               ELSE GREATEST (0, EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()))
             END AS replication_delay_seconds;
 
-      - metric_name: checkpoint_stats
+      - metric_name: checkpoints_req
         type: gauge
-        help: 'Number of requested and scheduled checkpoints'
+        help: 'Number of requested checkpoints'
         key_labels:
-        values:
-          - checkpoints_req
-          - checkpoints_timed
+        values: [checkpoints_req]
         query: |
-          SELECT checkpoints_req, checkpoints_timed FROM pg_stat_bgwriter;
+          SELECT checkpoints_req FROM pg_stat_bgwriter;
+
+      - metric_name: checkpoints_timed
+        type: gauge
+        help: 'Number of scheduled checkpoints'
+        key_labels:
+        values: [checkpoints_timed]
+        query: |
+          SELECT checkpoints_timed FROM pg_stat_bgwriter;
   - filename: neon_collector_autoscaling.yml
     content: |
       collector_name: neon_collector_autoscaling

From 6351313ae96ab6d0e3e2b27ed2d86eed3dd004c9 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 10 May 2024 22:30:05 +0300
Subject: [PATCH 027/130] feat: allow detaching from ancestor for timelines
 without writes (#7639)

The first implementation #7456 did not include `index_part.json` changes
in an attempt to keep amount of changes down. Tracks the historic
reparentings and earlier detach in `index_part.json`.

- `index_part.json` receives a new field `lineage: Lineage`
- `Lineage` is queried through RemoteTimelineClient during basebackup,
creating `PREV LSN: none` for the invalid prev record lsn just as it
would had been created for a newly created timeline
- as `struct IndexPart` grew, it is now boxed in places

Cc: #6994
---
 pageserver/src/basebackup.rs                  |   2 +-
 pageserver/src/tenant/metadata.rs             |   6 +-
 .../src/tenant/remote_timeline_client.rs      |  28 +++-
 .../tenant/remote_timeline_client/index.rs    | 140 +++++++++++++++++-
 pageserver/src/tenant/timeline.rs             |  13 +-
 .../src/tenant/timeline/detach_ancestor.rs    |  10 --
 pageserver/src/tenant/upload_queue.rs         |   8 +-
 s3_scrubber/src/checks.rs                     |   4 +-
 s3_scrubber/src/tenant_snapshot.rs            |   2 +-
 .../regress/test_timeline_detach_ancestor.py  |  83 +++++------
 10 files changed, 225 insertions(+), 71 deletions(-)

diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index 58b18dae7d..dca1510810 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -601,7 +601,7 @@ where
         // add zenith.signal file
         let mut zenith_signal = String::new();
         if self.prev_record_lsn == Lsn(0) {
-            if self.lsn == self.timeline.get_ancestor_lsn() {
+            if self.timeline.is_ancestor_lsn(self.lsn) {
                 write!(zenith_signal, "PREV LSN: none")
                     .map_err(|e| BasebackupError::Server(e.into()))?;
             } else {
diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs
index 8ba0775120..fc71ea7642 100644
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -214,12 +214,12 @@ impl TimelineMetadata {
         self.body.ancestor_timeline = Some(*timeline);
     }
 
-    pub fn detach_from_ancestor(&mut self, timeline: &TimelineId, ancestor_lsn: &Lsn) {
+    pub fn detach_from_ancestor(&mut self, branchpoint: &(TimelineId, Lsn)) {
         if let Some(ancestor) = self.body.ancestor_timeline {
-            assert_eq!(ancestor, *timeline);
+            assert_eq!(ancestor, branchpoint.0);
         }
         if self.body.ancestor_lsn != Lsn(0) {
-            assert_eq!(self.body.ancestor_lsn, *ancestor_lsn);
+            assert_eq!(self.body.ancestor_lsn, branchpoint.1);
         }
         self.body.ancestor_timeline = None;
         self.body.ancestor_lsn = Lsn(0);
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index bbe4e16378..9103760388 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -437,6 +437,19 @@ impl RemoteTimelineClient {
         }
     }
 
+    /// Returns true if this timeline was previously detached at this Lsn and the remote timeline
+    /// client is currently initialized.
+    pub(crate) fn is_previous_ancestor_lsn(&self, lsn: Lsn) -> bool {
+        // technically this is a dirty read, but given how timeline detach ancestor is implemented
+        // via tenant restart, the lineage has always been uploaded.
+        self.upload_queue
+            .lock()
+            .unwrap()
+            .initialized_mut()
+            .map(|uq| uq.latest_lineage.is_previous_ancestor_lsn(lsn))
+            .unwrap_or(false)
+    }
+
     fn update_remote_physical_size_gauge(&self, current_remote_index_part: Option<&IndexPart>) {
         let size: u64 = if let Some(current_remote_index_part) = current_remote_index_part {
             current_remote_index_part
@@ -628,7 +641,7 @@ impl RemoteTimelineClient {
         );
 
         let index_part = IndexPart::from(&*upload_queue);
-        let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn);
+        let op = UploadOp::UploadMetadata(Box::new(index_part), disk_consistent_lsn);
         self.metric_begin(&op);
         upload_queue.queued_operations.push_back(op);
         upload_queue.latest_files_changes_since_metadata_upload_scheduled = 0;
@@ -647,7 +660,14 @@ impl RemoteTimelineClient {
             let mut guard = self.upload_queue.lock().unwrap();
             let upload_queue = guard.initialized_mut()?;
 
+            let Some(prev) = upload_queue.latest_metadata.ancestor_timeline() else {
+                return Err(anyhow::anyhow!(
+                    "cannot reparent without a current ancestor"
+                ));
+            };
+
             upload_queue.latest_metadata.reparent(new_parent);
+            upload_queue.latest_lineage.record_previous_ancestor(&prev);
 
             self.schedule_index_upload(upload_queue);
 
@@ -670,9 +690,8 @@ impl RemoteTimelineClient {
             let mut guard = self.upload_queue.lock().unwrap();
             let upload_queue = guard.initialized_mut()?;
 
-            upload_queue
-                .latest_metadata
-                .detach_from_ancestor(&adopted.0, &adopted.1);
+            upload_queue.latest_metadata.detach_from_ancestor(&adopted);
+            upload_queue.latest_lineage.record_detaching(&adopted);
 
             for layer in layers {
                 upload_queue
@@ -1811,6 +1830,7 @@ impl RemoteTimelineClient {
                         latest_files: initialized.latest_files.clone(),
                         latest_files_changes_since_metadata_upload_scheduled: 0,
                         latest_metadata: initialized.latest_metadata.clone(),
+                        latest_lineage: initialized.latest_lineage.clone(),
                         projected_remote_consistent_lsn: None,
                         visible_remote_consistent_lsn: initialized
                             .visible_remote_consistent_lsn
diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index 3e05905afa..b114d6aa10 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -6,6 +6,7 @@ use std::collections::HashMap;
 
 use chrono::NaiveDateTime;
 use serde::{Deserialize, Serialize};
+use utils::id::TimelineId;
 
 use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::storage_layer::LayerName;
@@ -84,6 +85,9 @@ pub struct IndexPart {
 
     #[serde(rename = "metadata_bytes")]
     pub metadata: TimelineMetadata,
+
+    #[serde(default)]
+    pub(crate) lineage: Lineage,
 }
 
 impl IndexPart {
@@ -96,10 +100,11 @@ impl IndexPart {
     /// - 3: no longer deserialize `timeline_layers` (serialized format is the same, but timeline_layers
     ///      is always generated from the keys of `layer_metadata`)
     /// - 4: timeline_layers is fully removed.
-    const LATEST_VERSION: usize = 4;
+    /// - 5: lineage was added
+    const LATEST_VERSION: usize = 5;
 
     // Versions we may see when reading from a bucket.
-    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4];
+    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5];
 
     pub const FILE_NAME: &'static str = "index_part.json";
 
@@ -107,6 +112,7 @@ impl IndexPart {
         layers_and_metadata: &HashMap<LayerName, LayerFileMetadata>,
         disk_consistent_lsn: Lsn,
         metadata: TimelineMetadata,
+        lineage: Lineage,
     ) -> Self {
         let layer_metadata = layers_and_metadata
             .iter()
@@ -119,6 +125,7 @@ impl IndexPart {
             disk_consistent_lsn,
             metadata,
             deleted_at: None,
+            lineage,
         }
     }
 
@@ -147,6 +154,7 @@ impl IndexPart {
             &HashMap::new(),
             example_metadata.disk_consistent_lsn(),
             example_metadata,
+            Default::default(),
         )
     }
 }
@@ -155,8 +163,9 @@ impl From<&UploadQueueInitialized> for IndexPart {
     fn from(uq: &UploadQueueInitialized) -> Self {
         let disk_consistent_lsn = uq.latest_metadata.disk_consistent_lsn();
         let metadata = uq.latest_metadata.clone();
+        let lineage = uq.latest_lineage.clone();
 
-        Self::new(&uq.latest_files, disk_consistent_lsn, metadata)
+        Self::new(&uq.latest_files, disk_consistent_lsn, metadata, lineage)
     }
 }
 
@@ -184,8 +193,76 @@ impl From<&LayerFileMetadata> for IndexLayerMetadata {
     }
 }
 
+/// Limited history of earlier ancestors.
+///
+/// A timeline can have more than 1 earlier ancestor, in the rare case that it was repeatedly
+/// reparented by having an later timeline be detached from it's ancestor.
+#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Default)]
+pub(crate) struct Lineage {
+    /// Has the `reparenting_history` been truncated to [`Lineage::REMEMBER_AT_MOST`].
+    #[serde(skip_serializing_if = "is_false", default)]
+    reparenting_history_truncated: bool,
+
+    /// Earlier ancestors, truncated when [`Self::reparenting_history_truncated`]
+    ///
+    /// These are stored in case we want to support WAL based DR on the timeline. There can be many
+    /// of these and at most one [`Self::original_ancestor`]. There cannot be more reparentings
+    /// after [`Self::original_ancestor`] has been set.
+    #[serde(skip_serializing_if = "Vec::is_empty", default)]
+    reparenting_history: Vec<TimelineId>,
+
+    /// The ancestor from which this timeline has been detached from and when.
+    ///
+    /// If you are adding support for detaching from a hierarchy, consider changing the ancestry
+    /// into a `Vec<(TimelineId, Lsn)>` to be a path instead.
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    original_ancestor: Option<(TimelineId, Lsn, NaiveDateTime)>,
+}
+
+fn is_false(b: &bool) -> bool {
+    !b
+}
+
+impl Lineage {
+    const REMEMBER_AT_MOST: usize = 100;
+
+    pub(crate) fn record_previous_ancestor(&mut self, old_ancestor: &TimelineId) {
+        if self.reparenting_history.last() == Some(old_ancestor) {
+            // do not re-record it
+            return;
+        }
+
+        let drop_oldest = self.reparenting_history.len() + 1 >= Self::REMEMBER_AT_MOST;
+
+        self.reparenting_history_truncated |= drop_oldest;
+        if drop_oldest {
+            self.reparenting_history.remove(0);
+        }
+        self.reparenting_history.push(*old_ancestor);
+    }
+
+    pub(crate) fn record_detaching(&mut self, branchpoint: &(TimelineId, Lsn)) {
+        assert!(self.original_ancestor.is_none());
+
+        self.original_ancestor =
+            Some((branchpoint.0, branchpoint.1, chrono::Utc::now().naive_utc()));
+    }
+
+    /// The queried lsn is most likely the basebackup lsn, and this answers question "is it allowed
+    /// to start a read/write primary at this lsn".
+    ///
+    /// Returns true if the Lsn was previously a branch point.
+    pub(crate) fn is_previous_ancestor_lsn(&self, lsn: Lsn) -> bool {
+        self.original_ancestor
+            .as_ref()
+            .is_some_and(|(_, ancestor_lsn, _)| lsn == *ancestor_lsn)
+    }
+}
+
 #[cfg(test)]
 mod tests {
+    use std::str::FromStr;
+
     use super::*;
 
     #[test]
@@ -221,6 +298,7 @@ mod tests {
             disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
             metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
             deleted_at: None,
+            lineage: Lineage::default(),
         };
 
         let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
@@ -261,6 +339,7 @@ mod tests {
             disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
             metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
             deleted_at: None,
+            lineage: Lineage::default(),
         };
 
         let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
@@ -302,7 +381,8 @@ mod tests {
             disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
             metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
             deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
-                "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap())
+                "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()),
+            lineage: Lineage::default(),
         };
 
         let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
@@ -347,6 +427,7 @@ mod tests {
             ])
             .unwrap(),
             deleted_at: None,
+            lineage: Lineage::default(),
         };
 
         let empty_layers_parsed = IndexPart::from_s3_bytes(empty_layers_json.as_bytes()).unwrap();
@@ -385,11 +466,58 @@ mod tests {
             ]),
             disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
             metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
-            deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
-                "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()),
+            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
+            lineage: Lineage::default(),
         };
 
         let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
         assert_eq!(part, expected);
     }
+
+    #[test]
+    fn v5_indexpart_is_parsed() {
+        let example = r#"{
+            "version":5,
+            "layer_metadata":{
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF420-00000000014EF499":{"file_size":23289856,"generation":1},
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF499-00000000015A7619":{"file_size":1015808,"generation":1}},
+                "disk_consistent_lsn":"0/15A7618",
+                "metadata_bytes":[226,88,25,241,0,46,0,4,0,0,0,0,1,90,118,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,78,244,32,0,0,0,0,1,78,244,32,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
+                "lineage":{
+                    "original_ancestor":["e2bfd8c633d713d279e6fcd2bcc15b6d","0/15A7618","2024-05-07T18:52:36.322426563"],
+                    "reparenting_history":["e1bfd8c633d713d279e6fcd2bcc15b6d"]
+                }
+        }"#;
+
+        let expected = IndexPart {
+            version: 5,
+            layer_metadata: HashMap::from([
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF420-00000000014EF499".parse().unwrap(), IndexLayerMetadata {
+                    file_size: 23289856,
+                    generation: Generation::new(1),
+                    shard: ShardIndex::unsharded(),
+                }),
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF499-00000000015A7619".parse().unwrap(), IndexLayerMetadata {
+                    file_size: 1015808,
+                    generation: Generation::new(1),
+                    shard: ShardIndex::unsharded(),
+                })
+            ]),
+            disk_consistent_lsn: Lsn::from_str("0/15A7618").unwrap(),
+            metadata: TimelineMetadata::from_bytes(&[226,88,25,241,0,46,0,4,0,0,0,0,1,90,118,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,78,244,32,0,0,0,0,1,78,244,32,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
+            deleted_at: None,
+            lineage: Lineage {
+                reparenting_history_truncated: false,
+                reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
+                original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))),
+            },
+        };
+
+        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
+        assert_eq!(part, expected);
+    }
+
+    fn parse_naive_datetime(s: &str) -> NaiveDateTime {
+        chrono::NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S.%f").unwrap()
+    }
 }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 60b3873b71..505dc8c30d 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3037,6 +3037,18 @@ impl Timeline {
 
         Some(HeatMapTimeline::new(self.timeline_id, layers))
     }
+
+    /// Returns true if the given lsn is or was an ancestor branchpoint.
+    pub(crate) fn is_ancestor_lsn(&self, lsn: Lsn) -> bool {
+        // upon timeline detach, we set the ancestor_lsn to Lsn::INVALID and the store the original
+        // branchpoint in the value in IndexPart::lineage
+        self.ancestor_lsn == lsn
+            || (self.ancestor_lsn == Lsn::INVALID
+                && self
+                    .remote_client
+                    .as_ref()
+                    .is_some_and(|rtc| rtc.is_previous_ancestor_lsn(lsn)))
+    }
 }
 
 type TraversalId = Arc<str>;
@@ -4354,7 +4366,6 @@ impl Timeline {
     /// - has an ancestor to detach from
     /// - the ancestor does not have an ancestor -- follows from the original RFC limitations, not
     /// a technical requirement
-    /// - has prev_lsn in remote storage (temporary restriction)
     ///
     /// After the operation has been started, it cannot be canceled. Upon restart it needs to be
     /// polled again until completion.
diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs
index 2641bf3d13..69b82344a6 100644
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -22,8 +22,6 @@ pub(crate) enum Error {
     TooManyAncestors,
     #[error("shutting down, please retry later")]
     ShuttingDown,
-    #[error("detached timeline must receive writes before the operation")]
-    DetachedTimelineNeedsWrites,
     #[error("flushing failed")]
     FlushAncestor(#[source] anyhow::Error),
     #[error("layer download failed")]
@@ -94,14 +92,6 @@ pub(super) async fn prepare(
         return Err(TooManyAncestors);
     }
 
-    if detached.get_prev_record_lsn() == Lsn::INVALID
-        || detached.disk_consistent_lsn.load() == ancestor_lsn
-    {
-        // this is to avoid a problem that after detaching we would be unable to start up the
-        // compute because of "PREV_LSN: invalid".
-        return Err(DetachedTimelineNeedsWrites);
-    }
-
     // before we acquire the gate, we must mark the ancestor as having a detach operation
     // ongoing which will block other concurrent detach operations so we don't get to ackward
     // situations where there would be two branches trying to reparent earlier branches.
diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs
index 7797117e0f..a2f761fa94 100644
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -3,6 +3,7 @@ use super::storage_layer::ResidentLayer;
 use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::remote_timeline_client::index::IndexPart;
 use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
+use crate::tenant::remote_timeline_client::index::Lineage;
 use std::collections::{HashMap, VecDeque};
 use std::fmt::Debug;
 
@@ -56,6 +57,9 @@ pub(crate) struct UploadQueueInitialized {
     /// DANGER: do not return to outside world, e.g., safekeepers.
     pub(crate) latest_metadata: TimelineMetadata,
 
+    /// Part of the flattened "next" `index_part.json`.
+    pub(crate) latest_lineage: Lineage,
+
     /// `disk_consistent_lsn` from the last metadata file that was successfully
     /// uploaded. `Lsn(0)` if nothing was uploaded yet.
     /// Unlike `latest_files` or `latest_metadata`, this value is never ahead.
@@ -171,6 +175,7 @@ impl UploadQueue {
             latest_files: HashMap::new(),
             latest_files_changes_since_metadata_upload_scheduled: 0,
             latest_metadata: metadata.clone(),
+            latest_lineage: Lineage::default(),
             projected_remote_consistent_lsn: None,
             visible_remote_consistent_lsn: Arc::new(AtomicLsn::new(0)),
             // what follows are boring default initializations
@@ -218,6 +223,7 @@ impl UploadQueue {
             latest_files: files,
             latest_files_changes_since_metadata_upload_scheduled: 0,
             latest_metadata: index_part.metadata.clone(),
+            latest_lineage: index_part.lineage.clone(),
             projected_remote_consistent_lsn: Some(index_part.metadata.disk_consistent_lsn()),
             visible_remote_consistent_lsn: Arc::new(
                 index_part.metadata.disk_consistent_lsn().into(),
@@ -290,7 +296,7 @@ pub(crate) enum UploadOp {
     UploadLayer(ResidentLayer, LayerFileMetadata),
 
     /// Upload the metadata file
-    UploadMetadata(IndexPart, Lsn),
+    UploadMetadata(Box<IndexPart>, Lsn),
 
     /// Delete layer files
     Delete(Delete),
diff --git a/s3_scrubber/src/checks.rs b/s3_scrubber/src/checks.rs
index 68133fc0a9..dd64a0a98f 100644
--- a/s3_scrubber/src/checks.rs
+++ b/s3_scrubber/src/checks.rs
@@ -246,7 +246,7 @@ pub(crate) struct S3TimelineBlobData {
 #[derive(Debug)]
 pub(crate) enum BlobDataParseResult {
     Parsed {
-        index_part: IndexPart,
+        index_part: Box<IndexPart>,
         index_part_generation: Generation,
         s3_layers: HashSet<(LayerName, Generation)>,
     },
@@ -368,7 +368,7 @@ pub(crate) async fn list_timeline_blobs(
             Ok(index_part) => {
                 return Ok(S3TimelineBlobData {
                     blob_data: BlobDataParseResult::Parsed {
-                        index_part,
+                        index_part: Box::new(index_part),
                         index_part_generation,
                         s3_layers,
                     },
diff --git a/s3_scrubber/src/tenant_snapshot.rs b/s3_scrubber/src/tenant_snapshot.rs
index 2c93a8490a..a24a1e92ae 100644
--- a/s3_scrubber/src/tenant_snapshot.rs
+++ b/s3_scrubber/src/tenant_snapshot.rs
@@ -159,7 +159,7 @@ impl SnapshotDownloader {
     async fn download_timeline(
         &self,
         ttid: TenantShardTimelineId,
-        index_part: IndexPart,
+        index_part: Box<IndexPart>,
         index_part_generation: Generation,
         ancestor_layers: &mut HashMap<
             TenantShardTimelineId,
diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index b8a88ca6df..075f0a6bbc 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -1,3 +1,4 @@
+import datetime
 import enum
 from concurrent.futures import ThreadPoolExecutor
 from queue import Empty, Queue
@@ -12,6 +13,7 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.pageserver.http import HistoricLayerInfo
 from fixtures.pageserver.utils import wait_timeline_detail_404
+from fixtures.remote_storage import LocalFsStorage
 from fixtures.types import Lsn, TimelineId
 
 
@@ -56,15 +58,16 @@ SHUTDOWN_ALLOWED_ERRORS = [
 
 @pytest.mark.parametrize("branchpoint", Branchpoint.all())
 @pytest.mark.parametrize("restart_after", [True, False])
+@pytest.mark.parametrize("write_to_branch_first", [True, False])
 def test_ancestor_detach_branched_from(
-    neon_env_builder: NeonEnvBuilder, branchpoint: Branchpoint, restart_after: bool
+    neon_env_builder: NeonEnvBuilder,
+    branchpoint: Branchpoint,
+    restart_after: bool,
+    write_to_branch_first: bool,
 ):
     """
     Creates a branch relative to L0 lsn boundary according to Branchpoint. Later the timeline is detached.
     """
-    # TODO: parametrize; currently unimplemented over at pageserver
-    write_to_branch_first = True
-
     env = neon_env_builder.init_start()
 
     env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
@@ -174,8 +177,7 @@ def test_ancestor_detach_branched_from(
     wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline, 10, 1.0)
 
 
-@pytest.mark.parametrize("restart_after", [True, False])
-def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder, restart_after: bool):
+def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder):
     """
     The case from RFC:
 
@@ -204,9 +206,6 @@ def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder, res
     We confirm the end result by being able to delete "old main" after deleting "after".
     """
 
-    # TODO: support not yet implemented for these
-    write_to_branch_first = True
-
     env = neon_env_builder.init_start()
 
     env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
@@ -244,42 +243,57 @@ def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder, res
 
     after = env.neon_cli.create_branch("after", "main", env.initial_tenant, ancestor_start_lsn=None)
 
-    if write_to_branch_first:
-        with env.endpoints.create_start("new main", tenant_id=env.initial_tenant) as ep:
-            assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == 8192
-            with ep.cursor() as cur:
-                cur.execute("UPDATE audit SET starts = starts + 1")
-                assert cur.rowcount == 1
-            wait_for_last_flush_lsn(env, ep, env.initial_tenant, timeline_id)
-
-        client.timeline_checkpoint(env.initial_tenant, timeline_id)
-
     all_reparented = client.detach_ancestor(env.initial_tenant, timeline_id)
     assert all_reparented == {reparented, same_branchpoint}
 
-    if restart_after:
-        env.pageserver.stop()
-        env.pageserver.start()
-
     env.pageserver.quiesce_tenants()
 
     # checking the ancestor after is much faster than waiting for the endpoint not start
     expected_result = [
         ("main", env.initial_timeline, None, 16384, 1),
         ("after", after, env.initial_timeline, 16384, 1),
-        ("new main", timeline_id, None, 8192, 2),
+        ("new main", timeline_id, None, 8192, 1),
         ("same_branchpoint", same_branchpoint, timeline_id, 8192, 1),
         ("reparented", reparented, timeline_id, 0, 1),
     ]
 
-    for _, timeline_id, expected_ancestor, _, _ in expected_result:
-        details = client.timeline_detail(env.initial_tenant, timeline_id)
+    assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
+
+    for _, queried_timeline, expected_ancestor, _, _ in expected_result:
+        details = client.timeline_detail(env.initial_tenant, queried_timeline)
         ancestor_timeline_id = details["ancestor_timeline_id"]
         if expected_ancestor is None:
             assert ancestor_timeline_id is None
         else:
             assert TimelineId(ancestor_timeline_id) == expected_ancestor
 
+        index_part = env.pageserver_remote_storage.index_content(
+            env.initial_tenant, queried_timeline
+        )
+        lineage = index_part["lineage"]
+        assert lineage is not None
+
+        assert lineage.get("reparenting_history_overflown", "false") == "false"
+
+        if queried_timeline == timeline_id:
+            original_ancestor = lineage["original_ancestor"]
+            assert original_ancestor is not None
+            assert original_ancestor[0] == str(env.initial_timeline)
+            assert original_ancestor[1] == str(branchpoint_x)
+
+            # this does not contain Z in the end, so fromisoformat accepts it
+            # it is to be in line with the deletion timestamp.. well, almost.
+            when = original_ancestor[2][:26]
+            when_ts = datetime.datetime.fromisoformat(when)
+            assert when_ts < datetime.datetime.now()
+            assert len(lineage.get("reparenting_history", [])) == 0
+        elif expected_ancestor == timeline_id:
+            assert len(lineage.get("original_ancestor", [])) == 0
+            assert lineage["reparenting_history"] == [str(env.initial_timeline)]
+        else:
+            assert len(lineage.get("original_ancestor", [])) == 0
+            assert len(lineage.get("reparenting_history", [])) == 0
+
     for name, _, _, rows, starts in expected_result:
         with env.endpoints.create_start(name, tenant_id=env.initial_tenant) as ep:
             assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows
@@ -293,14 +307,10 @@ def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder, res
     wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline, 10, 1.0)
 
 
-@pytest.mark.parametrize("restart_after", [True, False])
-def test_detached_receives_flushes_while_being_detached(
-    neon_env_builder: NeonEnvBuilder, restart_after: bool
-):
+def test_detached_receives_flushes_while_being_detached(neon_env_builder: NeonEnvBuilder):
     """
     Makes sure that the timeline is able to receive writes through-out the detach process.
     """
-    write_to_branch_first = True
 
     env = neon_env_builder.init_start()
 
@@ -330,12 +340,6 @@ def test_detached_receives_flushes_while_being_detached(
     ep = env.endpoints.create_start("new main", tenant_id=env.initial_tenant)
     assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows
 
-    if write_to_branch_first:
-        rows += insert_rows(256, ep)
-        wait_for_last_flush_lsn(env, ep, env.initial_tenant, timeline_id)
-        client.timeline_checkpoint(env.initial_tenant, timeline_id)
-        log.info("completed {write_to_branch_first=}")
-
     def small_txs(ep, queue: Queue[str], barrier):
         extra_rows = 0
 
@@ -368,11 +372,6 @@ def test_detached_receives_flushes_while_being_detached(
         reparented = client.detach_ancestor(env.initial_tenant, timeline_id)
         assert len(reparented) == 0
 
-        if restart_after:
-            # ep and row production is kept alive on purpose
-            env.pageserver.stop()
-            env.pageserver.start()
-
         env.pageserver.quiesce_tenants()
 
         queue.put("done")

From 4270e86eb282367fedebe87a5a363619b466eece Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 13 May 2024 10:58:03 +0300
Subject: [PATCH 028/130] test(ancestor detach): verify with fullbackup (#7706)

In timeline detach ancestor tests there is no way to really be sure that
there were no subtle off-by one bugs. One such bug is demoed and
reverted. Add verifying fullbackup is equal before and after detaching
ancestor.

Fullbackup is expected to be equal apart from `zenith.signal`, which is
known to be good because endpoint can be started without the detached
branch receiving writes.
---
 .../regress/test_timeline_detach_ancestor.py  | 89 ++++++++++++++++++-
 1 file changed, 88 insertions(+), 1 deletion(-)

diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index 075f0a6bbc..214e10c32e 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -1,14 +1,19 @@
 import datetime
 import enum
+import tarfile
+import time
 from concurrent.futures import ThreadPoolExecutor
+from hashlib import sha256
+from pathlib import Path
 from queue import Empty, Queue
 from threading import Barrier
-from typing import List
+from typing import IO, List, Set, Tuple, Union
 
 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
+    PgBin,
     wait_for_last_flush_lsn,
 )
 from fixtures.pageserver.http import HistoricLayerInfo
@@ -60,7 +65,10 @@ SHUTDOWN_ALLOWED_ERRORS = [
 @pytest.mark.parametrize("restart_after", [True, False])
 @pytest.mark.parametrize("write_to_branch_first", [True, False])
 def test_ancestor_detach_branched_from(
+    test_output_dir,
+    pg_distrib_dir,
     neon_env_builder: NeonEnvBuilder,
+    pg_bin: PgBin,
     branchpoint: Branchpoint,
     restart_after: bool,
     write_to_branch_first: bool,
@@ -70,6 +78,7 @@ def test_ancestor_detach_branched_from(
     """
     env = neon_env_builder.init_start()
 
+    psql_env = {"LD_LIBRARY_PATH": str(pg_distrib_dir / "lib")}
     env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
 
     client = env.pageserver.http_client()
@@ -146,6 +155,20 @@ def test_ancestor_detach_branched_from(
     else:
         branch_layers = set()
 
+    # run fullbackup to make sure there are no off by one errors
+    # take this on the parent
+    fullbackup_before = test_output_dir / "fullbackup-before.tar"
+    cmd = [
+        "psql",
+        "--no-psqlrc",
+        env.pageserver.connstr(),
+        "-c",
+        f"fullbackup {env.initial_tenant} {env.initial_timeline} {branch_at}",
+        "-o",
+        str(fullbackup_before),
+    ]
+    pg_bin.run_capture(cmd, env=psql_env)
+
     all_reparented = client.detach_ancestor(env.initial_tenant, timeline_id)
     assert all_reparented == set()
 
@@ -173,9 +196,73 @@ def test_ancestor_detach_branched_from(
     # but if nothing was copied, then there is no nice rule.
     # there could be a hole in LSNs between copied from the "old main" and the first branch layer.
 
+    # take this on the detached, at same lsn
+    fullbackup_after = test_output_dir / "fullbackup-after.tar"
+    cmd = [
+        "psql",
+        "--no-psqlrc",
+        env.pageserver.connstr(),
+        "-c",
+        f"fullbackup {env.initial_tenant} {timeline_id} {branch_at}",
+        "-o",
+        str(fullbackup_after),
+    ]
+    pg_bin.run_capture(cmd, env=psql_env)
+
     client.timeline_delete(env.initial_tenant, env.initial_timeline)
     wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline, 10, 1.0)
 
+    # because we do the fullbackup from ancestor at the branch_lsn, the zenith.signal is always different
+    # as there is always "PREV_LSN: invalid" for "before"
+    skip_files = {"zenith.signal"}
+
+    tar_cmp(fullbackup_before, fullbackup_after, skip_files)
+
+
+def tar_cmp(left: Path, right: Path, skip_files: Set[str]):
+    """
+    This is essentially:
+
+    lines=$(comm -3 \
+        <(mkdir left && cd left && tar xf "$left" && find . -type f -print0 | xargs sha256sum | sort -k2) \
+        <(mkdir right && cd right && tar xf "$right" && find . -type f -print0 | xargs sha256sum | sort -k2) \
+        | wc -l)
+    [ "$lines" = "0" ]
+
+    But in a more mac friendly fashion.
+    """
+    started_at = time.time()
+
+    def hash_extracted(reader: Union[IO[bytes], None]) -> bytes:
+        assert reader is not None
+        digest = sha256(usedforsecurity=False)
+        while True:
+            buf = reader.read(64 * 1024)
+            if not buf:
+                break
+            digest.update(buf)
+        return digest.digest()
+
+    def build_hash_list(p: Path) -> List[Tuple[str, bytes]]:
+        with tarfile.open(p) as f:
+            matching_files = (info for info in f if info.isreg() and info.name not in skip_files)
+            ret = list(
+                map(lambda info: (info.name, hash_extracted(f.extractfile(info))), matching_files)
+            )
+            ret.sort(key=lambda t: t[0])
+            return ret
+
+    left_list, right_list = map(build_hash_list, [left, right])
+
+    try:
+        assert len(left_list) == len(right_list)
+
+        for left_tuple, right_tuple in zip(left_list, right_list):
+            assert left_tuple == right_tuple
+    finally:
+        elapsed = time.time() - started_at
+        log.info(f"tar_cmp completed in {elapsed}s")
+
 
 def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder):
     """

From 216fc5ba7beb48ac7ae45aa007f72f1a3c0f62e0 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 13 May 2024 11:56:07 +0300
Subject: [PATCH 029/130] test: fix confusing limit and logging (#7589)

The test has been flaky since 2024-04-11 for unknown reason, and the
logging was off. Fix the logging and raise the limit a bit. The
problematic ratio reproduces with pg14 and added sleep (not included)
but not on pg15. The new ratio abs diff limit works for all inspected
examples.

Cc: #7536
---
 test_runner/regress/test_disk_usage_eviction.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py
index 5e9efa7cce..1d73e9cb18 100644
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -623,15 +623,16 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv, order: EvictionOrder):
             ratio = count_now / original_count
             abs_diff = abs(ratio - expected_ratio)
             assert original_count > count_now
-            log.info(
-                f"tenant {tenant_id} layer count {original_count} -> {count_now}, ratio: {ratio}, expecting {abs_diff} < 0.1"
-            )
 
+            expectation = 0.06
+            log.info(
+                f"tenant {tenant_id} layer count {original_count} -> {count_now}, ratio: {ratio}, expecting {abs_diff} < {expectation}"
+            )
             # in this test case both relative_spare and relative_equal produce
             # the same outcomes; this must be a quantization effect of similar
             # sizes (-s4 and -s6) and small (5MB) layer size.
             # for pg15 and pg16 the absdiff is < 0.01, for pg14 it is closer to 0.02
-            assert abs_diff < 0.05
+            assert abs_diff < expectation
 
 
 @pytest.mark.parametrize(

From 1a1d52787579a617028c66f8a9f41b46641a1035 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 13 May 2024 12:21:49 +0300
Subject: [PATCH 030/130] test: allow vectored get validation failure during
 shutdown (#7716)

Per [evidence] the timeline ancestor detach tests can panic while
shutting down on vectored get validation. Allow the error because tenant
is restarted twice in the test.

[evidence]:
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-7708/9058185709/index.html#suites/a1c2be32556270764423c495fad75d47/d444f7e5c0a18ce9
---
 test_runner/regress/test_timeline_detach_ancestor.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index 214e10c32e..1e961a4b2f 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -58,6 +58,8 @@ class Branchpoint(str, enum.Enum):
 SHUTDOWN_ALLOWED_ERRORS = [
     ".*initial size calculation failed: downloading failed, possibly for shutdown",
     ".*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
+    ".*logical_size_calculation_task:panic.*: Sequential get failed with Bad state \\(not active\\).*",
+    ".*Task 'initial size calculation' .* panicked.*",
 ]
 
 

From b58a615197374da349525da843b04849c47d610f Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 13 May 2024 11:22:10 +0200
Subject: [PATCH 031/130] chore(pageserver): plumb through RequestContext to
 VirtualFile read methods (#7720)

This PR introduces no functional changes.

The `open()` path will be done separately.

refs https://github.com/neondatabase/neon/issues/6107
refs https://github.com/neondatabase/neon/issues/7386
---
 pageserver/src/tenant/block_io.rs             |  7 +-
 .../src/tenant/ephemeral_file/page_caching.rs |  2 +-
 .../src/tenant/storage_layer/delta_layer.rs   |  9 +-
 .../src/tenant/storage_layer/image_layer.rs   |  5 +-
 pageserver/src/tenant/vectored_blob_io.rs     |  4 +-
 pageserver/src/virtual_file.rs                | 98 +++++++++++++------
 6 files changed, 82 insertions(+), 43 deletions(-)

diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs
index 37c84be342..92928116c1 100644
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -102,7 +102,7 @@ impl<'a> BlockReaderRef<'a> {
             #[cfg(test)]
             TestDisk(r) => r.read_blk(blknum),
             #[cfg(test)]
-            VirtualFile(r) => r.read_blk(blknum).await,
+            VirtualFile(r) => r.read_blk(blknum, ctx).await,
         }
     }
 }
@@ -177,10 +177,11 @@ impl<'a> FileBlockReader<'a> {
         &self,
         buf: PageWriteGuard<'static>,
         blkno: u32,
+        ctx: &RequestContext,
     ) -> Result<PageWriteGuard<'static>, std::io::Error> {
         assert!(buf.len() == PAGE_SZ);
         self.file
-            .read_exact_at_page(buf, blkno as u64 * PAGE_SZ as u64)
+            .read_exact_at_page(buf, blkno as u64 * PAGE_SZ as u64, ctx)
             .await
     }
     /// Read a block.
@@ -206,7 +207,7 @@ impl<'a> FileBlockReader<'a> {
             ReadBufResult::Found(guard) => Ok(guard.into()),
             ReadBufResult::NotFound(write_guard) => {
                 // Read the page from disk into the buffer
-                let write_guard = self.fill_buffer(write_guard, blknum).await?;
+                let write_guard = self.fill_buffer(write_guard, blknum, ctx).await?;
                 Ok(write_guard.mark_valid().into())
             }
         }
diff --git a/pageserver/src/tenant/ephemeral_file/page_caching.rs b/pageserver/src/tenant/ephemeral_file/page_caching.rs
index 42def8858e..276ac87064 100644
--- a/pageserver/src/tenant/ephemeral_file/page_caching.rs
+++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs
@@ -78,7 +78,7 @@ impl RW {
                     page_cache::ReadBufResult::NotFound(write_guard) => {
                         let write_guard = writer
                             .file
-                            .read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64)
+                            .read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64, ctx)
                             .await?;
                         let read_guard = write_guard.mark_valid();
                         return Ok(BlockLease::PageReadGuard(read_guard));
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index c38c9bb656..4f30cf2e84 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -908,7 +908,7 @@ impl DeltaLayerInner {
         .await
         .map_err(GetVectoredError::Other)?;
 
-        self.do_reads_and_update_state(reads, reconstruct_state)
+        self.do_reads_and_update_state(reads, reconstruct_state, ctx)
             .await;
 
         reconstruct_state.on_lsn_advanced(&keyspace, self.lsn_range.start);
@@ -1012,6 +1012,7 @@ impl DeltaLayerInner {
         &self,
         reads: Vec<VectoredRead>,
         reconstruct_state: &mut ValuesReconstructState,
+        ctx: &RequestContext,
     ) {
         let vectored_blob_reader = VectoredBlobReader::new(&self.file);
         let mut ignore_key_with_err = None;
@@ -1029,7 +1030,7 @@ impl DeltaLayerInner {
         // track when a key is done.
         for read in reads.into_iter().rev() {
             let res = vectored_blob_reader
-                .read_blobs(&read, buf.take().expect("Should have a buffer"))
+                .read_blobs(&read, buf.take().expect("Should have a buffer"), ctx)
                 .await;
 
             let blobs_buf = match res {
@@ -1274,7 +1275,7 @@ impl DeltaLayerInner {
 
                 buf.clear();
                 buf.reserve(read.size());
-                let res = reader.read_blobs(&read, buf).await?;
+                let res = reader.read_blobs(&read, buf, ctx).await?;
 
                 for blob in res.blobs {
                     let key = blob.meta.key;
@@ -1848,7 +1849,7 @@ mod test {
 
             for read in vectored_reads {
                 let blobs_buf = vectored_blob_reader
-                    .read_blobs(&read, buf.take().expect("Should have a buffer"))
+                    .read_blobs(&read, buf.take().expect("Should have a buffer"), &ctx)
                     .await?;
                 for meta in blobs_buf.blobs.iter() {
                     let value = &blobs_buf.buf[meta.start..meta.end];
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index c9874873e4..72d1f36cab 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -474,7 +474,7 @@ impl ImageLayerInner {
             .await
             .map_err(GetVectoredError::Other)?;
 
-        self.do_reads_and_update_state(reads, reconstruct_state)
+        self.do_reads_and_update_state(reads, reconstruct_state, ctx)
             .await;
 
         Ok(())
@@ -537,6 +537,7 @@ impl ImageLayerInner {
         &self,
         reads: Vec<VectoredRead>,
         reconstruct_state: &mut ValuesReconstructState,
+        ctx: &RequestContext,
     ) {
         let max_vectored_read_bytes = self
             .max_vectored_read_bytes
@@ -565,7 +566,7 @@ impl ImageLayerInner {
             }
 
             let buf = BytesMut::with_capacity(buf_size);
-            let res = vectored_blob_reader.read_blobs(&read, buf).await;
+            let res = vectored_blob_reader.read_blobs(&read, buf, ctx).await;
 
             match res {
                 Ok(blobs_buf) => {
diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs
index 91934d5e0e..6e825760e3 100644
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -23,6 +23,7 @@ use pageserver_api::key::Key;
 use utils::lsn::Lsn;
 use utils::vec_map::VecMap;
 
+use crate::context::RequestContext;
 use crate::virtual_file::VirtualFile;
 
 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
@@ -285,6 +286,7 @@ impl<'a> VectoredBlobReader<'a> {
         &self,
         read: &VectoredRead,
         buf: BytesMut,
+        ctx: &RequestContext,
     ) -> Result<VectoredBlobsBuf, std::io::Error> {
         assert!(read.size() > 0);
         assert!(
@@ -295,7 +297,7 @@ impl<'a> VectoredBlobReader<'a> {
         );
         let buf = self
             .file
-            .read_exact_at_n(buf, read.start, read.size())
+            .read_exact_at_n(buf, read.start, read.size(), ctx)
             .await?;
 
         let blobs_at = read.blobs_at.as_slice();
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index a17488a286..8dee73891b 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -576,21 +576,34 @@ impl VirtualFile {
         Ok(self.pos)
     }
 
-    pub async fn read_exact_at<B>(&self, buf: B, offset: u64) -> Result<B, Error>
+    pub async fn read_exact_at<B>(
+        &self,
+        buf: B,
+        offset: u64,
+        ctx: &RequestContext,
+    ) -> Result<B, Error>
     where
         B: IoBufMut + Send,
     {
-        let (buf, res) =
-            read_exact_at_impl(buf, offset, None, |buf, offset| self.read_at(buf, offset)).await;
+        let (buf, res) = read_exact_at_impl(buf, offset, None, |buf, offset| {
+            self.read_at(buf, offset, ctx)
+        })
+        .await;
         res.map(|()| buf)
     }
 
-    pub async fn read_exact_at_n<B>(&self, buf: B, offset: u64, count: usize) -> Result<B, Error>
+    pub async fn read_exact_at_n<B>(
+        &self,
+        buf: B,
+        offset: u64,
+        count: usize,
+        ctx: &RequestContext,
+    ) -> Result<B, Error>
     where
         B: IoBufMut + Send,
     {
         let (buf, res) = read_exact_at_impl(buf, offset, Some(count), |buf, offset| {
-            self.read_at(buf, offset)
+            self.read_at(buf, offset, ctx)
         })
         .await;
         res.map(|()| buf)
@@ -601,12 +614,13 @@ impl VirtualFile {
         &self,
         page: PageWriteGuard<'static>,
         offset: u64,
+        ctx: &RequestContext,
     ) -> Result<PageWriteGuard<'static>, Error> {
         let buf = PageWriteGuardBuf {
             page,
             init_up_to: 0,
         };
-        let res = self.read_exact_at(buf, offset).await;
+        let res = self.read_exact_at(buf, offset, ctx).await;
         res.map(|PageWriteGuardBuf { page, .. }| page)
             .map_err(|e| Error::new(ErrorKind::Other, e))
     }
@@ -699,7 +713,12 @@ impl VirtualFile {
         (buf, Ok(n))
     }
 
-    pub(crate) async fn read_at<B>(&self, buf: B, offset: u64) -> (B, Result<usize, Error>)
+    pub(crate) async fn read_at<B>(
+        &self,
+        buf: B,
+        offset: u64,
+        _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */
+    ) -> (B, Result<usize, Error>)
     where
         B: tokio_epoll_uring::BoundedBufMut + Send,
     {
@@ -1020,20 +1039,21 @@ impl VirtualFile {
     pub(crate) async fn read_blk(
         &self,
         blknum: u32,
+        ctx: &RequestContext,
     ) -> Result<crate::tenant::block_io::BlockLease<'_>, std::io::Error> {
         use crate::page_cache::PAGE_SZ;
         let buf = vec![0; PAGE_SZ];
         let buf = self
-            .read_exact_at(buf, blknum as u64 * (PAGE_SZ as u64))
+            .read_exact_at(buf, blknum as u64 * (PAGE_SZ as u64), ctx)
             .await?;
         Ok(crate::tenant::block_io::BlockLease::Vec(buf))
     }
 
-    async fn read_to_end(&mut self, buf: &mut Vec<u8>) -> Result<(), Error> {
+    async fn read_to_end(&mut self, buf: &mut Vec<u8>, ctx: &RequestContext) -> Result<(), Error> {
         let mut tmp = vec![0; 128];
         loop {
             let res;
-            (tmp, res) = self.read_at(tmp, self.pos).await;
+            (tmp, res) = self.read_at(tmp, self.pos, ctx).await;
             match res {
                 Ok(0) => return Ok(()),
                 Ok(n) => {
@@ -1176,9 +1196,14 @@ mod tests {
     }
 
     impl MaybeVirtualFile {
-        async fn read_exact_at(&self, mut buf: Vec<u8>, offset: u64) -> Result<Vec<u8>, Error> {
+        async fn read_exact_at(
+            &self,
+            mut buf: Vec<u8>,
+            offset: u64,
+            ctx: &RequestContext,
+        ) -> Result<Vec<u8>, Error> {
             match self {
-                MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(buf, offset).await,
+                MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(buf, offset, ctx).await,
                 MaybeVirtualFile::File(file) => file.read_exact_at(&mut buf, offset).map(|()| buf),
             }
         }
@@ -1230,13 +1255,13 @@ mod tests {
 
         // Helper function to slurp contents of a file, starting at the current position,
         // into a string
-        async fn read_string(&mut self) -> Result<String, Error> {
+        async fn read_string(&mut self, ctx: &RequestContext) -> Result<String, Error> {
             use std::io::Read;
             let mut buf = String::new();
             match self {
                 MaybeVirtualFile::VirtualFile(file) => {
                     let mut buf = Vec::new();
-                    file.read_to_end(&mut buf).await?;
+                    file.read_to_end(&mut buf, ctx).await?;
                     return Ok(String::from_utf8(buf).unwrap());
                 }
                 MaybeVirtualFile::File(file) => {
@@ -1247,9 +1272,14 @@ mod tests {
         }
 
         // Helper function to slurp a portion of a file into a string
-        async fn read_string_at(&mut self, pos: u64, len: usize) -> Result<String, Error> {
+        async fn read_string_at(
+            &mut self,
+            pos: u64,
+            len: usize,
+            ctx: &RequestContext,
+        ) -> Result<String, Error> {
             let buf = vec![0; len];
-            let buf = self.read_exact_at(buf, pos).await?;
+            let buf = self.read_exact_at(buf, pos, ctx).await?;
             Ok(String::from_utf8(buf).unwrap())
         }
     }
@@ -1303,7 +1333,7 @@ mod tests {
         file_a.write_all(b"foobar".to_vec(), &ctx).await?;
 
         // cannot read from a file opened in write-only mode
-        let _ = file_a.read_string().await.unwrap_err();
+        let _ = file_a.read_string(&ctx).await.unwrap_err();
 
         // Close the file and re-open for reading
         let mut file_a = openfunc(path_a, OpenOptions::new().read(true).to_owned()).await?;
@@ -1312,24 +1342,24 @@ mod tests {
         let _ = file_a.write_all(b"bar".to_vec(), &ctx).await.unwrap_err();
 
         // Try simple read
-        assert_eq!("foobar", file_a.read_string().await?);
+        assert_eq!("foobar", file_a.read_string(&ctx).await?);
 
         // It's positioned at the EOF now.
-        assert_eq!("", file_a.read_string().await?);
+        assert_eq!("", file_a.read_string(&ctx).await?);
 
         // Test seeks.
         assert_eq!(file_a.seek(SeekFrom::Start(1)).await?, 1);
-        assert_eq!("oobar", file_a.read_string().await?);
+        assert_eq!("oobar", file_a.read_string(&ctx).await?);
 
         assert_eq!(file_a.seek(SeekFrom::End(-2)).await?, 4);
-        assert_eq!("ar", file_a.read_string().await?);
+        assert_eq!("ar", file_a.read_string(&ctx).await?);
 
         assert_eq!(file_a.seek(SeekFrom::Start(1)).await?, 1);
         assert_eq!(file_a.seek(SeekFrom::Current(2)).await?, 3);
-        assert_eq!("bar", file_a.read_string().await?);
+        assert_eq!("bar", file_a.read_string(&ctx).await?);
 
         assert_eq!(file_a.seek(SeekFrom::Current(-5)).await?, 1);
-        assert_eq!("oobar", file_a.read_string().await?);
+        assert_eq!("oobar", file_a.read_string(&ctx).await?);
 
         // Test erroneous seeks to before byte 0
         file_a.seek(SeekFrom::End(-7)).await.unwrap_err();
@@ -1337,7 +1367,7 @@ mod tests {
         file_a.seek(SeekFrom::Current(-2)).await.unwrap_err();
 
         // the erroneous seek should have left the position unchanged
-        assert_eq!("oobar", file_a.read_string().await?);
+        assert_eq!("oobar", file_a.read_string(&ctx).await?);
 
         // Create another test file, and try FileExt functions on it.
         let path_b = testdir.join("file_b");
@@ -1354,7 +1384,7 @@ mod tests {
         file_b.write_all_at(b"BAR".to_vec(), 3, &ctx).await?;
         file_b.write_all_at(b"FOO".to_vec(), 0, &ctx).await?;
 
-        assert_eq!(file_b.read_string_at(2, 3).await?, "OBA");
+        assert_eq!(file_b.read_string_at(2, 3, &ctx).await?, "OBA");
 
         // Open a lot of files, enough to cause some evictions. (Or to be precise,
         // open the same file many times. The effect is the same.)
@@ -1366,7 +1396,7 @@ mod tests {
         for _ in 0..100 {
             let mut vfile =
                 openfunc(path_b.clone(), OpenOptions::new().read(true).to_owned()).await?;
-            assert_eq!("FOOBAR", vfile.read_string().await?);
+            assert_eq!("FOOBAR", vfile.read_string(&ctx).await?);
             vfiles.push(vfile);
         }
 
@@ -1375,13 +1405,13 @@ mod tests {
 
         // The underlying file descriptor for 'file_a' should be closed now. Try to read
         // from it again. We left the file positioned at offset 1 above.
-        assert_eq!("oobar", file_a.read_string().await?);
+        assert_eq!("oobar", file_a.read_string(&ctx).await?);
 
         // Check that all the other FDs still work too. Use them in random order for
         // good measure.
         vfiles.as_mut_slice().shuffle(&mut thread_rng());
         for vfile in vfiles.iter_mut() {
-            assert_eq!("OOBAR", vfile.read_string_at(1, 5).await?);
+            assert_eq!("OOBAR", vfile.read_string_at(1, 5, &ctx).await?);
         }
 
         Ok(())
@@ -1397,6 +1427,7 @@ mod tests {
         const THREADS: usize = 100;
         const SAMPLE: [u8; SIZE] = [0xADu8; SIZE];
 
+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
         let testdir = crate::config::PageServerConf::test_repo_dir("vfile_concurrency");
         std::fs::create_dir_all(&testdir)?;
 
@@ -1425,12 +1456,13 @@ mod tests {
         let mut hdls = Vec::new();
         for _threadno in 0..THREADS {
             let files = files.clone();
+            let ctx = ctx.detached_child(TaskKind::UnitTest, DownloadBehavior::Error);
             let hdl = rt.spawn(async move {
                 let mut buf = vec![0u8; SIZE];
                 let mut rng = rand::rngs::OsRng;
                 for _ in 1..1000 {
                     let f = &files[rng.gen_range(0..files.len())];
-                    buf = f.read_exact_at(buf, 0).await.unwrap();
+                    buf = f.read_exact_at(buf, 0, &ctx).await.unwrap();
                     assert!(buf == SAMPLE);
                 }
             });
@@ -1446,6 +1478,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_atomic_overwrite_basic() {
+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
         let testdir = crate::config::PageServerConf::test_repo_dir("test_atomic_overwrite_basic");
         std::fs::create_dir_all(&testdir).unwrap();
 
@@ -1456,7 +1489,7 @@ mod tests {
             .await
             .unwrap();
         let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
-        let post = file.read_string().await.unwrap();
+        let post = file.read_string(&ctx).await.unwrap();
         assert_eq!(post, "foo");
         assert!(!tmp_path.exists());
         drop(file);
@@ -1465,7 +1498,7 @@ mod tests {
             .await
             .unwrap();
         let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
-        let post = file.read_string().await.unwrap();
+        let post = file.read_string(&ctx).await.unwrap();
         assert_eq!(post, "bar");
         assert!(!tmp_path.exists());
         drop(file);
@@ -1473,6 +1506,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_atomic_overwrite_preexisting_tmp() {
+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
         let testdir =
             crate::config::PageServerConf::test_repo_dir("test_atomic_overwrite_preexisting_tmp");
         std::fs::create_dir_all(&testdir).unwrap();
@@ -1488,7 +1522,7 @@ mod tests {
             .unwrap();
 
         let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
-        let post = file.read_string().await.unwrap();
+        let post = file.read_string(&ctx).await.unwrap();
         assert_eq!(post, "foo");
         assert!(!tmp_path.exists());
         drop(file);

From f50ff1456091d8e5ac8db2d01f4b058ce36d6569 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 13 May 2024 13:05:46 +0100
Subject: [PATCH 032/130] pageserver: refuse to run without remote storage
 (#7722)

## Problem

Since https://github.com/neondatabase/neon/pull/6769, the pageserver is
intentionally not usable without remote storage: it's purpose is to act
as a cache to an object store, rather than as a source of truth in its
own right.

## Summary of changes

- Make remote storage configuration mandatory: the pageserver will
refuse to start if it is not provided.

This is a precursor that will make it safe to subsequently remove all
the internal Option<>s
---
 pageserver/src/bin/pageserver.rs | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index eb4b8bb8bb..49f8a41b37 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -383,7 +383,7 @@ fn start_pageserver(
     let shutdown_pageserver = tokio_util::sync::CancellationToken::new();
 
     // Set up remote storage client
-    let remote_storage = create_remote_storage_client(conf)?;
+    let remote_storage = Some(create_remote_storage_client(conf)?);
 
     // Set up deletion queue
     let (deletion_queue, deletion_workers) = DeletionQueue::new(
@@ -708,12 +708,11 @@ fn start_pageserver(
 
 fn create_remote_storage_client(
     conf: &'static PageServerConf,
-) -> anyhow::Result<Option<GenericRemoteStorage>> {
+) -> anyhow::Result<GenericRemoteStorage> {
     let config = if let Some(config) = &conf.remote_storage_config {
         config
     } else {
-        tracing::warn!("no remote storage configured, this is a deprecated configuration");
-        return Ok(None);
+        anyhow::bail!("no remote storage configured, this is a deprecated configuration");
     };
 
     // Create the client
@@ -733,7 +732,7 @@ fn create_remote_storage_client(
             GenericRemoteStorage::unreliable_wrapper(remote_storage, conf.test_remote_failures);
     }
 
-    Ok(Some(remote_storage))
+    Ok(remote_storage)
 }
 
 fn cli() -> Command {

From d9dcbffac37ccd3331ec9adcd12fd20ce0ea31aa Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 13 May 2024 15:16:23 +0300
Subject: [PATCH 033/130] python: allow using allowed_errors.py (#7719)

See #7718. Fix it by renaming all `types.py` to `common_types.py`.

Additionally, add an advert for using `allowed_errors.py` to test any
added regex.
---
 scripts/check_allowed_errors.sh                | 18 ++++++++++++++++++
 test_runner/fixtures/benchmark_fixture.py      |  2 +-
 .../fixtures/{types.py => common_types.py}     |  0
 test_runner/fixtures/compute_reconfigure.py    |  2 +-
 test_runner/fixtures/neon_fixtures.py          |  4 ++--
 .../fixtures/pageserver/allowed_errors.py      |  5 +++--
 .../pageserver/{types.py => common_types.py}   |  2 +-
 test_runner/fixtures/pageserver/http.py        |  2 +-
 .../fixtures/pageserver/many_tenants.py        |  2 +-
 .../fixtures/pageserver/remote_storage.py      |  4 ++--
 test_runner/fixtures/pageserver/utils.py       |  2 +-
 test_runner/fixtures/remote_storage.py         |  2 +-
 test_runner/fixtures/safekeeper/http.py        |  2 +-
 test_runner/fixtures/safekeeper/utils.py       |  2 +-
 test_runner/fixtures/utils.py                  |  7 ++++---
 test_runner/fixtures/workload.py               |  2 +-
 test_runner/performance/pageserver/util.py     |  2 +-
 .../performance/test_branch_creation.py        |  2 +-
 test_runner/performance/test_bulk_insert.py    |  2 +-
 .../test_storage_controller_scale.py           |  2 +-
 .../performance/test_wal_backpressure.py       |  2 +-
 test_runner/regress/test_ancestor_branch.py    |  2 +-
 .../regress/test_attach_tenant_config.py       |  2 +-
 test_runner/regress/test_auth.py               |  2 +-
 test_runner/regress/test_branch_and_gc.py      |  2 +-
 test_runner/regress/test_branch_behind.py      |  2 +-
 test_runner/regress/test_branching.py          |  2 +-
 test_runner/regress/test_broken_timeline.py    |  2 +-
 test_runner/regress/test_compatibility.py      |  2 +-
 .../regress/test_disk_usage_eviction.py        |  2 +-
 test_runner/regress/test_duplicate_layers.py   |  2 +-
 test_runner/regress/test_fullbackup.py         |  2 +-
 test_runner/regress/test_gc_aggressive.py      |  2 +-
 test_runner/regress/test_import.py             |  2 +-
 test_runner/regress/test_layer_eviction.py     |  2 +-
 test_runner/regress/test_layers_from_future.py |  4 ++--
 .../regress/test_logical_replication.py        |  2 +-
 test_runner/regress/test_lsn_mapping.py        |  2 +-
 test_runner/regress/test_neon_cli.py           |  2 +-
 test_runner/regress/test_next_xid.py           |  2 +-
 test_runner/regress/test_old_request_lsn.py    |  2 +-
 test_runner/regress/test_ondemand_download.py  |  2 +-
 .../regress/test_ondemand_slru_download.py     |  2 +-
 test_runner/regress/test_pageserver_api.py     |  2 +-
 .../regress/test_pageserver_generations.py     |  2 +-
 .../test_pageserver_getpage_throttle.py        |  2 +-
 .../regress/test_pageserver_layer_rolling.py   |  2 +-
 .../test_pageserver_metric_collection.py       |  2 +-
 .../regress/test_pageserver_secondary.py       |  4 ++--
 test_runner/regress/test_pitr_gc.py            |  2 +-
 test_runner/regress/test_read_trace.py         |  2 +-
 test_runner/regress/test_readonly_node.py      |  2 +-
 test_runner/regress/test_remote_storage.py     |  4 ++--
 test_runner/regress/test_s3_restore.py         |  2 +-
 test_runner/regress/test_s3_scrubber.py        |  2 +-
 test_runner/regress/test_sharding.py           |  2 +-
 test_runner/regress/test_storage_controller.py |  2 +-
 test_runner/regress/test_tenant_conf.py        |  2 +-
 test_runner/regress/test_tenant_delete.py      |  2 +-
 test_runner/regress/test_tenant_detach.py      |  2 +-
 test_runner/regress/test_tenant_relocation.py  |  2 +-
 test_runner/regress/test_tenant_size.py        |  2 +-
 test_runner/regress/test_tenant_tasks.py       |  2 +-
 test_runner/regress/test_tenants.py            |  2 +-
 .../test_tenants_with_remote_storage.py        |  4 ++--
 test_runner/regress/test_timeline_delete.py    |  2 +-
 .../regress/test_timeline_detach_ancestor.py   |  2 +-
 test_runner/regress/test_timeline_size.py      |  2 +-
 test_runner/regress/test_wal_acceptor.py       |  2 +-
 test_runner/regress/test_wal_acceptor_async.py |  2 +-
 test_runner/regress/test_wal_receiver.py       |  2 +-
 test_runner/regress/test_wal_restore.py        |  2 +-
 .../test_walredo_not_left_behind_on_detach.py  |  2 +-
 73 files changed, 100 insertions(+), 80 deletions(-)
 create mode 100755 scripts/check_allowed_errors.sh
 rename test_runner/fixtures/{types.py => common_types.py} (100%)
 rename test_runner/fixtures/pageserver/{types.py => common_types.py} (98%)

diff --git a/scripts/check_allowed_errors.sh b/scripts/check_allowed_errors.sh
new file mode 100755
index 0000000000..87e52c1e64
--- /dev/null
+++ b/scripts/check_allowed_errors.sh
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+
+set -eu
+
+HELPER_DIR="$(dirname "${BASH_SOURCE[0]}")"
+SCRIPT="test_runner/fixtures/pageserver/allowed_errors.py"
+
+# first run to understand all of the errors:
+#
+# example: ./scripts/check_allowed_errors.sh -i - < pageserver.log
+# example: ./scripts/check_allowed_errors.sh -i pageserver.log
+#
+# then edit the test local allowed_errors to the
+# test_runner/fixtures/pageserver/allowed_errors.py, then re-run to make sure
+# they are handled.
+#
+# finally revert any local changes to allowed_errors.py.
+poetry run python3 "$HELPER_DIR/../$SCRIPT" $*
diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py
index c32748f6f0..038f557cc8 100644
--- a/test_runner/fixtures/benchmark_fixture.py
+++ b/test_runner/fixtures/benchmark_fixture.py
@@ -19,9 +19,9 @@ from _pytest.config.argparsing import Parser
 from _pytest.fixtures import FixtureRequest
 from _pytest.terminal import TerminalReporter
 
+from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonPageserver
-from fixtures.types import TenantId, TimelineId
 
 """
 This file contains fixtures for micro-benchmarks.
diff --git a/test_runner/fixtures/types.py b/test_runner/fixtures/common_types.py
similarity index 100%
rename from test_runner/fixtures/types.py
rename to test_runner/fixtures/common_types.py
diff --git a/test_runner/fixtures/compute_reconfigure.py b/test_runner/fixtures/compute_reconfigure.py
index a883d94f73..66fc35b6aa 100644
--- a/test_runner/fixtures/compute_reconfigure.py
+++ b/test_runner/fixtures/compute_reconfigure.py
@@ -5,8 +5,8 @@ import pytest
 from werkzeug.wrappers.request import Request
 from werkzeug.wrappers.response import Response
 
+from fixtures.common_types import TenantId
 from fixtures.log_helper import log
-from fixtures.types import TenantId
 
 
 class ComputeReconfigure:
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index da379693a0..0c2b70202e 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -47,14 +47,15 @@ from urllib3.util.retry import Retry
 
 from fixtures import overlayfs
 from fixtures.broker import NeonBroker
+from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId
 from fixtures.log_helper import log
 from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
 from fixtures.pageserver.allowed_errors import (
     DEFAULT_PAGESERVER_ALLOWED_ERRORS,
     DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS,
 )
+from fixtures.pageserver.common_types import IndexPartDump, LayerName, parse_layer_file_name
 from fixtures.pageserver.http import PageserverHttpClient
-from fixtures.pageserver.types import IndexPartDump, LayerName, parse_layer_file_name
 from fixtures.pageserver.utils import (
     wait_for_last_record_lsn,
     wait_for_upload,
@@ -72,7 +73,6 @@ from fixtures.remote_storage import (
 )
 from fixtures.safekeeper.http import SafekeeperHttpClient
 from fixtures.safekeeper.utils import are_walreceivers_absent
-from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
 from fixtures.utils import (
     ATTACHMENT_NAME_REGEX,
     allure_add_grafana_links,
diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py
index 58a76d7586..91cd67d107 100755
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -131,9 +131,10 @@ if __name__ == "__main__":
         "-i",
         "--input",
         type=argparse.FileType("r"),
-        default=sys.stdin,
-        help="Pageserver logs file. Reads from stdin if no file is provided.",
+        help="Pageserver logs file. Use '-' for stdin.",
+        required=True,
     )
+
     args = parser.parse_args()
     errors = _check_allowed_errors(args.input)
 
diff --git a/test_runner/fixtures/pageserver/types.py b/test_runner/fixtures/pageserver/common_types.py
similarity index 98%
rename from test_runner/fixtures/pageserver/types.py
rename to test_runner/fixtures/pageserver/common_types.py
index 1fb618f445..a6c327a8a0 100644
--- a/test_runner/fixtures/pageserver/types.py
+++ b/test_runner/fixtures/pageserver/common_types.py
@@ -2,7 +2,7 @@ import re
 from dataclasses import dataclass
 from typing import Any, Dict, Tuple, Union
 
-from fixtures.types import KEY_MAX, KEY_MIN, Key, Lsn
+from fixtures.common_types import KEY_MAX, KEY_MIN, Key, Lsn
 
 
 @dataclass
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index b06972056c..0b2963d89c 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -11,10 +11,10 @@ import requests
 from requests.adapters import HTTPAdapter
 from urllib3.util.retry import Retry
 
+from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId
 from fixtures.log_helper import log
 from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
 from fixtures.pg_version import PgVersion
-from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
 from fixtures.utils import Fn
 
 
diff --git a/test_runner/fixtures/pageserver/many_tenants.py b/test_runner/fixtures/pageserver/many_tenants.py
index f47a3ea043..def80a1c3e 100644
--- a/test_runner/fixtures/pageserver/many_tenants.py
+++ b/test_runner/fixtures/pageserver/many_tenants.py
@@ -3,6 +3,7 @@ import time
 from typing import Any, Callable, Dict, Tuple
 
 import fixtures.pageserver.remote_storage
+from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnv,
@@ -12,7 +13,6 @@ from fixtures.pageserver.utils import (
     wait_until_tenant_state,
 )
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
-from fixtures.types import TenantId, TimelineId
 
 
 def single_timeline(
diff --git a/test_runner/fixtures/pageserver/remote_storage.py b/test_runner/fixtures/pageserver/remote_storage.py
index e6cd9b4614..0c3612716a 100644
--- a/test_runner/fixtures/pageserver/remote_storage.py
+++ b/test_runner/fixtures/pageserver/remote_storage.py
@@ -6,13 +6,13 @@ import threading
 from pathlib import Path
 from typing import Any, List, Tuple
 
+from fixtures.common_types import TenantId, TimelineId
 from fixtures.neon_fixtures import NeonEnv, Pagectl
-from fixtures.pageserver.types import (
+from fixtures.pageserver.common_types import (
     InvalidFileName,
     parse_layer_file_name,
 )
 from fixtures.remote_storage import LocalFsStorage
-from fixtures.types import TenantId, TimelineId
 
 
 def duplicate_one_tenant(env: NeonEnv, template_tenant: TenantId, new_tenant: TenantId):
diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index 4b0dd7a815..91435e8a1f 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -8,10 +8,10 @@ from mypy_boto3_s3.type_defs import (
     ObjectTypeDef,
 )
 
+from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId
 from fixtures.log_helper import log
 from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
 from fixtures.remote_storage import RemoteStorage, RemoteStorageKind, S3Storage
-from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
 from fixtures.utils import wait_until
 
 
diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py
index 132d2450a7..ee18c53b52 100644
--- a/test_runner/fixtures/remote_storage.py
+++ b/test_runner/fixtures/remote_storage.py
@@ -12,8 +12,8 @@ import boto3
 import toml
 from mypy_boto3_s3 import S3Client
 
+from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
-from fixtures.types import TenantId, TimelineId
 
 TIMELINE_INDEX_PART_FILE_NAME = "index_part.json"
 TENANT_HEATMAP_FILE_NAME = "heatmap-v1.json"
diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py
index b9c1986818..82148d0556 100644
--- a/test_runner/fixtures/safekeeper/http.py
+++ b/test_runner/fixtures/safekeeper/http.py
@@ -6,8 +6,8 @@ from typing import Any, Dict, List, Optional, Tuple, Union
 import pytest
 import requests
 
+from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
-from fixtures.types import Lsn, TenantId, TimelineId
 
 
 # Walreceiver as returned by sk's timeline status endpoint.
diff --git a/test_runner/fixtures/safekeeper/utils.py b/test_runner/fixtures/safekeeper/utils.py
index 2818a493d6..0e4b5d7883 100644
--- a/test_runner/fixtures/safekeeper/utils.py
+++ b/test_runner/fixtures/safekeeper/utils.py
@@ -1,6 +1,6 @@
+from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.safekeeper.http import SafekeeperHttpClient
-from fixtures.types import TenantId, TimelineId
 
 
 def are_walreceivers_absent(
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index 6470621900..16dc9e8cfb 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -25,14 +25,14 @@ import zstandard
 from psycopg2.extensions import cursor
 
 from fixtures.log_helper import log
-from fixtures.pageserver.types import (
+from fixtures.pageserver.common_types import (
     parse_delta_layer,
     parse_image_layer,
 )
 
 if TYPE_CHECKING:
     from fixtures.neon_fixtures import PgBin
-from fixtures.types import TimelineId
+from fixtures.common_types import TimelineId
 
 Fn = TypeVar("Fn", bound=Callable[..., Any])
 
@@ -452,6 +452,7 @@ def humantime_to_ms(humantime: str) -> float:
 
 
 def scan_log_for_errors(input: Iterable[str], allowed_errors: List[str]) -> List[Tuple[int, str]]:
+    # FIXME: this duplicates test_runner/fixtures/pageserver/allowed_errors.py
     error_or_warn = re.compile(r"\s(ERROR|WARN)")
     errors = []
     for lineno, line in enumerate(input, start=1):
@@ -484,7 +485,7 @@ def assert_no_errors(log_file, service, allowed_errors):
     for _lineno, error in errors:
         log.info(f"not allowed {service} error: {error.strip()}")
 
-    assert not errors, f"Log errors on {service}: {errors[0]}"
+    assert not errors, f"First log error on {service}: {errors[0]}\nHint: use scripts/check_allowed_errors.sh to test any new allowed_error you add"
 
 
 @enum.unique
diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py
index c44628ce06..dfd9caba3e 100644
--- a/test_runner/fixtures/workload.py
+++ b/test_runner/fixtures/workload.py
@@ -1,6 +1,7 @@
 import threading
 from typing import Any, Optional
 
+from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     Endpoint,
@@ -10,7 +11,6 @@ from fixtures.neon_fixtures import (
     wait_for_last_flush_lsn,
 )
 from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
-from fixtures.types import TenantId, TimelineId
 
 # neon_local doesn't handle creating/modifying endpoints concurrently, so we use a mutex
 # to ensure we don't do that: this enables running lots of Workloads in parallel safely.
diff --git a/test_runner/performance/pageserver/util.py b/test_runner/performance/pageserver/util.py
index 009d62c9ba..f31cd9a9f8 100644
--- a/test_runner/performance/pageserver/util.py
+++ b/test_runner/performance/pageserver/util.py
@@ -5,13 +5,13 @@ Utilities used by all code in this sub-directory
 from typing import Any, Callable, Dict, Tuple
 
 import fixtures.pageserver.many_tenants as many_tenants
+from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnv,
     NeonEnvBuilder,
 )
 from fixtures.pageserver.utils import wait_until_all_tenants_state
-from fixtures.types import TenantId, TimelineId
 
 
 def ensure_pageserver_ready_for_benchmarking(env: NeonEnv, n_tenants: int):
diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py
index 7687b8417f..b3866f1813 100644
--- a/test_runner/performance/test_branch_creation.py
+++ b/test_runner/performance/test_branch_creation.py
@@ -9,11 +9,11 @@ from typing import List
 
 import pytest
 from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
+from fixtures.common_types import Lsn
 from fixtures.compare_fixtures import NeonCompare
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonPageserver
 from fixtures.pageserver.utils import wait_for_last_record_lsn
-from fixtures.types import Lsn
 from fixtures.utils import wait_until
 from prometheus_client.samples import Sample
 
diff --git a/test_runner/performance/test_bulk_insert.py b/test_runner/performance/test_bulk_insert.py
index 1df3f2f5f1..3f56da7c1d 100644
--- a/test_runner/performance/test_bulk_insert.py
+++ b/test_runner/performance/test_bulk_insert.py
@@ -2,10 +2,10 @@ from contextlib import closing
 
 import pytest
 from fixtures.benchmark_fixture import MetricReport
+from fixtures.common_types import Lsn
 from fixtures.compare_fixtures import NeonCompare, PgCompare
 from fixtures.pageserver.utils import wait_tenant_status_404
 from fixtures.pg_version import PgVersion
-from fixtures.types import Lsn
 
 
 #
diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py
index 632d465c3f..cb013ae8c3 100644
--- a/test_runner/performance/test_storage_controller_scale.py
+++ b/test_runner/performance/test_storage_controller_scale.py
@@ -3,6 +3,7 @@ import random
 import time
 
 import pytest
+from fixtures.common_types import TenantId, TenantShardId, TimelineId
 from fixtures.compute_reconfigure import ComputeReconfigure
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
@@ -10,7 +11,6 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pg_version import PgVersion
-from fixtures.types import TenantId, TenantShardId, TimelineId
 
 
 @pytest.mark.timeout(3600)  # super long running test: should go down as we optimize
diff --git a/test_runner/performance/test_wal_backpressure.py b/test_runner/performance/test_wal_backpressure.py
index 7eb244d378..513ebc74c3 100644
--- a/test_runner/performance/test_wal_backpressure.py
+++ b/test_runner/performance/test_wal_backpressure.py
@@ -6,10 +6,10 @@ from typing import Any, Callable, List
 
 import pytest
 from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
+from fixtures.common_types import Lsn
 from fixtures.compare_fixtures import NeonCompare, PgCompare, VanillaCompare
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import DEFAULT_BRANCH_NAME, NeonEnvBuilder, PgBin
-from fixtures.types import Lsn
 
 from performance.test_perf_pgbench import get_durations_matrix, get_scales_matrix
 
diff --git a/test_runner/regress/test_ancestor_branch.py b/test_runner/regress/test_ancestor_branch.py
index d16d2d6a24..7e40081aa2 100644
--- a/test_runner/regress/test_ancestor_branch.py
+++ b/test_runner/regress/test_ancestor_branch.py
@@ -1,6 +1,6 @@
+from fixtures.common_types import TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder
-from fixtures.types import TimelineId
 from fixtures.utils import query_scalar
 
 
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 693add422f..2ec375271c 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -2,13 +2,13 @@ from dataclasses import dataclass
 from typing import Generator, Optional
 
 import pytest
+from fixtures.common_types import TenantId
 from fixtures.neon_fixtures import (
     NeonEnv,
     NeonEnvBuilder,
 )
 from fixtures.pageserver.http import PageserverApiException, TenantConfig
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
-from fixtures.types import TenantId
 from fixtures.utils import wait_until
 
 
diff --git a/test_runner/regress/test_auth.py b/test_runner/regress/test_auth.py
index bb622c0d59..035ab2796f 100644
--- a/test_runner/regress/test_auth.py
+++ b/test_runner/regress/test_auth.py
@@ -4,13 +4,13 @@ from pathlib import Path
 
 import psycopg2
 import pytest
+from fixtures.common_types import TenantId, TimelineId
 from fixtures.neon_fixtures import (
     NeonEnv,
     NeonEnvBuilder,
     PgProtocol,
 )
 from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
-from fixtures.types import TenantId, TimelineId
 
 
 def assert_client_authorized(env: NeonEnv, http_client: PageserverHttpClient):
diff --git a/test_runner/regress/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py
index ddd02238ea..eb503ddbfa 100644
--- a/test_runner/regress/test_branch_and_gc.py
+++ b/test_runner/regress/test_branch_and_gc.py
@@ -2,10 +2,10 @@ import threading
 import time
 
 import pytest
+from fixtures.common_types import Lsn, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv
 from fixtures.pageserver.http import TimelineCreate406
-from fixtures.types import Lsn, TimelineId
 from fixtures.utils import query_scalar
 
 
diff --git a/test_runner/regress/test_branch_behind.py b/test_runner/regress/test_branch_behind.py
index b79cad979f..ac2fc79be4 100644
--- a/test_runner/regress/test_branch_behind.py
+++ b/test_runner/regress/test_branch_behind.py
@@ -1,8 +1,8 @@
 import pytest
+from fixtures.common_types import Lsn, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder
 from fixtures.pageserver.http import TimelineCreate406
-from fixtures.types import Lsn, TimelineId
 from fixtures.utils import print_gc_result, query_scalar
 
 
diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py
index 9fe9f77fea..03d6946c15 100644
--- a/test_runner/regress/test_branching.py
+++ b/test_runner/regress/test_branching.py
@@ -5,6 +5,7 @@ from concurrent.futures import ThreadPoolExecutor
 from typing import List
 
 import pytest
+from fixtures.common_types import Lsn, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     Endpoint,
@@ -14,7 +15,6 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import wait_until_tenant_active
-from fixtures.types import Lsn, TimelineId
 from fixtures.utils import query_scalar
 from performance.test_perf_pgbench import get_scales_matrix
 from requests import RequestException
diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py
index 1279c1bf81..7d4e101189 100644
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -3,6 +3,7 @@ import os
 from typing import List, Tuple
 
 import pytest
+from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     Endpoint,
@@ -11,7 +12,6 @@ from fixtures.neon_fixtures import (
     wait_for_last_flush_lsn,
 )
 from fixtures.pg_version import PgVersion
-from fixtures.types import TenantId, TimelineId
 
 
 # Test restarting page server, while safekeeper and compute node keep
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index 787c114fc1..ef35bf4696 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -7,6 +7,7 @@ from typing import List, Optional
 
 import pytest
 import toml
+from fixtures.common_types import Lsn
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnv,
@@ -21,7 +22,6 @@ from fixtures.pageserver.utils import (
 )
 from fixtures.pg_version import PgVersion
 from fixtures.remote_storage import RemoteStorageKind
-from fixtures.types import Lsn
 
 #
 # A test suite that help to prevent unintentionally breaking backward or forward compatibility between Neon releases.
diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py
index 1d73e9cb18..7ae2352c06 100644
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -5,6 +5,7 @@ from dataclasses import dataclass
 from typing import Any, Dict, Iterable, Tuple
 
 import pytest
+from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnv,
@@ -16,7 +17,6 @@ from fixtures.neon_fixtures import (
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import wait_for_upload_queue_empty
 from fixtures.remote_storage import RemoteStorageKind
-from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import wait_until
 
 GLOBAL_LRU_LOG_LINE = "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy"
diff --git a/test_runner/regress/test_duplicate_layers.py b/test_runner/regress/test_duplicate_layers.py
index 7471338ce5..0ebb99c712 100644
--- a/test_runner/regress/test_duplicate_layers.py
+++ b/test_runner/regress/test_duplicate_layers.py
@@ -2,7 +2,7 @@ import time
 
 import pytest
 from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_last_flush_lsn
-from fixtures.pageserver.types import parse_layer_file_name
+from fixtures.pageserver.common_types import parse_layer_file_name
 from fixtures.pageserver.utils import (
     wait_for_last_record_lsn,
     wait_for_upload_queue_empty,
diff --git a/test_runner/regress/test_fullbackup.py b/test_runner/regress/test_fullbackup.py
index d5f898492b..e1e4f700d4 100644
--- a/test_runner/regress/test_fullbackup.py
+++ b/test_runner/regress/test_fullbackup.py
@@ -1,6 +1,7 @@
 import os
 from pathlib import Path
 
+from fixtures.common_types import Lsn, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
@@ -8,7 +9,6 @@ from fixtures.neon_fixtures import (
     VanillaPostgres,
 )
 from fixtures.port_distributor import PortDistributor
-from fixtures.types import Lsn, TimelineId
 from fixtures.utils import query_scalar, subprocess_capture
 
 num_rows = 1000
diff --git a/test_runner/regress/test_gc_aggressive.py b/test_runner/regress/test_gc_aggressive.py
index c5070ee815..e5067bba8b 100644
--- a/test_runner/regress/test_gc_aggressive.py
+++ b/test_runner/regress/test_gc_aggressive.py
@@ -2,6 +2,7 @@ import asyncio
 import concurrent.futures
 import random
 
+from fixtures.common_types import TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     Endpoint,
@@ -10,7 +11,6 @@ from fixtures.neon_fixtures import (
     wait_for_last_flush_lsn,
 )
 from fixtures.remote_storage import RemoteStorageKind
-from fixtures.types import TimelineId
 
 # Test configuration
 #
diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py
index 132427ba2d..1f1c8cc582 100644
--- a/test_runner/regress/test_import.py
+++ b/test_runner/regress/test_import.py
@@ -7,6 +7,7 @@ from contextlib import closing
 from pathlib import Path
 
 import pytest
+from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     Endpoint,
@@ -20,7 +21,6 @@ from fixtures.pageserver.utils import (
     wait_for_upload,
 )
 from fixtures.remote_storage import RemoteStorageKind
-from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import subprocess_capture
 
 
diff --git a/test_runner/regress/test_layer_eviction.py b/test_runner/regress/test_layer_eviction.py
index b178baea11..0ef4f6d95b 100644
--- a/test_runner/regress/test_layer_eviction.py
+++ b/test_runner/regress/test_layer_eviction.py
@@ -7,7 +7,7 @@ from fixtures.neon_fixtures import (
     flush_ep_to_pageserver,
     wait_for_last_flush_lsn,
 )
-from fixtures.pageserver.types import parse_layer_file_name
+from fixtures.pageserver.common_types import parse_layer_file_name
 from fixtures.pageserver.utils import wait_for_upload
 from fixtures.remote_storage import RemoteStorageKind
 
diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py
index cc34fd83e9..18e5111786 100644
--- a/test_runner/regress/test_layers_from_future.py
+++ b/test_runner/regress/test_layers_from_future.py
@@ -1,8 +1,9 @@
 import time
 
+from fixtures.common_types import Lsn
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder, flush_ep_to_pageserver
-from fixtures.pageserver.types import (
+from fixtures.pageserver.common_types import (
     DeltaLayerName,
     ImageLayerName,
     is_future_layer,
@@ -13,7 +14,6 @@ from fixtures.pageserver.utils import (
     wait_until_tenant_active,
 )
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
-from fixtures.types import Lsn
 from fixtures.utils import query_scalar, wait_until
 
 
diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py
index 57d3447cae..a657d5a035 100644
--- a/test_runner/regress/test_logical_replication.py
+++ b/test_runner/regress/test_logical_replication.py
@@ -4,6 +4,7 @@ from random import choice
 from string import ascii_lowercase
 
 import pytest
+from fixtures.common_types import Lsn
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     AuxFileStore,
@@ -12,7 +13,6 @@ from fixtures.neon_fixtures import (
     logical_replication_sync,
     wait_for_last_flush_lsn,
 )
-from fixtures.types import Lsn
 from fixtures.utils import query_scalar, wait_until
 
 
diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py
index 225622868d..83d52d4c4c 100644
--- a/test_runner/regress/test_lsn_mapping.py
+++ b/test_runner/regress/test_lsn_mapping.py
@@ -2,10 +2,10 @@ import re
 import time
 from datetime import datetime, timedelta, timezone
 
+from fixtures.common_types import Lsn
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_last_flush_lsn
 from fixtures.pageserver.http import PageserverApiException
-from fixtures.types import Lsn
 from fixtures.utils import query_scalar
 
 
diff --git a/test_runner/regress/test_neon_cli.py b/test_runner/regress/test_neon_cli.py
index cb69f0ef39..ba170cfb4c 100644
--- a/test_runner/regress/test_neon_cli.py
+++ b/test_runner/regress/test_neon_cli.py
@@ -5,6 +5,7 @@ from typing import cast
 
 import pytest
 import requests
+from fixtures.common_types import TenantId, TimelineId
 from fixtures.neon_fixtures import (
     DEFAULT_BRANCH_NAME,
     NeonEnv,
@@ -13,7 +14,6 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pg_version import PgVersion, skip_on_postgres
-from fixtures.types import TenantId, TimelineId
 
 
 def helper_compare_timeline_list(
diff --git a/test_runner/regress/test_next_xid.py b/test_runner/regress/test_next_xid.py
index e880445c4d..45c0e3e409 100644
--- a/test_runner/regress/test_next_xid.py
+++ b/test_runner/regress/test_next_xid.py
@@ -3,13 +3,13 @@ import os
 import time
 from pathlib import Path
 
+from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_wal_insert_lsn
 from fixtures.pageserver.utils import (
     wait_for_last_record_lsn,
 )
 from fixtures.remote_storage import RemoteStorageKind
-from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import query_scalar
 
 
diff --git a/test_runner/regress/test_old_request_lsn.py b/test_runner/regress/test_old_request_lsn.py
index 391305c58a..43b0bb56f0 100644
--- a/test_runner/regress/test_old_request_lsn.py
+++ b/test_runner/regress/test_old_request_lsn.py
@@ -1,6 +1,6 @@
+from fixtures.common_types import TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder
-from fixtures.types import TimelineId
 from fixtures.utils import print_gc_result, query_scalar
 
 
diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py
index 6c2556f6a2..b51754c9e0 100644
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -5,6 +5,7 @@ import time
 from collections import defaultdict
 from typing import Any, DefaultDict, Dict, Tuple
 
+from fixtures.common_types import Lsn
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
@@ -21,7 +22,6 @@ from fixtures.pageserver.utils import (
     wait_until_tenant_active,
 )
 from fixtures.remote_storage import RemoteStorageKind
-from fixtures.types import Lsn
 from fixtures.utils import query_scalar, wait_until
 
 
diff --git a/test_runner/regress/test_ondemand_slru_download.py b/test_runner/regress/test_ondemand_slru_download.py
index 0b36b32552..4af7dcdfc3 100644
--- a/test_runner/regress/test_ondemand_slru_download.py
+++ b/test_runner/regress/test_ondemand_slru_download.py
@@ -1,9 +1,9 @@
 from typing import Optional
 
 import pytest
+from fixtures.common_types import Lsn
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder, tenant_get_shards
-from fixtures.types import Lsn
 from fixtures.utils import query_scalar
 
 
diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py
index bd7e4f118f..80a1d72f4a 100644
--- a/test_runner/regress/test_pageserver_api.py
+++ b/test_runner/regress/test_pageserver_api.py
@@ -3,13 +3,13 @@ from pathlib import Path
 from typing import Optional
 
 import toml
+from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.neon_fixtures import (
     DEFAULT_BRANCH_NAME,
     NeonEnv,
     NeonEnvBuilder,
 )
 from fixtures.pageserver.http import PageserverHttpClient
-from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import wait_until
 
 
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index 58eaf404d3..a38bcd45da 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -16,6 +16,7 @@ import time
 from typing import Optional
 
 import pytest
+from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnv,
@@ -37,7 +38,6 @@ from fixtures.pageserver.utils import (
 from fixtures.remote_storage import (
     RemoteStorageKind,
 )
-from fixtures.types import TenantId, TimelineId
 from fixtures.utils import print_gc_result, wait_until
 from fixtures.workload import Workload
 
diff --git a/test_runner/regress/test_pageserver_getpage_throttle.py b/test_runner/regress/test_pageserver_getpage_throttle.py
index 42cc28efee..111285b40c 100644
--- a/test_runner/regress/test_pageserver_getpage_throttle.py
+++ b/test_runner/regress/test_pageserver_getpage_throttle.py
@@ -2,10 +2,10 @@ import json
 import uuid
 
 from anyio import Path
+from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder, PgBin
 from fixtures.pg_version import PgVersion
-from fixtures.types import TenantId, TimelineId
 from fixtures.utils import wait_until
 
 
diff --git a/test_runner/regress/test_pageserver_layer_rolling.py b/test_runner/regress/test_pageserver_layer_rolling.py
index c5dc0f2919..aab0536f5a 100644
--- a/test_runner/regress/test_pageserver_layer_rolling.py
+++ b/test_runner/regress/test_pageserver_layer_rolling.py
@@ -5,6 +5,7 @@ from typing import Optional, Tuple
 
 import psutil
 import pytest
+from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnv,
@@ -13,7 +14,6 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
-from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import wait_until
 
 TIMELINE_COUNT = 10
diff --git a/test_runner/regress/test_pageserver_metric_collection.py b/test_runner/regress/test_pageserver_metric_collection.py
index c34ef46d07..b0465f2a96 100644
--- a/test_runner/regress/test_pageserver_metric_collection.py
+++ b/test_runner/regress/test_pageserver_metric_collection.py
@@ -7,6 +7,7 @@ from pathlib import Path
 from queue import SimpleQueue
 from typing import Any, Dict, Set
 
+from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
@@ -17,7 +18,6 @@ from fixtures.remote_storage import (
     RemoteStorageKind,
     remote_storage_to_toml_inline_table,
 )
-from fixtures.types import TenantId, TimelineId
 from pytest_httpserver import HTTPServer
 from werkzeug.wrappers.request import Request
 from werkzeug.wrappers.response import Response
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index c40bb962f2..fdc09a063d 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -5,9 +5,10 @@ import time
 from typing import Any, Dict, Optional
 
 import pytest
+from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver, S3Scrubber
-from fixtures.pageserver.types import parse_layer_file_name
+from fixtures.pageserver.common_types import parse_layer_file_name
 from fixtures.pageserver.utils import (
     assert_prefix_empty,
     poll_for_remote_storage_iterations,
@@ -15,7 +16,6 @@ from fixtures.pageserver.utils import (
     wait_for_upload_queue_empty,
 )
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, S3Storage
-from fixtures.types import TenantId, TimelineId
 from fixtures.utils import wait_until
 from fixtures.workload import Workload
 
diff --git a/test_runner/regress/test_pitr_gc.py b/test_runner/regress/test_pitr_gc.py
index 539ef3eda7..6434f431a4 100644
--- a/test_runner/regress/test_pitr_gc.py
+++ b/test_runner/regress/test_pitr_gc.py
@@ -1,6 +1,6 @@
+from fixtures.common_types import TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder
-from fixtures.types import TimelineId
 from fixtures.utils import print_gc_result, query_scalar
 
 
diff --git a/test_runner/regress/test_read_trace.py b/test_runner/regress/test_read_trace.py
index e6b3ccd7ec..cc5853b727 100644
--- a/test_runner/regress/test_read_trace.py
+++ b/test_runner/regress/test_read_trace.py
@@ -1,8 +1,8 @@
 from contextlib import closing
 
+from fixtures.common_types import Lsn
 from fixtures.neon_fixtures import NeonEnvBuilder
 from fixtures.pageserver.utils import wait_for_last_record_lsn
-from fixtures.types import Lsn
 from fixtures.utils import query_scalar
 
 
diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py
index b7c8f36107..ba8b91e84d 100644
--- a/test_runner/regress/test_readonly_node.py
+++ b/test_runner/regress/test_readonly_node.py
@@ -1,8 +1,8 @@
 import pytest
+from fixtures.common_types import Lsn
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv
 from fixtures.pageserver.utils import wait_for_last_record_lsn
-from fixtures.types import Lsn
 from fixtures.utils import query_scalar
 
 
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 70c025c225..7f79bf5d5c 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -6,13 +6,14 @@ import time
 from typing import Dict, List, Optional, Tuple
 
 import pytest
+from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     wait_for_last_flush_lsn,
 )
+from fixtures.pageserver.common_types import parse_layer_file_name
 from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
-from fixtures.pageserver.types import parse_layer_file_name
 from fixtures.pageserver.utils import (
     timeline_delete_wait_completed,
     wait_for_last_record_lsn,
@@ -25,7 +26,6 @@ from fixtures.remote_storage import (
     RemoteStorageKind,
     available_remote_storages,
 )
-from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import (
     assert_eq,
     assert_ge,
diff --git a/test_runner/regress/test_s3_restore.py b/test_runner/regress/test_s3_restore.py
index 9227836862..7fdabaaec7 100644
--- a/test_runner/regress/test_s3_restore.py
+++ b/test_runner/regress/test_s3_restore.py
@@ -1,6 +1,7 @@
 import time
 from datetime import datetime, timezone
 
+from fixtures.common_types import Lsn
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     PgBin,
@@ -14,7 +15,6 @@ from fixtures.pageserver.utils import (
     wait_for_upload,
 )
 from fixtures.remote_storage import RemoteStorageKind, s3_storage
-from fixtures.types import Lsn
 from fixtures.utils import run_pg_bench_small
 
 
diff --git a/test_runner/regress/test_s3_scrubber.py b/test_runner/regress/test_s3_scrubber.py
index 018c1637d0..8981000c24 100644
--- a/test_runner/regress/test_s3_scrubber.py
+++ b/test_runner/regress/test_s3_scrubber.py
@@ -3,12 +3,12 @@ import shutil
 from typing import Optional
 
 import pytest
+from fixtures.common_types import TenantShardId
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     S3Scrubber,
 )
 from fixtures.remote_storage import S3Storage, s3_storage
-from fixtures.types import TenantShardId
 from fixtures.workload import Workload
 
 
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index d33803250f..87544af598 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -5,6 +5,7 @@ from typing import Dict, List, Optional, Union
 
 import pytest
 import requests
+from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId
 from fixtures.compute_reconfigure import ComputeReconfigure
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
@@ -18,7 +19,6 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.pageserver.utils import assert_prefix_empty, assert_prefix_not_empty
 from fixtures.remote_storage import s3_storage
-from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
 from fixtures.utils import wait_until
 from fixtures.workload import Workload
 from pytest_httpserver import HTTPServer
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index bdd356388f..4a501e60ed 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -6,6 +6,7 @@ from datetime import datetime, timezone
 from typing import Any, Dict, List, Union
 
 import pytest
+from fixtures.common_types import TenantId, TenantShardId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnv,
@@ -25,7 +26,6 @@ from fixtures.pageserver.utils import (
 )
 from fixtures.pg_version import PgVersion
 from fixtures.remote_storage import RemoteStorageKind, s3_storage
-from fixtures.types import TenantId, TenantShardId, TimelineId
 from fixtures.utils import run_pg_bench_small, subprocess_capture, wait_until
 from fixtures.workload import Workload
 from mypy_boto3_s3.type_defs import (
diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py
index fc099297e1..a345464208 100644
--- a/test_runner/regress/test_tenant_conf.py
+++ b/test_runner/regress/test_tenant_conf.py
@@ -2,13 +2,13 @@ import json
 from contextlib import closing
 
 import psycopg2.extras
+from fixtures.common_types import Lsn
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
 )
 from fixtures.pageserver.utils import assert_tenant_state, wait_for_upload
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
-from fixtures.types import Lsn
 from fixtures.utils import wait_until
 
 
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index 363c3c88ec..3fc44de6fa 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -5,6 +5,7 @@ import shutil
 from threading import Thread
 
 import pytest
+from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
@@ -26,7 +27,6 @@ from fixtures.pageserver.utils import (
     wait_until_tenant_state,
 )
 from fixtures.remote_storage import RemoteStorageKind, available_s3_storages, s3_storage
-from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import run_pg_bench_small, wait_until
 from requests.exceptions import ReadTimeout
 
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index 0ba0108651..12a4730e69 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -7,6 +7,7 @@ from typing import List, Optional
 
 import asyncpg
 import pytest
+from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     Endpoint,
@@ -22,7 +23,6 @@ from fixtures.pageserver.utils import (
 from fixtures.remote_storage import (
     RemoteStorageKind,
 )
-from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import query_scalar, wait_until
 from prometheus_client.samples import Sample
 
diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py
index 68d9d9a660..be289e03d6 100644
--- a/test_runner/regress/test_tenant_relocation.py
+++ b/test_runner/regress/test_tenant_relocation.py
@@ -7,6 +7,7 @@ from pathlib import Path
 from typing import Any, Dict, Optional, Tuple
 
 import pytest
+from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import Endpoint, NeonEnvBuilder, NeonPageserver
 from fixtures.pageserver.http import PageserverHttpClient
@@ -20,7 +21,6 @@ from fixtures.remote_storage import (
     LocalFsStorage,
     RemoteStorageKind,
 )
-from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import (
     query_scalar,
     wait_until,
diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py
index e73eae91f0..7894f6933d 100644
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -4,6 +4,7 @@ from pathlib import Path
 from typing import List, Tuple
 
 import pytest
+from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     Endpoint,
@@ -19,7 +20,6 @@ from fixtures.pageserver.utils import (
     wait_until_tenant_active,
 )
 from fixtures.pg_version import PgVersion
-from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import wait_until
 
 
diff --git a/test_runner/regress/test_tenant_tasks.py b/test_runner/regress/test_tenant_tasks.py
index 75e5c2c91c..d08ad3cd2e 100644
--- a/test_runner/regress/test_tenant_tasks.py
+++ b/test_runner/regress/test_tenant_tasks.py
@@ -1,3 +1,4 @@
+from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder
 from fixtures.pageserver.utils import (
@@ -5,7 +6,6 @@ from fixtures.pageserver.utils import (
     timeline_delete_wait_completed,
     wait_until_tenant_active,
 )
-from fixtures.types import TenantId, TimelineId
 from fixtures.utils import wait_until
 
 
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index 2832304dcc..93e9ad3673 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -9,6 +9,7 @@ from typing import List
 
 import pytest
 import requests
+from fixtures.common_types import Lsn, TenantId
 from fixtures.log_helper import log
 from fixtures.metrics import (
     PAGESERVER_GLOBAL_METRICS,
@@ -24,7 +25,6 @@ from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import timeline_delete_wait_completed, wait_until_tenant_active
 from fixtures.pg_version import PgVersion
 from fixtures.remote_storage import RemoteStorageKind
-from fixtures.types import Lsn, TenantId
 from fixtures.utils import wait_until
 from prometheus_client.samples import Sample
 
diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py
index a1e96928bf..168876b711 100644
--- a/test_runner/regress/test_tenants_with_remote_storage.py
+++ b/test_runner/regress/test_tenants_with_remote_storage.py
@@ -11,6 +11,7 @@ import os
 from pathlib import Path
 from typing import List, Tuple
 
+from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     Endpoint,
@@ -18,7 +19,7 @@ from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     last_flush_lsn_upload,
 )
-from fixtures.pageserver.types import parse_layer_file_name
+from fixtures.pageserver.common_types import parse_layer_file_name
 from fixtures.pageserver.utils import (
     assert_tenant_state,
     wait_for_last_record_lsn,
@@ -28,7 +29,6 @@ from fixtures.remote_storage import (
     LocalFsStorage,
     RemoteStorageKind,
 )
-from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import query_scalar, wait_until
 
 
diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py
index 0eb1327c9e..da37f469b3 100644
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -6,6 +6,7 @@ import threading
 
 import pytest
 import requests
+from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnv,
@@ -31,7 +32,6 @@ from fixtures.remote_storage import (
     RemoteStorageKind,
     s3_storage,
 )
-from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import query_scalar, run_pg_bench_small, wait_until
 from urllib3.util.retry import Retry
 
diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index 1e961a4b2f..8406de8bc1 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -10,6 +10,7 @@ from threading import Barrier
 from typing import IO, List, Set, Tuple, Union
 
 import pytest
+from fixtures.common_types import Lsn, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
@@ -19,7 +20,6 @@ from fixtures.neon_fixtures import (
 from fixtures.pageserver.http import HistoricLayerInfo
 from fixtures.pageserver.utils import wait_timeline_detail_404
 from fixtures.remote_storage import LocalFsStorage
-from fixtures.types import Lsn, TimelineId
 
 
 def by_end_lsn(info: HistoricLayerInfo) -> Lsn:
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index 628c484fbd..18063bf104 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -10,6 +10,7 @@ from typing import Optional
 import psycopg2.errors
 import psycopg2.extras
 import pytest
+from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     Endpoint,
@@ -31,7 +32,6 @@ from fixtures.pageserver.utils import (
 from fixtures.pg_version import PgVersion
 from fixtures.port_distributor import PortDistributor
 from fixtures.remote_storage import RemoteStorageKind
-from fixtures.types import TenantId, TimelineId
 from fixtures.utils import get_timeline_dir_size, wait_until
 
 
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 967d133e18..ea66eeff63 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -18,6 +18,7 @@ import psycopg2.errors
 import psycopg2.extras
 import pytest
 from fixtures.broker import NeonBroker
+from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.metrics import parse_metrics
 from fixtures.neon_fixtures import (
@@ -47,7 +48,6 @@ from fixtures.remote_storage import (
 )
 from fixtures.safekeeper.http import SafekeeperHttpClient
 from fixtures.safekeeper.utils import are_walreceivers_absent
-from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import get_dir_size, query_scalar, start_in_background
 
 
diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py
index dce5616ac6..b5d86de574 100644
--- a/test_runner/regress/test_wal_acceptor_async.py
+++ b/test_runner/regress/test_wal_acceptor_async.py
@@ -8,10 +8,10 @@ from typing import List, Optional
 import asyncpg
 import pytest
 import toml
+from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import getLogger
 from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonEnvBuilder, Safekeeper
 from fixtures.remote_storage import RemoteStorageKind
-from fixtures.types import Lsn, TenantId, TimelineId
 
 log = getLogger("root.safekeeper_async")
 
diff --git a/test_runner/regress/test_wal_receiver.py b/test_runner/regress/test_wal_receiver.py
index 7ac6e6332c..d9265dcbcd 100644
--- a/test_runner/regress/test_wal_receiver.py
+++ b/test_runner/regress/test_wal_receiver.py
@@ -1,8 +1,8 @@
 import time
 
+from fixtures.common_types import Lsn, TenantId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder
-from fixtures.types import Lsn, TenantId
 
 
 # Checks that pageserver's walreceiver state is printed in the logs during WAL wait timeout.
diff --git a/test_runner/regress/test_wal_restore.py b/test_runner/regress/test_wal_restore.py
index 083a259d85..01a1d5cf55 100644
--- a/test_runner/regress/test_wal_restore.py
+++ b/test_runner/regress/test_wal_restore.py
@@ -6,6 +6,7 @@ from typing import List
 
 import pytest
 import zstandard
+from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
@@ -19,7 +20,6 @@ from fixtures.pageserver.utils import (
 )
 from fixtures.port_distributor import PortDistributor
 from fixtures.remote_storage import LocalFsStorage, S3Storage, s3_storage
-from fixtures.types import Lsn, TenantId, TimelineId
 from mypy_boto3_s3.type_defs import (
     ObjectTypeDef,
 )
diff --git a/test_runner/regress/test_walredo_not_left_behind_on_detach.py b/test_runner/regress/test_walredo_not_left_behind_on_detach.py
index 13159efbe8..ad37807dba 100644
--- a/test_runner/regress/test_walredo_not_left_behind_on_detach.py
+++ b/test_runner/regress/test_walredo_not_left_behind_on_detach.py
@@ -2,10 +2,10 @@ import time
 
 import psutil
 import pytest
+from fixtures.common_types import TenantId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder
 from fixtures.pageserver.http import PageserverApiException
-from fixtures.types import TenantId
 
 
 def assert_child_processes(pageserver_pid, wal_redo_present=False, defunct_present=False):

From 5a0da93c530e99a374a9b2cabc1e2034a68773f1 Mon Sep 17 00:00:00 2001
From: Jure Bajic <jure.bajic94@gmail.com>
Date: Mon, 13 May 2024 14:18:14 +0200
Subject: [PATCH 034/130] Fix `test_lock_time_tracing` flakiness (#7712)

## Problem

Closes
[test_lock_time_tracing](https://github.com/neondatabase/neon/issues/7691)

## Summary of changes

Taking a look at the execution of the same test in logs, it can be
concluded that the time we are holding the lock is sometimes not
enough(must be above 30s) to cause the second log to be shown by the
thread that is creating a timeline.

In the [successful
execution](https://neon-github-public-dev.s3.amazonaws.com/reports/pr-7663/9021247520/index.html#testresult/a21bce8c702b37f0)
it can be seen that the log `Operation TimelineCreate on key
5e088fc2dd14945020d0fa6d9efd1e36 has waited 30.000887709s for shared
lock` was on the edge of being logged, if it was below 30s it would not
be shown.

```
2024-05-09T18:02:32.552093Z  WARN request{method=PUT path=/control/v1/tenant/5e088fc2dd14945020d0fa6d9efd1e36/policy request_id=af7e4a04-d181-4acb-952f-9597c8eba5a8}: Lock on UpdatePolicy was held for 31.001892592s
2024-05-09T18:02:32.552109Z  INFO request{method=PUT path=/control/v1/tenant/5e088fc2dd14945020d0fa6d9efd1e36/policy request_id=af7e4a04-d181-4acb-952f-9597c8eba5a8}: Request handled, status: 200 OK
2024-05-09T18:02:32.552271Z  WARN request{method=POST path=/v1/tenant/5e088fc2dd14945020d0fa6d9efd1e36/timeline request_id=d3af756e-dbb3-476b-89bd-3594f19bbb67}: Operation TimelineCreate on key 5e088fc2dd14945020d0fa6d9efd1e36 has waited 30.000887709s for shared lock
```

In the [failed
execution](https://neon-github-public-dev.s3.amazonaws.com/reports/pr-7663/9022743601/index.html#/testresult/deb90136aeae4fce):
```
2024-05-09T20:14:33.526311Z  INFO request{method=POST path=/v1/tenant/68194ffadb61ca11adcbb11cbeb4ec6e/timeline request_id=1daa8c31-522d-4805-9114-68cdcffb9823}: Creating timeline 68194ffadb61ca11adcbb11cbeb4ec6e/f72185990ed13f0b0533383f81d877af
2024-05-09T20:14:36.441165Z  INFO Heartbeat round complete for 1 nodes, 0 offline
2024-05-09T20:14:41.441657Z  INFO Heartbeat round complete for 1 nodes, 0 offline
2024-05-09T20:14:41.535227Z  INFO request{method=POST path=/upcall/v1/validate request_id=94a7be88-474e-4163-92f8-57b401473add}: Handling request
2024-05-09T20:14:41.535269Z  INFO request{method=POST path=/upcall/v1/validate request_id=94a7be88-474e-4163-92f8-57b401473add}: handle_validate: 68194ffadb61ca11adcbb11cbeb4ec6e(gen 1): valid=true (latest Some(00000001))
2024-05-09T20:14:41.535284Z  INFO request{method=POST path=/upcall/v1/validate request_id=94a7be88-474e-4163-92f8-57b401473add}: Request handled, status: 200 OK
2024-05-09T20:14:46.441854Z  INFO Heartbeat round complete for 1 nodes, 0 offline
2024-05-09T20:14:51.441151Z  INFO Heartbeat round complete for 1 nodes, 0 offline
2024-05-09T20:14:56.441199Z  INFO Heartbeat round complete for 1 nodes, 0 offline
2024-05-09T20:15:01.440971Z  INFO Heartbeat round complete for 1 nodes, 0 offline
2024-05-09T20:15:03.516320Z  INFO request{method=PUT path=/control/v1/tenant/68194ffadb61ca11adcbb11cbeb4ec6e/policy request_id=0edfdb5b-2b05-486b-9879-d83f234d2f0d}: failpoint "tenant-update-policy-exclusive-lock": sleep done
2024-05-09T20:15:03.518474Z  INFO request{method=PUT path=/control/v1/tenant/68194ffadb61ca11adcbb11cbeb4ec6e/policy request_id=0edfdb5b-2b05-486b-9879-d83f234d2f0d}: Updated scheduling policy to Stop tenant_id=68194ffadb61ca11adcbb11cbeb4ec6e shard_id=0000
2024-05-09T20:15:03.518512Z  WARN request{method=PUT path=/control/v1/tenant/68194ffadb61ca11adcbb11cbeb4ec6e/policy request_id=0edfdb5b-2b05-486b-9879-d83f234d2f0d}: Scheduling is disabled by policy Stop tenant_id=68194ffadb61ca11adcbb11cbeb4ec6e shard_id=0000
2024-05-09T20:15:03.518540Z  WARN request{method=PUT path=/control/v1/tenant/68194ffadb61ca11adcbb11cbeb4ec6e/policy request_id=0edfdb5b-2b05-486b-9879-d83f234d2f0d}: Lock on UpdatePolicy was held for 31.003712703s
2024-05-09T20:15:03.518570Z  INFO request{method=PUT path=/control/v1/tenant/68194ffadb61ca11adcbb11cbeb4ec6e/policy request_id=0edfdb5b-2b05-486b-9879-d83f234d2f0d}: Request handled, status: 200 OK
2024-05-09T20:15:03.518804Z  WARN request{method=POST path=/v1/tenant/68194ffadb61ca11adcbb11cbeb4ec6e/timeline request_id=1daa8c31-522d-4805-9114-68cdcffb9823}: Scheduling is disabled by policy Stop tenant_id=68194ffadb61ca11adcbb11cbeb4ec6e shard_id=0000
2024-05-09T20:15:03.518815Z  INFO request{method=POST path=/v1/tenant/68194ffadb61ca11adcbb11cbeb4ec6e/timeline request_id=1daa8c31-522d-4805-9114-68cdcffb9823}: Creating timeline on shard 68194ffadb61ca11adcbb11cbeb4ec6e/f72185990ed13f0b0533383f81d877af, attached to node 1 (localhost)
```
we can see that the difference between starting to create timeline
`2024-05-09T20:14:33.526311Z` and creating timeline
`2024-05-09T20:15:03.518815Z` is not above 30s and will not cause any
logs to appear.

The proposed solution is to prolong how long we will pause to ensure
that the thread that creates the timeline waits above 30s.
---
 test_runner/regress/test_storage_controller.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 4a501e60ed..3a9a522f3f 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -1284,7 +1284,7 @@ def test_lock_time_tracing(neon_env_builder: NeonEnvBuilder):
 
     # Apply failpoint
     env.storage_controller.configure_failpoints(
-        ("tenant-update-policy-exclusive-lock", "return(31000)")
+        ("tenant-update-policy-exclusive-lock", "return(35000)")
     )
 
     # This will hold the exclusive for enough time to cause an warning
@@ -1306,7 +1306,7 @@ def test_lock_time_tracing(neon_env_builder: NeonEnvBuilder):
     env.storage_controller.pageserver_api().timeline_create(
         pg_version=PgVersion.NOT_SET, tenant_id=tenant_id, new_timeline_id=timeline_id
     )
-    thread_update_tenant_policy.join(timeout=10)
+    thread_update_tenant_policy.join()
 
     env.storage_controller.assert_log_contains("Lock on UpdatePolicy was held for")
     env.storage_controller.assert_log_contains(

From bbe730d7caec380edb389487fb1d6db4c9b12cc8 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Mon, 13 May 2024 13:41:14 +0100
Subject: [PATCH 035/130] Revert protocol version upgrade (#7727)

## Problem

"John pointed out that the switch to protocol version 2 made
test_gc_aggressive test flaky:
https://github.com/neondatabase/neon/issues/7692.
I tracked it down, and that is indeed an issue. Conditions for hitting
the issue:
The problem occurs in the primary
GC horizon is set to a very low value, e.g. 0.
If the primary is actively writing WAL, and GC runs in the pageserver at
the same time that the primary sends a GetPage request, it's possible
that the GC advances the GC horizon past the GetPage request's LSN. I'm
working on a fix here: https://github.com/neondatabase/neon/pull/7708."
- Heikki

## Summary of changes
Use protocol version 1 as default.
---
 pgxn/neon/libpagestore.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index f5ce2caff3..b7b1e7ccbf 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -49,7 +49,7 @@ char	   *neon_auth_token;
 int			readahead_buffer_size = 128;
 int			flush_every_n_requests = 8;
 
-int         neon_protocol_version = 2;
+int         neon_protocol_version = 1;
 
 static int	n_reconnect_attempts = 0;
 static int	max_reconnect_attempts = 60;
@@ -860,7 +860,7 @@ pg_init_libpagestore(void)
 							"Version of compute<->page server protocol",
 							NULL,
 							&neon_protocol_version,
-							2, /* use protocol version 2 */
+							1, /* default to old protocol for now */
 							1, /* min */
 							2, /* max */
 							PGC_SU_BACKEND,

From 6ff74295b5b21d54192d20d114d78621d8d53ba0 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 13 May 2024 14:52:06 +0200
Subject: [PATCH 036/130] chore(pageserver): plumb through RequestContext to
 VirtualFile open methods (#7725)

This PR introduces no functional changes.

The `open()` path will be done separately.

refs https://github.com/neondatabase/neon/issues/6107
refs https://github.com/neondatabase/neon/issues/7386

Co-authored-by: Joonas Koivunen <joonas@neon.tech>
---
 pageserver/ctl/src/layer_map_analyzer.rs      |   2 +-
 pageserver/ctl/src/layers.rs                  |   2 +-
 pageserver/src/pgdatadir_mapping.rs           |   2 +-
 pageserver/src/tenant/blob_io.rs              |   4 +-
 pageserver/src/tenant/ephemeral_file.rs       |   4 +-
 .../tenant/remote_timeline_client/download.rs |  21 ++--
 .../src/tenant/storage_layer/delta_layer.rs   |  10 +-
 .../src/tenant/storage_layer/image_layer.rs   |   8 +-
 .../tenant/storage_layer/inmemory_layer.rs    |   4 +-
 pageserver/src/tenant/timeline.rs             |  35 ++++--
 pageserver/src/tenant/timeline/compaction.rs  |   4 +
 .../src/tenant/timeline/detach_ancestor.rs    |   2 +
 .../src/tenant/timeline/layer_manager.rs      |   4 +-
 pageserver/src/virtual_file.rs                | 102 +++++++++++++-----
 14 files changed, 143 insertions(+), 61 deletions(-)

diff --git a/pageserver/ctl/src/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs
index c4c282f33d..b4bb239f44 100644
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -100,7 +100,7 @@ pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {
 
 // Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH"
 async fn get_holes(path: &Utf8Path, max_holes: usize, ctx: &RequestContext) -> Result<Vec<Hole>> {
-    let file = VirtualFile::open(path).await?;
+    let file = VirtualFile::open(path, ctx).await?;
     let file_id = page_cache::next_file_id();
     let block_reader = FileBlockReader::new(&file, file_id);
     let summary_blk = block_reader.read_blk(0, ctx).await?;
diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs
index be8f91675d..3611b0baab 100644
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -61,7 +61,7 @@ async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result
     let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
     virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
     page_cache::init(100);
-    let file = VirtualFile::open(path).await?;
+    let file = VirtualFile::open(path, ctx).await?;
     let file_id = page_cache::next_file_id();
     let block_reader = FileBlockReader::new(&file, file_id);
     let summary_blk = block_reader.read_blk(0, ctx).await?;
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index a4215ee107..ffcab5f140 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1671,7 +1671,7 @@ impl<'a> DatadirModification<'a> {
         }
 
         if !self.pending_deletions.is_empty() {
-            writer.delete_batch(&self.pending_deletions).await?;
+            writer.delete_batch(&self.pending_deletions, ctx).await?;
             self.pending_deletions.clear();
         }
 
diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs
index 1dc451f5c9..24b4e4f3ea 100644
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -299,7 +299,7 @@ mod tests {
         // Write part (in block to drop the file)
         let mut offsets = Vec::new();
         {
-            let file = VirtualFile::create(pathbuf.as_path()).await?;
+            let file = VirtualFile::create(pathbuf.as_path(), &ctx).await?;
             let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
             for blob in blobs.iter() {
                 let (_, res) = wtr.write_blob(blob.clone(), &ctx).await;
@@ -314,7 +314,7 @@ mod tests {
             wtr.flush_buffer(&ctx).await?;
         }
 
-        let file = VirtualFile::open(pathbuf.as_path()).await?;
+        let file = VirtualFile::open(pathbuf.as_path(), &ctx).await?;
         let rdr = BlockReaderRef::VirtualFile(&file);
         let rdr = BlockCursor::new(rdr);
         for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() {
diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs
index 8b815a1885..79cc7bf153 100644
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -28,6 +28,7 @@ impl EphemeralFile {
         conf: &PageServerConf,
         tenant_shard_id: TenantShardId,
         timeline_id: TimelineId,
+        ctx: &RequestContext,
     ) -> Result<EphemeralFile, io::Error> {
         static NEXT_FILENAME: AtomicU64 = AtomicU64::new(1);
         let filename_disambiguator =
@@ -45,6 +46,7 @@ impl EphemeralFile {
                 .read(true)
                 .write(true)
                 .create(true),
+            ctx,
         )
         .await?;
 
@@ -153,7 +155,7 @@ mod tests {
     async fn test_ephemeral_blobs() -> Result<(), io::Error> {
         let (conf, tenant_id, timeline_id, ctx) = harness("ephemeral_blobs")?;
 
-        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id).await?;
+        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &ctx).await?;
 
         let pos_foo = file.write_blob(b"foo", &ctx).await?;
         assert_eq!(
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index b464437422..f3c9e64533 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -112,14 +112,17 @@ pub async fn download_layer_file<'a>(
     // We use fatal_err() below because the after the rename above,
     // the in-memory state of the filesystem already has the layer file in its final place,
     // and subsequent pageserver code could think it's durable while it really isn't.
-    let work = async move {
-        let timeline_dir = VirtualFile::open(&timeline_path)
-            .await
-            .fatal_err("VirtualFile::open for timeline dir fsync");
-        timeline_dir
-            .sync_all()
-            .await
-            .fatal_err("VirtualFile::sync_all timeline dir");
+    let work = {
+        let ctx = ctx.detached_child(ctx.task_kind(), ctx.download_behavior());
+        async move {
+            let timeline_dir = VirtualFile::open(&timeline_path, &ctx)
+                .await
+                .fatal_err("VirtualFile::open for timeline dir fsync");
+            timeline_dir
+                .sync_all()
+                .await
+                .fatal_err("VirtualFile::sync_all timeline dir");
+        }
     };
     crate::virtual_file::io_engine::get()
         .spawn_blocking_and_block_on_if_std(work)
@@ -196,7 +199,7 @@ async fn download_object<'a>(
             use crate::virtual_file::owned_buffers_io::{self, util::size_tracking_writer};
             use bytes::BytesMut;
             async {
-                let destination_file = VirtualFile::create(dst_path)
+                let destination_file = VirtualFile::create(dst_path, ctx)
                     .await
                     .with_context(|| format!("create a destination file for layer '{dst_path}'"))
                     .map_err(DownloadError::Other)?;
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 4f30cf2e84..1b3802840f 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -394,6 +394,7 @@ impl DeltaLayerWriterInner {
         tenant_shard_id: TenantShardId,
         key_start: Key,
         lsn_range: Range<Lsn>,
+        ctx: &RequestContext,
     ) -> anyhow::Result<Self> {
         // Create the file initially with a temporary filename. We don't know
         // the end key yet, so we cannot form the final filename yet. We will
@@ -404,7 +405,7 @@ impl DeltaLayerWriterInner {
         let path =
             DeltaLayer::temp_path_for(conf, &tenant_shard_id, &timeline_id, key_start, &lsn_range);
 
-        let mut file = VirtualFile::create(&path).await?;
+        let mut file = VirtualFile::create(&path, ctx).await?;
         // make room for the header block
         file.seek(SeekFrom::Start(PAGE_SZ as u64)).await?;
         let blob_writer = BlobWriter::new(file, PAGE_SZ as u64);
@@ -586,6 +587,7 @@ impl DeltaLayerWriter {
         tenant_shard_id: TenantShardId,
         key_start: Key,
         lsn_range: Range<Lsn>,
+        ctx: &RequestContext,
     ) -> anyhow::Result<Self> {
         Ok(Self {
             inner: Some(
@@ -595,6 +597,7 @@ impl DeltaLayerWriter {
                     tenant_shard_id,
                     key_start,
                     lsn_range,
+                    ctx,
                 )
                 .await?,
             ),
@@ -701,6 +704,7 @@ impl DeltaLayer {
         let mut file = VirtualFile::open_with_options(
             path,
             virtual_file::OpenOptions::new().read(true).write(true),
+            ctx,
         )
         .await
         .with_context(|| format!("Failed to open file '{}'", path))?;
@@ -734,7 +738,7 @@ impl DeltaLayerInner {
         max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
         ctx: &RequestContext,
     ) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
-        let file = match VirtualFile::open(path).await {
+        let file = match VirtualFile::open(path, ctx).await {
             Ok(file) => file,
             Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
         };
@@ -1792,6 +1796,7 @@ mod test {
             harness.tenant_shard_id,
             entries_meta.key_range.start,
             entries_meta.lsn_range.clone(),
+            &ctx,
         )
         .await?;
 
@@ -1979,6 +1984,7 @@ mod test {
                 tenant.tenant_shard_id,
                 Key::MIN,
                 Lsn(0x11)..truncate_at,
+                ctx,
             )
             .await
             .unwrap();
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 72d1f36cab..6ea452b993 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -343,6 +343,7 @@ impl ImageLayer {
         let mut file = VirtualFile::open_with_options(
             path,
             virtual_file::OpenOptions::new().read(true).write(true),
+            ctx,
         )
         .await
         .with_context(|| format!("Failed to open file '{}'", path))?;
@@ -377,7 +378,7 @@ impl ImageLayerInner {
         max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
         ctx: &RequestContext,
     ) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
-        let file = match VirtualFile::open(path).await {
+        let file = match VirtualFile::open(path, ctx).await {
             Ok(file) => file,
             Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
         };
@@ -632,6 +633,7 @@ impl ImageLayerWriterInner {
         tenant_shard_id: TenantShardId,
         key_range: &Range<Key>,
         lsn: Lsn,
+        ctx: &RequestContext,
     ) -> anyhow::Result<Self> {
         // Create the file initially with a temporary filename.
         // We'll atomically rename it to the final name when we're done.
@@ -651,6 +653,7 @@ impl ImageLayerWriterInner {
                 virtual_file::OpenOptions::new()
                     .write(true)
                     .create_new(true),
+                ctx,
             )
             .await?
         };
@@ -805,10 +808,11 @@ impl ImageLayerWriter {
         tenant_shard_id: TenantShardId,
         key_range: &Range<Key>,
         lsn: Lsn,
+        ctx: &RequestContext,
     ) -> anyhow::Result<ImageLayerWriter> {
         Ok(Self {
             inner: Some(
-                ImageLayerWriterInner::new(conf, timeline_id, tenant_shard_id, key_range, lsn)
+                ImageLayerWriterInner::new(conf, timeline_id, tenant_shard_id, key_range, lsn, ctx)
                     .await?,
             ),
         })
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 4dacbec2f3..9553f83026 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -473,10 +473,11 @@ impl InMemoryLayer {
         timeline_id: TimelineId,
         tenant_shard_id: TenantShardId,
         start_lsn: Lsn,
+        ctx: &RequestContext,
     ) -> Result<InMemoryLayer> {
         trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");
 
-        let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id).await?;
+        let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id, ctx).await?;
         let key = InMemoryLayerFileId(file.page_cache_file_id());
 
         Ok(InMemoryLayer {
@@ -642,6 +643,7 @@ impl InMemoryLayer {
             self.tenant_shard_id,
             Key::MIN,
             self.start_lsn..end_lsn,
+            ctx,
         )
         .await?;
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 505dc8c30d..d2fcd6c4a5 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3560,7 +3560,11 @@ impl Timeline {
     ///
     /// Get a handle to the latest layer for appending.
     ///
-    async fn get_layer_for_write(&self, lsn: Lsn) -> anyhow::Result<Arc<InMemoryLayer>> {
+    async fn get_layer_for_write(
+        &self,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Arc<InMemoryLayer>> {
         let mut guard = self.layers.write().await;
         let layer = guard
             .get_layer_for_write(
@@ -3569,6 +3573,7 @@ impl Timeline {
                 self.conf,
                 self.timeline_id,
                 self.tenant_shard_id,
+                ctx,
             )
             .await?;
         Ok(layer)
@@ -3833,8 +3838,8 @@ impl Timeline {
                 );
                 self.create_delta_layer(
                     &frozen_layer,
-                    ctx,
                     Some(metadata_keyspace.0.ranges[0].clone()),
+                    ctx,
                 )
                 .await?
             } else {
@@ -3863,7 +3868,7 @@ impl Timeline {
             // Normal case, write out a L0 delta layer file.
             // `create_delta_layer` will not modify the layer map.
             // We will remove frozen layer and add delta layer in one atomic operation later.
-            let Some(layer) = self.create_delta_layer(&frozen_layer, ctx, None).await? else {
+            let Some(layer) = self.create_delta_layer(&frozen_layer, None, ctx).await? else {
                 panic!("delta layer cannot be empty if no filter is applied");
             };
             (
@@ -3992,8 +3997,8 @@ impl Timeline {
     async fn create_delta_layer(
         self: &Arc<Self>,
         frozen_layer: &Arc<InMemoryLayer>,
-        ctx: &RequestContext,
         key_range: Option<Range<Key>>,
+        ctx: &RequestContext,
     ) -> anyhow::Result<Option<ResidentLayer>> {
         let self_clone = Arc::clone(self);
         let frozen_layer = Arc::clone(frozen_layer);
@@ -4016,6 +4021,7 @@ impl Timeline {
                 &self_clone
                     .conf
                     .timeline_path(&self_clone.tenant_shard_id, &self_clone.timeline_id),
+                &ctx,
             )
             .await
             .fatal_err("VirtualFile::open for timeline dir fsync");
@@ -4209,6 +4215,7 @@ impl Timeline {
                 self.tenant_shard_id,
                 &img_range,
                 lsn,
+                ctx,
             )
             .await?;
 
@@ -4313,6 +4320,7 @@ impl Timeline {
                 &self
                     .conf
                     .timeline_path(&self.tenant_shard_id, &self.timeline_id),
+                ctx,
             )
             .await
             .fatal_err("VirtualFile::open for timeline dir fsync");
@@ -5214,7 +5222,7 @@ impl<'a> TimelineWriter<'a> {
         let buf_size: u64 = buf.len().try_into().expect("oversized value buf");
 
         let action = self.get_open_layer_action(lsn, buf_size);
-        let layer = self.handle_open_layer_action(lsn, action).await?;
+        let layer = self.handle_open_layer_action(lsn, action, ctx).await?;
         let res = layer.put_value(key, lsn, &buf, ctx).await;
 
         if res.is_ok() {
@@ -5237,14 +5245,15 @@ impl<'a> TimelineWriter<'a> {
         &mut self,
         at: Lsn,
         action: OpenLayerAction,
+        ctx: &RequestContext,
     ) -> anyhow::Result<&Arc<InMemoryLayer>> {
         match action {
             OpenLayerAction::Roll => {
                 let freeze_at = self.write_guard.as_ref().unwrap().max_lsn.unwrap();
                 self.roll_layer(freeze_at).await?;
-                self.open_layer(at).await?;
+                self.open_layer(at, ctx).await?;
             }
-            OpenLayerAction::Open => self.open_layer(at).await?,
+            OpenLayerAction::Open => self.open_layer(at, ctx).await?,
             OpenLayerAction::None => {
                 assert!(self.write_guard.is_some());
             }
@@ -5253,8 +5262,8 @@ impl<'a> TimelineWriter<'a> {
         Ok(&self.write_guard.as_ref().unwrap().open_layer)
     }
 
-    async fn open_layer(&mut self, at: Lsn) -> anyhow::Result<()> {
-        let layer = self.tl.get_layer_for_write(at).await?;
+    async fn open_layer(&mut self, at: Lsn, ctx: &RequestContext) -> anyhow::Result<()> {
+        let layer = self.tl.get_layer_for_write(at, ctx).await?;
         let initial_size = layer.size().await?;
 
         let last_freeze_at = self.last_freeze_at.load();
@@ -5331,10 +5340,14 @@ impl<'a> TimelineWriter<'a> {
         Ok(())
     }
 
-    pub(crate) async fn delete_batch(&mut self, batch: &[(Range<Key>, Lsn)]) -> anyhow::Result<()> {
+    pub(crate) async fn delete_batch(
+        &mut self,
+        batch: &[(Range<Key>, Lsn)],
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
         if let Some((_, lsn)) = batch.first() {
             let action = self.get_open_layer_action(*lsn, 0);
-            let layer = self.handle_open_layer_action(*lsn, action).await?;
+            let layer = self.handle_open_layer_action(*lsn, action, ctx).await?;
             layer.put_tombstones(batch).await?;
         }
 
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index e83878b8fb..4226bf431e 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -700,6 +700,7 @@ impl Timeline {
                                 debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
                                 lsn_range.clone()
                             },
+                            ctx,
                         )
                         .await?,
                     );
@@ -755,6 +756,7 @@ impl Timeline {
                 &self
                     .conf
                     .timeline_path(&self.tenant_shard_id, &self.timeline_id),
+                ctx,
             )
             .await
             .fatal_err("VirtualFile::open for timeline dir fsync");
@@ -1093,6 +1095,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
             self.timeline.tenant_shard_id,
             key_range.start,
             lsn_range.clone(),
+            ctx,
         )
         .await?;
 
@@ -1167,6 +1170,7 @@ impl TimelineAdaptor {
             self.timeline.tenant_shard_id,
             key_range,
             lsn,
+            ctx,
         )
         .await?;
 
diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs
index 69b82344a6..9471ba860f 100644
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -215,6 +215,7 @@ pub(super) async fn prepare(
                 &detached
                     .conf
                     .timeline_path(&detached.tenant_shard_id, &detached.timeline_id),
+                ctx,
             )
             .await
             .fatal_err("VirtualFile::open for timeline dir fsync");
@@ -339,6 +340,7 @@ async fn copy_lsn_prefix(
         target_timeline.tenant_shard_id,
         layer.layer_desc().key_range.start,
         layer.layer_desc().lsn_range.start..end_lsn,
+        ctx,
     )
     .await
     .map_err(CopyDeltaPrefix)?;
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index a72eb1b3bf..248420e632 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -9,6 +9,7 @@ use utils::{
 
 use crate::{
     config::PageServerConf,
+    context::RequestContext,
     metrics::TimelineMetrics,
     tenant::{
         layer_map::{BatchedUpdates, LayerMap},
@@ -69,6 +70,7 @@ impl LayerManager {
         conf: &'static PageServerConf,
         timeline_id: TimelineId,
         tenant_shard_id: TenantShardId,
+        ctx: &RequestContext,
     ) -> Result<Arc<InMemoryLayer>> {
         ensure!(lsn.is_aligned());
 
@@ -105,7 +107,7 @@ impl LayerManager {
             );
 
             let new_layer =
-                InMemoryLayer::create(conf, timeline_id, tenant_shard_id, start_lsn).await?;
+                InMemoryLayer::create(conf, timeline_id, tenant_shard_id, start_lsn, ctx).await?;
             let layer = Arc::new(new_layer);
 
             self.layer_map.open_layer = Some(layer.clone());
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 8dee73891b..b68f3a0e89 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -344,16 +344,23 @@ macro_rules! with_file {
 
 impl VirtualFile {
     /// Open a file in read-only mode. Like File::open.
-    pub async fn open(path: &Utf8Path) -> Result<VirtualFile, std::io::Error> {
-        Self::open_with_options(path, OpenOptions::new().read(true)).await
+    pub async fn open(
+        path: &Utf8Path,
+        ctx: &RequestContext,
+    ) -> Result<VirtualFile, std::io::Error> {
+        Self::open_with_options(path, OpenOptions::new().read(true), ctx).await
     }
 
     /// Create a new file for writing. If the file exists, it will be truncated.
     /// Like File::create.
-    pub async fn create(path: &Utf8Path) -> Result<VirtualFile, std::io::Error> {
+    pub async fn create(
+        path: &Utf8Path,
+        ctx: &RequestContext,
+    ) -> Result<VirtualFile, std::io::Error> {
         Self::open_with_options(
             path,
             OpenOptions::new().write(true).create(true).truncate(true),
+            ctx,
         )
         .await
     }
@@ -366,6 +373,7 @@ impl VirtualFile {
     pub async fn open_with_options(
         path: &Utf8Path,
         open_options: &OpenOptions,
+        _ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */
     ) -> Result<VirtualFile, std::io::Error> {
         let path_str = path.to_string();
         let parts = path_str.split('/').collect::<Vec<&str>>();
@@ -1179,7 +1187,6 @@ mod tests {
     use rand::seq::SliceRandom;
     use rand::thread_rng;
     use rand::Rng;
-    use std::future::Future;
     use std::io::Write;
     use std::os::unix::fs::FileExt;
     use std::sync::Arc;
@@ -1293,41 +1300,69 @@ mod tests {
         // results with VirtualFiles as with native Files. (Except that with
         // native files, you will run out of file descriptors if the ulimit
         // is low enough.)
-        test_files("virtual_files", |path, open_options| async move {
-            let vf = VirtualFile::open_with_options(&path, &open_options).await?;
-            Ok(MaybeVirtualFile::VirtualFile(vf))
-        })
-        .await
+        struct A;
+
+        impl Adapter for A {
+            async fn open(
+                path: Utf8PathBuf,
+                opts: OpenOptions,
+                ctx: &RequestContext,
+            ) -> Result<MaybeVirtualFile, anyhow::Error> {
+                let vf = VirtualFile::open_with_options(&path, &opts, ctx).await?;
+                Ok(MaybeVirtualFile::VirtualFile(vf))
+            }
+        }
+        test_files::<A>("virtual_files").await
     }
 
     #[tokio::test]
     async fn test_physical_files() -> anyhow::Result<()> {
-        test_files("physical_files", |path, open_options| async move {
-            Ok(MaybeVirtualFile::File({
-                let owned_fd = open_options.open(path.as_std_path()).await?;
-                File::from(owned_fd)
-            }))
-        })
-        .await
+        struct B;
+
+        impl Adapter for B {
+            async fn open(
+                path: Utf8PathBuf,
+                opts: OpenOptions,
+                _ctx: &RequestContext,
+            ) -> Result<MaybeVirtualFile, anyhow::Error> {
+                Ok(MaybeVirtualFile::File({
+                    let owned_fd = opts.open(path.as_std_path()).await?;
+                    File::from(owned_fd)
+                }))
+            }
+        }
+
+        test_files::<B>("physical_files").await
     }
 
-    async fn test_files<OF, FT>(testname: &str, openfunc: OF) -> anyhow::Result<()>
+    /// This is essentially a closure which returns a MaybeVirtualFile, but because rust edition
+    /// 2024 is not yet out with new lifetime capture or outlives rules, this is a async function
+    /// in trait which benefits from the new lifetime capture rules already.
+    trait Adapter {
+        async fn open(
+            path: Utf8PathBuf,
+            opts: OpenOptions,
+            ctx: &RequestContext,
+        ) -> Result<MaybeVirtualFile, anyhow::Error>;
+    }
+
+    async fn test_files<A>(testname: &str) -> anyhow::Result<()>
     where
-        OF: Fn(Utf8PathBuf, OpenOptions) -> FT,
-        FT: Future<Output = Result<MaybeVirtualFile, std::io::Error>>,
+        A: Adapter,
     {
         let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
         let testdir = crate::config::PageServerConf::test_repo_dir(testname);
         std::fs::create_dir_all(&testdir)?;
 
         let path_a = testdir.join("file_a");
-        let mut file_a = openfunc(
+        let mut file_a = A::open(
             path_a.clone(),
             OpenOptions::new()
                 .write(true)
                 .create(true)
                 .truncate(true)
                 .to_owned(),
+            &ctx,
         )
         .await?;
         file_a.write_all(b"foobar".to_vec(), &ctx).await?;
@@ -1336,7 +1371,7 @@ mod tests {
         let _ = file_a.read_string(&ctx).await.unwrap_err();
 
         // Close the file and re-open for reading
-        let mut file_a = openfunc(path_a, OpenOptions::new().read(true).to_owned()).await?;
+        let mut file_a = A::open(path_a, OpenOptions::new().read(true).to_owned(), &ctx).await?;
 
         // cannot write to a file opened in read-only mode
         let _ = file_a.write_all(b"bar".to_vec(), &ctx).await.unwrap_err();
@@ -1371,7 +1406,7 @@ mod tests {
 
         // Create another test file, and try FileExt functions on it.
         let path_b = testdir.join("file_b");
-        let mut file_b = openfunc(
+        let mut file_b = A::open(
             path_b.clone(),
             OpenOptions::new()
                 .read(true)
@@ -1379,6 +1414,7 @@ mod tests {
                 .create(true)
                 .truncate(true)
                 .to_owned(),
+            &ctx,
         )
         .await?;
         file_b.write_all_at(b"BAR".to_vec(), 3, &ctx).await?;
@@ -1394,8 +1430,12 @@ mod tests {
 
         let mut vfiles = Vec::new();
         for _ in 0..100 {
-            let mut vfile =
-                openfunc(path_b.clone(), OpenOptions::new().read(true).to_owned()).await?;
+            let mut vfile = A::open(
+                path_b.clone(),
+                OpenOptions::new().read(true).to_owned(),
+                &ctx,
+            )
+            .await?;
             assert_eq!("FOOBAR", vfile.read_string(&ctx).await?);
             vfiles.push(vfile);
         }
@@ -1441,8 +1481,12 @@ mod tests {
         // Open the file many times.
         let mut files = Vec::new();
         for _ in 0..VIRTUAL_FILES {
-            let f = VirtualFile::open_with_options(&test_file_path, OpenOptions::new().read(true))
-                .await?;
+            let f = VirtualFile::open_with_options(
+                &test_file_path,
+                OpenOptions::new().read(true),
+                &ctx,
+            )
+            .await?;
             files.push(f);
         }
         let files = Arc::new(files);
@@ -1488,7 +1532,7 @@ mod tests {
         VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec())
             .await
             .unwrap();
-        let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
+        let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap());
         let post = file.read_string(&ctx).await.unwrap();
         assert_eq!(post, "foo");
         assert!(!tmp_path.exists());
@@ -1497,7 +1541,7 @@ mod tests {
         VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"bar".to_vec())
             .await
             .unwrap();
-        let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
+        let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap());
         let post = file.read_string(&ctx).await.unwrap();
         assert_eq!(post, "bar");
         assert!(!tmp_path.exists());
@@ -1521,7 +1565,7 @@ mod tests {
             .await
             .unwrap();
 
-        let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
+        let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap());
         let post = file.read_string(&ctx).await.unwrap();
         assert_eq!(post, "foo");
         assert!(!tmp_path.exists());

From 55ba885f6be431b2b694bca4d95d7a40f4bda98a Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 13 May 2024 14:16:03 +0100
Subject: [PATCH 037/130] CI(report-benchmarks-failures): report benchmarks
 failures to slack (#7678)

## Problem

`benchmarks` job that we run on the main doesn't block anything, so it's
easy to miss its failure.

Ref https://github.com/neondatabase/cloud/issues/13087

## Summary of changes
- Add `report-benchmarks-failures` job that report failures of
`benchmarks` job to a Slack channel
---
 .github/workflows/build_and_test.yml | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 21e7a56670..f417cecd58 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -546,9 +546,27 @@ jobs:
       # XXX: no coverage data handling here, since benchmarks are run on release builds,
       # while coverage is currently collected for the debug ones
 
+  report-benchmarks-failures:
+    needs: [ benchmarks, create-test-report ]
+    if: github.ref_name == 'main' && failure()
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: slackapi/slack-github-action@v1
+      with:
+        channel-id: C060CNA47S9 # on-call-staging-storage-stream
+        slack-message: |
+          Benchmarks failed on main: ${{ github.event.head_commit.url }}
+
+          Allure report: ${{ needs.create-test-report.outputs.report-url }}
+      env:
+        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
+
   create-test-report:
     needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-build-tools-image ]
     if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
+    outputs:
+      report-url: ${{ steps.create-allure-report.outputs.report-url }}
 
     runs-on: [ self-hosted, gen3, small ]
     container:

From 4d8a10af1cc41563cc4542beb327a2d75fb1bad8 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 13 May 2024 17:49:50 +0300
Subject: [PATCH 038/130] fix: do not create metrics contention from background
 task permit (#7730)

The background task loop permit metrics do two of `with_label_values`
very often. Change the codepath to cache the counters on first access
into a `Lazy` with `enum_map::EnumMap`. The expectation is that this
should not fix for metric collection failures under load, but it doesn't
hurt.

Cc: #7161
---
 pageserver/src/tenant/tasks.rs | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index f153719f98..ba2b8afd03 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -41,7 +41,7 @@ static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy<tokio::sync::Semaphore
         tokio::sync::Semaphore::new(permits)
     });
 
-#[derive(Debug, PartialEq, Eq, Clone, Copy, strum_macros::IntoStaticStr)]
+#[derive(Debug, PartialEq, Eq, Clone, Copy, strum_macros::IntoStaticStr, enum_map::Enum)]
 #[strum(serialize_all = "snake_case")]
 pub(crate) enum BackgroundLoopKind {
     Compaction,
@@ -57,19 +57,25 @@ pub(crate) enum BackgroundLoopKind {
 
 impl BackgroundLoopKind {
     fn as_static_str(&self) -> &'static str {
-        let s: &'static str = self.into();
-        s
+        self.into()
     }
 }
 
+static PERMIT_GAUGES: once_cell::sync::Lazy<
+    enum_map::EnumMap<BackgroundLoopKind, metrics::IntCounterPair>,
+> = once_cell::sync::Lazy::new(|| {
+    enum_map::EnumMap::from_array(std::array::from_fn(|i| {
+        let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
+        crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE.with_label_values(&[kind.into()])
+    }))
+});
+
 /// Cancellation safe.
 pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
     loop_kind: BackgroundLoopKind,
     _ctx: &RequestContext,
 ) -> tokio::sync::SemaphorePermit<'static> {
-    let _guard = crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE
-        .with_label_values(&[loop_kind.as_static_str()])
-        .guard();
+    let _guard = PERMIT_GAUGES[loop_kind].guard();
 
     pausable_failpoint!(
         "initial-size-calculation-permit-pause",

From 7f517640011a5dc2c811c66c2767ae58e27d0c79 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Mon, 13 May 2024 11:33:41 -0400
Subject: [PATCH 039/130] feat(pageserver): add metrics for aux file size
 (#7623)

ref https://github.com/neondatabase/neon/issues/7443

## Summary of changes

This pull request adds a size estimator for aux files. Each timeline
stores a cached `isize` for the estimated total size of aux files. It
gets reset on basebackup, and gets updated for each aux file
modification. TODO: print a warning when it exceeds the size.

The size metrics is not accurate. Race between `on_basebackup` and other
functions could create a negative basebackup size, but the chance is
rare. Anyways, this does not impose any extra I/Os to the storage as
everything is computed in-memory.

The aux files are only stored on shard 0. As basebackups are only
generated on shard 0, only shard 0 will report this metrics.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/aux_file.rs          | 52 +++++++++++++++++++++++++++++
 pageserver/src/metrics.rs           | 15 +++++++++
 pageserver/src/pgdatadir_mapping.rs | 52 +++++++++++++++++++++--------
 pageserver/src/tenant/timeline.rs   | 32 ++++++++++++------
 test_runner/fixtures/metrics.py     |  1 +
 5 files changed, 128 insertions(+), 24 deletions(-)

diff --git a/pageserver/src/aux_file.rs b/pageserver/src/aux_file.rs
index a26ed84a0d..e6d950487d 100644
--- a/pageserver/src/aux_file.rs
+++ b/pageserver/src/aux_file.rs
@@ -1,3 +1,6 @@
+use std::sync::Arc;
+
+use ::metrics::IntGauge;
 use bytes::{Buf, BufMut, Bytes};
 use pageserver_api::key::{Key, AUX_KEY_PREFIX, METADATA_KEY_SIZE};
 use tracing::warn;
@@ -140,6 +143,55 @@ pub fn encode_file_value(files: &[(&str, &[u8])]) -> anyhow::Result<Vec<u8>> {
     Ok(encoded)
 }
 
+/// An estimation of the size of aux files.
+pub struct AuxFileSizeEstimator {
+    aux_file_size_gauge: IntGauge,
+    size: Arc<std::sync::Mutex<Option<isize>>>,
+}
+
+impl AuxFileSizeEstimator {
+    pub fn new(aux_file_size_gauge: IntGauge) -> Self {
+        Self {
+            aux_file_size_gauge,
+            size: Arc::new(std::sync::Mutex::new(None)),
+        }
+    }
+
+    pub fn on_base_backup(&self, new_size: usize) {
+        let mut guard = self.size.lock().unwrap();
+        *guard = Some(new_size as isize);
+        self.report(new_size as isize);
+    }
+
+    pub fn on_add(&self, file_size: usize) {
+        let mut guard = self.size.lock().unwrap();
+        if let Some(size) = &mut *guard {
+            *size += file_size as isize;
+            self.report(*size);
+        }
+    }
+
+    pub fn on_remove(&self, file_size: usize) {
+        let mut guard = self.size.lock().unwrap();
+        if let Some(size) = &mut *guard {
+            *size -= file_size as isize;
+            self.report(*size);
+        }
+    }
+
+    pub fn on_update(&self, old_size: usize, new_size: usize) {
+        let mut guard = self.size.lock().unwrap();
+        if let Some(size) = &mut *guard {
+            *size += new_size as isize - old_size as isize;
+            self.report(*size);
+        }
+    }
+
+    pub fn report(&self, size: isize) {
+        self.aux_file_size_gauge.set(size as i64);
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 256f2f334c..b27bfb43b0 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -585,6 +585,15 @@ static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
     .expect("failed to define current logical size metric")
 });
 
+static AUX_FILE_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
+    register_int_gauge_vec!(
+        "pageserver_aux_file_estimated_size",
+        "The size of all aux files for a timeline in aux file v2 store.",
+        &["tenant_id", "shard_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) mod initial_logical_size {
     use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
     use once_cell::sync::Lazy;
@@ -2115,6 +2124,7 @@ pub(crate) struct TimelineMetrics {
     resident_physical_size_gauge: UIntGauge,
     /// copy of LayeredTimeline.current_logical_size
     pub current_logical_size_gauge: UIntGauge,
+    pub aux_file_size_gauge: IntGauge,
     pub directory_entries_count_gauge: Lazy<UIntGauge, Box<dyn Send + Fn() -> UIntGauge>>,
     pub evictions: IntCounter,
     pub evictions_with_low_residence_duration: std::sync::RwLock<EvictionsWithLowResidenceDuration>,
@@ -2187,6 +2197,9 @@ impl TimelineMetrics {
         let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
             .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
             .unwrap();
+        let aux_file_size_gauge = AUX_FILE_SIZE
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
+            .unwrap();
         // TODO use impl Trait syntax here once we have ability to use it: https://github.com/rust-lang/rust/issues/63065
         let directory_entries_count_gauge_closure = {
             let tenant_shard_id = *tenant_shard_id;
@@ -2224,6 +2237,7 @@ impl TimelineMetrics {
             last_record_gauge,
             resident_physical_size_gauge,
             current_logical_size_gauge,
+            aux_file_size_gauge,
             directory_entries_count_gauge,
             evictions,
             evictions_with_low_residence_duration: std::sync::RwLock::new(
@@ -2264,6 +2278,7 @@ impl TimelineMetrics {
             let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         }
         let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
+        let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
 
         self.evictions_with_low_residence_duration
             .write()
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index ffcab5f140..1092d64d33 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -699,13 +699,17 @@ impl Timeline {
             .await
             .context("scan")?;
         let mut result = HashMap::new();
+        let mut sz = 0;
         for (_, v) in kv {
             let v = v.context("get value")?;
             let v = aux_file::decode_file_value_bytes(&v).context("value decode")?;
             for (fname, content) in v {
+                sz += fname.len();
+                sz += content.len();
                 result.insert(fname, content);
             }
         }
+        self.aux_file_size_estimator.on_base_backup(sz);
         Ok(result)
     }
 
@@ -1474,23 +1478,45 @@ impl<'a> DatadirModification<'a> {
                 Err(PageReconstructError::MissingKey(_)) => None,
                 Err(e) => return Err(e.into()),
             };
-            let files = if let Some(ref old_val) = old_val {
+            let files: Vec<(&str, &[u8])> = if let Some(ref old_val) = old_val {
                 aux_file::decode_file_value(old_val)?
             } else {
                 Vec::new()
             };
-            let new_files = if content.is_empty() {
-                files
-                    .into_iter()
-                    .filter(|(p, _)| &path != p)
-                    .collect::<Vec<_>>()
-            } else {
-                files
-                    .into_iter()
-                    .filter(|(p, _)| &path != p)
-                    .chain(std::iter::once((path, content)))
-                    .collect::<Vec<_>>()
-            };
+            let mut other_files = Vec::with_capacity(files.len());
+            let mut modifying_file = None;
+            for file @ (p, content) in files {
+                if path == p {
+                    assert!(
+                        modifying_file.is_none(),
+                        "duplicated entries found for {}",
+                        path
+                    );
+                    modifying_file = Some(content);
+                } else {
+                    other_files.push(file);
+                }
+            }
+            let mut new_files = other_files;
+            match (modifying_file, content.is_empty()) {
+                (Some(old_content), false) => {
+                    self.tline
+                        .aux_file_size_estimator
+                        .on_update(old_content.len(), content.len());
+                    new_files.push((path, content));
+                }
+                (Some(old_content), true) => {
+                    self.tline
+                        .aux_file_size_estimator
+                        .on_remove(old_content.len());
+                    // not adding the file key to the final `new_files` vec.
+                }
+                (None, false) => {
+                    self.tline.aux_file_size_estimator.on_add(content.len());
+                    new_files.push((path, content));
+                }
+                (None, true) => anyhow::bail!("removing non-existing aux file: {}", path),
+            }
             let new_val = aux_file::encode_file_value(&new_files)?;
             self.put(key, Value::Image(new_val.into()));
         }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index d2fcd6c4a5..01f354b9e8 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -61,9 +61,12 @@ use std::{
 };
 
 use crate::tenant::timeline::init::LocalLayerFileMetadata;
-use crate::tenant::{
-    layer_map::{LayerMap, SearchResult},
-    metadata::TimelineMetadata,
+use crate::{
+    aux_file::AuxFileSizeEstimator,
+    tenant::{
+        layer_map::{LayerMap, SearchResult},
+        metadata::TimelineMetadata,
+    },
 };
 use crate::{
     context::{DownloadBehavior, RequestContext},
@@ -409,6 +412,8 @@ pub struct Timeline {
 
     /// Keep aux directory cache to avoid it's reconstruction on each update
     pub(crate) aux_files: tokio::sync::Mutex<AuxFilesState>,
+
+    pub(crate) aux_file_size_estimator: AuxFileSizeEstimator,
 }
 
 pub struct WalReceiverInfo {
@@ -2161,6 +2166,16 @@ impl Timeline {
         };
 
         Arc::new_cyclic(|myself| {
+            let metrics = TimelineMetrics::new(
+                &tenant_shard_id,
+                &timeline_id,
+                crate::metrics::EvictionsWithLowResidenceDurationBuilder::new(
+                    "mtime",
+                    evictions_low_residence_duration_metric_threshold,
+                ),
+            );
+            let aux_file_metrics = metrics.aux_file_size_gauge.clone();
+
             let mut result = Timeline {
                 conf,
                 tenant_conf,
@@ -2192,14 +2207,7 @@ impl Timeline {
                 ancestor_timeline: ancestor,
                 ancestor_lsn: metadata.ancestor_lsn(),
 
-                metrics: TimelineMetrics::new(
-                    &tenant_shard_id,
-                    &timeline_id,
-                    crate::metrics::EvictionsWithLowResidenceDurationBuilder::new(
-                        "mtime",
-                        evictions_low_residence_duration_metric_threshold,
-                    ),
-                ),
+                metrics,
 
                 query_metrics: crate::metrics::SmgrQueryTimePerTimeline::new(
                     &tenant_shard_id,
@@ -2263,6 +2271,8 @@ impl Timeline {
                     dir: None,
                     n_deltas: 0,
                 }),
+
+                aux_file_size_estimator: AuxFileSizeEstimator::new(aux_file_metrics),
             };
             result.repartition_threshold =
                 result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index 7d34e12ca3..8fa67e75c9 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -149,6 +149,7 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
     "pageserver_storage_operations_seconds_sum_total",
     "pageserver_evictions_total",
     "pageserver_evictions_with_low_residence_duration_total",
+    "pageserver_aux_file_estimated_size",
     *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
     # "pageserver_directory_entries_count", -- only used if above a certain threshold
     # "pageserver_broken_tenants_count" -- used only for broken

From be0c73f8e7543ca4de8c89b816eb70f792f2685a Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 13 May 2024 17:59:59 +0100
Subject: [PATCH 040/130] pageserver: improve API for invoking GC (#7655)

## Problem

In https://github.com/neondatabase/neon/pull/7531, I had a test flaky
because the GC API endpoint fails if the tenant happens not to be active
yet.

## Summary of changes

While adding that wait for the tenant to be active, I noticed that this
endpoint is kind of strange (spawns a TaskManager task) and has a
comment `// TODO: spawning is redundant now, need to hold the gate`, so
this PR cleans it up to just run the GC inline while holding a gate.

The GC code is updated to avoid assuming it runs inside a task manager
task. Avoiding checking the task_mgr cancellation token is safe, because
our timeline shutdown always cancels Timeline::cancel.
---
 pageserver/src/http/routes.rs     |   7 +-
 pageserver/src/tenant.rs          |   2 +-
 pageserver/src/tenant/mgr.rs      | 107 +++++++++++++-----------------
 pageserver/src/tenant/timeline.rs |   2 -
 4 files changed, 49 insertions(+), 69 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index a8ca642dc5..2370561756 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1715,12 +1715,7 @@ async fn timeline_gc_handler(
     let gc_req: TimelineGcRequest = json_request(&mut request).await?;
 
     let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let wait_task_done = mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx)?;
-    let gc_result = wait_task_done
-        .await
-        .context("wait for gc task")
-        .map_err(ApiError::InternalServerError)?
-        .map_err(ApiError::InternalServerError)?;
+    let gc_result = mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx).await?;
 
     json_response(StatusCode::OK, gc_result)
 }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 010e56a899..80d354d79e 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2800,7 +2800,7 @@ impl Tenant {
         // See comments in [`Tenant::branch_timeline`] for more information about why branch
         // creation task can run concurrently with timeline's GC iteration.
         for timeline in gc_timelines {
-            if task_mgr::is_shutdown_requested() || cancel.is_cancelled() {
+            if cancel.is_cancelled() {
                 // We were requested to shut down. Stop and return with the progress we
                 // made.
                 break;
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 6be66e99ad..7a3e36bf02 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -2880,86 +2880,73 @@ use {
     utils::http::error::ApiError,
 };
 
-pub(crate) fn immediate_gc(
+#[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id))]
+pub(crate) async fn immediate_gc(
     tenant_shard_id: TenantShardId,
     timeline_id: TimelineId,
     gc_req: TimelineGcRequest,
     cancel: CancellationToken,
     ctx: &RequestContext,
-) -> Result<tokio::sync::oneshot::Receiver<Result<GcResult, anyhow::Error>>, ApiError> {
-    let guard = TENANTS.read().unwrap();
-
-    let tenant = guard
-        .get(&tenant_shard_id)
-        .cloned()
-        .with_context(|| format!("tenant {tenant_shard_id}"))
-        .map_err(|e| ApiError::NotFound(e.into()))?;
+) -> Result<GcResult, ApiError> {
+    let tenant = {
+        let guard = TENANTS.read().unwrap();
+        guard
+            .get(&tenant_shard_id)
+            .cloned()
+            .with_context(|| format!("tenant {tenant_shard_id}"))
+            .map_err(|e| ApiError::NotFound(e.into()))?
+    };
 
     let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
     // Use tenant's pitr setting
     let pitr = tenant.get_pitr_interval();
 
+    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+
     // Run in task_mgr to avoid race with tenant_detach operation
-    let ctx = ctx.detached_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
-    let (task_done, wait_task_done) = tokio::sync::oneshot::channel();
-    let span = info_span!("manual_gc", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id);
+    let ctx: RequestContext =
+        ctx.detached_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
 
-    // TODO: spawning is redundant now, need to hold the gate
-    task_mgr::spawn(
-        &tokio::runtime::Handle::current(),
-        TaskKind::GarbageCollector,
-        Some(tenant_shard_id),
-        Some(timeline_id),
-        &format!("timeline_gc_handler garbage collection run for tenant {tenant_shard_id} timeline {timeline_id}"),
-        false,
-        async move {
-            fail::fail_point!("immediate_gc_task_pre");
+    let _gate_guard = tenant.gate.enter().map_err(|_| ApiError::ShuttingDown)?;
 
-            #[allow(unused_mut)]
-            let mut result = tenant
-                .gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx)
-                .await;
-                // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
-                // better once the types support it.
+    fail::fail_point!("immediate_gc_task_pre");
 
-            #[cfg(feature = "testing")]
-            {
-                // we need to synchronize with drop completion for python tests without polling for
-                // log messages
-                if let Ok(result) = result.as_mut() {
-                    let mut js = tokio::task::JoinSet::new();
-                    for layer in std::mem::take(&mut result.doomed_layers) {
-                        js.spawn(layer.wait_drop());
-                    }
-                    tracing::info!(total = js.len(), "starting to wait for the gc'd layers to be dropped");
-                    while let Some(res) = js.join_next().await {
-                        res.expect("wait_drop should not panic");
-                    }
-                }
+    #[allow(unused_mut)]
+    let mut result = tenant
+        .gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx)
+        .await;
+    // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
+    // better once the types support it.
 
-                let timeline = tenant.get_timeline(timeline_id, false).ok();
-                let rtc = timeline.as_ref().and_then(|x| x.remote_client.as_ref());
-
-                if let Some(rtc) = rtc {
-                    // layer drops schedule actions on remote timeline client to actually do the
-                    // deletions; don't care about the shutdown error, just exit fast
-                    drop(rtc.wait_completion().await);
-                }
+    #[cfg(feature = "testing")]
+    {
+        // we need to synchronize with drop completion for python tests without polling for
+        // log messages
+        if let Ok(result) = result.as_mut() {
+            let mut js = tokio::task::JoinSet::new();
+            for layer in std::mem::take(&mut result.doomed_layers) {
+                js.spawn(layer.wait_drop());
             }
-
-            match task_done.send(result) {
-                Ok(_) => (),
-                Err(result) => error!("failed to send gc result: {result:?}"),
+            tracing::info!(
+                total = js.len(),
+                "starting to wait for the gc'd layers to be dropped"
+            );
+            while let Some(res) = js.join_next().await {
+                res.expect("wait_drop should not panic");
             }
-            Ok(())
         }
-        .instrument(span)
-    );
 
-    // drop the guard until after we've spawned the task so that timeline shutdown will wait for the task
-    drop(guard);
+        let timeline = tenant.get_timeline(timeline_id, false).ok();
+        let rtc = timeline.as_ref().and_then(|x| x.remote_client.as_ref());
 
-    Ok(wait_task_done)
+        if let Some(rtc) = rtc {
+            // layer drops schedule actions on remote timeline client to actually do the
+            // deletions; don't care about the shutdown error, just exit fast
+            drop(rtc.wait_completion().await);
+        }
+    }
+
+    result.map_err(ApiError::InternalServerError)
 }
 
 #[cfg(test)]
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 01f354b9e8..9ee24a4ff0 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4656,11 +4656,9 @@ impl Timeline {
     pub(super) async fn gc(&self) -> anyhow::Result<GcResult> {
         // this is most likely the background tasks, but it might be the spawned task from
         // immediate_gc
-        let cancel = crate::task_mgr::shutdown_token();
         let _g = tokio::select! {
             guard = self.gc_lock.lock() => guard,
             _ = self.cancel.cancelled() => return Ok(GcResult::default()),
-            _ = cancel.cancelled() => return Ok(GcResult::default()),
         };
         let timer = self.metrics.garbage_collect_histo.start_timer();
 

From 1412e9b3e827435bab6de0eeea0a3f9d721273c0 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Mon, 13 May 2024 18:24:12 +0100
Subject: [PATCH 041/130] pagectl: fix diagrams generation for paths containing
 generations (#7739)

## Problem
When layer paths include generations, the lsn parsing does not work and
`pagectl` errors out.

## Summary of changes
If the last "word" of the layer path contains 8 characters, discard it
for the purpose of lsn parsing.
---
 pageserver/ctl/src/draw_timeline_dir.rs | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/pageserver/ctl/src/draw_timeline_dir.rs b/pageserver/ctl/src/draw_timeline_dir.rs
index d8082f8ab4..4dff8af1fc 100644
--- a/pageserver/ctl/src/draw_timeline_dir.rs
+++ b/pageserver/ctl/src/draw_timeline_dir.rs
@@ -83,6 +83,11 @@ fn parse_filename(name: &str) -> (Range<Key>, Range<Lsn>) {
     let split: Vec<&str> = name.split("__").collect();
     let keys: Vec<&str> = split[0].split('-').collect();
     let mut lsns: Vec<&str> = split[1].split('-').collect();
+
+    if lsns.last().expect("should").len() == 8 {
+        lsns.pop();
+    }
+
     if lsns.len() == 1 {
         lsns.push(lsns[0]);
     }

From 972470b1745146ad28c833385d34f8d413e34d1d Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 13 May 2024 18:38:30 +0100
Subject: [PATCH 042/130] pageserver: use adaptive concurrency in secondary
 layer downloads (#7675)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

Secondary downloads are a low priority task, and intentionally do not
try to max out download speeds. This is almost always fine when they are
used through the life of a tenant shard as a continuous "trickle" of
background downloads.

However, there are sometimes circumstances where we would like to
populate a secondary location as fast as we can, within the constraint
that we don't want to impact the activity of attached tenants:
- During node removal, where we will need to create replacements for
secondary locations on the node being removed
- After a shard split, we need new secondary locations for the new
shards to populate before the shards can be migrated to their final
location.

## Summary of changes

- Add an activity() function to the remote storage interface, enabling
callers to query how busy the remote storage backend is
- In the secondary download code, use a very modest amount of
concurrency, driven by the remote storage's state: we only use
concurrency if the remote storage semaphore is 75% free, and scale the
amount of concurrency used within that range.

This is not a super clever form of prioritization, but it should
accomplish the key goals:
- Enable secondary downloads to happen faster when the system is idle
- Make secondary downloads a much lower priority than attached tenants
when the remote storage is busy.

---------

Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
---
 libs/remote_storage/src/azure_blob.rs         |   5 +
 libs/remote_storage/src/lib.rs                |  34 +++
 libs/remote_storage/src/local_fs.rs           |  14 +-
 libs/remote_storage/src/s3_bucket.rs          |   8 +-
 libs/remote_storage/src/simulate_failures.rs  |   6 +-
 pageserver/src/tenant/secondary/downloader.rs | 238 +++++++++++++-----
 6 files changed, 238 insertions(+), 67 deletions(-)

diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index 24c1248304..220d4ef115 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -29,6 +29,7 @@ use http_types::{StatusCode, Url};
 use tokio_util::sync::CancellationToken;
 use tracing::debug;
 
+use crate::RemoteStorageActivity;
 use crate::{
     error::Cancelled, s3_bucket::RequestKind, AzureConfig, ConcurrencyLimiter, Download,
     DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata,
@@ -525,6 +526,10 @@ impl RemoteStorage for AzureBlobStorage {
         // https://learn.microsoft.com/en-us/azure/storage/blobs/point-in-time-restore-overview
         Err(TimeTravelError::Unimplemented)
     }
+
+    fn activity(&self) -> RemoteStorageActivity {
+        self.concurrency_limiter.activity()
+    }
 }
 
 pin_project_lite::pin_project! {
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 708662f20f..f024021507 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -263,6 +263,17 @@ pub trait RemoteStorage: Send + Sync + 'static {
         done_if_after: SystemTime,
         cancel: &CancellationToken,
     ) -> Result<(), TimeTravelError>;
+
+    /// Query how busy we currently are: may be used by callers which wish to politely
+    /// back off if there are already a lot of operations underway.
+    fn activity(&self) -> RemoteStorageActivity;
+}
+
+pub struct RemoteStorageActivity {
+    pub read_available: usize,
+    pub read_total: usize,
+    pub write_available: usize,
+    pub write_total: usize,
 }
 
 /// DownloadStream is sensitive to the timeout and cancellation used with the original
@@ -444,6 +455,15 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
             }
         }
     }
+
+    pub fn activity(&self) -> RemoteStorageActivity {
+        match self {
+            Self::LocalFs(s) => s.activity(),
+            Self::AwsS3(s) => s.activity(),
+            Self::AzureBlob(s) => s.activity(),
+            Self::Unreliable(s) => s.activity(),
+        }
+    }
 }
 
 impl GenericRemoteStorage {
@@ -774,6 +794,9 @@ struct ConcurrencyLimiter {
     // The helps to ensure we don't exceed the thresholds.
     write: Arc<Semaphore>,
     read: Arc<Semaphore>,
+
+    write_total: usize,
+    read_total: usize,
 }
 
 impl ConcurrencyLimiter {
@@ -802,10 +825,21 @@ impl ConcurrencyLimiter {
         Arc::clone(self.for_kind(kind)).acquire_owned().await
     }
 
+    fn activity(&self) -> RemoteStorageActivity {
+        RemoteStorageActivity {
+            read_available: self.read.available_permits(),
+            read_total: self.read_total,
+            write_available: self.write.available_permits(),
+            write_total: self.write_total,
+        }
+    }
+
     fn new(limit: usize) -> ConcurrencyLimiter {
         Self {
             read: Arc::new(Semaphore::new(limit)),
             write: Arc::new(Semaphore::new(limit)),
+            read_total: limit,
+            write_total: limit,
         }
     }
 }
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index 1f7bcfc982..f12f6590a3 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -23,8 +23,8 @@ use tokio_util::{io::ReaderStream, sync::CancellationToken};
 use utils::crashsafe::path_with_suffix_extension;
 
 use crate::{
-    Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError, TimeoutOrCancel,
-    REMOTE_STORAGE_PREFIX_SEPARATOR,
+    Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorageActivity,
+    TimeTravelError, TimeoutOrCancel, REMOTE_STORAGE_PREFIX_SEPARATOR,
 };
 
 use super::{RemoteStorage, StorageMetadata};
@@ -605,6 +605,16 @@ impl RemoteStorage for LocalFs {
     ) -> Result<(), TimeTravelError> {
         Err(TimeTravelError::Unimplemented)
     }
+
+    fn activity(&self) -> RemoteStorageActivity {
+        // LocalFS has no concurrency limiting: give callers the impression that plenty of units are available
+        RemoteStorageActivity {
+            read_available: 16,
+            read_total: 16,
+            write_available: 16,
+            write_total: 16,
+        }
+    }
 }
 
 fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index c3d6c75e20..0f6772b274 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -47,8 +47,8 @@ use utils::backoff;
 use super::StorageMetadata;
 use crate::{
     error::Cancelled, support::PermitCarrying, ConcurrencyLimiter, Download, DownloadError,
-    Listing, ListingMode, RemotePath, RemoteStorage, S3Config, TimeTravelError, TimeoutOrCancel,
-    MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
+    Listing, ListingMode, RemotePath, RemoteStorage, RemoteStorageActivity, S3Config,
+    TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
 };
 
 pub(super) mod metrics;
@@ -975,6 +975,10 @@ impl RemoteStorage for S3Bucket {
         }
         Ok(())
     }
+
+    fn activity(&self) -> RemoteStorageActivity {
+        self.concurrency_limiter.activity()
+    }
 }
 
 /// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`].
diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs
index c467a2d196..66522e04ca 100644
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -12,7 +12,7 @@ use tokio_util::sync::CancellationToken;
 
 use crate::{
     Download, DownloadError, GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorage,
-    StorageMetadata, TimeTravelError,
+    RemoteStorageActivity, StorageMetadata, TimeTravelError,
 };
 
 pub struct UnreliableWrapper {
@@ -213,4 +213,8 @@ impl RemoteStorage for UnreliableWrapper {
             .time_travel_recover(prefix, timestamp, done_if_after, cancel)
             .await
     }
+
+    fn activity(&self) -> RemoteStorageActivity {
+        self.inner.activity()
+    }
 }
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 2a8f83be95..c28e041fa2 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -45,10 +45,10 @@ use crate::tenant::{
 
 use camino::Utf8PathBuf;
 use chrono::format::{DelayedFormat, StrftimeItems};
-use futures::Future;
+use futures::{Future, StreamExt};
 use pageserver_api::models::SecondaryProgress;
 use pageserver_api::shard::TenantShardId;
-use remote_storage::{DownloadError, Etag, GenericRemoteStorage};
+use remote_storage::{DownloadError, Etag, GenericRemoteStorage, RemoteStorageActivity};
 
 use tokio_util::sync::CancellationToken;
 use tracing::{info_span, instrument, warn, Instrument};
@@ -71,6 +71,12 @@ use super::{
 /// `<ttps://github.com/neondatabase/neon/issues/6200>`
 const DOWNLOAD_FRESHEN_INTERVAL: Duration = Duration::from_millis(60000);
 
+/// Range of concurrency we may use when downloading layers within a timeline.  This is independent
+/// for each tenant we're downloading: the concurrency of _tenants_ is defined separately in
+/// `PageServerConf::secondary_download_concurrency`
+const MAX_LAYER_CONCURRENCY: usize = 16;
+const MIN_LAYER_CONCURRENCY: usize = 1;
+
 pub(super) async fn downloader_task(
     tenant_manager: Arc<TenantManager>,
     remote_storage: GenericRemoteStorage,
@@ -79,14 +85,15 @@ pub(super) async fn downloader_task(
     cancel: CancellationToken,
     root_ctx: RequestContext,
 ) {
-    let concurrency = tenant_manager.get_conf().secondary_download_concurrency;
+    // How many tenants' secondary download operations we will run concurrently
+    let tenant_concurrency = tenant_manager.get_conf().secondary_download_concurrency;
 
     let generator = SecondaryDownloader {
         tenant_manager,
         remote_storage,
         root_ctx,
     };
-    let mut scheduler = Scheduler::new(generator, concurrency);
+    let mut scheduler = Scheduler::new(generator, tenant_concurrency);
 
     scheduler
         .run(command_queue, background_jobs_can_start, cancel)
@@ -792,6 +799,8 @@ impl<'a> TenantDownloader<'a> {
 
         tracing::debug!(timeline_id=%timeline.timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len());
 
+        let mut download_futs = Vec::new();
+
         // Download heatmap layers that are not present on local disk, or update their
         // access time if they are already present.
         for layer in timeline.layers {
@@ -874,67 +883,33 @@ impl<'a> TenantDownloader<'a> {
                 }
             }
 
-            // Failpoint for simulating slow remote storage
-            failpoint_support::sleep_millis_async!(
-                "secondary-layer-download-sleep",
-                &self.secondary_state.cancel
-            );
-
-            // Note: no backoff::retry wrapper here because download_layer_file does its own retries internally
-            let downloaded_bytes = match download_layer_file(
-                self.conf,
-                self.remote_storage,
-                *tenant_shard_id,
-                timeline.timeline_id,
-                &layer.name,
-                &LayerFileMetadata::from(&layer.metadata),
-                &self.secondary_state.cancel,
+            download_futs.push(self.download_layer(
+                tenant_shard_id,
+                &timeline.timeline_id,
+                layer,
                 ctx,
-            )
-            .await
-            {
-                Ok(bytes) => bytes,
-                Err(DownloadError::NotFound) => {
-                    // A heatmap might be out of date and refer to a layer that doesn't exist any more.
-                    // This is harmless: continue to download the next layer. It is expected during compaction
-                    // GC.
-                    tracing::debug!(
-                        "Skipped downloading missing layer {}, raced with compaction/gc?",
-                        layer.name
-                    );
-                    continue;
+            ));
+        }
+
+        // Break up layer downloads into chunks, so that for each chunk we can re-check how much
+        // concurrency to use based on activity level of remote storage.
+        while !download_futs.is_empty() {
+            let chunk =
+                download_futs.split_off(download_futs.len().saturating_sub(MAX_LAYER_CONCURRENCY));
+
+            let concurrency = Self::layer_concurrency(self.remote_storage.activity());
+
+            let mut result_stream = futures::stream::iter(chunk).buffered(concurrency);
+            let mut result_stream = std::pin::pin!(result_stream);
+            while let Some(result) = result_stream.next().await {
+                match result {
+                    Err(e) => return Err(e),
+                    Ok(None) => {
+                        // No error, but we didn't download the layer.  Don't mark it touched
+                    }
+                    Ok(Some(layer)) => touched.push(layer),
                 }
-                Err(e) => return Err(e.into()),
-            };
-
-            if downloaded_bytes != layer.metadata.file_size {
-                let local_path = local_layer_path(
-                    self.conf,
-                    tenant_shard_id,
-                    &timeline.timeline_id,
-                    &layer.name,
-                    &layer.metadata.generation,
-                );
-
-                tracing::warn!(
-                    "Downloaded layer {} with unexpected size {} != {}.  Removing download.",
-                    layer.name,
-                    downloaded_bytes,
-                    layer.metadata.file_size
-                );
-
-                tokio::fs::remove_file(&local_path)
-                    .await
-                    .or_else(fs_ext::ignore_not_found)?;
-            } else {
-                tracing::info!("Downloaded layer {}, size {}", layer.name, downloaded_bytes);
-                let mut progress = self.secondary_state.progress.lock().unwrap();
-                progress.bytes_downloaded += downloaded_bytes;
-                progress.layers_downloaded += 1;
             }
-
-            SECONDARY_MODE.download_layer.inc();
-            touched.push(layer)
         }
 
         // Write updates to state to record layers we just downloaded or touched.
@@ -966,6 +941,90 @@ impl<'a> TenantDownloader<'a> {
 
         Ok(())
     }
+
+    async fn download_layer(
+        &self,
+        tenant_shard_id: &TenantShardId,
+        timeline_id: &TimelineId,
+        layer: HeatMapLayer,
+        ctx: &RequestContext,
+    ) -> Result<Option<HeatMapLayer>, UpdateError> {
+        // Failpoint for simulating slow remote storage
+        failpoint_support::sleep_millis_async!(
+            "secondary-layer-download-sleep",
+            &self.secondary_state.cancel
+        );
+
+        // Note: no backoff::retry wrapper here because download_layer_file does its own retries internally
+        let downloaded_bytes = match download_layer_file(
+            self.conf,
+            self.remote_storage,
+            *tenant_shard_id,
+            *timeline_id,
+            &layer.name,
+            &LayerFileMetadata::from(&layer.metadata),
+            &self.secondary_state.cancel,
+            ctx,
+        )
+        .await
+        {
+            Ok(bytes) => bytes,
+            Err(DownloadError::NotFound) => {
+                // A heatmap might be out of date and refer to a layer that doesn't exist any more.
+                // This is harmless: continue to download the next layer. It is expected during compaction
+                // GC.
+                tracing::debug!(
+                    "Skipped downloading missing layer {}, raced with compaction/gc?",
+                    layer.name
+                );
+                return Ok(None);
+            }
+            Err(e) => return Err(e.into()),
+        };
+
+        if downloaded_bytes != layer.metadata.file_size {
+            let local_path = local_layer_path(
+                self.conf,
+                tenant_shard_id,
+                timeline_id,
+                &layer.name,
+                &layer.metadata.generation,
+            );
+
+            tracing::warn!(
+                "Downloaded layer {} with unexpected size {} != {}.  Removing download.",
+                layer.name,
+                downloaded_bytes,
+                layer.metadata.file_size
+            );
+
+            tokio::fs::remove_file(&local_path)
+                .await
+                .or_else(fs_ext::ignore_not_found)?;
+        } else {
+            tracing::info!("Downloaded layer {}, size {}", layer.name, downloaded_bytes);
+            let mut progress = self.secondary_state.progress.lock().unwrap();
+            progress.bytes_downloaded += downloaded_bytes;
+            progress.layers_downloaded += 1;
+        }
+
+        SECONDARY_MODE.download_layer.inc();
+
+        Ok(Some(layer))
+    }
+
+    /// Calculate the currently allowed parallelism of layer download tasks, based on activity level of the remote storage
+    fn layer_concurrency(activity: RemoteStorageActivity) -> usize {
+        // When less than 75% of units are available, use minimum concurrency.  Else, do a linear mapping
+        // of our concurrency range to the units available within the remaining 25%.
+        let clamp_at = (activity.read_total * 3) / 4;
+        if activity.read_available > clamp_at {
+            (MAX_LAYER_CONCURRENCY * (activity.read_available - clamp_at))
+                / (activity.read_total - clamp_at)
+        } else {
+            MIN_LAYER_CONCURRENCY
+        }
+    }
 }
 
 /// Scan local storage and build up Layer objects based on the metadata in a HeatMapTimeline
@@ -1092,3 +1151,58 @@ async fn init_timeline_state(
 
     detail
 }
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn layer_concurrency() {
+        // Totally idle
+        assert_eq!(
+            TenantDownloader::layer_concurrency(RemoteStorageActivity {
+                read_available: 16,
+                read_total: 16,
+                write_available: 16,
+                write_total: 16
+            }),
+            MAX_LAYER_CONCURRENCY
+        );
+
+        // Totally busy
+        assert_eq!(
+            TenantDownloader::layer_concurrency(RemoteStorageActivity {
+                read_available: 0,
+                read_total: 16,
+
+                write_available: 16,
+                write_total: 16
+            }),
+            MIN_LAYER_CONCURRENCY
+        );
+
+        // Edge of the range at which we interpolate
+        assert_eq!(
+            TenantDownloader::layer_concurrency(RemoteStorageActivity {
+                read_available: 12,
+                read_total: 16,
+
+                write_available: 16,
+                write_total: 16
+            }),
+            MIN_LAYER_CONCURRENCY
+        );
+
+        // Midpoint of the range in which we interpolate
+        assert_eq!(
+            TenantDownloader::layer_concurrency(RemoteStorageActivity {
+                read_available: 14,
+                read_total: 16,
+
+                write_available: 16,
+                write_total: 16
+            }),
+            MAX_LAYER_CONCURRENCY / 2
+        );
+    }
+}

From 9ffb8523597d846b6afce7eb855b820b79efcbd6 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Mon, 13 May 2024 17:14:08 -0400
Subject: [PATCH 043/130] fix(test): ensure compatibility test uses the correct
 compute node (#7741)

Use the old compute node for compat tests.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 test_runner/fixtures/neon_fixtures.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 0c2b70202e..8432655370 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -702,8 +702,7 @@ class NeonEnvBuilder:
         config["branch_name_mappings"] = snapshot_config["branch_name_mappings"]
 
         # Update the config with new neon + postgres path in case of compat test
-        # FIXME: overriding pg_distrib_dir cause storage controller fail to start
-        # config["pg_distrib_dir"] = str(self.pg_distrib_dir)
+        config["pg_distrib_dir"] = str(self.pg_distrib_dir)
         config["neon_distrib_dir"] = str(self.neon_binpath)
 
         with (self.repo_dir / "config").open("w") as f:

From 3a6fa768286fced1787b2320d4878e1b961b5a0e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 14 May 2024 01:13:25 +0200
Subject: [PATCH 044/130] Tiered compaction: cut deltas along lsn as well if
 needed (#7671)

In general, tiered compaction is splitting delta layers along the key
dimension, but this can only continue until a single key is reached: if
the changes from a single key don't fit into one layer file, we used to
create layer files of unbounded sizes.

This patch implements the method listed as TODO/FIXME in the source
code. It does the following things:

* Make `accum_key_values` take the target size and if one key's
modifications exceed it, make it fill `partition_lsns`, a vector of lsns
to use for partitioning.
* Have `retile_deltas` use that `partition_lsns` to create delta layers
separated by lsn.
* Adjust the `test_many_updates_for_single_key` to allow layer files
below 0.5 the target size. This situation can create arbitarily small
layer files: The amount of data is arbitrary that sits between having
just cut a new delta, and then stumbling upon the key that needs to be
split along lsn. This data will end up in a dedicated layer and it can
be arbitrarily small.
* Ignore single-key delta layers for depth calculation: in theory we
might have only single-key delta layers in a tier, and this might
confuse depth calculation as well, but this should be unlikely.

Fixes #7243

Part of #7554

---------

Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
---
 pageserver/compaction/src/compact_tiered.rs  | 107 ++++++++++++++-----
 pageserver/compaction/src/helpers.rs         |  23 +++-
 pageserver/compaction/src/identify_levels.rs |   6 ++
 pageserver/compaction/tests/tests.rs         |   8 +-
 4 files changed, 109 insertions(+), 35 deletions(-)

diff --git a/pageserver/compaction/src/compact_tiered.rs b/pageserver/compaction/src/compact_tiered.rs
index 20e9cf2196..a8f184af24 100644
--- a/pageserver/compaction/src/compact_tiered.rs
+++ b/pageserver/compaction/src/compact_tiered.rs
@@ -530,8 +530,6 @@ where
         // If we have accumulated only a narrow band of keyspace, create an
         // image layer. Otherwise write a delta layer.
 
-        // FIXME: deal with the case of lots of values for same key
-
         // FIXME: we are ignoring images here. Did we already divide the work
         // so that we won't encounter them here?
 
@@ -550,39 +548,94 @@ where
         let mut new_jobs = Vec::new();
 
         // Slide a window through the keyspace
-        let mut key_accum = std::pin::pin!(accum_key_values(key_value_stream));
+        let mut key_accum =
+            std::pin::pin!(accum_key_values(key_value_stream, self.target_file_size));
         let mut all_in_window: bool = false;
         let mut window = Window::new();
+
+        // Helper function to create a job for a new delta layer with given key-lsn
+        // rectangle.
+        let create_delta_job = |key_range, lsn_range: &Range<Lsn>, new_jobs: &mut Vec<_>| {
+            // The inputs for the job are all the input layers of the original job that
+            // overlap with the rectangle.
+            let batch_layers: Vec<LayerId> = job
+                .input_layers
+                .iter()
+                .filter(|layer_id| {
+                    overlaps_with(self.layers[layer_id.0].layer.key_range(), &key_range)
+                })
+                .cloned()
+                .collect();
+            assert!(!batch_layers.is_empty());
+            new_jobs.push(CompactionJob {
+                key_range,
+                lsn_range: lsn_range.clone(),
+                strategy: CompactionStrategy::CreateDelta,
+                input_layers: batch_layers,
+                completed: false,
+            });
+        };
+
         loop {
-            if all_in_window && window.elems.is_empty() {
+            if all_in_window && window.is_empty() {
                 // All done!
                 break;
             }
+
+            // If we now have enough keyspace for next delta layer in the window, create a
+            // new delta layer
             if let Some(key_range) = window.choose_next_delta(self.target_file_size, !all_in_window)
             {
-                let batch_layers: Vec<LayerId> = job
-                    .input_layers
-                    .iter()
-                    .filter(|layer_id| {
-                        overlaps_with(self.layers[layer_id.0].layer.key_range(), &key_range)
-                    })
-                    .cloned()
-                    .collect();
-                assert!(!batch_layers.is_empty());
-                new_jobs.push(CompactionJob {
-                    key_range,
-                    lsn_range: job.lsn_range.clone(),
-                    strategy: CompactionStrategy::CreateDelta,
-                    input_layers: batch_layers,
-                    completed: false,
-                });
-            } else {
-                assert!(!all_in_window);
-                if let Some(next_key) = key_accum.next().await.transpose()? {
-                    window.feed(next_key.key, next_key.size);
-                } else {
+                create_delta_job(key_range, &job.lsn_range, &mut new_jobs);
+                continue;
+            }
+            assert!(!all_in_window);
+
+            // Process next key in the key space
+            match key_accum.next().await.transpose()? {
+                None => {
                     all_in_window = true;
                 }
+                Some(next_key) if next_key.partition_lsns.is_empty() => {
+                    // Normal case: extend the window by the key
+                    window.feed(next_key.key, next_key.size);
+                }
+                Some(next_key) => {
+                    // A key with too large size impact for a single delta layer. This
+                    // case occurs if you make a huge number of updates for a single key.
+                    //
+                    // Drain the window with has_more = false to make a clean cut before
+                    // the key, and then make dedicated delta layers for the single key.
+                    //
+                    // We cannot cluster the key with the others, because we don't want
+                    // layer files to overlap with each other in the lsn,key space (no
+                    // overlaps for the rectangles).
+                    let key = next_key.key;
+                    debug!("key {key} with size impact larger than the layer size");
+                    while !window.is_empty() {
+                        let has_more = false;
+                        let key_range = window.choose_next_delta(self.target_file_size, has_more)
+                            .expect("with has_more==false, choose_next_delta always returns something for a non-empty Window");
+                        create_delta_job(key_range, &job.lsn_range, &mut new_jobs);
+                    }
+
+                    // Not really required: but here for future resilience:
+                    // We make a "gap" here, so any structure the window holds should
+                    // probably be reset.
+                    window = Window::new();
+
+                    let mut prior_lsn = job.lsn_range.start;
+                    let mut lsn_ranges = Vec::new();
+                    for (lsn, _size) in next_key.partition_lsns.iter() {
+                        lsn_ranges.push(prior_lsn..*lsn);
+                        prior_lsn = *lsn;
+                    }
+                    lsn_ranges.push(prior_lsn..job.lsn_range.end);
+                    for lsn_range in lsn_ranges {
+                        let key_range = key..key.next();
+                        create_delta_job(key_range, &lsn_range, &mut new_jobs);
+                    }
+                }
             }
         }
 
@@ -803,6 +856,10 @@ where
         self.elems.front().unwrap().accum_size - self.splitoff_size
     }
 
+    fn is_empty(&self) -> bool {
+        self.elems.is_empty()
+    }
+
     fn commit_upto(&mut self, mut upto: usize) {
         while upto > 1 {
             let popped = self.elems.pop_front().unwrap();
diff --git a/pageserver/compaction/src/helpers.rs b/pageserver/compaction/src/helpers.rs
index 06454ee1d0..2c922b0a49 100644
--- a/pageserver/compaction/src/helpers.rs
+++ b/pageserver/compaction/src/helpers.rs
@@ -235,9 +235,14 @@ pub struct KeySize<K> {
     pub key: K,
     pub num_values: u64,
     pub size: u64,
+    /// The lsns to partition at (if empty then no per-lsn partitioning)
+    pub partition_lsns: Vec<(Lsn, u64)>,
 }
 
-pub fn accum_key_values<'a, I, K, D, E>(input: I) -> impl Stream<Item = Result<KeySize<K>, E>>
+pub fn accum_key_values<'a, I, K, D, E>(
+    input: I,
+    target_size: u64,
+) -> impl Stream<Item = Result<KeySize<K>, E>>
 where
     K: Eq + PartialOrd + Display + Copy,
     I: Stream<Item = Result<D, E>>,
@@ -249,25 +254,35 @@ where
 
         if let Some(first) = input.next().await {
             let first = first?;
+            let mut part_size = first.size();
             let mut accum: KeySize<K> = KeySize {
                 key: first.key(),
                 num_values: 1,
-                size: first.size(),
+                size: part_size,
+                partition_lsns: Vec::new(),
             };
             let mut last_key = accum.key;
             while let Some(this) = input.next().await {
                 let this = this?;
                 if this.key() == accum.key {
-                    accum.size += this.size();
+                    let add_size = this.size();
+                    if part_size + add_size > target_size {
+                        accum.partition_lsns.push((this.lsn(), part_size));
+                        part_size = 0;
+                    }
+                    part_size += add_size;
+                    accum.size += add_size;
                     accum.num_values += 1;
                 } else {
                     assert!(last_key <= accum.key, "last_key={last_key} <= accum.key={}", accum.key);
                     last_key = accum.key;
                     yield accum;
+                    part_size = this.size();
                     accum = KeySize {
                         key: this.key(),
                         num_values: 1,
-                        size: this.size(),
+                        size: part_size,
+                        partition_lsns: Vec::new(),
                     };
                 }
             }
diff --git a/pageserver/compaction/src/identify_levels.rs b/pageserver/compaction/src/identify_levels.rs
index 98dd46925c..1853afffdd 100644
--- a/pageserver/compaction/src/identify_levels.rs
+++ b/pageserver/compaction/src/identify_levels.rs
@@ -184,6 +184,12 @@ impl<L> Level<L> {
         }
         let mut events: Vec<Event<K>> = Vec::new();
         for (idx, l) in self.layers.iter().enumerate() {
+            let key_range = l.key_range();
+            if key_range.end == key_range.start.next() && l.is_delta() {
+                // Ignore single-key delta layers as they can be stacked on top of each other
+                // as that is the only way to cut further.
+                continue;
+            }
             events.push(Event {
                 key: l.key_range().start,
                 layer_idx: idx,
diff --git a/pageserver/compaction/tests/tests.rs b/pageserver/compaction/tests/tests.rs
index 7aa20e6863..bd8b54a286 100644
--- a/pageserver/compaction/tests/tests.rs
+++ b/pageserver/compaction/tests/tests.rs
@@ -20,10 +20,6 @@ pub(crate) fn setup_logging() {
 /// even if we produce an extremely narrow delta layer, spanning just that one
 /// key, we still too many records to fit in the target file size. We need to
 /// split in the LSN dimension too in that case.
-///
-/// TODO: The code to avoid this problem has not been implemented yet! So the
-/// assertion currently fails, but we need to make it not fail.
-#[ignore]
 #[tokio::test]
 async fn test_many_updates_for_single_key() {
     setup_logging();
@@ -43,9 +39,9 @@ async fn test_many_updates_for_single_key() {
     }
     for l in executor.live_layers.iter() {
         assert!(l.file_size() < executor.target_file_size * 2);
-        // sanity check that none of the delta layers are stupidly small either
+        // Sanity check that none of the delta layers are empty either.
         if l.is_delta() {
-            assert!(l.file_size() > executor.target_file_size / 2);
+            assert!(l.file_size() > 0);
         }
     }
 }

From ba20752b7678179f5db0248ce98f56d225684c77 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 13 May 2024 13:17:27 +0300
Subject: [PATCH 045/130] Refactor the request LSNs to a separate struct
 (#7708)

We had a lot of code that passed around the two LSNs that are
associated with each GetPage request. Introduce a new struct to
encapsulate them. I'm about to add a third LSN to the struct in the
next commit, this is a mechanical refactoring in preparation for that.
---
 pgxn/neon/pagestore_client.h    |  24 ++++-
 pgxn/neon/pagestore_smgr.c      | 179 ++++++++++++++------------------
 pgxn/neon_test_utils/neontest.c |  24 ++---
 3 files changed, 111 insertions(+), 116 deletions(-)

diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h
index 7709ab9d42..1334a04f9a 100644
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -237,18 +237,38 @@ extern void neon_zeroextend(SMgrRelation reln, ForkNumber forknum,
 extern bool neon_prefetch(SMgrRelation reln, ForkNumber forknum,
 						  BlockNumber blocknum);
 
+/*
+ * LSN values associated with each request to the pageserver
+ */
+typedef struct
+{
+	/*
+	 * 'request_lsn' is the main value that determines which page version to
+	 * fetch.
+	 */
+	XLogRecPtr request_lsn;
+
+	/*
+	 * A hint to the pageserver that the requested page hasn't been modified
+	 * between this LSN and 'request_lsn'. That allows the pageserver to
+	 * return the page faster, without waiting for 'request_lsn' to arrive in
+	 * the pageserver, as long as 'not_modified_since' has arrived.
+	 */
+	XLogRecPtr not_modified_since;
+} neon_request_lsns;
+
 #if PG_MAJORVERSION_NUM < 16
 extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 					  char *buffer);
 extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno,
-										 XLogRecPtr request_lsn, XLogRecPtr not_modified_since, char *buffer);
+										 neon_request_lsns request_lsns, char *buffer);
 extern void neon_write(SMgrRelation reln, ForkNumber forknum,
 					   BlockNumber blocknum, char *buffer, bool skipFsync);
 #else
 extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 					  void *buffer);
 extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno,
-										 XLogRecPtr request_lsn, XLogRecPtr not_modified_since, void *buffer);
+										 neon_request_lsns request_lsns, void *buffer);
 extern void neon_write(SMgrRelation reln, ForkNumber forknum,
 					   BlockNumber blocknum, const void *buffer, bool skipFsync);
 #endif
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 44ecdbd9aa..8fcbbfe54d 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -168,8 +168,7 @@ typedef enum PrefetchStatus
 typedef struct PrefetchRequest
 {
 	BufferTag	buftag;			/* must be first entry in the struct */
-	XLogRecPtr	request_lsn;
-	XLogRecPtr	not_modified_since;
+	neon_request_lsns request_lsns;
 	NeonResponse *response;		/* may be null */
 	PrefetchStatus status;
 	shardno_t   shard_no;
@@ -271,16 +270,15 @@ static PrefetchState *MyPState;
 
 static bool compact_prefetch_buffers(void);
 static void consume_prefetch_responses(void);
-static uint64 prefetch_register_buffer(BufferTag tag, XLogRecPtr *force_request_lsn, XLogRecPtr *force_not_modified_since);
+static uint64 prefetch_register_buffer(BufferTag tag, neon_request_lsns *force_request_lsns);
 static bool prefetch_read(PrefetchRequest *slot);
-static void prefetch_do_request(PrefetchRequest *slot, XLogRecPtr *force_request_lsn, XLogRecPtr *force_not_modified_since);
+static void prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns);
 static bool prefetch_wait_for(uint64 ring_index);
 static void prefetch_cleanup_trailing_unused(void);
 static inline void prefetch_set_unused(uint64 ring_index);
 
-static void neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
-								 XLogRecPtr *request_lsn, XLogRecPtr *not_modified_since);
-static bool neon_prefetch_response_usable(XLogRecPtr request_lsn, XLogRecPtr not_modified_since,
+static neon_request_lsns neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno);
+static bool neon_prefetch_response_usable(neon_request_lsns request_lsns,
 										  PrefetchRequest *slot);
 
 static bool
@@ -338,8 +336,7 @@ compact_prefetch_buffers(void)
 		target_slot->shard_no = source_slot->shard_no;
 		target_slot->status = source_slot->status;
 		target_slot->response = source_slot->response;
-		target_slot->request_lsn = source_slot->request_lsn;
-		target_slot->not_modified_since = source_slot->not_modified_since;
+		target_slot->request_lsns = source_slot->request_lsns;
 		target_slot->my_ring_index = empty_ring_index;
 
 		prfh_delete(MyPState->prf_hash, source_slot);
@@ -358,8 +355,9 @@ compact_prefetch_buffers(void)
 		};
 		source_slot->response = NULL;
 		source_slot->my_ring_index = 0;
-		source_slot->request_lsn = InvalidXLogRecPtr;
-		source_slot->not_modified_since = InvalidXLogRecPtr;
+		source_slot->request_lsns = (neon_request_lsns) {
+			InvalidXLogRecPtr, InvalidXLogRecPtr
+		};
 
 		/* update bookkeeping */
 		n_moved++;
@@ -689,7 +687,7 @@ prefetch_set_unused(uint64 ring_index)
  * prefetch_wait_for().
  */
 static void
-prefetch_do_request(PrefetchRequest *slot, XLogRecPtr *force_request_lsn, XLogRecPtr *force_not_modified_since)
+prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns)
 {
 	bool		found;
 	NeonGetPageRequest request = {
@@ -700,23 +698,14 @@ prefetch_do_request(PrefetchRequest *slot, XLogRecPtr *force_request_lsn, XLogRe
 		.blkno = slot->buftag.blockNum,
 	};
 
-	Assert(((force_request_lsn != NULL) == (force_not_modified_since != NULL)));
-
-	if (force_request_lsn)
-	{
-		request.req.lsn = *force_request_lsn;
-		request.req.not_modified_since = *force_not_modified_since;
-	}
+	if (force_request_lsns)
+		slot->request_lsns = *force_request_lsns;
 	else
-	{
-		neon_get_request_lsn(BufTagGetNRelFileInfo(slot->buftag),
-							 slot->buftag.forkNum,
-							 slot->buftag.blockNum,
-							 &request.req.lsn,
-							 &request.req.not_modified_since);
-	}
-	slot->request_lsn = request.req.lsn;
-	slot->not_modified_since = request.req.not_modified_since;
+		slot->request_lsns = neon_get_request_lsns(BufTagGetNRelFileInfo(slot->buftag),
+												   slot->buftag.forkNum,
+												   slot->buftag.blockNum);
+	request.req.lsn = slot->request_lsns.request_lsn;
+	request.req.not_modified_since = slot->request_lsns.not_modified_since;
 
 	Assert(slot->response == NULL);
 	Assert(slot->my_ring_index == MyPState->ring_unused);
@@ -742,25 +731,22 @@ prefetch_do_request(PrefetchRequest *slot, XLogRecPtr *force_request_lsn, XLogRe
  *
  * Register that we may want the contents of BufferTag in the near future.
  *
- * If force_request_lsn and force_not_modified_since are not NULL, those
- * values are sent to the pageserver. If they are NULL, we utilize the
- * lastWrittenLsn -infrastructure to fill them in.
+ * If force_request_lsns is not NULL, those values are sent to the
+ * pageserver. If NULL, we utilize the lastWrittenLsn -infrastructure
+ * to calculate the LSNs to send.
  *
  * NOTE: this function may indirectly update MyPState->pfs_hash; which
  * invalidates any active pointers into the hash table.
  */
 
 static uint64
-prefetch_register_buffer(BufferTag tag, XLogRecPtr *force_request_lsn,
-						 XLogRecPtr *force_not_modified_since)
+prefetch_register_buffer(BufferTag tag, neon_request_lsns *force_request_lsns)
 {
 	uint64		ring_index;
 	PrefetchRequest req;
 	PrefetchRequest *slot;
 	PrfHashEntry *entry;
 
-	Assert(((force_request_lsn != NULL) == (force_not_modified_since != NULL)));
-
 	/* use an intermediate PrefetchRequest struct to ensure correct alignment */
 	req.buftag = tag;
 Retry:
@@ -781,10 +767,9 @@ Retry:
 		 * If the caller specified a request LSN to use, only accept prefetch
 		 * responses that satisfy that request.
 		 */
-		if (force_request_lsn)
+		if (force_request_lsns)
 		{
-			if (!neon_prefetch_response_usable(*force_request_lsn,
-											   *force_not_modified_since, slot))
+			if (!neon_prefetch_response_usable(*force_request_lsns, slot))
 			{
 				/* Wait for the old request to finish and discard it */
 				if (!prefetch_wait_for(ring_index))
@@ -886,7 +871,7 @@ Retry:
 	slot->shard_no = get_shard_number(&tag);
 	slot->my_ring_index = ring_index;
 
-	prefetch_do_request(slot, force_request_lsn, force_not_modified_since);
+	prefetch_do_request(slot, force_request_lsns);
 	Assert(slot->status == PRFS_REQUESTED);
 	Assert(MyPState->ring_last <= ring_index &&
 		   ring_index < MyPState->ring_unused);
@@ -1529,11 +1514,11 @@ nm_adjust_lsn(XLogRecPtr lsn)
 /*
  * Return LSN for requesting pages and number of blocks from page server
  */
-static void
-neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
-					 XLogRecPtr *request_lsn, XLogRecPtr *not_modified_since)
+static neon_request_lsns
+neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno)
 {
 	XLogRecPtr	last_written_lsn;
+	neon_request_lsns result;
 
 	last_written_lsn = GetLastWrittenLSN(rinfo, forknum, blkno);
 	last_written_lsn = nm_adjust_lsn(last_written_lsn);
@@ -1542,12 +1527,12 @@ neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 	if (RecoveryInProgress())
 	{
 		/* Request the page at the last replayed LSN. */
-		*request_lsn = GetXLogReplayRecPtr(NULL);
-		*not_modified_since = last_written_lsn;
-		Assert(last_written_lsn <= *request_lsn);
+		result.request_lsn = GetXLogReplayRecPtr(NULL);
+		result.not_modified_since = last_written_lsn;
+		Assert(last_written_lsn <= result.request_lsn);
 
-		neon_log(DEBUG1, "neon_get_request_lsn request lsn %X/%X, not_modified_since %X/%X",
-				 LSN_FORMAT_ARGS(*request_lsn), LSN_FORMAT_ARGS(*not_modified_since));
+		neon_log(DEBUG1, "neon_get_request_lsns request lsn %X/%X, not_modified_since %X/%X",
+				 LSN_FORMAT_ARGS(result.request_lsn), LSN_FORMAT_ARGS(result.not_modified_since));
 	}
 	else
 	{
@@ -1559,7 +1544,7 @@ neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 		 * must still in the buffer cache, so our request cannot concern
 		 * those.
 		 */
-		neon_log(DEBUG1, "neon_get_request_lsn GetLastWrittenLSN lsn %X/%X ",
+		neon_log(DEBUG1, "neon_get_request_lsns GetLastWrittenLSN lsn %X/%X",
 				 LSN_FORMAT_ARGS(last_written_lsn));
 
 		/*
@@ -1592,9 +1577,11 @@ neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 		 * flush, it should still be in the buffer cache, and we wouldn't be
 		 * requesting it.
 		 */
-		*request_lsn = flushlsn;
-		*not_modified_since = last_written_lsn;
+		result.request_lsn = flushlsn;
+		result.not_modified_since = last_written_lsn;
 	}
+
+	return result;
 }
 
 /*
@@ -1604,12 +1591,12 @@ neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
  * satisfy a page read now.
  */
 static bool
-neon_prefetch_response_usable(XLogRecPtr request_lsn, XLogRecPtr not_modified_since,
+neon_prefetch_response_usable(neon_request_lsns request_lsns,
 							  PrefetchRequest *slot)
 {
 	/* sanity check the LSN's on the old and the new request */
-	Assert(request_lsn >= not_modified_since);
-	Assert(slot->request_lsn >= slot->not_modified_since);
+	Assert(request_lsns.request_lsn >= request_lsns.not_modified_since);
+	Assert(slot->request_lsns.request_lsn >= slot->request_lsns.not_modified_since);
 	Assert(slot->status != PRFS_UNUSED);
 
 	/*
@@ -1627,14 +1614,15 @@ neon_prefetch_response_usable(XLogRecPtr request_lsn, XLogRecPtr not_modified_si
 	 * calculate LSNs "out of order" with each other, but the prefetch queue
 	 * is backend-private at the moment.)
 	 */
-	if (request_lsn < slot->request_lsn || not_modified_since < slot->not_modified_since)
+	if (request_lsns.request_lsn < slot->request_lsns.request_lsn ||
+		request_lsns.not_modified_since < slot->request_lsns.not_modified_since)
 	{
 		ereport(LOG,
 				(errcode(ERRCODE_IO_ERROR),
 				 errmsg(NEON_TAG "request with unexpected LSN after prefetch"),
 				 errdetail("Request %X/%X not_modified_since %X/%X, prefetch %X/%X not_modified_since %X/%X)",
-						   LSN_FORMAT_ARGS(request_lsn), LSN_FORMAT_ARGS(not_modified_since),
-						   LSN_FORMAT_ARGS(slot->request_lsn), LSN_FORMAT_ARGS(slot->not_modified_since))));
+						   LSN_FORMAT_ARGS(request_lsns.request_lsn), LSN_FORMAT_ARGS(request_lsns.not_modified_since),
+						   LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since))));
 		return false;
 	}
 
@@ -1675,9 +1663,9 @@ neon_prefetch_response_usable(XLogRecPtr request_lsn, XLogRecPtr not_modified_si
 	 */
 
 	/* this follows from the checks above */
-	Assert(request_lsn >= slot->not_modified_since);
+	Assert(request_lsns.request_lsn >= slot->request_lsns.not_modified_since);
 
-	return not_modified_since <= slot->request_lsn;
+	return request_lsns.not_modified_since <= slot->request_lsns.request_lsn;
 }
 
 /*
@@ -1689,8 +1677,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 	bool		exists;
 	NeonResponse *resp;
 	BlockNumber n_blocks;
-	XLogRecPtr	request_lsn;
-	XLogRecPtr	not_modified_since;
+	neon_request_lsns request_lsns;
 
 	switch (reln->smgr_relpersistence)
 	{
@@ -1745,15 +1732,15 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 		return false;
 	}
 
-	neon_get_request_lsn(InfoFromSMgrRel(reln), forkNum, REL_METADATA_PSEUDO_BLOCKNO,
-						 &request_lsn, &not_modified_since);
+	request_lsns = neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, REL_METADATA_PSEUDO_BLOCKNO);
 	{
 		NeonExistsRequest request = {
 			.req.tag = T_NeonExistsRequest,
-			.req.lsn = request_lsn,
-			.req.not_modified_since = not_modified_since,
+			.req.lsn = request_lsns.request_lsn,
+			.req.not_modified_since = request_lsns.not_modified_since,
 			.rinfo = InfoFromSMgrRel(reln),
-		.forknum = forkNum};
+			.forknum = forkNum
+		};
 
 		resp = page_server_request(&request);
 	}
@@ -1770,7 +1757,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 					 errmsg(NEON_TAG "could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X",
 							RelFileInfoFmt(InfoFromSMgrRel(reln)),
 							forkNum,
-							(uint32) (request_lsn >> 32), (uint32) request_lsn),
+							LSN_FORMAT_ARGS(request_lsns.request_lsn)),
 					 errdetail("page server returned error: %s",
 							   ((NeonErrorResponse *) resp)->message)));
 			break;
@@ -2135,7 +2122,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 
 	CopyNRelFileInfoToBufTag(tag, InfoFromSMgrRel(reln));
 
-	ring_index = prefetch_register_buffer(tag, NULL, NULL);
+	ring_index = prefetch_register_buffer(tag, NULL);
 
 	Assert(ring_index < MyPState->ring_unused &&
 		   MyPState->ring_last <= ring_index);
@@ -2188,10 +2175,10 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
 void
 #if PG_MAJORVERSION_NUM < 16
 neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-				 XLogRecPtr request_lsn, XLogRecPtr not_modified_since, char *buffer)
+				 neon_request_lsns request_lsns, char *buffer)
 #else
 neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-				 XLogRecPtr request_lsn, XLogRecPtr not_modified_since, void *buffer)
+				 neon_request_lsns request_lsns, void *buffer)
 #endif
 {
 	NeonResponse *resp;
@@ -2223,7 +2210,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	 * value of the LwLsn cache when the entry is not found.
 	 */
 	if (RecoveryInProgress() && !(MyBackendType == B_STARTUP))
-		XLogWaitForReplayOf(request_lsn);
+		XLogWaitForReplayOf(request_lsns.request_lsn);
 
 	/*
 	 * Try to find prefetched page in the list of received pages.
@@ -2234,7 +2221,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	if (entry != NULL)
 	{
 		slot = entry->slot;
-		if (neon_prefetch_response_usable(request_lsn, not_modified_since, slot))
+		if (neon_prefetch_response_usable(request_lsns, slot))
 		{
 			ring_index = slot->my_ring_index;
 			pgBufferUsage.prefetch.hits += 1;
@@ -2268,8 +2255,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		{
 			pgBufferUsage.prefetch.misses += 1;
 
-			ring_index = prefetch_register_buffer(buftag, &request_lsn,
-												  &not_modified_since);
+			ring_index = prefetch_register_buffer(buftag, &request_lsns);
 			slot = GetPrfSlot(ring_index);
 		}
 		else
@@ -2310,7 +2296,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 							slot->shard_no, blkno,
 							RelFileInfoFmt(rinfo),
 							forkNum,
-							(uint32) (request_lsn >> 32), (uint32) request_lsn),
+							LSN_FORMAT_ARGS(request_lsns.request_lsn)),
 					 errdetail("page server returned error: %s",
 							   ((NeonErrorResponse *) resp)->message)));
 			break;
@@ -2333,8 +2319,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, char *buffer
 neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer)
 #endif
 {
-	XLogRecPtr	request_lsn;
-	XLogRecPtr	not_modified_since;
+	neon_request_lsns request_lsns;
 
 	switch (reln->smgr_relpersistence)
 	{
@@ -2359,9 +2344,8 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 		return;
 	}
 
-	neon_get_request_lsn(InfoFromSMgrRel(reln), forkNum, blkno,
-						 &request_lsn, &not_modified_since);
-	neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsn, not_modified_since, buffer);
+	request_lsns = neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno);
+	neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer);
 
 #ifdef DEBUG_COMPARE_LOCAL
 	if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
@@ -2530,8 +2514,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 {
 	NeonResponse *resp;
 	BlockNumber n_blocks;
-	XLogRecPtr	request_lsn;
-	XLogRecPtr	not_modified_since;
+	neon_request_lsns request_lsns;
 
 	switch (reln->smgr_relpersistence)
 	{
@@ -2558,13 +2541,12 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 		return n_blocks;
 	}
 
-	neon_get_request_lsn(InfoFromSMgrRel(reln), forknum, REL_METADATA_PSEUDO_BLOCKNO,
-						 &request_lsn, &not_modified_since);
+	request_lsns = neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, REL_METADATA_PSEUDO_BLOCKNO);
 	{
 		NeonNblocksRequest request = {
 			.req.tag = T_NeonNblocksRequest,
-			.req.lsn = request_lsn,
-			.req.not_modified_since = not_modified_since,
+			.req.lsn = request_lsns.request_lsn,
+			.req.not_modified_since = request_lsns.not_modified_since,
 			.rinfo = InfoFromSMgrRel(reln),
 			.forknum = forknum,
 		};
@@ -2584,7 +2566,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 					 errmsg(NEON_TAG "could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X",
 							RelFileInfoFmt(InfoFromSMgrRel(reln)),
 							forknum,
-							(uint32) (request_lsn >> 32), (uint32) request_lsn),
+							LSN_FORMAT_ARGS(request_lsns.request_lsn)),
 					 errdetail("page server returned error: %s",
 							   ((NeonErrorResponse *) resp)->message)));
 			break;
@@ -2595,10 +2577,10 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 	update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);
 
 	neon_log(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks",
-		 RelFileInfoFmt(InfoFromSMgrRel(reln)),
-		 forknum,
-		 (uint32) (request_lsn >> 32), (uint32) request_lsn,
-		 n_blocks);
+			 RelFileInfoFmt(InfoFromSMgrRel(reln)),
+			 forknum,
+			 LSN_FORMAT_ARGS(request_lsns.request_lsn),
+			 n_blocks);
 
 	pfree(resp);
 	return n_blocks;
@@ -2612,17 +2594,15 @@ neon_dbsize(Oid dbNode)
 {
 	NeonResponse *resp;
 	int64		db_size;
-	XLogRecPtr	request_lsn,
-				not_modified_since;
+	neon_request_lsns request_lsns;
 	NRelFileInfo dummy_node = {0};
 
-	neon_get_request_lsn(dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO,
-						 &request_lsn, &not_modified_since);
+	request_lsns = neon_get_request_lsns(dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO);
 	{
 		NeonDbSizeRequest request = {
 			.req.tag = T_NeonDbSizeRequest,
-			.req.lsn = request_lsn,
-			.req.not_modified_since = not_modified_since,
+			.req.lsn = request_lsns.request_lsn,
+			.req.not_modified_since = request_lsns.not_modified_since,
 			.dbNode = dbNode,
 		};
 
@@ -2639,8 +2619,7 @@ neon_dbsize(Oid dbNode)
 			ereport(ERROR,
 					(errcode(ERRCODE_IO_ERROR),
 					 errmsg(NEON_TAG "could not read db size of db %u from page server at lsn %X/%08X",
-							dbNode,
-							(uint32) (request_lsn >> 32), (uint32) request_lsn),
+							dbNode, LSN_FORMAT_ARGS(request_lsns.request_lsn)),
 					 errdetail("page server returned error: %s",
 							   ((NeonErrorResponse *) resp)->message)));
 			break;
@@ -2650,9 +2629,7 @@ neon_dbsize(Oid dbNode)
 	}
 
 	neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
-		 dbNode,
-		 (uint32) (request_lsn >> 32), (uint32) request_lsn,
-		 db_size);
+			 dbNode, LSN_FORMAT_ARGS(request_lsns.request_lsn), db_size);
 
 	pfree(resp);
 	return db_size;
diff --git a/pgxn/neon_test_utils/neontest.c b/pgxn/neon_test_utils/neontest.c
index 677006923d..9f63b58a86 100644
--- a/pgxn/neon_test_utils/neontest.c
+++ b/pgxn/neon_test_utils/neontest.c
@@ -48,10 +48,10 @@ PG_FUNCTION_INFO_V1(neon_xlogflush);
  */
 #if PG_MAJORVERSION_NUM < 16
 typedef void (*neon_read_at_lsn_type) (NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-									   XLogRecPtr request_lsn, XLogRecPtr not_modified_since, char *buffer);
+									   neon_request_lsns request_lsns, char *buffer);
 #else
 typedef void (*neon_read_at_lsn_type) (NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-									   XLogRecPtr request_lsn, XLogRecPtr not_modified_since, void *buffer);
+									   neon_request_lsns request_lsns, void *buffer);
 #endif
 
 static neon_read_at_lsn_type neon_read_at_lsn_ptr;
@@ -298,9 +298,7 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS)
 	text	   *relname;
 	text	   *forkname;
 	uint32		blkno;
-
-	XLogRecPtr	request_lsn;
-	XLogRecPtr	not_modified_since;
+	neon_request_lsns	request_lsns;
 
 	if (PG_NARGS() != 5)
 		elog(ERROR, "unexpected number of arguments in SQL function signature");
@@ -312,8 +310,8 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS)
 	forkname = PG_GETARG_TEXT_PP(1);
 	blkno = PG_GETARG_UINT32(2);
 
-	request_lsn = PG_ARGISNULL(3) ? GetXLogInsertRecPtr() : PG_GETARG_LSN(3);
-	not_modified_since = PG_ARGISNULL(4) ? request_lsn : PG_GETARG_LSN(4);
+	request_lsns.request_lsn = PG_ARGISNULL(3) ? GetXLogInsertRecPtr() : PG_GETARG_LSN(3);
+	request_lsns.not_modified_since = PG_ARGISNULL(4) ? request_lsns.request_lsn : PG_GETARG_LSN(4);
 
 	if (!superuser())
 		ereport(ERROR,
@@ -367,7 +365,8 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS)
 	SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ);
 	raw_page_data = VARDATA(raw_page);
 
-	neon_read_at_lsn(InfoFromRelation(rel), forknum, blkno, request_lsn, not_modified_since, raw_page_data);
+	neon_read_at_lsn(InfoFromRelation(rel), forknum, blkno, request_lsns,
+					 raw_page_data);
 
 	relation_close(rel, AccessShareLock);
 
@@ -413,19 +412,18 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS)
 
 		ForkNumber	forknum = PG_GETARG_UINT32(3);
 		uint32		blkno = PG_GETARG_UINT32(4);
-		XLogRecPtr	request_lsn;
-		XLogRecPtr	not_modified_since;
+		neon_request_lsns	request_lsns;
 
 		/* Initialize buffer to copy to */
 		bytea	   *raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ);
 
-		request_lsn = PG_ARGISNULL(5) ? GetXLogInsertRecPtr() : PG_GETARG_LSN(5);
-		not_modified_since = PG_ARGISNULL(6) ? request_lsn : PG_GETARG_LSN(6);
+		request_lsns.request_lsn = PG_ARGISNULL(5) ? GetXLogInsertRecPtr() : PG_GETARG_LSN(5);
+		request_lsns.not_modified_since = PG_ARGISNULL(6) ? request_lsns.request_lsn : PG_GETARG_LSN(6);
 
 		SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ);
 		raw_page_data = VARDATA(raw_page);
 
-		neon_read_at_lsn(rinfo, forknum, blkno, request_lsn, not_modified_since, raw_page_data);
+		neon_read_at_lsn(rinfo, forknum, blkno, request_lsns, raw_page_data);
 		PG_RETURN_BYTEA_P(raw_page);
 	}
 }

From 22afaea6e1c595282633b210ac02990f98378458 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 13 May 2024 13:17:30 +0300
Subject: [PATCH 046/130] Always use Lsn::MAX as the request LSN in the primary
 (#7708)

The new protocol version supports sending two LSNs to the pageserver:
request LSN and a "not_modified_since" hint. A primary always wants to
read the latest version of each page, so having two values was not
strictly necessary, and the old protocol worked fine with just the
"not_modified_since" LSN and a flag to request the latest page
version. Nevertheless, it seemed like a good idea to set the request
LSN to the current insert/flush LSN, because that's logically the page
version that the primary wants to read.

However, that made the test_gc_aggressive test case flaky. When the
primary requests a page with the last inserted or flushed LSN, it's
possible that by the time that the pageserver processes the request,
more WAL has been generated by other processes in the compute and
already digested by the pageserver. Furthermore, if the PITR horizon
in the pageserver is set to 0, and GC runs during that window, it's
possible that the GC horizon has advances past the request LSN, before
the pageserver processes the request. It is still correct to send the
latest page version in that case, because the compute either has the
page locked so the it cannot have been modified in the primary, or if
it's a prefetch request, and we will validate the LSNs when the
prefetch response is processed and discard it if the page has been
modified. But the pageserver doesn't know that and rightly complains.

To fix, modify the compute so that the primary always uses Lsn::MAX in
the requests. This reverts the primary's behavior to how the protocol
version 1 worked. In protocol version 1, there was only one LSN, the
"not_modified_since" hint, and a flag was set to read the latest page
version, whatever that might be. Requests from computes that are still
using protocol version 1 were already mapped to Lsn::MAX in the
pageserver, now we do the same with protocol version 2 for primary's
requests. (I'm a bit sad about losing the information in the
pageserver, what the last LSN was at the time that the request wa
made. We never had it with protocol version 1, but I wanted to make it
available for debugging purposes.)

Add another field, 'effective_request_lsn', to track what the flush
LSN was when the request was made. It's not sent to the pageserver,
Lsn::MAX is now used as the request LSN, but it's still needed
internally in the compute to track the validity of prefetch requests.

Fixes issue https://github.com/neondatabase/neon/issues/7692
---
 pgxn/neon/pagestore_client.h    | 12 +++++
 pgxn/neon/pagestore_smgr.c      | 95 +++++++++++++++++++++++----------
 pgxn/neon_test_utils/neontest.c | 14 +++++
 3 files changed, 92 insertions(+), 29 deletions(-)

diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h
index 1334a04f9a..8951e6607b 100644
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -255,6 +255,18 @@ typedef struct
 	 * the pageserver, as long as 'not_modified_since' has arrived.
 	 */
 	XLogRecPtr not_modified_since;
+
+	/*
+	 * 'effective_request_lsn' is not included in the request that's sent to
+	 * the pageserver, but is used to keep track of the latest LSN of when the
+	 * request was made. In a standby server, this is always the same as the
+	 * 'request_lsn', but in the primary we use UINT64_MAX as the
+	 * 'request_lsn' to request the latest page version, so we need this
+	 * separate field to remember that latest LSN was when the request was
+	 * made. It's needed to manage prefetch request, to verify if the response
+	 * to a prefetched request is still valid.
+	 */
+	XLogRecPtr effective_request_lsn;
 } neon_request_lsns;
 
 #if PG_MAJORVERSION_NUM < 16
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 8fcbbfe54d..e3b841f526 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -356,7 +356,7 @@ compact_prefetch_buffers(void)
 		source_slot->response = NULL;
 		source_slot->my_ring_index = 0;
 		source_slot->request_lsns = (neon_request_lsns) {
-			InvalidXLogRecPtr, InvalidXLogRecPtr
+			InvalidXLogRecPtr, InvalidXLogRecPtr, InvalidXLogRecPtr
 		};
 
 		/* update bookkeeping */
@@ -1529,6 +1529,7 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno)
 		/* Request the page at the last replayed LSN. */
 		result.request_lsn = GetXLogReplayRecPtr(NULL);
 		result.not_modified_since = last_written_lsn;
+		result.effective_request_lsn = result.request_lsn;
 		Assert(last_written_lsn <= result.request_lsn);
 
 		neon_log(DEBUG1, "neon_get_request_lsns request lsn %X/%X, not_modified_since %X/%X",
@@ -1570,15 +1571,30 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno)
 		}
 
 		/*
-		 * Request the latest version of the page. The most up-to-date request
-		 * LSN we could use would be the current insert LSN, but to avoid the
-		 * overhead of looking it up, use 'flushlsn' instead. This relies on
-		 * the assumption that if the page was modified since the last WAL
-		 * flush, it should still be in the buffer cache, and we wouldn't be
-		 * requesting it.
+		 * Request the very latest version of the page. In principle we
+		 * want to read the page at the current insert LSN, and we could
+		 * use that value in the request. However, there's a corner case
+		 * with pageserver's garbage collection. If the GC horizon is
+		 * set to a very small value, it's possible that by the time
+		 * that the pageserver processes our request, the GC horizon has
+		 * already moved past the LSN we calculate here. Standby servers
+		 * always have that problem as the can always lag behind the
+		 * primary, but for the primary we can avoid it by always
+		 * requesting the latest page, by setting request LSN to
+		 * UINT64_MAX.
+		 *
+		 * Remember the current LSN, however, so that we can later
+		 * correctly determine if the response to the request is still
+		 * valid. The most up-to-date LSN we could use for that purpose
+		 * would be the current insert LSN, but to avoid the overhead of
+		 * looking it up, use 'flushlsn' instead. This relies on the
+		 * assumption that if the page was modified since the last WAL
+		 * flush, it should still be in the buffer cache, and we
+		 * wouldn't be requesting it.
 		 */
-		result.request_lsn = flushlsn;
+		result.request_lsn = UINT64_MAX;
 		result.not_modified_since = last_written_lsn;
+		result.effective_request_lsn = flushlsn;
 	}
 
 	return result;
@@ -1596,7 +1612,11 @@ neon_prefetch_response_usable(neon_request_lsns request_lsns,
 {
 	/* sanity check the LSN's on the old and the new request */
 	Assert(request_lsns.request_lsn >= request_lsns.not_modified_since);
+	Assert(request_lsns.effective_request_lsn >= request_lsns.not_modified_since);
+	Assert(request_lsns.effective_request_lsn <= request_lsns.request_lsn);
 	Assert(slot->request_lsns.request_lsn >= slot->request_lsns.not_modified_since);
+	Assert(slot->request_lsns.effective_request_lsn >= slot->request_lsns.not_modified_since);
+	Assert(slot->request_lsns.effective_request_lsn <= slot->request_lsns.request_lsn);
 	Assert(slot->status != PRFS_UNUSED);
 
 	/*
@@ -1614,27 +1634,40 @@ neon_prefetch_response_usable(neon_request_lsns request_lsns,
 	 * calculate LSNs "out of order" with each other, but the prefetch queue
 	 * is backend-private at the moment.)
 	 */
-	if (request_lsns.request_lsn < slot->request_lsns.request_lsn ||
+	if (request_lsns.effective_request_lsn < slot->request_lsns.effective_request_lsn ||
 		request_lsns.not_modified_since < slot->request_lsns.not_modified_since)
 	{
 		ereport(LOG,
 				(errcode(ERRCODE_IO_ERROR),
 				 errmsg(NEON_TAG "request with unexpected LSN after prefetch"),
 				 errdetail("Request %X/%X not_modified_since %X/%X, prefetch %X/%X not_modified_since %X/%X)",
-						   LSN_FORMAT_ARGS(request_lsns.request_lsn), LSN_FORMAT_ARGS(request_lsns.not_modified_since),
-						   LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since))));
+						   LSN_FORMAT_ARGS(request_lsns.effective_request_lsn),
+						   LSN_FORMAT_ARGS(request_lsns.not_modified_since),
+						   LSN_FORMAT_ARGS(slot->request_lsns.effective_request_lsn),
+						   LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since))));
 		return false;
 	}
 
 	/*---
-	 * Each request to the pageserver carries two LSN values:
-	 * `not_modified_since` and `request_lsn`. The (not_modified_since,
-	 * request_lsn] range of each request is effectively a claim that the page
-	 * has not been modified between those LSNs.  If the range of the old
-	 * request in the queue overlaps with the new request, we know that the
-	 * page hasn't been modified in the union of the ranges. We can use the
-	 * response to old request to satisfy the new request in that case. For
-	 * example:
+	 * Each request to the pageserver has three LSN values associated with it:
+	 * `not_modified_since`, `request_lsn`, and 'effective_request_lsn'.
+	 * `not_modified_since` and `request_lsn` are sent to the pageserver, but
+	 * in the primary node, we always use UINT64_MAX as the `request_lsn`, so
+	 * we remember `effective_request_lsn` separately. In a primary,
+	 * `effective_request_lsn` is the last flush WAL position when the request
+	 * was sent to the pageserver. That's logically the LSN that we are
+	 * requesting the page at, but we send UINT64_MAX to the pageserver so
+	 * that if the GC horizon advances past that position, we still get a
+	 * valid response instead of an error.
+	 *
+	 * To determine whether a response to a GetPage request issued earlier is
+	 * still valid to satisfy a new page read, we look at the
+	 * (not_modified_since, effective_request_lsn] range of the request. It is
+	 * effectively a claim that the page has not been modified between those
+	 * LSNs.  If the range of the old request in the queue overlaps with the
+	 * new request, we know that the page hasn't been modified in the union of
+	 * the ranges. We can use the response to old request to satisfy the new
+	 * request in that case. For example:
 	 *
 	 *              100      500
 	 * Old request:  +--------+
@@ -1663,9 +1696,9 @@ neon_prefetch_response_usable(neon_request_lsns request_lsns,
 	 */
 
 	/* this follows from the checks above */
-	Assert(request_lsns.request_lsn >= slot->request_lsns.not_modified_since);
+	Assert(request_lsns.effective_request_lsn >= slot->request_lsns.not_modified_since);
 
-	return request_lsns.not_modified_since <= slot->request_lsns.request_lsn;
+	return request_lsns.not_modified_since <= slot->request_lsns.effective_request_lsn;
 }
 
 /*
@@ -1757,7 +1790,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 					 errmsg(NEON_TAG "could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X",
 							RelFileInfoFmt(InfoFromSMgrRel(reln)),
 							forkNum,
-							LSN_FORMAT_ARGS(request_lsns.request_lsn)),
+							LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)),
 					 errdetail("page server returned error: %s",
 							   ((NeonErrorResponse *) resp)->message)));
 			break;
@@ -2296,7 +2329,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 							slot->shard_no, blkno,
 							RelFileInfoFmt(rinfo),
 							forkNum,
-							LSN_FORMAT_ARGS(request_lsns.request_lsn)),
+							LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)),
 					 errdetail("page server returned error: %s",
 							   ((NeonErrorResponse *) resp)->message)));
 			break;
@@ -2566,7 +2599,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 					 errmsg(NEON_TAG "could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X",
 							RelFileInfoFmt(InfoFromSMgrRel(reln)),
 							forknum,
-							LSN_FORMAT_ARGS(request_lsns.request_lsn)),
+							LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)),
 					 errdetail("page server returned error: %s",
 							   ((NeonErrorResponse *) resp)->message)));
 			break;
@@ -2579,7 +2612,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 	neon_log(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks",
 			 RelFileInfoFmt(InfoFromSMgrRel(reln)),
 			 forknum,
-			 LSN_FORMAT_ARGS(request_lsns.request_lsn),
+			 LSN_FORMAT_ARGS(request_lsns.effective_request_lsn),
 			 n_blocks);
 
 	pfree(resp);
@@ -2619,7 +2652,7 @@ neon_dbsize(Oid dbNode)
 			ereport(ERROR,
 					(errcode(ERRCODE_IO_ERROR),
 					 errmsg(NEON_TAG "could not read db size of db %u from page server at lsn %X/%08X",
-							dbNode, LSN_FORMAT_ARGS(request_lsns.request_lsn)),
+							dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)),
 					 errdetail("page server returned error: %s",
 							   ((NeonErrorResponse *) resp)->message)));
 			break;
@@ -2629,7 +2662,7 @@ neon_dbsize(Oid dbNode)
 	}
 
 	neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
-			 dbNode, LSN_FORMAT_ARGS(request_lsns.request_lsn), db_size);
+			 dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), db_size);
 
 	pfree(resp);
 	return db_size;
@@ -2874,6 +2907,10 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
 	XLogRecPtr request_lsn,
 		not_modified_since;
 
+	/*
+	 * Compute a request LSN to use, similar to neon_get_request_lsns() but the
+	 * logic is a bit simpler.
+	 */
 	if (RecoveryInProgress())
 	{
 		request_lsn = GetXLogReplayRecPtr(NULL);
@@ -2885,10 +2922,10 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
 			 */
 			request_lsn = GetRedoStartLsn();
 		}
+		request_lsn = nm_adjust_lsn(request_lsn);
 	}
 	else
-		request_lsn = GetXLogInsertRecPtr();
-	request_lsn = nm_adjust_lsn(request_lsn);
+		request_lsn = UINT64_MAX;
 
 	/*
 	 * GetRedoStartLsn() returns LSN of basebackup. We know that the SLRU
diff --git a/pgxn/neon_test_utils/neontest.c b/pgxn/neon_test_utils/neontest.c
index 9f63b58a86..47f245fbf1 100644
--- a/pgxn/neon_test_utils/neontest.c
+++ b/pgxn/neon_test_utils/neontest.c
@@ -312,6 +312,13 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS)
 
 	request_lsns.request_lsn = PG_ARGISNULL(3) ? GetXLogInsertRecPtr() : PG_GETARG_LSN(3);
 	request_lsns.not_modified_since = PG_ARGISNULL(4) ? request_lsns.request_lsn : PG_GETARG_LSN(4);
+	/*
+	 * For the time being, use the same LSN for request and
+	 * effective request LSN. If any test needed to use UINT64_MAX
+	 * as the request LSN, we'd need to add effective_request_lsn
+	 * as a new argument.
+	 */
+	request_lsns.effective_request_lsn = request_lsns.request_lsn;
 
 	if (!superuser())
 		ereport(ERROR,
@@ -419,6 +426,13 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS)
 
 		request_lsns.request_lsn = PG_ARGISNULL(5) ? GetXLogInsertRecPtr() : PG_GETARG_LSN(5);
 		request_lsns.not_modified_since = PG_ARGISNULL(6) ? request_lsns.request_lsn : PG_GETARG_LSN(6);
+		/*
+		 * For the time being, use the same LSN for request
+		 * and effective request LSN. If any test needed to
+		 * use UINT64_MAX as the request LSN, we'd need to add
+		 * effective_request_lsn as a new argument.
+		 */
+		request_lsns.effective_request_lsn = request_lsns.request_lsn;
 
 		SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ);
 		raw_page_data = VARDATA(raw_page);

From cd0e34493887c91485184f3fa5515ea94438aeda Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 14 May 2024 09:31:26 +0100
Subject: [PATCH 047/130] pageserver: do fewer heatmap uploads for tiny tenants
 (#7731)

## Problem

Currently we do a large number of heatmap uploads for tiny tenants.
"tiny" in this context is defined as being less than a single layer in
size. These uploads are triggered by atime changes rather than changes
in the set of layers.

Uploading heatmaps for atime changes on small tenants isn't useful,
because even without bumping these atimes, disk usage eviction still
avoids evicting the largest resident layer of a tenant, which in
practice keeps tiny/empty tenants mostly resident irrespective of
atimes.

## Summary of changes

- For tenants smaller than one checkpoint interval, only upload heatmap
if the set of layers has changed, not if only the atimes have changed.
- Include the heatmap period in the uploaded heatmap, as a precursor to
implementing https://github.com/neondatabase/neon/issues/6200
(auto-adjusting download intervals to match upload intervals)
---
 pageserver/src/tenant/secondary/heatmap.rs    | 25 ++++++
 .../src/tenant/secondary/heatmap_uploader.rs  | 82 +++++++++++++------
 2 files changed, 81 insertions(+), 26 deletions(-)

diff --git a/pageserver/src/tenant/secondary/heatmap.rs b/pageserver/src/tenant/secondary/heatmap.rs
index ca91ec24c6..2da4a3b9d5 100644
--- a/pageserver/src/tenant/secondary/heatmap.rs
+++ b/pageserver/src/tenant/secondary/heatmap.rs
@@ -15,6 +15,14 @@ pub(super) struct HeatMapTenant {
     pub(super) generation: Generation,
 
     pub(super) timelines: Vec<HeatMapTimeline>,
+
+    /// Uploaders provide their own upload period in the heatmap, as a hint to downloaders
+    /// of how frequently it is worthwhile to check for updates.
+    ///
+    /// This is optional for backward compat, and because we sometimes might upload
+    /// a heatmap explicitly via API for a tenant that has no periodic upload configured.
+    #[serde(default)]
+    pub(super) upload_period_ms: Option<u128>,
 }
 
 #[serde_as]
@@ -81,4 +89,21 @@ impl HeatMapTenant {
 
         stats
     }
+
+    pub(crate) fn strip_atimes(self) -> Self {
+        Self {
+            timelines: self
+                .timelines
+                .into_iter()
+                .map(|mut tl| {
+                    for layer in &mut tl.layers {
+                        layer.access_time = SystemTime::UNIX_EPOCH;
+                    }
+                    tl
+                })
+                .collect(),
+            generation: self.generation,
+            upload_period_ms: self.upload_period_ms,
+        }
+    }
 }
diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs
index 352409f5fc..fddced3ead 100644
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -80,7 +80,7 @@ impl RunningJob for WriteInProgress {
 
 struct UploadPending {
     tenant: Arc<Tenant>,
-    last_digest: Option<md5::Digest>,
+    last_upload: Option<LastUploadState>,
     target_time: Option<Instant>,
     period: Option<Duration>,
 }
@@ -94,7 +94,7 @@ impl scheduler::PendingJob for UploadPending {
 struct WriteComplete {
     tenant_shard_id: TenantShardId,
     completed_at: Instant,
-    digest: Option<md5::Digest>,
+    uploaded: Option<LastUploadState>,
     next_upload: Option<Instant>,
 }
 
@@ -115,10 +115,7 @@ struct UploaderTenantState {
     tenant: Weak<Tenant>,
 
     /// Digest of the serialized heatmap that we last successfully uploaded
-    ///
-    /// md5 is generally a bad hash.  We use it because it's convenient for interop with AWS S3's ETag,
-    /// which is also an md5sum.
-    last_digest: Option<md5::Digest>,
+    last_upload_state: Option<LastUploadState>,
 
     /// When the last upload attempt completed (may have been successful or failed)
     last_upload: Option<Instant>,
@@ -187,7 +184,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
                     tenant: Arc::downgrade(&tenant),
                     last_upload: None,
                     next_upload: Some(now.checked_add(period_warmup(period)).unwrap_or(now)),
-                    last_digest: None,
+                    last_upload_state: None,
                 });
 
             // Decline to do the upload if insufficient time has passed
@@ -195,10 +192,10 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
                 return;
             }
 
-            let last_digest = state.last_digest;
+            let last_upload = state.last_upload_state.clone();
             result.jobs.push(UploadPending {
                 tenant,
-                last_digest,
+                last_upload,
                 target_time: state.next_upload,
                 period: Some(period),
             });
@@ -218,7 +215,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
     ) {
         let UploadPending {
             tenant,
-            last_digest,
+            last_upload,
             target_time,
             period,
         } = job;
@@ -231,16 +228,16 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
             let _completion = completion;
 
             let started_at = Instant::now();
-            let digest = match upload_tenant_heatmap(remote_storage, &tenant, last_digest).await {
-                Ok(UploadHeatmapOutcome::Uploaded(digest)) => {
+            let uploaded = match upload_tenant_heatmap(remote_storage, &tenant, last_upload.clone()).await {
+                Ok(UploadHeatmapOutcome::Uploaded(uploaded)) => {
                     let duration = Instant::now().duration_since(started_at);
                     SECONDARY_MODE
                         .upload_heatmap_duration
                         .observe(duration.as_secs_f64());
                     SECONDARY_MODE.upload_heatmap.inc();
-                    Some(digest)
+                    Some(uploaded)
                 }
-                Ok(UploadHeatmapOutcome::NoChange | UploadHeatmapOutcome::Skipped) => last_digest,
+                Ok(UploadHeatmapOutcome::NoChange | UploadHeatmapOutcome::Skipped) => last_upload,
                 Err(UploadHeatmapError::Upload(e)) => {
                     tracing::warn!(
                         "Failed to upload heatmap for tenant {}: {e:#}",
@@ -251,11 +248,11 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
                         .upload_heatmap_duration
                         .observe(duration.as_secs_f64());
                     SECONDARY_MODE.upload_heatmap_errors.inc();
-                    last_digest
+                    last_upload
                 }
                 Err(UploadHeatmapError::Cancelled) => {
                     tracing::info!("Cancelled heatmap upload, shutting down");
-                    last_digest
+                    last_upload
                 }
             };
 
@@ -277,7 +274,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
             WriteComplete {
                     tenant_shard_id: *tenant.get_tenant_shard_id(),
                     completed_at: now,
-                    digest,
+                    uploaded,
                     next_upload,
                 }
         }.instrument(info_span!(parent: None, "heatmap_upload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))))
@@ -299,7 +296,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
 
         Ok(UploadPending {
             // Ignore our state for last digest: this forces an upload even if nothing has changed
-            last_digest: None,
+            last_upload: None,
             tenant,
             target_time: None,
             period: None,
@@ -312,7 +309,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
         let WriteComplete {
             tenant_shard_id,
             completed_at,
-            digest,
+            uploaded,
             next_upload,
         } = completion;
         use std::collections::hash_map::Entry;
@@ -322,7 +319,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
             }
             Entry::Occupied(mut entry) => {
                 entry.get_mut().last_upload = Some(completed_at);
-                entry.get_mut().last_digest = digest;
+                entry.get_mut().last_upload_state = uploaded;
                 entry.get_mut().next_upload = next_upload
             }
         }
@@ -331,7 +328,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
 
 enum UploadHeatmapOutcome {
     /// We successfully wrote to remote storage, with this digest.
-    Uploaded(md5::Digest),
+    Uploaded(LastUploadState),
     /// We did not upload because the heatmap digest was unchanged since the last upload
     NoChange,
     /// We skipped the upload for some reason, such as tenant/timeline not ready
@@ -347,12 +344,25 @@ enum UploadHeatmapError {
     Upload(#[from] anyhow::Error),
 }
 
+/// Digests describing the heatmap we most recently uploaded successfully.
+///
+/// md5 is generally a bad hash.  We use it because it's convenient for interop with AWS S3's ETag,
+/// which is also an md5sum.
+#[derive(Clone)]
+struct LastUploadState {
+    // Digest of json-encoded HeatMapTenant
+    uploaded_digest: md5::Digest,
+
+    // Digest without atimes set.
+    layers_only_digest: md5::Digest,
+}
+
 /// The inner upload operation.  This will skip if `last_digest` is Some and matches the digest
 /// of the object we would have uploaded.
 async fn upload_tenant_heatmap(
     remote_storage: GenericRemoteStorage,
     tenant: &Arc<Tenant>,
-    last_digest: Option<md5::Digest>,
+    last_upload: Option<LastUploadState>,
 ) -> Result<UploadHeatmapOutcome, UploadHeatmapError> {
     debug_assert_current_span_has_tenant_id();
 
@@ -368,6 +378,7 @@ async fn upload_tenant_heatmap(
     let mut heatmap = HeatMapTenant {
         timelines: Vec::new(),
         generation,
+        upload_period_ms: tenant.get_heatmap_period().map(|p| p.as_millis()),
     };
     let timelines = tenant.timelines.lock().unwrap().clone();
 
@@ -396,15 +407,31 @@ async fn upload_tenant_heatmap(
 
     // Serialize the heatmap
     let bytes = serde_json::to_vec(&heatmap).map_err(|e| anyhow::anyhow!(e))?;
-    let bytes = bytes::Bytes::from(bytes);
-    let size = bytes.len();
 
     // Drop out early if nothing changed since our last upload
     let digest = md5::compute(&bytes);
-    if Some(digest) == last_digest {
+    if Some(&digest) == last_upload.as_ref().map(|d| &d.uploaded_digest) {
         return Ok(UploadHeatmapOutcome::NoChange);
     }
 
+    // Calculate a digest that omits atimes, so that we can distinguish actual changes in
+    // layers from changes only in atimes.
+    let heatmap_size_bytes = heatmap.get_stats().bytes;
+    let layers_only_bytes =
+        serde_json::to_vec(&heatmap.strip_atimes()).map_err(|e| anyhow::anyhow!(e))?;
+    let layers_only_digest = md5::compute(&layers_only_bytes);
+    if heatmap_size_bytes < tenant.get_checkpoint_distance() {
+        // For small tenants, skip upload if only atimes changed. This avoids doing frequent
+        // uploads from long-idle tenants whose atimes are just incremented by periodic
+        // size calculations.
+        if Some(&layers_only_digest) == last_upload.as_ref().map(|d| &d.layers_only_digest) {
+            return Ok(UploadHeatmapOutcome::NoChange);
+        }
+    }
+
+    let bytes = bytes::Bytes::from(bytes);
+    let size = bytes.len();
+
     let path = remote_heatmap_path(tenant.get_tenant_shard_id());
 
     let cancel = &tenant.cancel;
@@ -436,5 +463,8 @@ async fn upload_tenant_heatmap(
 
     tracing::info!("Successfully uploaded {size} byte heatmap to {path}");
 
-    Ok(UploadHeatmapOutcome::Uploaded(digest))
+    Ok(UploadHeatmapOutcome::Uploaded(LastUploadState {
+        uploaded_digest: digest,
+        layers_only_digest,
+    }))
 }

From df0f1e359b1ff01474ffaac26cec730697d1d643 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 14 May 2024 09:37:48 +0100
Subject: [PATCH 048/130] pageserver: switch on new-style local layer paths
 (#7660)

We recently added support for local layer paths that contain a
generation number:
- https://github.com/neondatabase/neon/pull/7609
- https://github.com/neondatabase/neon/pull/7640

Now that we've cut a
[release](https://github.com/neondatabase/neon/pull/7735) that includes
those changes, we can proceed to enable writing the new format without
breaking forward compatibility.
---
 pageserver/src/tenant/storage_layer/layer.rs    | 17 +++++++----------
 .../regress/test_pageserver_generations.py      |  1 -
 2 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index b5b0260327..b6f7702247 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -129,19 +129,16 @@ pub(crate) fn local_layer_path(
     tenant_shard_id: &TenantShardId,
     timeline_id: &TimelineId,
     layer_file_name: &LayerName,
-    _generation: &Generation,
+    generation: &Generation,
 ) -> Utf8PathBuf {
     let timeline_path = conf.timeline_path(tenant_shard_id, timeline_id);
 
-    timeline_path.join(layer_file_name.to_string())
-
-    // TODO: switch to enabling new-style layer paths after next release
-    // if generation.is_none() {
-    //     // Without a generation, we may only use legacy path style
-    //     timeline_path.join(layer_file_name.to_string())
-    // } else {
-    //     timeline_path.join(format!("{}-v1{}", layer_file_name, generation.get_suffix()))
-    // }
+    if generation.is_none() {
+        // Without a generation, we may only use legacy path style
+        timeline_path.join(layer_file_name.to_string())
+    } else {
+        timeline_path.join(format!("{}-v1{}", layer_file_name, generation.get_suffix()))
+    }
 }
 
 impl Layer {
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index a38bcd45da..4fdc5852f5 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -703,7 +703,6 @@ def test_multi_attach(
     workload.validate(pageservers[2].id)
 
 
-@pytest.mark.skip(reason="To be enabled after release with new local path style")
 def test_upgrade_generationless_local_file_paths(
     neon_env_builder: NeonEnvBuilder,
 ):

From b6ee91835b5eaf1d210805d0e4e07d01214445dd Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 14 May 2024 11:39:59 +0100
Subject: [PATCH 049/130]  CI(report-benchmarks-failures): fix condition
 (#7745)

## Problem

`report-benchmarks-failures` job is triggered for any failure in the CI
pipeline, but we need it to be triggered only for failed `benchmarks`
job

## Summary of changes
- replace `failure()` with `needs.benchmarks.result == 'failure'` in the
condition
---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index f417cecd58..14d19f7ae3 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -548,7 +548,7 @@ jobs:
 
   report-benchmarks-failures:
     needs: [ benchmarks, create-test-report ]
-    if: github.ref_name == 'main' && failure()
+    if: github.ref_name == 'main' && needs.benchmarks.result == 'failure'
     runs-on: ubuntu-latest
 
     steps:

From 30d15ad4032e543be37fc0378345d9b6568b20cc Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Tue, 14 May 2024 10:36:48 -0400
Subject: [PATCH 050/130] chore(test): add version check for forward compat
 test (#7685)

A test for https://github.com/neondatabase/neon/pull/7684.

This pull request checks if the pageserver version we specified is the
one actually running by comparing the git hash in forward compatibility
tests.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 test_runner/regress/test_compatibility.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index ef35bf4696..65649e0c0a 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -1,4 +1,5 @@
 import os
+import re
 import shutil
 import subprocess
 import tempfile
@@ -245,14 +246,34 @@ def test_forward_compatibility(
             compatibility_snapshot_dir / "repo",
         )
 
+        # not using env.pageserver.version because it was initialized before
+        prev_pageserver_version_str = env.get_binary_version("pageserver")
+        prev_pageserver_version_match = re.search(
+            "Neon page server git-env:(.*) failpoints: (.*), features: (.*)",
+            prev_pageserver_version_str,
+        )
+        if prev_pageserver_version_match is not None:
+            prev_pageserver_version = prev_pageserver_version_match.group(1)
+        else:
+            raise AssertionError(
+                "cannot find git hash in the version string: " + prev_pageserver_version_str
+            )
+
+        # does not include logs from previous runs
+        assert not env.pageserver.log_contains("git-env:" + prev_pageserver_version)
+
         neon_env_builder.start()
 
+        # ensure the specified pageserver is running
+        assert env.pageserver.log_contains("git-env:" + prev_pageserver_version)
+
         check_neon_works(
             env,
             test_output_dir=test_output_dir,
             sql_dump_path=compatibility_snapshot_dir / "dump.sql",
             repo_dir=env.repo_dir,
         )
+
     except Exception:
         if breaking_changes_allowed:
             pytest.xfail(

From 82960b2175211c0f666b91b5258c5e2253a245c7 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 14 May 2024 16:39:17 +0100
Subject: [PATCH 051/130] pageserver: skip waiting for logical size on shard >0
 (#7744)

## Problem

Shards with number >0 could hang waiting for
`await_initial_logical_size`, as we don't calculate logical size on
these shards. This causes them to hold onto semaphore units and starve
other tenants out from proceeding with warmup activation.

That doesn't hurt availability (we still have on-demand activation), but
it does mean that some background tasks like consumption metrics would
omit some tenants.

## Summary of changes

- Skip waiting for logical size calculation on shards >0
- Upgrade unexpected code paths to use debug_assert!(), which acts as an
implicit regression test for this issue, and make the info() one into a
warn()
---
 .../src/tenant/remote_timeline_client.rs      |  5 +++++
 pageserver/src/tenant/timeline.rs             | 19 ++++++++++++++++++-
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 9103760388..630ade5c13 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1127,6 +1127,11 @@ impl RemoteTimelineClient {
         Ok(())
     }
 
+    pub(crate) fn is_deleting(&self) -> bool {
+        let mut locked = self.upload_queue.lock().unwrap();
+        locked.stopped_mut().is_ok()
+    }
+
     pub(crate) async fn preserve_initdb_archive(
         self: &Arc<Self>,
         tenant_id: &TenantId,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 9ee24a4ff0..ca34b4fadc 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2631,6 +2631,7 @@ impl Timeline {
                             // Don't make noise.
                         } else {
                             warn!("unexpected: cancel_wait_for_background_loop_concurrency_limit_semaphore not set, priority-boosting of logical size calculation will not work");
+                            debug_assert!(false);
                         }
                     }
                 };
@@ -4355,6 +4356,21 @@ impl Timeline {
     /// this Timeline is shut down.  Calling this function will cause the initial
     /// logical size calculation to skip waiting for the background jobs barrier.
     pub(crate) async fn await_initial_logical_size(self: Arc<Self>) {
+        if !self.shard_identity.is_shard_zero() {
+            // We don't populate logical size on shard >0: skip waiting for it.
+            return;
+        }
+
+        if self
+            .remote_client
+            .as_ref()
+            .map(|c| c.is_deleting())
+            .unwrap_or(false)
+        {
+            // The timeline was created in a deletion-resume state, we don't expect logical size to be populated
+            return;
+        }
+
         if let Some(await_bg_cancel) = self
             .current_logical_size
             .cancel_wait_for_background_loop_concurrency_limit_semaphore
@@ -4366,9 +4382,10 @@ impl Timeline {
             // the logical size cancellation to skip the concurrency limit semaphore.
             // TODO: this is an unexpected case.  We should restructure so that it
             // can't happen.
-            tracing::info!(
+            tracing::warn!(
                 "await_initial_logical_size: can't get semaphore cancel token, skipping"
             );
+            debug_assert!(false);
         }
 
         tokio::select!(

From e67fcf9563faa1ebf0653a378a1999f446a3c1a0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 14 May 2024 17:49:19 +0200
Subject: [PATCH 052/130] Update mold to 2.31 (#7757)

The [2.31.0 release](https://github.com/rui314/mold/releases/tag/v2.31.0) of mold
includes a 10% speed improvement for binaries with a lot of debug info.
As we have such, it might be useful to update mold to the latest
release. The jump is from 2.4.0 to 2.31.0, but it's not been many
releases in between as the version number was raised by the mold
maintainers to 2.30.0 after 2.4.1 [to avoid confusion for some
tools](https://github.com/rui314/mold/releases/tag/v2.30.0).
---
 Dockerfile.build-tools | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools
index 19739cc1f8..460b8c996d 100644
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -87,7 +87,7 @@ RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "aws
     && rm awscliv2.zip
 
 # Mold: A Modern Linker
-ENV MOLD_VERSION v2.4.0
+ENV MOLD_VERSION v2.31.0
 RUN set -e \
     && git clone https://github.com/rui314/mold.git \
     && mkdir mold/build \

From 4eedb3b6f188408cd74397f81e02fc4c416eb5f5 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 14 May 2024 18:03:08 +0200
Subject: [PATCH 053/130] test suite: allow overriding default compaction
 algorithm via env var (#7747)

This PR allows setting the
`PAGESERVER_DEFAULT_TENANT_CONFIG_COMPACTION_ALGORITHM` env var to
override the `tenant_config.compaction_algorithm` field in the initial
`pageserver.toml` for all tests.

I tested manually that this works by halting a test using pdb and
inspecting the `effective_config` in the tenant status managment API.

If the env var is set, the tests are parametrized by the `kind` tag
field, allowing to do a matrix build in CI and let Allure summarize
everything in a nice report.

If the env var is not set, the tests are not parametrized. So, merging
this PR doesn't cause problems for flaky test detection. In fact, it
doesn't cause any runtime change if the env var is not set.

There are some tests in the test suite that set used to override
the entire tenant_config using
`NeonEnvBuilder.pageserver_config_override`.
Since config overrides are merged non-recursively, such overrides
that don't specify `kind = ` cause a fallback to pageserver's built-in
`DEFAULT_COMPACTION_ALGORITHM`.

Such cases can be found using

```
["']tenant_config\s*[='"]
```

We'll deal with these tests in a future PR.

closes https://github.com/neondatabase/neon/issues/7555
---
 scripts/flaky_tests.py                | 25 ++++++++++++++++++++++---
 test_runner/fixtures/neon_fixtures.py | 18 ++++++++++++++++++
 test_runner/fixtures/parametrize.py   | 27 ++++++++++++++++++++++++++-
 3 files changed, 66 insertions(+), 4 deletions(-)

diff --git a/scripts/flaky_tests.py b/scripts/flaky_tests.py
index 878840fcee..919a9278a9 100755
--- a/scripts/flaky_tests.py
+++ b/scripts/flaky_tests.py
@@ -5,10 +5,11 @@ import json
 import logging
 import os
 from collections import defaultdict
-from typing import DefaultDict, Dict
+from typing import Any, DefaultDict, Dict, Optional
 
 import psycopg2
 import psycopg2.extras
+import toml
 
 FLAKY_TESTS_QUERY = """
     SELECT
@@ -58,6 +59,24 @@ def main(args: argparse.Namespace):
     else:
         pageserver_virtual_file_io_engine_parameter = ""
 
+    # re-use existing records of flaky tests from before parametrization by compaction_algorithm
+    def get_pageserver_default_tenant_config_compaction_algorithm() -> Optional[Dict[str, Any]]:
+        """Duplicated from parametrize.py"""
+        toml_table = os.getenv("PAGESERVER_DEFAULT_TENANT_CONFIG_COMPACTION_ALGORITHM")
+        if toml_table is None:
+            return None
+        v = toml.loads(toml_table)
+        assert isinstance(v, dict)
+        return v
+
+    pageserver_default_tenant_config_compaction_algorithm_parameter = ""
+    if (
+        explicit_default := get_pageserver_default_tenant_config_compaction_algorithm()
+    ) is not None:
+        pageserver_default_tenant_config_compaction_algorithm_parameter = (
+            f"-{explicit_default['kind']}"
+        )
+
     for row in rows:
         # We don't want to automatically rerun tests in a performance suite
         if row["parent_suite"] != "test_runner.regress":
@@ -66,10 +85,10 @@ def main(args: argparse.Namespace):
         if row["name"].endswith("]"):
             parametrized_test = row["name"].replace(
                 "[",
-                f"[{build_type}-pg{pg_version}{pageserver_virtual_file_io_engine_parameter}-",
+                f"[{build_type}-pg{pg_version}{pageserver_virtual_file_io_engine_parameter}{pageserver_default_tenant_config_compaction_algorithm_parameter}-",
             )
         else:
-            parametrized_test = f"{row['name']}[{build_type}-pg{pg_version}{pageserver_virtual_file_io_engine_parameter}]"
+            parametrized_test = f"{row['name']}[{build_type}-pg{pg_version}{pageserver_virtual_file_io_engine_parameter}{pageserver_default_tenant_config_compaction_algorithm_parameter}]"
 
         res[row["parent_suite"]][row["suite"]][parametrized_test] = True
 
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 8432655370..62a4b974a3 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -467,6 +467,7 @@ class NeonEnvBuilder:
         initial_timeline: Optional[TimelineId] = None,
         pageserver_virtual_file_io_engine: Optional[str] = None,
         pageserver_aux_file_policy: Optional[AuxFileStore] = None,
+        pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]] = None,
     ):
         self.repo_dir = repo_dir
         self.rust_log_override = rust_log_override
@@ -507,6 +508,14 @@ class NeonEnvBuilder:
 
         self.pageserver_virtual_file_io_engine: Optional[str] = pageserver_virtual_file_io_engine
 
+        self.pageserver_default_tenant_config_compaction_algorithm: Optional[
+            Dict[str, Any]
+        ] = pageserver_default_tenant_config_compaction_algorithm
+        if self.pageserver_default_tenant_config_compaction_algorithm is not None:
+            log.debug(
+                f"Overriding pageserver default compaction algorithm to {self.pageserver_default_tenant_config_compaction_algorithm}"
+            )
+
         self.pageserver_get_vectored_impl: Optional[str] = None
         if os.getenv("PAGESERVER_GET_VECTORED_IMPL", "") == "vectored":
             self.pageserver_get_vectored_impl = "vectored"
@@ -1103,6 +1112,11 @@ class NeonEnv:
                 ps_cfg["get_impl"] = config.pageserver_get_impl
             if config.pageserver_validate_vectored_get is not None:
                 ps_cfg["validate_vectored_get"] = config.pageserver_validate_vectored_get
+            if config.pageserver_default_tenant_config_compaction_algorithm is not None:
+                tenant_config = ps_cfg.setdefault("tenant_config", {})
+                tenant_config[
+                    "compaction_algorithm"
+                ] = config.pageserver_default_tenant_config_compaction_algorithm
 
             if self.pageserver_remote_storage is not None:
                 ps_cfg["remote_storage"] = remote_storage_to_toml_dict(
@@ -1304,6 +1318,7 @@ def _shared_simple_env(
     pg_version: PgVersion,
     pageserver_virtual_file_io_engine: str,
     pageserver_aux_file_policy: Optional[AuxFileStore],
+    pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]],
 ) -> Iterator[NeonEnv]:
     """
     # Internal fixture backing the `neon_simple_env` fixture. If TEST_SHARED_FIXTURES
@@ -1335,6 +1350,7 @@ def _shared_simple_env(
         test_output_dir=test_output_dir,
         pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine,
         pageserver_aux_file_policy=pageserver_aux_file_policy,
+        pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm,
     ) as builder:
         env = builder.init_start()
 
@@ -1375,6 +1391,7 @@ def neon_env_builder(
     top_output_dir: Path,
     pageserver_virtual_file_io_engine: str,
     pageserver_aux_file_policy: Optional[AuxFileStore] = None,
+    pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]] = None,
 ) -> Iterator[NeonEnvBuilder]:
     """
     Fixture to create a Neon environment for test.
@@ -1409,6 +1426,7 @@ def neon_env_builder(
         test_output_dir=test_output_dir,
         test_overlay_dir=test_overlay_dir,
         pageserver_aux_file_policy=pageserver_aux_file_policy,
+        pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm,
     ) as builder:
         yield builder
 
diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py
index 77523a542b..0227285822 100644
--- a/test_runner/fixtures/parametrize.py
+++ b/test_runner/fixtures/parametrize.py
@@ -1,7 +1,8 @@
 import os
-from typing import Optional
+from typing import Any, Dict, Optional
 
 import pytest
+import toml
 from _pytest.python import Metafunc
 
 from fixtures.pg_version import PgVersion
@@ -37,6 +38,20 @@ def pageserver_aux_file_policy() -> Optional[AuxFileStore]:
     return None
 
 
+def get_pageserver_default_tenant_config_compaction_algorithm() -> Optional[Dict[str, Any]]:
+    toml_table = os.getenv("PAGESERVER_DEFAULT_TENANT_CONFIG_COMPACTION_ALGORITHM")
+    if toml_table is None:
+        return None
+    v = toml.loads(toml_table)
+    assert isinstance(v, dict)
+    return v
+
+
+@pytest.fixture(scope="function", autouse=True)
+def pageserver_default_tenant_config_compaction_algorithm() -> Optional[Dict[str, Any]]:
+    return get_pageserver_default_tenant_config_compaction_algorithm()
+
+
 def pytest_generate_tests(metafunc: Metafunc):
     if (bt := os.getenv("BUILD_TYPE")) is None:
         build_types = ["debug", "release"]
@@ -60,6 +75,16 @@ def pytest_generate_tests(metafunc: Metafunc):
     ):
         metafunc.parametrize("pageserver_virtual_file_io_engine", [io_engine])
 
+    # Same hack for pageserver_default_tenant_config_compaction_algorithm
+    if (
+        explicit_default := get_pageserver_default_tenant_config_compaction_algorithm()
+    ) is not None:
+        metafunc.parametrize(
+            "pageserver_default_tenant_config_compaction_algorithm",
+            [explicit_default],
+            ids=[explicit_default["kind"]],
+        )
+
     # For performance tests, parametrize also by platform
     if (
         "test_runner/performance" in metafunc.definition._nodeid

From 1a2a3cb446be40b7a1453876f0ea8d128c3435cd Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 10 May 2024 16:23:36 +0300
Subject: [PATCH 054/130] Add restart_lsn metric for logical slots.

---
 vm-image-spec.yaml | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index e9d983eba3..fa7cd014bf 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -293,6 +293,16 @@ files:
         values: [checkpoints_timed]
         query: |
           SELECT checkpoints_timed FROM pg_stat_bgwriter;
+
+      # Number of slots is limited by max_replication_slots, so collecting position for all of them shouldn't be bad.
+      - metric_name: logical_slot_restart_lsn
+        type: gauge
+        help: 'restart_lsn of logical slots'
+        key_labels:
+          - slot_name
+        values: [restart_lsn]
+        query: |
+          select slot_name, restart_lsn from pg_replication_slots where slot_type = 'logical';
   - filename: neon_collector_autoscaling.yml
     content: |
       collector_name: neon_collector_autoscaling

From 438bacc32eb5d6888d0aec623006e6046b59299e Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 15 May 2024 12:29:12 +0100
Subject: [PATCH 055/130] CI(neon-extra-builds): Use small-arm64 runners
 instead of large-arm64 (#7740)

## Problem
There are not enough arm runners and jobs in `neon-extra-builds` workflow
take about the same amount of time on a small-arm runner as on
large-arm.

## Summary of changes
- Switch `neon-extra-builds` workflow from `large-arm64` to
`small-arm64` runners
---
 .github/actionlint.yml                  | 1 +
 .github/workflows/neon_extra_builds.yml | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/actionlint.yml b/.github/actionlint.yml
index 942861ecd8..37983798b7 100644
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -5,6 +5,7 @@ self-hosted-runner:
     - large
     - large-arm64
     - small
+    - small-arm64
     - us-east-2
 config-variables:
   - REMOTE_STORAGE_AZURE_CONTAINER
diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index fdb03963fb..7d2187e59c 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -136,7 +136,7 @@ jobs:
   check-linux-arm-build:
     needs: [ check-permissions, build-build-tools-image ]
     timeout-minutes: 90
-    runs-on: [ self-hosted, large-arm64 ]
+    runs-on: [ self-hosted, small-arm64 ]
 
     env:
       # Use release build only, to have less debug info around
@@ -260,7 +260,7 @@ jobs:
   check-codestyle-rust-arm:
     needs: [ check-permissions, build-build-tools-image ]
     timeout-minutes: 90
-    runs-on: [ self-hosted, large-arm64 ]
+    runs-on: [ self-hosted, small-arm64 ]
 
     container:
       image: ${{ needs.build-build-tools-image.outputs.image }}

From f342b87f306f5a05cf91d8191833c2df9f6d4acd Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 15 May 2024 13:05:24 +0100
Subject: [PATCH 056/130] pageserver: remove Option<> around remote storage,
 clean up metadata file refs (#7752)

## Problem

This is historical baggage from when the pageserver could be run with
local disk only: we had a bunch of places where we had to treat remote
storage as optional.

Closes: https://github.com/neondatabase/neon/issues/6890

## Changes

- Remove Option<> around remote storage (in
https://github.com/neondatabase/neon/pull/7722 we made remote storage
clearly mandatory)
- Remove code for deleting old metadata files: they're all gone now.
- Remove other references to metadata files when loading directories, as
none exist.

I checked last 14 days of logs for "found legacy metadata", there are no
instances.
---
 pageserver/ctl/src/draw_timeline_dir.rs       |   5 -
 pageserver/src/bin/pageserver.rs              |  43 ++--
 pageserver/src/deletion_queue.rs              |  23 +-
 pageserver/src/http/routes.rs                 |  44 +---
 pageserver/src/lib.rs                         |  10 +-
 pageserver/src/tenant.rs                      | 177 ++++++---------
 pageserver/src/tenant/delete.rs               |  55 ++---
 pageserver/src/tenant/mgr.rs                  |  84 ++-----
 .../src/tenant/remote_timeline_client.rs      |   4 +-
 pageserver/src/tenant/secondary/downloader.rs |   8 +-
 pageserver/src/tenant/storage_layer/layer.rs  |  56 ++---
 .../src/tenant/storage_layer/layer/tests.rs   |  10 +-
 pageserver/src/tenant/timeline.rs             | 210 +++++++-----------
 pageserver/src/tenant/timeline/compaction.rs  |  12 +-
 pageserver/src/tenant/timeline/delete.rs      |  40 ++--
 .../src/tenant/timeline/detach_ancestor.rs    |  27 +--
 .../src/tenant/timeline/eviction_task.rs      |   7 +-
 pageserver/src/tenant/timeline/init.rs        |   7 +-
 test_runner/regress/test_broken_timeline.py   |  20 +-
 19 files changed, 285 insertions(+), 557 deletions(-)

diff --git a/pageserver/ctl/src/draw_timeline_dir.rs b/pageserver/ctl/src/draw_timeline_dir.rs
index 4dff8af1fc..389519c65a 100644
--- a/pageserver/ctl/src/draw_timeline_dir.rs
+++ b/pageserver/ctl/src/draw_timeline_dir.rs
@@ -52,7 +52,6 @@
 
 use anyhow::{Context, Result};
 use pageserver::repository::Key;
-use pageserver::METADATA_FILE_NAME;
 use std::cmp::Ordering;
 use std::io::{self, BufRead};
 use std::path::PathBuf;
@@ -159,10 +158,6 @@ pub fn main() -> Result<()> {
         let line = PathBuf::from_str(&line).unwrap();
         let filename = line.file_name().unwrap();
         let filename = filename.to_str().unwrap();
-        if filename == METADATA_FILE_NAME {
-            // Don't try and parse "metadata" like a key-lsn range
-            continue;
-        }
         let (key_range, lsn_range) = parse_filename(filename);
         files.push(Layer {
             filename: filename.to_owned(),
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 49f8a41b37..c0099aa704 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -383,7 +383,7 @@ fn start_pageserver(
     let shutdown_pageserver = tokio_util::sync::CancellationToken::new();
 
     // Set up remote storage client
-    let remote_storage = Some(create_remote_storage_client(conf)?);
+    let remote_storage = create_remote_storage_client(conf)?;
 
     // Set up deletion queue
     let (deletion_queue, deletion_workers) = DeletionQueue::new(
@@ -516,16 +516,12 @@ fn start_pageserver(
         }
     });
 
-    let secondary_controller = if let Some(remote_storage) = &remote_storage {
-        secondary::spawn_tasks(
-            tenant_manager.clone(),
-            remote_storage.clone(),
-            background_jobs_barrier.clone(),
-            shutdown_pageserver.clone(),
-        )
-    } else {
-        secondary::null_controller()
-    };
+    let secondary_controller = secondary::spawn_tasks(
+        tenant_manager.clone(),
+        remote_storage.clone(),
+        background_jobs_barrier.clone(),
+        shutdown_pageserver.clone(),
+    );
 
     // shared state between the disk-usage backed eviction background task and the http endpoint
     // that allows triggering disk-usage based eviction manually. note that the http endpoint
@@ -533,15 +529,13 @@ fn start_pageserver(
     // been configured.
     let disk_usage_eviction_state: Arc<disk_usage_eviction_task::State> = Arc::default();
 
-    if let Some(remote_storage) = &remote_storage {
-        launch_disk_usage_global_eviction_task(
-            conf,
-            remote_storage.clone(),
-            disk_usage_eviction_state.clone(),
-            tenant_manager.clone(),
-            background_jobs_barrier.clone(),
-        )?;
-    }
+    launch_disk_usage_global_eviction_task(
+        conf,
+        remote_storage.clone(),
+        disk_usage_eviction_state.clone(),
+        tenant_manager.clone(),
+        background_jobs_barrier.clone(),
+    )?;
 
     // Start up the service to handle HTTP mgmt API request. We created the
     // listener earlier already.
@@ -693,14 +687,7 @@ fn start_pageserver(
             // Right now that tree doesn't reach very far, and `task_mgr` is used instead.
             // The plan is to change that over time.
             shutdown_pageserver.take();
-            let bg_remote_storage = remote_storage.clone();
-            let bg_deletion_queue = deletion_queue.clone();
-            pageserver::shutdown_pageserver(
-                &tenant_manager,
-                bg_remote_storage.map(|_| bg_deletion_queue),
-                0,
-            )
-            .await;
+            pageserver::shutdown_pageserver(&tenant_manager, deletion_queue.clone(), 0).await;
             unreachable!()
         })
     }
diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs
index c937309d83..8790a9b0a8 100644
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -632,7 +632,7 @@ impl DeletionQueue {
     ///
     /// If remote_storage is None, then the returned workers will also be None.
     pub fn new<C>(
-        remote_storage: Option<GenericRemoteStorage>,
+        remote_storage: GenericRemoteStorage,
         control_plane_client: Option<C>,
         conf: &'static PageServerConf,
     ) -> (Self, Option<DeletionQueueWorkers<C>>)
@@ -658,23 +658,6 @@ impl DeletionQueue {
         // longer to flush after Tenants have all been torn down.
         let cancel = CancellationToken::new();
 
-        let remote_storage = match remote_storage {
-            None => {
-                return (
-                    Self {
-                        client: DeletionQueueClient {
-                            tx,
-                            executor_tx,
-                            lsn_table: lsn_table.clone(),
-                        },
-                        cancel,
-                    },
-                    None,
-                )
-            }
-            Some(r) => r,
-        };
-
         (
             Self {
                 client: DeletionQueueClient {
@@ -765,7 +748,7 @@ mod test {
         /// Simulate a pageserver restart by destroying and recreating the deletion queue
         async fn restart(&mut self) {
             let (deletion_queue, workers) = DeletionQueue::new(
-                Some(self.storage.clone()),
+                self.storage.clone(),
                 Some(self.mock_control_plane.clone()),
                 self.harness.conf,
             );
@@ -875,7 +858,7 @@ mod test {
         let mock_control_plane = MockControlPlane::new();
 
         let (deletion_queue, worker) = DeletionQueue::new(
-            Some(storage.clone()),
+            storage.clone(),
             Some(mock_control_plane.clone()),
             harness.conf,
         );
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 2370561756..0a98d32f02 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -104,7 +104,7 @@ pub struct State {
     tenant_manager: Arc<TenantManager>,
     auth: Option<Arc<SwappableJwtAuth>>,
     allowlist_routes: Vec<Uri>,
-    remote_storage: Option<GenericRemoteStorage>,
+    remote_storage: GenericRemoteStorage,
     broker_client: storage_broker::BrokerClientChannel,
     disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
     deletion_queue_client: DeletionQueueClient,
@@ -118,7 +118,7 @@ impl State {
         conf: &'static PageServerConf,
         tenant_manager: Arc<TenantManager>,
         auth: Option<Arc<SwappableJwtAuth>>,
-        remote_storage: Option<GenericRemoteStorage>,
+        remote_storage: GenericRemoteStorage,
         broker_client: storage_broker::BrokerClientChannel,
         disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
         deletion_queue_client: DeletionQueueClient,
@@ -813,12 +813,6 @@ async fn tenant_attach_handler(
 
     let generation = get_request_generation(state, maybe_body.as_ref().and_then(|r| r.generation))?;
 
-    if state.remote_storage.is_none() {
-        return Err(ApiError::BadRequest(anyhow!(
-            "attach_tenant is not possible because pageserver was configured without remote storage"
-        )));
-    }
-
     let tenant_shard_id = TenantShardId::unsharded(tenant_id);
     let shard_params = ShardParameters::default();
     let location_conf = LocationConf::attached_single(tenant_conf, generation, &shard_params);
@@ -1643,12 +1637,6 @@ async fn tenant_time_travel_remote_storage_handler(
         )));
     }
 
-    let Some(storage) = state.remote_storage.as_ref() else {
-        return Err(ApiError::InternalServerError(anyhow::anyhow!(
-            "remote storage not configured, cannot run time travel"
-        )));
-    };
-
     if timestamp > done_if_after {
         return Err(ApiError::BadRequest(anyhow!(
             "The done_if_after timestamp comes before the timestamp to recover to"
@@ -1658,7 +1646,7 @@ async fn tenant_time_travel_remote_storage_handler(
     tracing::info!("Issuing time travel request internally. timestamp={timestamp_raw}, done_if_after={done_if_after_raw}");
 
     remote_timeline_client::upload::time_travel_recover_tenant(
-        storage,
+        &state.remote_storage,
         &tenant_shard_id,
         timestamp,
         done_if_after,
@@ -1903,11 +1891,6 @@ async fn deletion_queue_flush(
 ) -> Result<Response<Body>, ApiError> {
     let state = get_state(&r);
 
-    if state.remote_storage.is_none() {
-        // Nothing to do if remote storage is disabled.
-        return json_response(StatusCode::OK, ());
-    }
-
     let execute = parse_query_param(&r, "execute")?.unwrap_or(false);
 
     let flush = async {
@@ -2072,18 +2055,11 @@ async fn disk_usage_eviction_run(
     };
 
     let state = get_state(&r);
-
-    let Some(storage) = state.remote_storage.as_ref() else {
-        return Err(ApiError::InternalServerError(anyhow::anyhow!(
-            "remote storage not configured, cannot run eviction iteration"
-        )));
-    };
-
     let eviction_state = state.disk_usage_eviction_state.clone();
 
     let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
         &eviction_state,
-        storage,
+        &state.remote_storage,
         usage,
         &state.tenant_manager,
         config.eviction_order,
@@ -2120,29 +2096,23 @@ async fn tenant_scan_remote_handler(
     let state = get_state(&request);
     let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
 
-    let Some(remote_storage) = state.remote_storage.as_ref() else {
-        return Err(ApiError::BadRequest(anyhow::anyhow!(
-            "Remote storage not configured"
-        )));
-    };
-
     let mut response = TenantScanRemoteStorageResponse::default();
 
     let (shards, _other_keys) =
-        list_remote_tenant_shards(remote_storage, tenant_id, cancel.clone())
+        list_remote_tenant_shards(&state.remote_storage, tenant_id, cancel.clone())
             .await
             .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
 
     for tenant_shard_id in shards {
         let (timeline_ids, _other_keys) =
-            list_remote_timelines(remote_storage, tenant_shard_id, cancel.clone())
+            list_remote_timelines(&state.remote_storage, tenant_shard_id, cancel.clone())
                 .await
                 .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
 
         let mut generation = Generation::none();
         for timeline_id in timeline_ids {
             match download_index_part(
-                remote_storage,
+                &state.remote_storage,
                 &tenant_shard_id,
                 &timeline_id,
                 Generation::MAX,
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 930700e50c..c69fb8c83b 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -57,7 +57,7 @@ pub use crate::metrics::preinitialize_metrics;
 #[tracing::instrument(skip_all, fields(%exit_code))]
 pub async fn shutdown_pageserver(
     tenant_manager: &TenantManager,
-    deletion_queue: Option<DeletionQueue>,
+    mut deletion_queue: DeletionQueue,
     exit_code: i32,
 ) {
     use std::time::Duration;
@@ -89,9 +89,7 @@ pub async fn shutdown_pageserver(
     .await;
 
     // Best effort to persist any outstanding deletions, to avoid leaking objects
-    if let Some(mut deletion_queue) = deletion_queue {
-        deletion_queue.shutdown(Duration::from_secs(5)).await;
-    }
+    deletion_queue.shutdown(Duration::from_secs(5)).await;
 
     // Shut down the HTTP endpoint last, so that you can still check the server's
     // status while it's shutting down.
@@ -114,10 +112,6 @@ pub async fn shutdown_pageserver(
     std::process::exit(exit_code);
 }
 
-/// The name of the metadata file pageserver creates per timeline.
-/// Full path: `tenants/<tenant_id>/timelines/<timeline_id>/metadata`.
-pub const METADATA_FILE_NAME: &str = "metadata";
-
 /// Per-tenant configuration file.
 /// Full path: `tenants/<tenant_id>/config`.
 pub(crate) const TENANT_CONFIG_NAME: &str = "config";
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 80d354d79e..026cbc107c 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -190,7 +190,7 @@ pub const TENANT_DELETED_MARKER_FILE_NAME: &str = "deleted";
 #[derive(Clone)]
 pub struct TenantSharedResources {
     pub broker_client: storage_broker::BrokerClientChannel,
-    pub remote_storage: Option<GenericRemoteStorage>,
+    pub remote_storage: GenericRemoteStorage,
     pub deletion_queue_client: DeletionQueueClient,
 }
 
@@ -292,7 +292,7 @@ pub struct Tenant {
     walredo_mgr: Option<Arc<WalRedoManager>>,
 
     // provides access to timeline data sitting in the remote storage
-    pub(crate) remote_storage: Option<GenericRemoteStorage>,
+    pub(crate) remote_storage: GenericRemoteStorage,
 
     // Access to global deletion queue for when this tenant wants to schedule a deletion
     deletion_queue_client: DeletionQueueClient,
@@ -551,21 +551,22 @@ impl Tenant {
         );
 
         if let Some(index_part) = index_part.as_ref() {
-            timeline
-                .remote_client
-                .as_ref()
-                .unwrap()
-                .init_upload_queue(index_part)?;
-        } else if self.remote_storage.is_some() {
+            timeline.remote_client.init_upload_queue(index_part)?;
+        } else {
             // No data on the remote storage, but we have local metadata file. We can end up
             // here with timeline_create being interrupted before finishing index part upload.
             // By doing what we do here, the index part upload is retried.
             // If control plane retries timeline creation in the meantime, the mgmt API handler
             // for timeline creation will coalesce on the upload we queue here.
+
             // FIXME: this branch should be dead code as we no longer write local metadata.
-            let rtc = timeline.remote_client.as_ref().unwrap();
-            rtc.init_upload_queue_for_empty_remote(&metadata)?;
-            rtc.schedule_index_upload_for_full_metadata_update(&metadata)?;
+
+            timeline
+                .remote_client
+                .init_upload_queue_for_empty_remote(&metadata)?;
+            timeline
+                .remote_client
+                .schedule_index_upload_for_full_metadata_update(&metadata)?;
         }
 
         timeline
@@ -777,14 +778,14 @@ impl Tenant {
                     AttachType::Normal
                 };
 
-                let preload = match (&mode, &remote_storage) {
-                    (SpawnMode::Create, _) => {
+                let preload = match &mode {
+                    SpawnMode::Create => {
                         None
                     },
-                    (SpawnMode::Eager | SpawnMode::Lazy, Some(remote_storage)) => {
+                    SpawnMode::Eager | SpawnMode::Lazy => {
                         let _preload_timer = TENANT.preload.start_timer();
                         let res = tenant_clone
-                            .preload(remote_storage, task_mgr::shutdown_token())
+                            .preload(&remote_storage, task_mgr::shutdown_token())
                             .await;
                         match res {
                             Ok(p) => Some(p),
@@ -794,10 +795,7 @@ impl Tenant {
                             }
                         }
                     }
-                    (_, None) => {
-                        let _preload_timer = TENANT.preload.start_timer();
-                        None
-                    }
+
                 };
 
                 // Remote preload is complete.
@@ -1021,7 +1019,7 @@ impl Tenant {
                 index_part,
                 remote_metadata,
                 TimelineResources {
-                    remote_client: Some(remote_client),
+                    remote_client,
                     deletion_queue_client: self.deletion_queue_client.clone(),
                     timeline_get_throttle: self.timeline_get_throttle.clone(),
                 },
@@ -1047,7 +1045,7 @@ impl Tenant {
                 Arc::clone(self),
                 timeline_id,
                 &index_part.metadata,
-                Some(remote_timeline_client),
+                remote_timeline_client,
                 self.deletion_queue_client.clone(),
             )
             .instrument(tracing::info_span!("timeline_delete", %timeline_id))
@@ -1139,9 +1137,7 @@ impl Tenant {
         let mut size = 0;
 
         for timeline in self.list_timelines() {
-            if let Some(remote_client) = &timeline.remote_client {
-                size += remote_client.get_remote_physical_size();
-            }
+            size += timeline.remote_client.get_remote_physical_size();
         }
 
         size
@@ -1191,6 +1187,7 @@ impl Tenant {
     pub fn create_broken_tenant(
         conf: &'static PageServerConf,
         tenant_shard_id: TenantShardId,
+        remote_storage: GenericRemoteStorage,
         reason: String,
     ) -> Arc<Tenant> {
         Arc::new(Tenant::new(
@@ -1205,7 +1202,7 @@ impl Tenant {
             ShardIdentity::broken(tenant_shard_id.shard_number, tenant_shard_id.shard_count),
             None,
             tenant_shard_id,
-            None,
+            remote_storage,
             DeletionQueueClient::broken(),
         ))
     }
@@ -1398,13 +1395,7 @@ impl Tenant {
         tline.freeze_and_flush().await.context("freeze_and_flush")?;
 
         // Make sure the freeze_and_flush reaches remote storage.
-        tline
-            .remote_client
-            .as_ref()
-            .unwrap()
-            .wait_completion()
-            .await
-            .unwrap();
+        tline.remote_client.wait_completion().await.unwrap();
 
         let tl = uninit_tl.finish_creation()?;
         // The non-test code would call tl.activate() here.
@@ -1470,20 +1461,19 @@ impl Tenant {
                     return Err(CreateTimelineError::Conflict);
                 }
 
-                if let Some(remote_client) = existing.remote_client.as_ref() {
-                    // Wait for uploads to complete, so that when we return Ok, the timeline
-                    // is known to be durable on remote storage. Just like we do at the end of
-                    // this function, after we have created the timeline ourselves.
-                    //
-                    // We only really care that the initial version of `index_part.json` has
-                    // been uploaded. That's enough to remember that the timeline
-                    // exists. However, there is no function to wait specifically for that so
-                    // we just wait for all in-progress uploads to finish.
-                    remote_client
-                        .wait_completion()
-                        .await
-                        .context("wait for timeline uploads to complete")?;
-                }
+                // Wait for uploads to complete, so that when we return Ok, the timeline
+                // is known to be durable on remote storage. Just like we do at the end of
+                // this function, after we have created the timeline ourselves.
+                //
+                // We only really care that the initial version of `index_part.json` has
+                // been uploaded. That's enough to remember that the timeline
+                // exists. However, there is no function to wait specifically for that so
+                // we just wait for all in-progress uploads to finish.
+                existing
+                    .remote_client
+                    .wait_completion()
+                    .await
+                    .context("wait for timeline uploads to complete")?;
 
                 return Ok(existing);
             }
@@ -1559,14 +1549,14 @@ impl Tenant {
         // the timeline is visible in [`Self::timelines`], but it is _not_ durable yet.  We must
         // not send a success to the caller until it is.  The same applies to handling retries,
         // see the handling of [`TimelineExclusionError::AlreadyExists`] above.
-        if let Some(remote_client) = loaded_timeline.remote_client.as_ref() {
-            let kind = ancestor_timeline_id
-                .map(|_| "branched")
-                .unwrap_or("bootstrapped");
-            remote_client.wait_completion().await.with_context(|| {
-                format!("wait for {} timeline initial uploads to complete", kind)
-            })?;
-        }
+        let kind = ancestor_timeline_id
+            .map(|_| "branched")
+            .unwrap_or("bootstrapped");
+        loaded_timeline
+            .remote_client
+            .wait_completion()
+            .await
+            .with_context(|| format!("wait for {} timeline initial uploads to complete", kind))?;
 
         loaded_timeline.activate(self.clone(), broker_client, None, ctx);
 
@@ -2161,32 +2151,26 @@ impl Tenant {
     ) -> anyhow::Result<()> {
         let timelines = self.timelines.lock().unwrap().clone();
         for timeline in timelines.values() {
-            let Some(tl_client) = &timeline.remote_client else {
-                anyhow::bail!("Remote storage is mandatory");
-            };
-
-            let Some(remote_storage) = &self.remote_storage else {
-                anyhow::bail!("Remote storage is mandatory");
-            };
-
             // We do not block timeline creation/deletion during splits inside the pageserver: it is up to higher levels
             // to ensure that they do not start a split if currently in the process of doing these.
 
             // Upload an index from the parent: this is partly to provide freshness for the
             // child tenants that will copy it, and partly for general ease-of-debugging: there will
             // always be a parent shard index in the same generation as we wrote the child shard index.
-            tl_client.schedule_index_upload_for_file_changes()?;
-            tl_client.wait_completion().await?;
+            timeline
+                .remote_client
+                .schedule_index_upload_for_file_changes()?;
+            timeline.remote_client.wait_completion().await?;
 
             // Shut down the timeline's remote client: this means that the indices we write
             // for child shards will not be invalidated by the parent shard deleting layers.
-            tl_client.shutdown().await;
+            timeline.remote_client.shutdown().await;
 
             // Download methods can still be used after shutdown, as they don't flow through the remote client's
             // queue.  In principal the RemoteTimelineClient could provide this without downloading it, but this
             // operation is rare, so it's simpler to just download it (and robustly guarantees that the index
             // we use here really is the remotely persistent one).
-            let result = tl_client
+            let result = timeline.remote_client
                 .download_index_file(&self.cancel)
                 .instrument(info_span!("download_index_file", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline.timeline_id))
                 .await?;
@@ -2199,7 +2183,7 @@ impl Tenant {
 
             for child_shard in child_shards {
                 upload_index_part(
-                    remote_storage,
+                    &self.remote_storage,
                     child_shard,
                     &timeline.timeline_id,
                     self.generation,
@@ -2475,7 +2459,7 @@ impl Tenant {
         shard_identity: ShardIdentity,
         walredo_mgr: Option<Arc<WalRedoManager>>,
         tenant_shard_id: TenantShardId,
-        remote_storage: Option<GenericRemoteStorage>,
+        remote_storage: GenericRemoteStorage,
         deletion_queue_client: DeletionQueueClient,
     ) -> Tenant {
         let (state, mut rx) = watch::channel(state);
@@ -3119,11 +3103,10 @@ impl Tenant {
         // We still need to upload its metadata eagerly: if other nodes `attach` the tenant and miss this timeline, their GC
         // could get incorrect information and remove more layers, than needed.
         // See also https://github.com/neondatabase/neon/issues/3865
-        if let Some(remote_client) = new_timeline.remote_client.as_ref() {
-            remote_client
-                .schedule_index_upload_for_full_metadata_update(&metadata)
-                .context("branch initial metadata upload")?;
-        }
+        new_timeline
+            .remote_client
+            .schedule_index_upload_for_full_metadata_update(&metadata)
+            .context("branch initial metadata upload")?;
 
         Ok(new_timeline)
     }
@@ -3155,11 +3138,6 @@ impl Tenant {
         pgdata_path: &Utf8PathBuf,
         timeline_id: &TimelineId,
     ) -> anyhow::Result<()> {
-        let Some(storage) = &self.remote_storage else {
-            // No remote storage?  No upload.
-            return Ok(());
-        };
-
         let temp_path = timelines_path.join(format!(
             "{INITDB_PATH}.upload-{timeline_id}.{TEMP_FILE_SUFFIX}"
         ));
@@ -3183,7 +3161,7 @@ impl Tenant {
         backoff::retry(
             || async {
                 self::remote_timeline_client::upload_initdb_dir(
-                    storage,
+                    &self.remote_storage,
                     &self.tenant_shard_id.tenant_id,
                     timeline_id,
                     pgdata_zstd.try_clone().await?,
@@ -3240,9 +3218,6 @@ impl Tenant {
             }
         }
         if let Some(existing_initdb_timeline_id) = load_existing_initdb {
-            let Some(storage) = &self.remote_storage else {
-                bail!("no storage configured but load_existing_initdb set to {existing_initdb_timeline_id}");
-            };
             if existing_initdb_timeline_id != timeline_id {
                 let source_path = &remote_initdb_archive_path(
                     &self.tenant_shard_id.tenant_id,
@@ -3252,7 +3227,7 @@ impl Tenant {
                     &remote_initdb_archive_path(&self.tenant_shard_id.tenant_id, &timeline_id);
 
                 // if this fails, it will get retried by retried control plane requests
-                storage
+                self.remote_storage
                     .copy_object(source_path, dest_path, &self.cancel)
                     .await
                     .context("copy initdb tar")?;
@@ -3260,7 +3235,7 @@ impl Tenant {
             let (initdb_tar_zst_path, initdb_tar_zst) =
                 self::remote_timeline_client::download_initdb_tar_zst(
                     self.conf,
-                    storage,
+                    &self.remote_storage,
                     &self.tenant_shard_id,
                     &existing_initdb_timeline_id,
                     &self.cancel,
@@ -3355,20 +3330,14 @@ impl Tenant {
 
     /// Call this before constructing a timeline, to build its required structures
     fn build_timeline_resources(&self, timeline_id: TimelineId) -> TimelineResources {
-        let remote_client = if let Some(remote_storage) = self.remote_storage.as_ref() {
-            let remote_client = RemoteTimelineClient::new(
-                remote_storage.clone(),
-                self.deletion_queue_client.clone(),
-                self.conf,
-                self.tenant_shard_id,
-                timeline_id,
-                self.generation,
-            );
-            Some(remote_client)
-        } else {
-            None
-        };
-
+        let remote_client = RemoteTimelineClient::new(
+            self.remote_storage.clone(),
+            self.deletion_queue_client.clone(),
+            self.conf,
+            self.tenant_shard_id,
+            timeline_id,
+            self.generation,
+        );
         TimelineResources {
             remote_client,
             deletion_queue_client: self.deletion_queue_client.clone(),
@@ -3392,9 +3361,9 @@ impl Tenant {
         let tenant_shard_id = self.tenant_shard_id;
 
         let resources = self.build_timeline_resources(new_timeline_id);
-        if let Some(remote_client) = &resources.remote_client {
-            remote_client.init_upload_queue_for_empty_remote(new_metadata)?;
-        }
+        resources
+            .remote_client
+            .init_upload_queue_for_empty_remote(new_metadata)?;
 
         let timeline_struct = self
             .create_timeline_struct(
@@ -3562,9 +3531,7 @@ impl Tenant {
             tracing::info!(timeline_id=%timeline.timeline_id, "Flushing...");
             timeline.freeze_and_flush().await?;
             tracing::info!(timeline_id=%timeline.timeline_id, "Waiting for uploads...");
-            if let Some(client) = &timeline.remote_client {
-                client.wait_completion().await?;
-            }
+            timeline.remote_client.wait_completion().await?;
 
             Ok(())
         }
@@ -3878,7 +3845,7 @@ pub(crate) mod harness {
                 ShardIdentity::unsharded(),
                 Some(walredo_mgr),
                 self.tenant_shard_id,
-                Some(self.remote_storage.clone()),
+                self.remote_storage.clone(),
                 self.deletion_queue.new_client(),
             ));
 
diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs
index 2e5259bfe2..3173a33dad 100644
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -181,25 +181,23 @@ async fn ensure_timelines_dir_empty(timelines_path: &Utf8Path) -> Result<(), Del
 
 async fn remove_tenant_remote_delete_mark(
     conf: &PageServerConf,
-    remote_storage: Option<&GenericRemoteStorage>,
+    remote_storage: &GenericRemoteStorage,
     tenant_shard_id: &TenantShardId,
     cancel: &CancellationToken,
 ) -> Result<(), DeleteTenantError> {
-    if let Some(remote_storage) = remote_storage {
-        let path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
-        backoff::retry(
-            || async { remote_storage.delete(&path, cancel).await },
-            TimeoutOrCancel::caused_by_cancel,
-            FAILED_UPLOAD_WARN_THRESHOLD,
-            FAILED_REMOTE_OP_RETRIES,
-            "remove_tenant_remote_delete_mark",
-            cancel,
-        )
-        .await
-        .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
-        .and_then(|x| x)
-        .context("remove_tenant_remote_delete_mark")?;
-    }
+    let path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
+    backoff::retry(
+        || async { remote_storage.delete(&path, cancel).await },
+        TimeoutOrCancel::caused_by_cancel,
+        FAILED_UPLOAD_WARN_THRESHOLD,
+        FAILED_REMOTE_OP_RETRIES,
+        "remove_tenant_remote_delete_mark",
+        cancel,
+    )
+    .await
+    .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
+    .and_then(|x| x)
+    .context("remove_tenant_remote_delete_mark")?;
     Ok(())
 }
 
@@ -297,7 +295,7 @@ impl DeleteTenantFlow {
     #[instrument(skip_all)]
     pub(crate) async fn run(
         conf: &'static PageServerConf,
-        remote_storage: Option<GenericRemoteStorage>,
+        remote_storage: GenericRemoteStorage,
         tenants: &'static std::sync::RwLock<TenantsMap>,
         tenant: Arc<Tenant>,
         cancel: &CancellationToken,
@@ -308,9 +306,7 @@ impl DeleteTenantFlow {
 
         let mut guard = Self::prepare(&tenant).await?;
 
-        if let Err(e) =
-            Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant, cancel).await
-        {
+        if let Err(e) = Self::run_inner(&mut guard, conf, &remote_storage, &tenant, cancel).await {
             tenant.set_broken(format!("{e:#}")).await;
             return Err(e);
         }
@@ -327,7 +323,7 @@ impl DeleteTenantFlow {
     async fn run_inner(
         guard: &mut OwnedMutexGuard<Self>,
         conf: &'static PageServerConf,
-        remote_storage: Option<&GenericRemoteStorage>,
+        remote_storage: &GenericRemoteStorage,
         tenant: &Tenant,
         cancel: &CancellationToken,
     ) -> Result<(), DeleteTenantError> {
@@ -339,14 +335,9 @@ impl DeleteTenantFlow {
             ))?
         });
 
-        // IDEA: implement detach as delete without remote storage. Then they would use the same lock (deletion_progress) so wont contend.
-        // Though sounds scary, different mark name?
-        // Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state.
-        if let Some(remote_storage) = &remote_storage {
-            create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id, cancel)
-                .await
-                .context("remote_mark")?
-        }
+        create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id, cancel)
+            .await
+            .context("remote_mark")?;
 
         fail::fail_point!("tenant-delete-before-create-local-mark", |_| {
             Err(anyhow::anyhow!(
@@ -483,7 +474,7 @@ impl DeleteTenantFlow {
     fn schedule_background(
         guard: OwnedMutexGuard<Self>,
         conf: &'static PageServerConf,
-        remote_storage: Option<GenericRemoteStorage>,
+        remote_storage: GenericRemoteStorage,
         tenants: &'static std::sync::RwLock<TenantsMap>,
         tenant: Arc<Tenant>,
     ) {
@@ -512,7 +503,7 @@ impl DeleteTenantFlow {
     async fn background(
         mut guard: OwnedMutexGuard<Self>,
         conf: &PageServerConf,
-        remote_storage: Option<GenericRemoteStorage>,
+        remote_storage: GenericRemoteStorage,
         tenants: &'static std::sync::RwLock<TenantsMap>,
         tenant: &Arc<Tenant>,
     ) -> Result<(), DeleteTenantError> {
@@ -551,7 +542,7 @@ impl DeleteTenantFlow {
 
         remove_tenant_remote_delete_mark(
             conf,
-            remote_storage.as_ref(),
+            &remote_storage,
             &tenant.tenant_shard_id,
             &task_mgr::shutdown_token(),
         )
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 7a3e36bf02..5abda7b64e 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -47,7 +47,7 @@ use crate::tenant::span::debug_assert_current_span_has_tenant_id;
 use crate::tenant::storage_layer::inmemory_layer;
 use crate::tenant::timeline::ShutdownMode;
 use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState};
-use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TEMP_FILE_SUFFIX};
+use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};
 
 use utils::crashsafe::path_with_suffix_extension;
 use utils::fs_ext::PathExt;
@@ -391,22 +391,17 @@ async fn init_load_generations(
     // deletion list entries may still be valid.  We provide that by pushing a recovery operation into
     // the queue. Sequential processing of te queue ensures that recovery is done before any new tenant deletions
     // are processed, even though we don't block on recovery completing here.
-    //
-    // Must only do this if remote storage is enabled, otherwise deletion queue
-    // is not running and channel push will fail.
-    if resources.remote_storage.is_some() {
-        let attached_tenants = generations
-            .iter()
-            .flat_map(|(id, start_mode)| {
-                match start_mode {
-                    TenantStartupMode::Attached((_mode, generation)) => Some(generation),
-                    TenantStartupMode::Secondary => None,
-                }
-                .map(|gen| (*id, *gen))
-            })
-            .collect();
-        resources.deletion_queue_client.recover(attached_tenants)?;
-    }
+    let attached_tenants = generations
+        .iter()
+        .flat_map(|(id, start_mode)| {
+            match start_mode {
+                TenantStartupMode::Attached((_mode, generation)) => Some(generation),
+                TenantStartupMode::Secondary => None,
+            }
+            .map(|gen| (*id, *gen))
+        })
+        .collect();
+    resources.deletion_queue_client.recover(attached_tenants)?;
 
     Ok(Some(generations))
 }
@@ -460,53 +455,6 @@ fn load_tenant_config(
         }
     };
 
-    // Clean up legacy `metadata` files.
-    // Doing it here because every single tenant directory is visited here.
-    // In any later code, there's different treatment of tenant dirs
-    // ... depending on whether the tenant is in re-attach response or not
-    // ... epending on whether the tenant is ignored or not
-    assert_eq!(
-        &conf.tenant_path(&tenant_shard_id),
-        &tenant_dir_path,
-        "later use of conf....path() methods would be dubious"
-    );
-    let timelines: Vec<TimelineId> = match conf.timelines_path(&tenant_shard_id).read_dir_utf8() {
-        Ok(iter) => {
-            let mut timelines = Vec::new();
-            for res in iter {
-                let p = res?;
-                let Some(timeline_id) = p.file_name().parse::<TimelineId>().ok() else {
-                    // skip any entries that aren't TimelineId, such as
-                    // - *.___temp dirs
-                    // - unfinished initdb uploads (test_non_uploaded_root_timeline_is_deleted_after_restart)
-                    continue;
-                };
-                timelines.push(timeline_id);
-            }
-            timelines
-        }
-        Err(e) if e.kind() == std::io::ErrorKind::NotFound => vec![],
-        Err(e) => return Err(anyhow::anyhow!(e)),
-    };
-    for timeline_id in timelines {
-        let timeline_path = &conf.timeline_path(&tenant_shard_id, &timeline_id);
-        let metadata_path = timeline_path.join(METADATA_FILE_NAME);
-        match std::fs::remove_file(&metadata_path) {
-            Ok(()) => {
-                crashsafe::fsync(timeline_path)
-                    .context("fsync timeline dir after removing legacy metadata file")?;
-                info!("removed legacy metadata file at {metadata_path}");
-            }
-            Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
-                // something removed the file earlier, or it was never there
-                // We don't care, this software version doesn't write it again, so, we're good.
-            }
-            Err(e) => {
-                anyhow::bail!("remove legacy metadata file: {e}: {metadata_path}");
-            }
-        }
-    }
-
     let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME);
     if tenant_ignore_mark_file.exists() {
         info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant");
@@ -611,6 +559,7 @@ pub async fn init_tenant_mgr(
                     TenantSlot::Attached(Tenant::create_broken_tenant(
                         conf,
                         tenant_shard_id,
+                        resources.remote_storage.clone(),
                         format!("{}", e),
                     )),
                 );
@@ -803,6 +752,7 @@ fn tenant_spawn(
         "Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}"
     );
 
+    let remote_storage = resources.remote_storage.clone();
     let tenant = match Tenant::spawn(
         conf,
         tenant_shard_id,
@@ -817,7 +767,7 @@ fn tenant_spawn(
         Ok(tenant) => tenant,
         Err(e) => {
             error!("Failed to spawn tenant {tenant_shard_id}, reason: {e:#}");
-            Tenant::create_broken_tenant(conf, tenant_shard_id, format!("{e:#}"))
+            Tenant::create_broken_tenant(conf, tenant_shard_id, remote_storage, format!("{e:#}"))
         }
     };
 
@@ -2276,7 +2226,7 @@ pub(crate) async fn load_tenant(
     tenant_id: TenantId,
     generation: Generation,
     broker_client: storage_broker::BrokerClientChannel,
-    remote_storage: Option<GenericRemoteStorage>,
+    remote_storage: GenericRemoteStorage,
     deletion_queue_client: DeletionQueueClient,
     ctx: &RequestContext,
 ) -> Result<(), TenantMapInsertError> {
@@ -2937,7 +2887,7 @@ pub(crate) async fn immediate_gc(
         }
 
         let timeline = tenant.get_timeline(timeline_id, false).ok();
-        let rtc = timeline.as_ref().and_then(|x| x.remote_client.as_ref());
+        let rtc = timeline.as_ref().map(|x| &x.remote_client);
 
         if let Some(rtc) = rtc {
             // layer drops schedule actions on remote timeline client to actually do the
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 630ade5c13..c5462dac43 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -2137,7 +2137,7 @@ mod tests {
             tenant_ctx: _tenant_ctx,
         } = test_setup;
 
-        let client = timeline.remote_client.as_ref().unwrap();
+        let client = &timeline.remote_client;
 
         // Download back the index.json, and check that the list of files is correct
         let initial_index_part = match client
@@ -2328,7 +2328,7 @@ mod tests {
             timeline,
             ..
         } = TestSetup::new("metrics").await.unwrap();
-        let client = timeline.remote_client.as_ref().unwrap();
+        let client = &timeline.remote_client;
 
         let layer_file_name_1: LayerName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
         let local_path = local_layer_path(
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index c28e041fa2..46a3d7e81f 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -26,7 +26,7 @@ use crate::{
         tasks::{warn_when_period_overrun, BackgroundLoopKind},
     },
     virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile},
-    METADATA_FILE_NAME, TEMP_FILE_SUFFIX,
+    TEMP_FILE_SUFFIX,
 };
 
 use super::{
@@ -1074,11 +1074,7 @@ async fn init_timeline_state(
             .fatal_err(&format!("Read metadata on {}", file_path));
 
         let file_name = file_path.file_name().expect("created it from the dentry");
-        if file_name == METADATA_FILE_NAME {
-            // Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant.
-            warn!(path=?dentry.path(), "found legacy metadata file, these should have been removed in load_tenant_config");
-            continue;
-        } else if crate::is_temporary(&file_path)
+        if crate::is_temporary(&file_path)
             || is_temp_download_file(&file_path)
             || is_ephemeral_file(file_name)
         {
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index b6f7702247..e8c712c4c6 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -585,9 +585,6 @@ struct LayerInner {
     /// [`Timeline::gate`] at the same time.
     timeline: Weak<Timeline>,
 
-    /// Cached knowledge of [`Timeline::remote_client`] being `Some`.
-    have_remote_client: bool,
-
     access_stats: LayerAccessStats,
 
     /// This custom OnceCell is backed by std mutex, but only held for short time periods.
@@ -732,23 +729,23 @@ impl Drop for LayerInner {
             if removed {
                 timeline.metrics.resident_physical_size_sub(file_size);
             }
-            if let Some(remote_client) = timeline.remote_client.as_ref() {
-                let res = remote_client.schedule_deletion_of_unlinked(vec![(file_name, meta)]);
+            let res = timeline
+                .remote_client
+                .schedule_deletion_of_unlinked(vec![(file_name, meta)]);
 
-                if let Err(e) = res {
-                    // test_timeline_deletion_with_files_stuck_in_upload_queue is good at
-                    // demonstrating this deadlock (without spawn_blocking): stop will drop
-                    // queued items, which will have ResidentLayer's, and those drops would try
-                    // to re-entrantly lock the RemoteTimelineClient inner state.
-                    if !timeline.is_active() {
-                        tracing::info!("scheduling deletion on drop failed: {e:#}");
-                    } else {
-                        tracing::warn!("scheduling deletion on drop failed: {e:#}");
-                    }
-                    LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed);
+            if let Err(e) = res {
+                // test_timeline_deletion_with_files_stuck_in_upload_queue is good at
+                // demonstrating this deadlock (without spawn_blocking): stop will drop
+                // queued items, which will have ResidentLayer's, and those drops would try
+                // to re-entrantly lock the RemoteTimelineClient inner state.
+                if !timeline.is_active() {
+                    tracing::info!("scheduling deletion on drop failed: {e:#}");
                 } else {
-                    LAYER_IMPL_METRICS.inc_completed_deletes();
+                    tracing::warn!("scheduling deletion on drop failed: {e:#}");
                 }
+                LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed);
+            } else {
+                LAYER_IMPL_METRICS.inc_completed_deletes();
             }
         });
     }
@@ -786,7 +783,6 @@ impl LayerInner {
             path: local_path,
             desc,
             timeline: Arc::downgrade(timeline),
-            have_remote_client: timeline.remote_client.is_some(),
             access_stats,
             wanted_deleted: AtomicBool::new(false),
             inner,
@@ -815,8 +811,6 @@ impl LayerInner {
     /// in a new attempt to evict OR join the previously started attempt.
     #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, ret, err(level = tracing::Level::DEBUG), fields(layer=%self))]
     pub(crate) async fn evict_and_wait(&self, timeout: Duration) -> Result<(), EvictionError> {
-        assert!(self.have_remote_client);
-
         let mut rx = self.status.as_ref().unwrap().subscribe();
 
         {
@@ -973,10 +967,6 @@ impl LayerInner {
             return Err(DownloadError::NotFile(ft));
         }
 
-        if timeline.remote_client.as_ref().is_none() {
-            return Err(DownloadError::NoRemoteStorage);
-        }
-
         if let Some(ctx) = ctx {
             self.check_expected_download(ctx)?;
         }
@@ -1113,12 +1103,8 @@ impl LayerInner {
         permit: heavier_once_cell::InitPermit,
         ctx: &RequestContext,
     ) -> anyhow::Result<Arc<DownloadedLayer>> {
-        let client = timeline
+        let result = timeline
             .remote_client
-            .as_ref()
-            .expect("checked before download_init_and_wait");
-
-        let result = client
             .download_layer_file(
                 &self.desc.layer_name(),
                 &self.metadata(),
@@ -1293,20 +1279,10 @@ impl LayerInner {
 
     /// `DownloadedLayer` is being dropped, so it calls this method.
     fn on_downloaded_layer_drop(self: Arc<LayerInner>, only_version: usize) {
-        let can_evict = self.have_remote_client;
-
         // we cannot know without inspecting LayerInner::inner if we should evict or not, even
         // though here it is very likely
         let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_shard_id.tenant_id, shard_id = %self.desc.tenant_shard_id.shard_slug(), timeline_id = %self.desc.timeline_id, layer=%self, version=%only_version);
 
-        if !can_evict {
-            // it would be nice to assert this case out, but we are in drop
-            span.in_scope(|| {
-                tracing::error!("bug in struct Layer: ResidentOrWantedEvicted has been downgraded while we have no remote storage");
-            });
-            return;
-        }
-
         // NOTE: this scope *must* never call `self.inner.get` because evict_and_wait might
         // drop while the `self.inner` is being locked, leading to a deadlock.
 
@@ -1578,8 +1554,6 @@ pub(crate) enum EvictionError {
 pub(crate) enum DownloadError {
     #[error("timeline has already shutdown")]
     TimelineShutdown,
-    #[error("no remote storage configured")]
-    NoRemoteStorage,
     #[error("context denies downloading")]
     ContextAndConfigReallyDeniesDownloads,
     #[error("downloading is really required but not allowed by this method")]
diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs
index 52f62faa8d..fa9142d5e9 100644
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -145,7 +145,7 @@ async fn smoke_test() {
         .await
         .expect("the local layer file still exists");
 
-    let rtc = timeline.remote_client.as_ref().unwrap();
+    let rtc = &timeline.remote_client;
 
     {
         let layers = &[layer];
@@ -761,13 +761,7 @@ async fn eviction_cancellation_on_drop() {
     timeline.freeze_and_flush().await.unwrap();
 
     // wait for the upload to complete so our Arc::strong_count assertion holds
-    timeline
-        .remote_client
-        .as_ref()
-        .unwrap()
-        .wait_completion()
-        .await
-        .unwrap();
+    timeline.remote_client.wait_completion().await.unwrap();
 
     let (evicted_layer, not_evicted) = {
         let mut layers = {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index ca34b4fadc..df9bc9b35b 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -200,7 +200,7 @@ fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
 
 /// The outward-facing resources required to build a Timeline
 pub struct TimelineResources {
-    pub remote_client: Option<RemoteTimelineClient>,
+    pub remote_client: RemoteTimelineClient,
     pub deletion_queue_client: DeletionQueueClient,
     pub timeline_get_throttle: Arc<
         crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>,
@@ -272,7 +272,7 @@ pub struct Timeline {
 
     /// Remote storage client.
     /// See [`remote_timeline_client`](super::remote_timeline_client) module comment for details.
-    pub remote_client: Option<Arc<RemoteTimelineClient>>,
+    pub remote_client: Arc<RemoteTimelineClient>,
 
     // What page versions do we hold in the repository? If we get a
     // request > last_record_lsn, we need to wait until we receive all
@@ -1375,22 +1375,14 @@ impl Timeline {
     /// not validated with control plane yet.
     /// See [`Self::get_remote_consistent_lsn_visible`].
     pub(crate) fn get_remote_consistent_lsn_projected(&self) -> Option<Lsn> {
-        if let Some(remote_client) = &self.remote_client {
-            remote_client.remote_consistent_lsn_projected()
-        } else {
-            None
-        }
+        self.remote_client.remote_consistent_lsn_projected()
     }
 
     /// remote_consistent_lsn which the tenant is guaranteed not to go backward from,
     /// i.e. a value of remote_consistent_lsn_projected which has undergone
     /// generation validation in the deletion queue.
     pub(crate) fn get_remote_consistent_lsn_visible(&self) -> Option<Lsn> {
-        if let Some(remote_client) = &self.remote_client {
-            remote_client.remote_consistent_lsn_visible()
-        } else {
-            None
-        }
+        self.remote_client.remote_consistent_lsn_visible()
     }
 
     /// The sum of the file size of all historic layers in the layer map.
@@ -1760,16 +1752,14 @@ impl Timeline {
             match self.freeze_and_flush().await {
                 Ok(_) => {
                     // drain the upload queue
-                    if let Some(client) = self.remote_client.as_ref() {
-                        // if we did not wait for completion here, it might be our shutdown process
-                        // didn't wait for remote uploads to complete at all, as new tasks can forever
-                        // be spawned.
-                        //
-                        // what is problematic is the shutting down of RemoteTimelineClient, because
-                        // obviously it does not make sense to stop while we wait for it, but what
-                        // about corner cases like s3 suddenly hanging up?
-                        client.shutdown().await;
-                    }
+                    // if we did not wait for completion here, it might be our shutdown process
+                    // didn't wait for remote uploads to complete at all, as new tasks can forever
+                    // be spawned.
+                    //
+                    // what is problematic is the shutting down of RemoteTimelineClient, because
+                    // obviously it does not make sense to stop while we wait for it, but what
+                    // about corner cases like s3 suddenly hanging up?
+                    self.remote_client.shutdown().await;
                 }
                 Err(e) => {
                     // Non-fatal.  Shutdown is infallible.  Failures to flush just mean that
@@ -1785,18 +1775,16 @@ impl Timeline {
 
         // Transition the remote_client into a state where it's only useful for timeline deletion.
         // (The deletion use case is why we can't just hook up remote_client to Self::cancel).)
-        if let Some(remote_client) = self.remote_client.as_ref() {
-            remote_client.stop();
-            // As documented in remote_client.stop()'s doc comment, it's our responsibility
-            // to shut down the upload queue tasks.
-            // TODO: fix that, task management should be encapsulated inside remote_client.
-            task_mgr::shutdown_tasks(
-                Some(TaskKind::RemoteUploadTask),
-                Some(self.tenant_shard_id),
-                Some(self.timeline_id),
-            )
-            .await;
-        }
+        self.remote_client.stop();
+        // As documented in remote_client.stop()'s doc comment, it's our responsibility
+        // to shut down the upload queue tasks.
+        // TODO: fix that, task management should be encapsulated inside remote_client.
+        task_mgr::shutdown_tasks(
+            Some(TaskKind::RemoteUploadTask),
+            Some(self.tenant_shard_id),
+            Some(self.timeline_id),
+        )
+        .await;
 
         // TODO: work toward making this a no-op. See this funciton's doc comment for more context.
         tracing::debug!("Waiting for tasks...");
@@ -1922,10 +1910,6 @@ impl Timeline {
             return Ok(None);
         };
 
-        if self.remote_client.is_none() {
-            return Ok(Some(false));
-        }
-
         layer.download().await?;
 
         Ok(Some(true))
@@ -2190,7 +2174,7 @@ impl Timeline {
                 walredo_mgr,
                 walreceiver: Mutex::new(None),
 
-                remote_client: resources.remote_client.map(Arc::new),
+                remote_client: Arc::new(resources.remote_client),
 
                 // initialize in-memory 'last_record_lsn' from 'disk_consistent_lsn'.
                 last_record_lsn: SeqWait::new(RecordLsn {
@@ -2437,10 +2421,6 @@ impl Timeline {
                             discovered_layers.push((layer_file_name, local_path, file_size));
                             continue;
                         }
-                        Discovered::Metadata => {
-                            warn!("found legacy metadata file, these should have been removed in load_tenant_config");
-                            continue;
-                        }
                         Discovered::IgnoredBackup => {
                             continue;
                         }
@@ -2487,12 +2467,10 @@ impl Timeline {
                             if local.metadata.file_size() == remote.file_size() {
                                 // Use the local file, but take the remote metadata so that we pick up
                                 // the correct generation.
-                                UseLocal(
-                                    LocalLayerFileMetadata {
-                                        metadata: remote,
-                                        local_path: local.local_path
-                                    }
-                                )
+                                UseLocal(LocalLayerFileMetadata {
+                                    metadata: remote,
+                                    local_path: local.local_path,
+                                })
                             } else {
                                 init::cleanup_local_file_for_remote(&local, &remote)?;
                                 UseRemote { local, remote }
@@ -2501,7 +2479,11 @@ impl Timeline {
                         Ok(decision) => decision,
                         Err(DismissedLayer::Future { local }) => {
                             if let Some(local) = local {
-                                init::cleanup_future_layer(&local.local_path, &name, disk_consistent_lsn)?;
+                                init::cleanup_future_layer(
+                                    &local.local_path,
+                                    &name,
+                                    disk_consistent_lsn,
+                                )?;
                             }
                             needs_cleanup.push(name);
                             continue;
@@ -2523,7 +2505,8 @@ impl Timeline {
                     let layer = match decision {
                         UseLocal(local) => {
                             total_physical_size += local.metadata.file_size();
-                            Layer::for_resident(conf, &this, local.local_path, name, local.metadata).drop_eviction_guard()
+                            Layer::for_resident(conf, &this, local.local_path, name, local.metadata)
+                                .drop_eviction_guard()
                         }
                         Evicted(remote) | UseRemote { remote, .. } => {
                             Layer::for_evicted(conf, &this, name, remote)
@@ -2543,36 +2526,36 @@ impl Timeline {
 
         guard.initialize_local_layers(loaded_layers, disk_consistent_lsn + 1);
 
-        if let Some(rtc) = self.remote_client.as_ref() {
-            rtc.schedule_layer_file_deletion(&needs_cleanup)?;
-            rtc.schedule_index_upload_for_file_changes()?;
-            // This barrier orders above DELETEs before any later operations.
-            // This is critical because code executing after the barrier might
-            // create again objects with the same key that we just scheduled for deletion.
-            // For example, if we just scheduled deletion of an image layer "from the future",
-            // later compaction might run again and re-create the same image layer.
-            // "from the future" here means an image layer whose LSN is > IndexPart::disk_consistent_lsn.
-            // "same" here means same key range and LSN.
-            //
-            // Without a barrier between above DELETEs and the re-creation's PUTs,
-            // the upload queue may execute the PUT first, then the DELETE.
-            // In our example, we will end up with an IndexPart referencing a non-existent object.
-            //
-            // 1. a future image layer is created and uploaded
-            // 2. ps restart
-            // 3. the future layer from (1) is deleted during load layer map
-            // 4. image layer is re-created and uploaded
-            // 5. deletion queue would like to delete (1) but actually deletes (4)
-            // 6. delete by name works as expected, but it now deletes the wrong (later) version
-            //
-            // See https://github.com/neondatabase/neon/issues/5878
-            //
-            // NB: generation numbers naturally protect against this because they disambiguate
-            //     (1) and (4)
-            rtc.schedule_barrier()?;
-            // Tenant::create_timeline will wait for these uploads to happen before returning, or
-            // on retry.
-        }
+        self.remote_client
+            .schedule_layer_file_deletion(&needs_cleanup)?;
+        self.remote_client
+            .schedule_index_upload_for_file_changes()?;
+        // This barrier orders above DELETEs before any later operations.
+        // This is critical because code executing after the barrier might
+        // create again objects with the same key that we just scheduled for deletion.
+        // For example, if we just scheduled deletion of an image layer "from the future",
+        // later compaction might run again and re-create the same image layer.
+        // "from the future" here means an image layer whose LSN is > IndexPart::disk_consistent_lsn.
+        // "same" here means same key range and LSN.
+        //
+        // Without a barrier between above DELETEs and the re-creation's PUTs,
+        // the upload queue may execute the PUT first, then the DELETE.
+        // In our example, we will end up with an IndexPart referencing a non-existent object.
+        //
+        // 1. a future image layer is created and uploaded
+        // 2. ps restart
+        // 3. the future layer from (1) is deleted during load layer map
+        // 4. image layer is re-created and uploaded
+        // 5. deletion queue would like to delete (1) but actually deletes (4)
+        // 6. delete by name works as expected, but it now deletes the wrong (later) version
+        //
+        // See https://github.com/neondatabase/neon/issues/5878
+        //
+        // NB: generation numbers naturally protect against this because they disambiguate
+        //     (1) and (4)
+        self.remote_client.schedule_barrier()?;
+        // Tenant::create_timeline will wait for these uploads to happen before returning, or
+        // on retry.
 
         info!(
             "loaded layer map with {} layers at {}, total physical size: {}",
@@ -3025,9 +3008,6 @@ impl Timeline {
     /// should treat this as a cue to simply skip doing any heatmap uploading
     /// for this timeline.
     pub(crate) async fn generate_heatmap(&self) -> Option<HeatMapTimeline> {
-        // no point in heatmaps without remote client
-        let _remote_client = self.remote_client.as_ref()?;
-
         if !self.is_active() {
             return None;
         }
@@ -3055,10 +3035,7 @@ impl Timeline {
         // branchpoint in the value in IndexPart::lineage
         self.ancestor_lsn == lsn
             || (self.ancestor_lsn == Lsn::INVALID
-                && self
-                    .remote_client
-                    .as_ref()
-                    .is_some_and(|rtc| rtc.is_previous_ancestor_lsn(lsn)))
+                && self.remote_client.is_previous_ancestor_lsn(lsn))
     }
 }
 
@@ -3978,29 +3955,23 @@ impl Timeline {
             x.unwrap()
         ));
 
-        if let Some(remote_client) = &self.remote_client {
-            for layer in layers_to_upload {
-                remote_client.schedule_layer_file_upload(layer)?;
-            }
-            remote_client.schedule_index_upload_for_metadata_update(&update)?;
+        for layer in layers_to_upload {
+            self.remote_client.schedule_layer_file_upload(layer)?;
         }
+        self.remote_client
+            .schedule_index_upload_for_metadata_update(&update)?;
 
         Ok(())
     }
 
     pub(crate) async fn preserve_initdb_archive(&self) -> anyhow::Result<()> {
-        if let Some(remote_client) = &self.remote_client {
-            remote_client
-                .preserve_initdb_archive(
-                    &self.tenant_shard_id.tenant_id,
-                    &self.timeline_id,
-                    &self.cancel,
-                )
-                .await?;
-        } else {
-            bail!("No remote storage configured, but was asked to backup the initdb archive for {} / {}", self.tenant_shard_id.tenant_id, self.timeline_id);
-        }
-        Ok(())
+        self.remote_client
+            .preserve_initdb_archive(
+                &self.tenant_shard_id.tenant_id,
+                &self.timeline_id,
+                &self.cancel,
+            )
+            .await
     }
 
     // Write out the given frozen in-memory layer as a new L0 delta file. This L0 file will not be tracked
@@ -4361,12 +4332,7 @@ impl Timeline {
             return;
         }
 
-        if self
-            .remote_client
-            .as_ref()
-            .map(|c| c.is_deleting())
-            .unwrap_or(false)
-        {
+        if self.remote_client.is_deleting() {
             // The timeline was created in a deletion-resume state, we don't expect logical size to be populated
             return;
         }
@@ -4534,9 +4500,8 @@ impl Timeline {
         // deletion will happen later, the layer file manager calls garbage_collect_on_drop
         guard.finish_compact_l0(&remove_layers, &insert_layers, &self.metrics);
 
-        if let Some(remote_client) = self.remote_client.as_ref() {
-            remote_client.schedule_compaction_update(&remove_layers, new_deltas)?;
-        }
+        self.remote_client
+            .schedule_compaction_update(&remove_layers, new_deltas)?;
 
         drop_wlock(guard);
 
@@ -4554,9 +4519,8 @@ impl Timeline {
 
         let upload_layers: Vec<_> = replace_layers.into_iter().map(|r| r.1).collect();
 
-        if let Some(remote_client) = self.remote_client.as_ref() {
-            remote_client.schedule_compaction_update(&drop_layers, &upload_layers)?;
-        }
+        self.remote_client
+            .schedule_compaction_update(&drop_layers, &upload_layers)?;
 
         Ok(())
     }
@@ -4566,16 +4530,14 @@ impl Timeline {
         self: &Arc<Self>,
         new_images: impl IntoIterator<Item = ResidentLayer>,
     ) -> anyhow::Result<()> {
-        let Some(remote_client) = &self.remote_client else {
-            return Ok(());
-        };
         for layer in new_images {
-            remote_client.schedule_layer_file_upload(layer)?;
+            self.remote_client.schedule_layer_file_upload(layer)?;
         }
         // should any new image layer been created, not uploading index_part will
         // result in a mismatch between remote_physical_size and layermap calculated
         // size, which will fail some tests, but should not be an issue otherwise.
-        remote_client.schedule_index_upload_for_file_changes()?;
+        self.remote_client
+            .schedule_index_upload_for_file_changes()?;
         Ok(())
     }
 
@@ -4861,9 +4823,7 @@ impl Timeline {
 
             result.layers_removed = gc_layers.len() as u64;
 
-            if let Some(remote_client) = self.remote_client.as_ref() {
-                remote_client.schedule_gc_update(&gc_layers)?;
-            }
+            self.remote_client.schedule_gc_update(&gc_layers)?;
 
             guard.finish_gc_timeline(&gc_layers);
 
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 4226bf431e..ed48b4c9cb 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -295,13 +295,11 @@ impl Timeline {
         // Update the LayerMap so that readers will use the new layers, and enqueue it for writing to remote storage
         self.rewrite_layers(replace_layers, drop_layers).await?;
 
-        if let Some(remote_client) = self.remote_client.as_ref() {
-            // We wait for all uploads to complete before finishing this compaction stage.  This is not
-            // necessary for correctness, but it simplifies testing, and avoids proceeding with another
-            // Timeline's compaction while this timeline's uploads may be generating lots of disk I/O
-            // load.
-            remote_client.wait_completion().await?;
-        }
+        // We wait for all uploads to complete before finishing this compaction stage.  This is not
+        // necessary for correctness, but it simplifies testing, and avoids proceeding with another
+        // Timeline's compaction while this timeline's uploads may be generating lots of disk I/O
+        // load.
+        self.remote_client.wait_completion().await?;
 
         Ok(())
     }
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index d8701be170..901f5149b3 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -26,19 +26,21 @@ use super::{Timeline, TimelineResources};
 /// during attach or pageserver restart.
 /// See comment in persist_index_part_with_deleted_flag.
 async fn set_deleted_in_remote_index(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
-    if let Some(remote_client) = timeline.remote_client.as_ref() {
-        match remote_client.persist_index_part_with_deleted_flag().await {
-            // If we (now, or already) marked it successfully as deleted, we can proceed
-            Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (),
-            // Bail out otherwise
-            //
-            // AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents
-            // two tasks from performing the deletion at the same time. The first task
-            // that starts deletion should run it to completion.
-            Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_))
-            | Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => {
-                return Err(DeleteTimelineError::Other(anyhow::anyhow!(e)));
-            }
+    match timeline
+        .remote_client
+        .persist_index_part_with_deleted_flag()
+        .await
+    {
+        // If we (now, or already) marked it successfully as deleted, we can proceed
+        Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (),
+        // Bail out otherwise
+        //
+        // AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents
+        // two tasks from performing the deletion at the same time. The first task
+        // that starts deletion should run it to completion.
+        Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_))
+        | Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => {
+            return Err(DeleteTimelineError::Other(anyhow::anyhow!(e)));
         }
     }
     Ok(())
@@ -117,11 +119,11 @@ pub(super) async fn delete_local_timeline_directory(
 
 /// Removes remote layers and an index file after them.
 async fn delete_remote_layers_and_index(timeline: &Timeline) -> anyhow::Result<()> {
-    if let Some(remote_client) = &timeline.remote_client {
-        remote_client.delete_all().await.context("delete_all")?
-    };
-
-    Ok(())
+    timeline
+        .remote_client
+        .delete_all()
+        .await
+        .context("delete_all")
 }
 
 // This function removs remaining traces of a timeline on disk.
@@ -260,7 +262,7 @@ impl DeleteTimelineFlow {
         tenant: Arc<Tenant>,
         timeline_id: TimelineId,
         local_metadata: &TimelineMetadata,
-        remote_client: Option<RemoteTimelineClient>,
+        remote_client: RemoteTimelineClient,
         deletion_queue_client: DeletionQueueClient,
     ) -> anyhow::Result<()> {
         // Note: here we even skip populating layer map. Timeline is essentially uninitialized.
diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs
index 9471ba860f..7f59758c87 100644
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -70,10 +70,6 @@ pub(super) async fn prepare(
 ) -> Result<(completion::Completion, PreparedTimelineDetach), Error> {
     use Error::*;
 
-    if detached.remote_client.as_ref().is_none() {
-        unimplemented!("no new code for running without remote storage");
-    }
-
     let Some((ancestor, ancestor_lsn)) = detached
         .ancestor_timeline
         .as_ref()
@@ -315,8 +311,6 @@ async fn upload_rewritten_layer(
     // FIXME: better shuttingdown error
     target
         .remote_client
-        .as_ref()
-        .unwrap()
         .upload_layer_file(&copied, cancel)
         .await
         .map_err(UploadRewritten)?;
@@ -406,8 +400,6 @@ async fn remote_copy(
     // FIXME: better shuttingdown error
     adoptee
         .remote_client
-        .as_ref()
-        .unwrap()
         .copy_timeline_layer(adopted, &owned, cancel)
         .await
         .map(move |()| owned)
@@ -421,11 +413,6 @@ pub(super) async fn complete(
     prepared: PreparedTimelineDetach,
     _ctx: &RequestContext,
 ) -> Result<Vec<TimelineId>, anyhow::Error> {
-    let rtc = detached
-        .remote_client
-        .as_ref()
-        .expect("has to have a remote timeline client for timeline ancestor detach");
-
     let PreparedTimelineDetach { layers } = prepared;
 
     let ancestor = detached
@@ -442,11 +429,13 @@ pub(super) async fn complete(
     //
     // this is not perfect, but it avoids us a retry happening after a compaction or gc on restart
     // which could give us a completely wrong layer combination.
-    rtc.schedule_adding_existing_layers_to_index_detach_and_wait(
-        &layers,
-        (ancestor.timeline_id, ancestor_lsn),
-    )
-    .await?;
+    detached
+        .remote_client
+        .schedule_adding_existing_layers_to_index_detach_and_wait(
+            &layers,
+            (ancestor.timeline_id, ancestor_lsn),
+        )
+        .await?;
 
     let mut tasks = tokio::task::JoinSet::new();
 
@@ -491,8 +480,6 @@ pub(super) async fn complete(
                 async move {
                     let res = timeline
                         .remote_client
-                        .as_ref()
-                        .expect("reparented has to have remote client because detached has one")
                         .schedule_reparenting_and_wait(&new_parent)
                         .await;
 
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index 3567761b9a..8a8c38d0ce 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -23,7 +23,7 @@ use std::{
 use pageserver_api::models::{EvictionPolicy, EvictionPolicyLayerAccessThreshold};
 use tokio::time::Instant;
 use tokio_util::sync::CancellationToken;
-use tracing::{debug, error, info, info_span, instrument, warn, Instrument};
+use tracing::{debug, info, info_span, instrument, warn, Instrument};
 
 use crate::{
     context::{DownloadBehavior, RequestContext},
@@ -211,11 +211,6 @@ impl Timeline {
 
         // So, we just need to deal with this.
 
-        if self.remote_client.is_none() {
-            error!("no remote storage configured, cannot evict layers");
-            return ControlFlow::Continue(());
-        }
-
         let mut js = tokio::task::JoinSet::new();
         {
             let guard = self.layers.read().await;
diff --git a/pageserver/src/tenant/timeline/init.rs b/pageserver/src/tenant/timeline/init.rs
index 66aa765015..feadc79e5e 100644
--- a/pageserver/src/tenant/timeline/init.rs
+++ b/pageserver/src/tenant/timeline/init.rs
@@ -9,7 +9,6 @@ use crate::{
         storage_layer::LayerName,
         Generation,
     },
-    METADATA_FILE_NAME,
 };
 use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};
@@ -27,8 +26,6 @@ pub(super) enum Discovered {
     Temporary(String),
     /// Temporary on-demand download files, should be removed
     TemporaryDownload(String),
-    /// "metadata" file we persist locally and include in `index_part.json`
-    Metadata,
     /// Backup file from previously future layers
     IgnoredBackup,
     /// Unrecognized, warn about these
@@ -49,9 +46,7 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result<Vec<Discovere
                 Discovered::Layer(file_name, direntry.path().to_owned(), file_size)
             }
             Err(_) => {
-                if file_name == METADATA_FILE_NAME {
-                    Discovered::Metadata
-                } else if file_name.ends_with(".old") {
+                if file_name.ends_with(".old") {
                     // ignore these
                     Discovered::IgnoredBackup
                 } else if remote_timeline_client::is_temp_download_file(direntry.path()) {
diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py
index 7d4e101189..61afd820ca 100644
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -56,14 +56,8 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder):
     (tenant0, timeline0, pg0) = tenant_timelines[0]
     log.info(f"Timeline {tenant0}/{timeline0} is left intact")
 
-    (tenant1, timeline1, pg1) = tenant_timelines[1]
-    metadata_path = f"{env.pageserver.workdir}/tenants/{tenant1}/timelines/{timeline1}/metadata"
-    with open(metadata_path, "w") as f:
-        f.write("overwritten with garbage!")
-    log.info(f"Timeline {tenant1}/{timeline1} got its metadata spoiled")
-
-    (tenant2, timeline2, pg2) = tenant_timelines[2]
-    timeline_path = f"{env.pageserver.workdir}/tenants/{tenant2}/timelines/{timeline2}/"
+    (tenant1, timeline1, pg1) = tenant_timelines[2]
+    timeline_path = f"{env.pageserver.workdir}/tenants/{tenant1}/timelines/{timeline1}/"
     for filename in os.listdir(timeline_path):
         if filename.startswith("00000"):
             # Looks like a layer file. Corrupt it
@@ -72,7 +66,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder):
             with open(p, "wb") as f:
                 f.truncate(0)
                 f.truncate(size)
-    log.info(f"Timeline {tenant2}/{timeline2} got its local layer files spoiled")
+    log.info(f"Timeline {tenant1}/{timeline1} got its local layer files spoiled")
 
     env.pageserver.start()
 
@@ -80,19 +74,15 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder):
     pg0.start()
     assert pg0.safe_psql("SELECT COUNT(*) FROM t")[0][0] == 100
 
-    # Tenant with corrupt local metadata works: remote storage is authoritative for metadata
-    pg1.start()
-    assert pg1.safe_psql("SELECT COUNT(*) FROM t")[0][0] == 100
-
     # Second timeline will fail during basebackup, because the local layer file is corrupt.
     # It will fail when we try to read (and reconstruct) a page from it, ergo the error message.
     # (We don't check layer file contents on startup, when loading the timeline)
     #
     # This will change when we implement checksums for layers
     with pytest.raises(Exception, match=f"{reconstruct_function_name} for layer ") as err:
-        pg2.start()
+        pg1.start()
     log.info(
-        f"As expected, compute startup failed for timeline {tenant2}/{timeline2} with corrupt layers: {err}"
+        f"As expected, compute startup failed for timeline {tenant1}/{timeline1} with corrupt layers: {err}"
     )
 
 

From bc78b0e9cc95ea033797b13d5bb36e61d338a070 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 15 May 2024 14:18:02 +0200
Subject: [PATCH 057/130] chore(deps): use upstream svg_fmt after they merged
 our PR (#7764)

They have merged our PR https://github.com/nical/rust_debug/pull/4 but
they haven't released a new crate version yet.

refs https://github.com/neondatabase/neon/issues/7763
---
 Cargo.lock | 2 +-
 Cargo.toml | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 6ce7180d67..961101b151 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5952,7 +5952,7 @@ checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"
 [[package]]
 name = "svg_fmt"
 version = "0.4.2"
-source = "git+https://github.com/neondatabase/fork--nical--rust_debug?branch=neon#c1820b28664b5df68de7f043fccf2ed5d67b6ae8"
+source = "git+https://github.com/nical/rust_debug?rev=28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4#28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4"
 
 [[package]]
 name = "syn"
diff --git a/Cargo.toml b/Cargo.toml
index 17f30a1327..3ccdabee18 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -158,8 +158,8 @@ socket2 = "0.5"
 strum = "0.24"
 strum_macros = "0.24"
 "subtle"  = "2.5.0"
-# https://github.com/nical/rust_debug/pull/4
-svg_fmt = { git = "https://github.com/neondatabase/fork--nical--rust_debug", branch = "neon" }
+# Our PR https://github.com/nical/rust_debug/pull/4 has been merged but no new version released yet
+svg_fmt = { git = "https://github.com/nical/rust_debug", rev = "28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4" }
 sync_wrapper = "0.1.2"
 tar = "0.4"
 task-local-extensions = "0.1.4"

From c3dd646ab3a48e1063c7efc6d080e21fdfb48fa7 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 15 May 2024 15:04:52 +0200
Subject: [PATCH 058/130] chore!: always use async walredo, warn if sync is
 configured (#7754)

refs https://github.com/neondatabase/neon/issues/7753

This PR is step (1) of removing sync walredo from Pageserver.

Changes:
* Remove the sync impl
* If sync is configured, warn! and use async instead
* Remove the metric that exposes `kind`
* Remove the tenant status API that exposes `kind`

Future Work
-----------

After we've released this change to prod and are sure we won't
roll back, we will

1. update the prod Ansible to remove the config flag from the prod
   pageserver.toml.
2. remove the remaining `kind` code in pageserver

These two changes need no release inbetween.

See  https://github.com/neondatabase/neon/issues/7753 for details.
---
 libs/pageserver_api/src/models.rs             |   3 -
 pageserver/benches/bench_walredo.rs           | 139 ++----
 pageserver/src/bin/pageserver.rs              |   1 -
 pageserver/src/config.rs                      |   2 +-
 pageserver/src/metrics.rs                     |  23 -
 pageserver/src/walredo.rs                     |   5 +-
 pageserver/src/walredo/process.rs             |  57 +--
 .../process/process_impl/process_std.rs       | 405 ------------------
 test_runner/regress/test_pageserver_config.py |  35 --
 9 files changed, 67 insertions(+), 603 deletions(-)
 delete mode 100644 pageserver/src/walredo/process/process_impl/process_std.rs
 delete mode 100644 test_runner/regress/test_pageserver_config.py

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 1df5820fb9..d78d2bcbea 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -776,9 +776,6 @@ pub struct TimelineGcRequest {
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct WalRedoManagerProcessStatus {
     pub pid: u32,
-    /// The strum-generated `into::<&'static str>()` for `pageserver::walredo::ProcessKind`.
-    /// `ProcessKind` are a transitory thing, so, they have no enum representation in `pageserver_api`.
-    pub kind: Cow<'static, str>,
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs
index 5b871c5d5e..5aab10e5d9 100644
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -30,47 +30,27 @@
 //! 2024-04-15 on i3en.3xlarge
 //!
 //! ```text
-//! async-short/1           time:   [24.584 µs 24.737 µs 24.922 µs]
-//! async-short/2           time:   [33.479 µs 33.660 µs 33.888 µs]
-//! async-short/4           time:   [42.713 µs 43.046 µs 43.440 µs]
-//! async-short/8           time:   [71.814 µs 72.478 µs 73.240 µs]
-//! async-short/16          time:   [132.73 µs 134.45 µs 136.22 µs]
-//! async-short/32          time:   [258.31 µs 260.73 µs 263.27 µs]
-//! async-short/64          time:   [511.61 µs 514.44 µs 517.51 µs]
-//! async-short/128         time:   [992.64 µs 998.23 µs 1.0042 ms]
-//! async-medium/1          time:   [110.11 µs 110.50 µs 110.96 µs]
-//! async-medium/2          time:   [153.06 µs 153.85 µs 154.99 µs]
-//! async-medium/4          time:   [317.51 µs 319.92 µs 322.85 µs]
-//! async-medium/8          time:   [638.30 µs 644.68 µs 652.12 µs]
-//! async-medium/16         time:   [1.2651 ms 1.2773 ms 1.2914 ms]
-//! async-medium/32         time:   [2.5117 ms 2.5410 ms 2.5720 ms]
-//! async-medium/64         time:   [4.8088 ms 4.8555 ms 4.9047 ms]
-//! async-medium/128        time:   [8.8311 ms 8.9849 ms 9.1263 ms]
-//! sync-short/1            time:   [25.503 µs 25.626 µs 25.771 µs]
-//! sync-short/2            time:   [30.850 µs 31.013 µs 31.208 µs]
-//! sync-short/4            time:   [45.543 µs 45.856 µs 46.193 µs]
-//! sync-short/8            time:   [84.114 µs 84.639 µs 85.220 µs]
-//! sync-short/16           time:   [185.22 µs 186.15 µs 187.13 µs]
-//! sync-short/32           time:   [377.43 µs 378.87 µs 380.46 µs]
-//! sync-short/64           time:   [756.49 µs 759.04 µs 761.70 µs]
-//! sync-short/128          time:   [1.4825 ms 1.4874 ms 1.4923 ms]
-//! sync-medium/1           time:   [105.66 µs 106.01 µs 106.43 µs]
-//! sync-medium/2           time:   [153.10 µs 153.84 µs 154.72 µs]
-//! sync-medium/4           time:   [327.13 µs 329.44 µs 332.27 µs]
-//! sync-medium/8           time:   [654.26 µs 658.73 µs 663.63 µs]
-//! sync-medium/16          time:   [1.2682 ms 1.2748 ms 1.2816 ms]
-//! sync-medium/32          time:   [2.4456 ms 2.4595 ms 2.4731 ms]
-//! sync-medium/64          time:   [4.6523 ms 4.6890 ms 4.7256 ms]
-//! sync-medium/128         time:   [8.7215 ms 8.8323 ms 8.9344 ms]
+//! short/1           time:   [24.584 µs 24.737 µs 24.922 µs]
+//! short/2           time:   [33.479 µs 33.660 µs 33.888 µs]
+//! short/4           time:   [42.713 µs 43.046 µs 43.440 µs]
+//! short/8           time:   [71.814 µs 72.478 µs 73.240 µs]
+//! short/16          time:   [132.73 µs 134.45 µs 136.22 µs]
+//! short/32          time:   [258.31 µs 260.73 µs 263.27 µs]
+//! short/64          time:   [511.61 µs 514.44 µs 517.51 µs]
+//! short/128         time:   [992.64 µs 998.23 µs 1.0042 ms]
+//! medium/1          time:   [110.11 µs 110.50 µs 110.96 µs]
+//! medium/2          time:   [153.06 µs 153.85 µs 154.99 µs]
+//! medium/4          time:   [317.51 µs 319.92 µs 322.85 µs]
+//! medium/8          time:   [638.30 µs 644.68 µs 652.12 µs]
+//! medium/16         time:   [1.2651 ms 1.2773 ms 1.2914 ms]
+//! medium/32         time:   [2.5117 ms 2.5410 ms 2.5720 ms]
+//! medium/64         time:   [4.8088 ms 4.8555 ms 4.9047 ms]
+//! medium/128        time:   [8.8311 ms 8.9849 ms 9.1263 ms]
 //! ```
 
 use bytes::{Buf, Bytes};
 use criterion::{BenchmarkId, Criterion};
-use pageserver::{
-    config::PageServerConf,
-    walrecord::NeonWalRecord,
-    walredo::{PostgresRedoManager, ProcessKind},
-};
+use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
 use pageserver_api::{key::Key, shard::TenantShardId};
 use std::{
     sync::Arc,
@@ -80,39 +60,32 @@ use tokio::{sync::Barrier, task::JoinSet};
 use utils::{id::TenantId, lsn::Lsn};
 
 fn bench(c: &mut Criterion) {
-    for process_kind in &[ProcessKind::Async, ProcessKind::Sync] {
-        {
-            let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
-            for nclients in nclients {
-                let mut group = c.benchmark_group(format!("{process_kind}-short"));
-                group.bench_with_input(
-                    BenchmarkId::from_parameter(nclients),
-                    &nclients,
-                    |b, nclients| {
-                        let redo_work = Arc::new(Request::short_input());
-                        b.iter_custom(|iters| {
-                            bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients)
-                        });
-                    },
-                );
-            }
+    {
+        let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
+        for nclients in nclients {
+            let mut group = c.benchmark_group("short");
+            group.bench_with_input(
+                BenchmarkId::from_parameter(nclients),
+                &nclients,
+                |b, nclients| {
+                    let redo_work = Arc::new(Request::short_input());
+                    b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
+                },
+            );
         }
-
-        {
-            let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
-            for nclients in nclients {
-                let mut group = c.benchmark_group(format!("{process_kind}-medium"));
-                group.bench_with_input(
-                    BenchmarkId::from_parameter(nclients),
-                    &nclients,
-                    |b, nclients| {
-                        let redo_work = Arc::new(Request::medium_input());
-                        b.iter_custom(|iters| {
-                            bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients)
-                        });
-                    },
-                );
-            }
+    }
+    {
+        let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
+        for nclients in nclients {
+            let mut group = c.benchmark_group("medium");
+            group.bench_with_input(
+                BenchmarkId::from_parameter(nclients),
+                &nclients,
+                |b, nclients| {
+                    let redo_work = Arc::new(Request::medium_input());
+                    b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
+                },
+            );
         }
     }
 }
@@ -120,16 +93,10 @@ criterion::criterion_group!(benches, bench);
 criterion::criterion_main!(benches);
 
 // Returns the sum of each client's wall-clock time spent executing their share of the n_redos.
-fn bench_impl(
-    process_kind: ProcessKind,
-    redo_work: Arc<Request>,
-    n_redos: u64,
-    nclients: u64,
-) -> Duration {
+fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration {
     let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap();
 
-    let mut conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
-    conf.walredo_process_kind = process_kind;
+    let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
     let conf = Box::leak(Box::new(conf));
     let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
 
@@ -158,27 +125,13 @@ fn bench_impl(
         });
     }
 
-    let elapsed = rt.block_on(async move {
+    rt.block_on(async move {
         let mut total_wallclock_time = Duration::ZERO;
         while let Some(res) = tasks.join_next().await {
             total_wallclock_time += res.unwrap();
         }
         total_wallclock_time
-    });
-
-    // consistency check to ensure process kind setting worked
-    if nredos_per_client > 0 {
-        assert_eq!(
-            manager
-                .status()
-                .process
-                .map(|p| p.kind)
-                .expect("the benchmark work causes a walredo process to be spawned"),
-            std::borrow::Cow::Borrowed(process_kind.into())
-        );
-    }
-
-    elapsed
+    })
 }
 
 async fn client(
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index c0099aa704..a04195e12b 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -284,7 +284,6 @@ fn start_pageserver(
     ))
     .unwrap();
     pageserver::preinitialize_metrics();
-    pageserver::metrics::wal_redo::set_process_kind_metric(conf.walredo_process_kind);
 
     // If any failpoints were set from FAILPOINTS environment variable,
     // print them to the log for debugging purposes
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 258eed0b12..b0afb6414b 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -99,7 +99,7 @@ pub mod defaults {
 
     pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
 
-    pub const DEFAULT_WALREDO_PROCESS_KIND: &str = "sync";
+    pub const DEFAULT_WALREDO_PROCESS_KIND: &str = "async";
 
     ///
     /// Default built-in configuration file.
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index b27bfb43b0..ffcd08b4b3 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1999,29 +1999,6 @@ impl Default for WalRedoProcessCounters {
 pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy<WalRedoProcessCounters> =
     Lazy::new(WalRedoProcessCounters::default);
 
-#[cfg(not(test))]
-pub mod wal_redo {
-    use super::*;
-
-    static PROCESS_KIND: Lazy<std::sync::Mutex<UIntGaugeVec>> = Lazy::new(|| {
-        std::sync::Mutex::new(
-            register_uint_gauge_vec!(
-                "pageserver_wal_redo_process_kind",
-                "The configured process kind for walredo",
-                &["kind"],
-            )
-            .unwrap(),
-        )
-    });
-
-    pub fn set_process_kind_metric(kind: crate::walredo::ProcessKind) {
-        // use guard to avoid races around the next two steps
-        let guard = PROCESS_KIND.lock().unwrap();
-        guard.reset();
-        guard.with_label_values(&[&format!("{kind}")]).set(1);
-    }
-}
-
 /// Similar to `prometheus::HistogramTimer` but does not record on drop.
 pub(crate) struct StorageTimeMetricsTimer {
     metrics: StorageTimeMetrics,
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index 9776d4ce88..3decea0c6d 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -153,10 +153,7 @@ impl PostgresRedoManager {
             process: self
                 .redo_process
                 .get()
-                .map(|p| WalRedoManagerProcessStatus {
-                    pid: p.id(),
-                    kind: std::borrow::Cow::Borrowed(p.kind().into()),
-                }),
+                .map(|p| WalRedoManagerProcessStatus { pid: p.id() }),
         }
     }
 }
diff --git a/pageserver/src/walredo/process.rs b/pageserver/src/walredo/process.rs
index ad6b4e5fe9..02c9c04bf1 100644
--- a/pageserver/src/walredo/process.rs
+++ b/pageserver/src/walredo/process.rs
@@ -1,7 +1,10 @@
+/// Layer of indirection previously used to support multiple implementations.
+/// Subject to removal: <https://github.com/neondatabase/neon/issues/7753>
 use std::time::Duration;
 
 use bytes::Bytes;
 use pageserver_api::{reltag::RelTag, shard::TenantShardId};
+use tracing::warn;
 use utils::lsn::Lsn;
 
 use crate::{config::PageServerConf, walrecord::NeonWalRecord};
@@ -12,7 +15,6 @@ mod protocol;
 
 mod process_impl {
     pub(super) mod process_async;
-    pub(super) mod process_std;
 }
 
 #[derive(
@@ -34,10 +36,7 @@ pub enum Kind {
     Async,
 }
 
-pub(crate) enum Process {
-    Sync(process_impl::process_std::WalRedoProcess),
-    Async(process_impl::process_async::WalRedoProcess),
-}
+pub(crate) struct Process(process_impl::process_async::WalRedoProcess);
 
 impl Process {
     #[inline(always)]
@@ -46,18 +45,17 @@ impl Process {
         tenant_shard_id: TenantShardId,
         pg_version: u32,
     ) -> anyhow::Result<Self> {
-        Ok(match conf.walredo_process_kind {
-            Kind::Sync => Self::Sync(process_impl::process_std::WalRedoProcess::launch(
-                conf,
-                tenant_shard_id,
-                pg_version,
-            )?),
-            Kind::Async => Self::Async(process_impl::process_async::WalRedoProcess::launch(
-                conf,
-                tenant_shard_id,
-                pg_version,
-            )?),
-        })
+        if conf.walredo_process_kind != Kind::Async {
+            warn!(
+                configured = %conf.walredo_process_kind,
+                "the walredo_process_kind setting has been turned into a no-op, using async implementation"
+            );
+        }
+        Ok(Self(process_impl::process_async::WalRedoProcess::launch(
+            conf,
+            tenant_shard_id,
+            pg_version,
+        )?))
     }
 
     #[inline(always)]
@@ -69,29 +67,12 @@ impl Process {
         records: &[(Lsn, NeonWalRecord)],
         wal_redo_timeout: Duration,
     ) -> anyhow::Result<Bytes> {
-        match self {
-            Process::Sync(p) => {
-                p.apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout)
-                    .await
-            }
-            Process::Async(p) => {
-                p.apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout)
-                    .await
-            }
-        }
+        self.0
+            .apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout)
+            .await
     }
 
     pub(crate) fn id(&self) -> u32 {
-        match self {
-            Process::Sync(p) => p.id(),
-            Process::Async(p) => p.id(),
-        }
-    }
-
-    pub(crate) fn kind(&self) -> Kind {
-        match self {
-            Process::Sync(_) => Kind::Sync,
-            Process::Async(_) => Kind::Async,
-        }
+        self.0.id()
     }
 }
diff --git a/pageserver/src/walredo/process/process_impl/process_std.rs b/pageserver/src/walredo/process/process_impl/process_std.rs
deleted file mode 100644
index e7a6c263c9..0000000000
--- a/pageserver/src/walredo/process/process_impl/process_std.rs
+++ /dev/null
@@ -1,405 +0,0 @@
-use self::no_leak_child::NoLeakChild;
-use crate::{
-    config::PageServerConf,
-    metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
-    walrecord::NeonWalRecord,
-    walredo::process::{no_leak_child, protocol},
-};
-use anyhow::Context;
-use bytes::Bytes;
-use nix::poll::{PollFd, PollFlags};
-use pageserver_api::{reltag::RelTag, shard::TenantShardId};
-use postgres_ffi::BLCKSZ;
-use std::os::fd::AsRawFd;
-#[cfg(feature = "testing")]
-use std::sync::atomic::AtomicUsize;
-use std::{
-    collections::VecDeque,
-    io::{Read, Write},
-    process::{ChildStdin, ChildStdout, Command, Stdio},
-    sync::{Mutex, MutexGuard},
-    time::Duration,
-};
-use tracing::{debug, error, instrument, Instrument};
-use utils::{lsn::Lsn, nonblock::set_nonblock};
-
-pub struct WalRedoProcess {
-    #[allow(dead_code)]
-    conf: &'static PageServerConf,
-    tenant_shard_id: TenantShardId,
-    // Some() on construction, only becomes None on Drop.
-    child: Option<NoLeakChild>,
-    stdout: Mutex<ProcessOutput>,
-    stdin: Mutex<ProcessInput>,
-    /// Counter to separate same sized walredo inputs failing at the same millisecond.
-    #[cfg(feature = "testing")]
-    dump_sequence: AtomicUsize,
-}
-
-struct ProcessInput {
-    stdin: ChildStdin,
-    n_requests: usize,
-}
-
-struct ProcessOutput {
-    stdout: ChildStdout,
-    pending_responses: VecDeque<Option<Bytes>>,
-    n_processed_responses: usize,
-}
-
-impl WalRedoProcess {
-    //
-    // Start postgres binary in special WAL redo mode.
-    //
-    #[instrument(skip_all,fields(pg_version=pg_version))]
-    pub(crate) fn launch(
-        conf: &'static PageServerConf,
-        tenant_shard_id: TenantShardId,
-        pg_version: u32,
-    ) -> anyhow::Result<Self> {
-        crate::span::debug_assert_current_span_has_tenant_id();
-
-        let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
-        let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
-
-        use no_leak_child::NoLeakChildCommandExt;
-        // Start postgres itself
-        let child = Command::new(pg_bin_dir_path.join("postgres"))
-            // the first arg must be --wal-redo so the child process enters into walredo mode
-            .arg("--wal-redo")
-            // the child doesn't process this arg, but, having it in the argv helps indentify the
-            // walredo process for a particular tenant when debugging a pagserver
-            .args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
-            .stdin(Stdio::piped())
-            .stderr(Stdio::piped())
-            .stdout(Stdio::piped())
-            .env_clear()
-            .env("LD_LIBRARY_PATH", &pg_lib_dir_path)
-            .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
-            // NB: The redo process is not trusted after we sent it the first
-            // walredo work. Before that, it is trusted. Specifically, we trust
-            // it to
-            // 1. close all file descriptors except stdin, stdout, stderr because
-            //    pageserver might not be 100% diligent in setting FD_CLOEXEC on all
-            //    the files it opens, and
-            // 2. to use seccomp to sandbox itself before processing the first
-            //    walredo request.
-            .spawn_no_leak_child(tenant_shard_id)
-            .context("spawn process")?;
-        WAL_REDO_PROCESS_COUNTERS.started.inc();
-        let mut child = scopeguard::guard(child, |child| {
-            error!("killing wal-redo-postgres process due to a problem during launch");
-            child.kill_and_wait(WalRedoKillCause::Startup);
-        });
-
-        let stdin = child.stdin.take().unwrap();
-        let stdout = child.stdout.take().unwrap();
-        let stderr = child.stderr.take().unwrap();
-        let stderr = tokio::process::ChildStderr::from_std(stderr)
-            .context("convert to tokio::ChildStderr")?;
-        macro_rules! set_nonblock_or_log_err {
-        ($file:ident) => {{
-            let res = set_nonblock($file.as_raw_fd());
-            if let Err(e) = &res {
-                error!(error = %e, file = stringify!($file), pid = child.id(), "set_nonblock failed");
-            }
-            res
-        }};
-    }
-        set_nonblock_or_log_err!(stdin)?;
-        set_nonblock_or_log_err!(stdout)?;
-
-        // all fallible operations post-spawn are complete, so get rid of the guard
-        let child = scopeguard::ScopeGuard::into_inner(child);
-
-        tokio::spawn(
-            async move {
-                scopeguard::defer! {
-                    debug!("wal-redo-postgres stderr_logger_task finished");
-                    crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
-                }
-                debug!("wal-redo-postgres stderr_logger_task started");
-                crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
-
-                use tokio::io::AsyncBufReadExt;
-                let mut stderr_lines = tokio::io::BufReader::new(stderr);
-                let mut buf = Vec::new();
-                let res = loop {
-                    buf.clear();
-                    // TODO we don't trust the process to cap its stderr length.
-                    // Currently it can do unbounded Vec allocation.
-                    match stderr_lines.read_until(b'\n', &mut buf).await {
-                        Ok(0) => break Ok(()), // eof
-                        Ok(num_bytes) => {
-                            let output = String::from_utf8_lossy(&buf[..num_bytes]);
-                            error!(%output, "received output");
-                        }
-                        Err(e) => {
-                            break Err(e);
-                        }
-                    }
-                };
-                match res {
-                    Ok(()) => (),
-                    Err(e) => {
-                        error!(error=?e, "failed to read from walredo stderr");
-                    }
-                }
-            }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
-        );
-
-        Ok(Self {
-            conf,
-            tenant_shard_id,
-            child: Some(child),
-            stdin: Mutex::new(ProcessInput {
-                stdin,
-                n_requests: 0,
-            }),
-            stdout: Mutex::new(ProcessOutput {
-                stdout,
-                pending_responses: VecDeque::new(),
-                n_processed_responses: 0,
-            }),
-            #[cfg(feature = "testing")]
-            dump_sequence: AtomicUsize::default(),
-        })
-    }
-
-    pub(crate) fn id(&self) -> u32 {
-        self.child
-            .as_ref()
-            .expect("must not call this during Drop")
-            .id()
-    }
-
-    // Apply given WAL records ('records') over an old page image. Returns
-    // new page image.
-    //
-    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
-    pub(crate) async fn apply_wal_records(
-        &self,
-        rel: RelTag,
-        blknum: u32,
-        base_img: &Option<Bytes>,
-        records: &[(Lsn, NeonWalRecord)],
-        wal_redo_timeout: Duration,
-    ) -> anyhow::Result<Bytes> {
-        let tag = protocol::BufferTag { rel, blknum };
-        let input = self.stdin.lock().unwrap();
-
-        // Serialize all the messages to send the WAL redo process first.
-        //
-        // This could be problematic if there are millions of records to replay,
-        // but in practice the number of records is usually so small that it doesn't
-        // matter, and it's better to keep this code simple.
-        //
-        // Most requests start with a before-image with BLCKSZ bytes, followed by
-        // by some other WAL records. Start with a buffer that can hold that
-        // comfortably.
-        let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
-        protocol::build_begin_redo_for_block_msg(tag, &mut writebuf);
-        if let Some(img) = base_img {
-            protocol::build_push_page_msg(tag, img, &mut writebuf);
-        }
-        for (lsn, rec) in records.iter() {
-            if let NeonWalRecord::Postgres {
-                will_init: _,
-                rec: postgres_rec,
-            } = rec
-            {
-                protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
-            } else {
-                anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
-            }
-        }
-        protocol::build_get_page_msg(tag, &mut writebuf);
-        WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
-
-        let res = self.apply_wal_records0(&writebuf, input, wal_redo_timeout);
-
-        if res.is_err() {
-            // not all of these can be caused by this particular input, however these are so rare
-            // in tests so capture all.
-            self.record_and_log(&writebuf);
-        }
-
-        res
-    }
-
-    fn apply_wal_records0(
-        &self,
-        writebuf: &[u8],
-        input: MutexGuard<ProcessInput>,
-        wal_redo_timeout: Duration,
-    ) -> anyhow::Result<Bytes> {
-        let mut proc = { input }; // TODO: remove this legacy rename, but this keep the patch small.
-        let mut nwrite = 0usize;
-
-        while nwrite < writebuf.len() {
-            let mut stdin_pollfds = [PollFd::new(&proc.stdin, PollFlags::POLLOUT)];
-            let n = loop {
-                match nix::poll::poll(&mut stdin_pollfds[..], wal_redo_timeout.as_millis() as i32) {
-                    Err(nix::errno::Errno::EINTR) => continue,
-                    res => break res,
-                }
-            }?;
-
-            if n == 0 {
-                anyhow::bail!("WAL redo timed out");
-            }
-
-            // If 'stdin' is writeable, do write.
-            let in_revents = stdin_pollfds[0].revents().unwrap();
-            if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
-                nwrite += proc.stdin.write(&writebuf[nwrite..])?;
-            }
-            if in_revents.contains(PollFlags::POLLHUP) {
-                // We still have more data to write, but the process closed the pipe.
-                anyhow::bail!("WAL redo process closed its stdin unexpectedly");
-            }
-        }
-        let request_no = proc.n_requests;
-        proc.n_requests += 1;
-        drop(proc);
-
-        // To improve walredo performance we separate sending requests and receiving
-        // responses. Them are protected by different mutexes (output and input).
-        // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
-        // then there is not warranty that T1 will first granted output mutex lock.
-        // To address this issue we maintain number of sent requests, number of processed
-        // responses and ring buffer with pending responses. After sending response
-        // (under input mutex), threads remembers request number. Then it releases
-        // input mutex, locks output mutex and fetch in ring buffer all responses until
-        // its stored request number. The it takes correspondent element from
-        // pending responses ring buffer and truncate all empty elements from the front,
-        // advancing processed responses number.
-
-        let mut output = self.stdout.lock().unwrap();
-        let n_processed_responses = output.n_processed_responses;
-        while n_processed_responses + output.pending_responses.len() <= request_no {
-            // We expect the WAL redo process to respond with an 8k page image. We read it
-            // into this buffer.
-            let mut resultbuf = vec![0; BLCKSZ.into()];
-            let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far
-            while nresult < BLCKSZ.into() {
-                let mut stdout_pollfds = [PollFd::new(&output.stdout, PollFlags::POLLIN)];
-                // We do two things simultaneously: reading response from stdout
-                // and forward any logging information that the child writes to its stderr to the page server's log.
-                let n = loop {
-                    match nix::poll::poll(
-                        &mut stdout_pollfds[..],
-                        wal_redo_timeout.as_millis() as i32,
-                    ) {
-                        Err(nix::errno::Errno::EINTR) => continue,
-                        res => break res,
-                    }
-                }?;
-
-                if n == 0 {
-                    anyhow::bail!("WAL redo timed out");
-                }
-
-                // If we have some data in stdout, read it to the result buffer.
-                let out_revents = stdout_pollfds[0].revents().unwrap();
-                if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
-                    nresult += output.stdout.read(&mut resultbuf[nresult..])?;
-                }
-                if out_revents.contains(PollFlags::POLLHUP) {
-                    anyhow::bail!("WAL redo process closed its stdout unexpectedly");
-                }
-            }
-            output
-                .pending_responses
-                .push_back(Some(Bytes::from(resultbuf)));
-        }
-        // Replace our request's response with None in `pending_responses`.
-        // Then make space in the ring buffer by clearing out any seqence of contiguous
-        // `None`'s from the front of `pending_responses`.
-        // NB: We can't pop_front() because other requests' responses because another
-        // requester might have grabbed the output mutex before us:
-        // T1: grab input mutex
-        // T1: send request_no 23
-        // T1: release input mutex
-        // T2: grab input mutex
-        // T2: send request_no 24
-        // T2: release input mutex
-        // T2: grab output mutex
-        // T2: n_processed_responses + output.pending_responses.len() <= request_no
-        //            23                                0                   24
-        // T2: enters poll loop that reads stdout
-        // T2: put response for 23 into pending_responses
-        // T2: put response for 24 into pending_resposnes
-        // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
-        // T2: takes its response_24
-        // pending_responses now looks like this: Front Some(response_23) None Back
-        // T2: does the while loop below
-        // pending_responses now looks like this: Front Some(response_23) None Back
-        // T2: releases output mutex
-        // T1: grabs output mutex
-        // T1: n_processed_responses + output.pending_responses.len() > request_no
-        //            23                                2                   23
-        // T1: skips poll loop that reads stdout
-        // T1: takes its response_23
-        // pending_responses now looks like this: Front None None Back
-        // T2: does the while loop below
-        // pending_responses now looks like this: Front Back
-        // n_processed_responses now has value 25
-        let res = output.pending_responses[request_no - n_processed_responses]
-            .take()
-            .expect("we own this request_no, nobody else is supposed to take it");
-        while let Some(front) = output.pending_responses.front() {
-            if front.is_none() {
-                output.pending_responses.pop_front();
-                output.n_processed_responses += 1;
-            } else {
-                break;
-            }
-        }
-        Ok(res)
-    }
-
-    #[cfg(feature = "testing")]
-    fn record_and_log(&self, writebuf: &[u8]) {
-        use std::sync::atomic::Ordering;
-
-        let millis = std::time::SystemTime::now()
-            .duration_since(std::time::SystemTime::UNIX_EPOCH)
-            .unwrap()
-            .as_millis();
-
-        let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
-
-        // these files will be collected to an allure report
-        let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
-
-        let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
-
-        let res = std::fs::OpenOptions::new()
-            .write(true)
-            .create_new(true)
-            .read(true)
-            .open(path)
-            .and_then(|mut f| f.write_all(writebuf));
-
-        // trip up allowed_errors
-        if let Err(e) = res {
-            tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
-        } else {
-            tracing::error!(filename, "erroring walredo input saved");
-        }
-    }
-
-    #[cfg(not(feature = "testing"))]
-    fn record_and_log(&self, _: &[u8]) {}
-}
-
-impl Drop for WalRedoProcess {
-    fn drop(&mut self) {
-        self.child
-            .take()
-            .expect("we only do this once")
-            .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
-        // no way to wait for stderr_logger_task from Drop because that is async only
-    }
-}
diff --git a/test_runner/regress/test_pageserver_config.py b/test_runner/regress/test_pageserver_config.py
deleted file mode 100644
index c04348b488..0000000000
--- a/test_runner/regress/test_pageserver_config.py
+++ /dev/null
@@ -1,35 +0,0 @@
-import pytest
-from fixtures.neon_fixtures import (
-    NeonEnvBuilder,
-    last_flush_lsn_upload,
-)
-
-
-@pytest.mark.parametrize("kind", ["sync", "async"])
-def test_walredo_process_kind_config(neon_env_builder: NeonEnvBuilder, kind: str):
-    neon_env_builder.pageserver_config_override = f"walredo_process_kind = '{kind}'"
-    # ensure it starts
-    env = neon_env_builder.init_start()
-    # ensure the metric is set
-    ps_http = env.pageserver.http_client()
-    metrics = ps_http.get_metrics()
-    samples = metrics.query_all("pageserver_wal_redo_process_kind")
-    assert [(s.labels, s.value) for s in samples] == [({"kind": kind}, 1)]
-    # ensure default tenant's config kind matches
-    # => write some data to force-spawn walredo
-    ep = env.endpoints.create_start("main")
-    with ep.connect() as conn:
-        with conn.cursor() as cur:
-            cur.execute("create table foo(bar text)")
-            cur.execute("insert into foo select from generate_series(1, 100)")
-    last_flush_lsn_upload(env, ep, env.initial_tenant, env.initial_timeline)
-    ep.stop()
-    ep.start()
-    with ep.connect() as conn:
-        with conn.cursor() as cur:
-            cur.execute("select count(*) from foo")
-            [(count,)] = cur.fetchall()
-            assert count == 100
-
-    status = ps_http.tenant_status(env.initial_tenant)
-    assert status["walredo"]["process"]["kind"] == kind

From 1075386d778f67ad32200c3e7d7279479a7eb84f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 15 May 2024 15:32:47 +0200
Subject: [PATCH 059/130] Add test_uploads_and_deletions test (#7758)

Adds a test that is a reproducer for many tiered compaction bugs,
both ones that have since been fixed as well as still unfxied ones:
* (now fixed) #7296
* #7707
* #7759
* Likely also #7244 but I haven't tried that.

The key ordering bug can be reproduced by switching to
`merge_delta_keys` instead of `merge_delta_keys_buffered`, so reverting
a big part of #7661, although it only sometimes reproduces (30-50% of
cases).

part of https://github.com/neondatabase/neon/issues/7554
---
 test_runner/fixtures/neon_fixtures.py         | 78 +++++++++++++++++
 test_runner/regress/test_compaction.py        | 62 +++++++++++++-
 .../regress/test_pageserver_generations.py    | 83 +------------------
 3 files changed, 141 insertions(+), 82 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 62a4b974a3..405ef19bfc 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -59,6 +59,7 @@ from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import (
     wait_for_last_record_lsn,
     wait_for_upload,
+    wait_for_upload_queue_empty,
 )
 from fixtures.pg_version import PgVersion
 from fixtures.port_distributor import PortDistributor
@@ -79,6 +80,7 @@ from fixtures.utils import (
     allure_attach_from_dir,
     assert_no_errors,
     get_self_dir,
+    print_gc_result,
     subprocess_capture,
     wait_until,
 )
@@ -4419,3 +4421,79 @@ def parse_project_git_version_output(s: str) -> str:
         return commit
 
     raise ValueError(f"unable to parse --version output: '{s}'")
+
+
+def generate_uploads_and_deletions(
+    env: NeonEnv,
+    *,
+    init: bool = True,
+    tenant_id: Optional[TenantId] = None,
+    timeline_id: Optional[TimelineId] = None,
+    data: Optional[str] = None,
+    pageserver: NeonPageserver,
+):
+    """
+    Using the environment's default tenant + timeline, generate a load pattern
+    that results in some uploads and some deletions to remote storage.
+    """
+
+    if tenant_id is None:
+        tenant_id = env.initial_tenant
+    assert tenant_id is not None
+
+    if timeline_id is None:
+        timeline_id = env.initial_timeline
+    assert timeline_id is not None
+
+    ps_http = pageserver.http_client()
+
+    with env.endpoints.create_start(
+        "main", tenant_id=tenant_id, pageserver_id=pageserver.id
+    ) as endpoint:
+        if init:
+            endpoint.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)")
+            last_flush_lsn_upload(
+                env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id
+            )
+
+        def churn(data):
+            endpoint.safe_psql_many(
+                [
+                    f"""
+                INSERT INTO foo (id, val)
+                SELECT g, '{data}'
+                FROM generate_series(1, 200) g
+                ON CONFLICT (id) DO UPDATE
+                SET val = EXCLUDED.val
+                """,
+                    # to ensure that GC can actually remove some layers
+                    "VACUUM foo",
+                ]
+            )
+            assert tenant_id is not None
+            assert timeline_id is not None
+            # We are waiting for uploads as well as local flush, in order to avoid leaving the system
+            # in a state where there are "future layers" in remote storage that will generate deletions
+            # after a restart.
+            last_flush_lsn_upload(
+                env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id
+            )
+
+        # Compaction should generate some GC-elegible layers
+        for i in range(0, 2):
+            churn(f"{i if data is None else data}")
+
+        gc_result = ps_http.timeline_gc(tenant_id, timeline_id, 0)
+        print_gc_result(gc_result)
+        assert gc_result["layers_removed"] > 0
+
+        # Stop endpoint and flush all data to pageserver, then checkpoint it: this
+        # ensures that the pageserver is in a fully idle state: there will be no more
+        # background ingest, no more uploads pending, and therefore no non-determinism
+        # in subsequent actions like pageserver restarts.
+        final_lsn = flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id, pageserver.id)
+        ps_http.timeline_checkpoint(tenant_id, timeline_id)
+        # Finish uploads
+        wait_for_upload(ps_http, tenant_id, timeline_id, final_lsn)
+        # Finish all remote writes (including deletions)
+        wait_for_upload_queue_empty(ps_http, tenant_id, timeline_id)
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index 43a3323462..93a16620a3 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -1,10 +1,12 @@
+import enum
 import json
 import os
 from typing import Optional
 
 import pytest
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnvBuilder
+from fixtures.neon_fixtures import NeonEnvBuilder, generate_uploads_and_deletions
+from fixtures.pageserver.http import PageserverApiException
 from fixtures.workload import Workload
 
 AGGRESIVE_COMPACTION_TENANT_CONF = {
@@ -190,3 +192,61 @@ def test_sharding_compaction(
 
     # Assert that everything is still readable
     workload.validate()
+
+
+class CompactionAlgorithm(str, enum.Enum):
+    LEGACY = "Legacy"
+    TIERED = "Tiered"
+
+
+@pytest.mark.parametrize(
+    "compaction_algorithm", [CompactionAlgorithm.LEGACY, CompactionAlgorithm.TIERED]
+)
+def test_uploads_and_deletions(
+    neon_env_builder: NeonEnvBuilder,
+    compaction_algorithm: CompactionAlgorithm,
+):
+    """
+    :param compaction_algorithm: the compaction algorithm to use.
+    """
+
+    tenant_conf = {
+        # small checkpointing and compaction targets to ensure we generate many upload operations
+        "checkpoint_distance": f"{128 * 1024}",
+        "compaction_threshold": "1",
+        "compaction_target_size": f"{128 * 1024}",
+        # no PITR horizon, we specify the horizon when we request on-demand GC
+        "pitr_interval": "0s",
+        # disable background compaction and GC. We invoke it manually when we want it to happen.
+        "gc_period": "0s",
+        "compaction_period": "0s",
+        # create image layers eagerly, so that GC can remove some layers
+        "image_creation_threshold": "1",
+        "image_layer_creation_check_threshold": "0",
+        "compaction_algorithm": json.dumps({"kind": compaction_algorithm.value}),
+    }
+    env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf)
+
+    # TODO remove these allowed errors
+    # https://github.com/neondatabase/neon/issues/7707
+    # https://github.com/neondatabase/neon/issues/7759
+    allowed_errors = [
+        ".*duplicated L1 layer.*",
+        ".*delta layer created with.*duplicate values.*",
+        ".*assertion failed: self.lsn_range.start <= lsn.*",
+        ".*HTTP request handler task panicked: task.*panicked.*",
+    ]
+    if compaction_algorithm == CompactionAlgorithm.TIERED:
+        env.pageserver.allowed_errors.extend(allowed_errors)
+
+    try:
+        generate_uploads_and_deletions(env, pageserver=env.pageserver)
+    except PageserverApiException as e:
+        log.info(f"Obtained PageserverApiException: {e}")
+
+    # The errors occur flakily and no error is ensured to occur,
+    # however at least one of them occurs.
+    if compaction_algorithm == CompactionAlgorithm.TIERED:
+        found_allowed_error = any(env.pageserver.log_contains(e) for e in allowed_errors)
+        if not found_allowed_error:
+            raise Exception("None of the allowed_errors occured in the log")
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index 4fdc5852f5..9b97254410 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -21,11 +21,9 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnv,
     NeonEnvBuilder,
-    NeonPageserver,
     PgBin,
     S3Scrubber,
-    flush_ep_to_pageserver,
-    last_flush_lsn_upload,
+    generate_uploads_and_deletions,
 )
 from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import (
@@ -33,12 +31,11 @@ from fixtures.pageserver.utils import (
     list_prefix,
     wait_for_last_record_lsn,
     wait_for_upload,
-    wait_for_upload_queue_empty,
 )
 from fixtures.remote_storage import (
     RemoteStorageKind,
 )
-from fixtures.utils import print_gc_result, wait_until
+from fixtures.utils import wait_until
 from fixtures.workload import Workload
 
 # A tenant configuration that is convenient for generating uploads and deletions
@@ -59,82 +56,6 @@ TENANT_CONF = {
 }
 
 
-def generate_uploads_and_deletions(
-    env: NeonEnv,
-    *,
-    init: bool = True,
-    tenant_id: Optional[TenantId] = None,
-    timeline_id: Optional[TimelineId] = None,
-    data: Optional[str] = None,
-    pageserver: NeonPageserver,
-):
-    """
-    Using the environment's default tenant + timeline, generate a load pattern
-    that results in some uploads and some deletions to remote storage.
-    """
-
-    if tenant_id is None:
-        tenant_id = env.initial_tenant
-    assert tenant_id is not None
-
-    if timeline_id is None:
-        timeline_id = env.initial_timeline
-    assert timeline_id is not None
-
-    ps_http = pageserver.http_client()
-
-    with env.endpoints.create_start(
-        "main", tenant_id=tenant_id, pageserver_id=pageserver.id
-    ) as endpoint:
-        if init:
-            endpoint.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)")
-            last_flush_lsn_upload(
-                env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id
-            )
-
-        def churn(data):
-            endpoint.safe_psql_many(
-                [
-                    f"""
-                INSERT INTO foo (id, val)
-                SELECT g, '{data}'
-                FROM generate_series(1, 200) g
-                ON CONFLICT (id) DO UPDATE
-                SET val = EXCLUDED.val
-                """,
-                    # to ensure that GC can actually remove some layers
-                    "VACUUM foo",
-                ]
-            )
-            assert tenant_id is not None
-            assert timeline_id is not None
-            # We are waiting for uploads as well as local flush, in order to avoid leaving the system
-            # in a state where there are "future layers" in remote storage that will generate deletions
-            # after a restart.
-            last_flush_lsn_upload(
-                env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id
-            )
-
-        # Compaction should generate some GC-elegible layers
-        for i in range(0, 2):
-            churn(f"{i if data is None else data}")
-
-        gc_result = ps_http.timeline_gc(tenant_id, timeline_id, 0)
-        print_gc_result(gc_result)
-        assert gc_result["layers_removed"] > 0
-
-        # Stop endpoint and flush all data to pageserver, then checkpoint it: this
-        # ensures that the pageserver is in a fully idle state: there will be no more
-        # background ingest, no more uploads pending, and therefore no non-determinism
-        # in subsequent actions like pageserver restarts.
-        final_lsn = flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id, pageserver.id)
-        ps_http.timeline_checkpoint(tenant_id, timeline_id)
-        # Finish uploads
-        wait_for_upload(ps_http, tenant_id, timeline_id, final_lsn)
-        # Finish all remote writes (including deletions)
-        wait_for_upload_queue_empty(ps_http, tenant_id, timeline_id)
-
-
 def read_all(
     env: NeonEnv, tenant_id: Optional[TenantId] = None, timeline_id: Optional[TimelineId] = None
 ):

From 3ef6e2121178a8c4e7f498aff9a9e7cb9376fd1c Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 15 May 2024 18:17:55 +0200
Subject: [PATCH 060/130] fixup #7747: actually use the fixture for
 neon_env_builder (#7767)

The `= None` makes it not use the fixture.

This slipped due to last-minute changes.
---
 test_runner/fixtures/neon_fixtures.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 405ef19bfc..d3aadbe612 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1392,8 +1392,8 @@ def neon_env_builder(
     test_overlay_dir: Path,
     top_output_dir: Path,
     pageserver_virtual_file_io_engine: str,
+    pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]],
     pageserver_aux_file_policy: Optional[AuxFileStore] = None,
-    pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]] = None,
 ) -> Iterator[NeonEnvBuilder]:
     """
     Fixture to create a Neon environment for test.

From affc18f912b67a31b05a05a05a9cff468a74d75f Mon Sep 17 00:00:00 2001
From: Jure Bajic <jure.bajic94@gmail.com>
Date: Wed, 15 May 2024 18:41:12 +0200
Subject: [PATCH 061/130] Add performance regress
 `test_ondemand_download_churn.py` (#7242)

Add performance regress test  for on-demand download throughput.

Closes https://github.com/neondatabase/neon/issues/7146

Co-authored-by: Christian Schwarz <christian@neon.tech>
Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 libs/pageserver_api/src/models.rs             |  10 +
 .../src/cmd/ondemand_download_churn.rs        | 103 ++++++++---
 .../pagebench/test_ondemand_download_churn.py | 175 ++++++++++++++++++
 3 files changed, 267 insertions(+), 21 deletions(-)
 create mode 100644 test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index d78d2bcbea..7cf54bf32a 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -745,6 +745,16 @@ impl HistoricLayerInfo {
         };
         *field = value;
     }
+    pub fn layer_file_size(&self) -> u64 {
+        match self {
+            HistoricLayerInfo::Delta {
+                layer_file_size, ..
+            } => *layer_file_size,
+            HistoricLayerInfo::Image {
+                layer_file_size, ..
+            } => *layer_file_size,
+        }
+    }
 }
 
 #[derive(Debug, Serialize, Deserialize)]
diff --git a/pageserver/pagebench/src/cmd/ondemand_download_churn.rs b/pageserver/pagebench/src/cmd/ondemand_download_churn.rs
index 197e782dca..1bb71b9353 100644
--- a/pageserver/pagebench/src/cmd/ondemand_download_churn.rs
+++ b/pageserver/pagebench/src/cmd/ondemand_download_churn.rs
@@ -2,9 +2,11 @@ use pageserver_api::{models::HistoricLayerInfo, shard::TenantShardId};
 
 use pageserver_client::mgmt_api;
 use rand::seq::SliceRandom;
+use tokio_util::sync::CancellationToken;
 use tracing::{debug, info};
 use utils::id::{TenantTimelineId, TimelineId};
 
+use std::{f64, sync::Arc};
 use tokio::{
     sync::{mpsc, OwnedSemaphorePermit},
     task::JoinSet,
@@ -12,10 +14,7 @@ use tokio::{
 
 use std::{
     num::NonZeroUsize,
-    sync::{
-        atomic::{AtomicU64, Ordering},
-        Arc,
-    },
+    sync::atomic::{AtomicU64, Ordering},
     time::{Duration, Instant},
 };
 
@@ -51,19 +50,31 @@ pub(crate) fn main(args: Args) -> anyhow::Result<()> {
     Ok(())
 }
 
+#[derive(serde::Serialize)]
+struct Output {
+    downloads_count: u64,
+    downloads_bytes: u64,
+    evictions_count: u64,
+    timeline_restarts: u64,
+    #[serde(with = "humantime_serde")]
+    runtime: Duration,
+}
+
 #[derive(Debug, Default)]
 struct LiveStats {
-    evictions: AtomicU64,
-    downloads: AtomicU64,
+    evictions_count: AtomicU64,
+    downloads_count: AtomicU64,
+    downloads_bytes: AtomicU64,
     timeline_restarts: AtomicU64,
 }
 
 impl LiveStats {
     fn eviction_done(&self) {
-        self.evictions.fetch_add(1, Ordering::Relaxed);
+        self.evictions_count.fetch_add(1, Ordering::Relaxed);
     }
-    fn download_done(&self) {
-        self.downloads.fetch_add(1, Ordering::Relaxed);
+    fn download_done(&self, size: u64) {
+        self.downloads_count.fetch_add(1, Ordering::Relaxed);
+        self.downloads_bytes.fetch_add(size, Ordering::Relaxed);
     }
     fn timeline_restart_done(&self) {
         self.timeline_restarts.fetch_add(1, Ordering::Relaxed);
@@ -92,28 +103,49 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
     )
     .await?;
 
+    let token = CancellationToken::new();
     let mut tasks = JoinSet::new();
 
-    let live_stats = Arc::new(LiveStats::default());
+    let periodic_stats = Arc::new(LiveStats::default());
+    let total_stats = Arc::new(LiveStats::default());
+
+    let start = Instant::now();
     tasks.spawn({
-        let live_stats = Arc::clone(&live_stats);
+        let periodic_stats = Arc::clone(&periodic_stats);
+        let total_stats = Arc::clone(&total_stats);
+        let cloned_token = token.clone();
         async move {
             let mut last_at = Instant::now();
             loop {
+                if cloned_token.is_cancelled() {
+                    return;
+                }
                 tokio::time::sleep_until((last_at + Duration::from_secs(1)).into()).await;
                 let now = Instant::now();
                 let delta: Duration = now - last_at;
                 last_at = now;
 
                 let LiveStats {
-                    evictions,
-                    downloads,
+                    evictions_count,
+                    downloads_count,
+                    downloads_bytes,
                     timeline_restarts,
-                } = &*live_stats;
-                let evictions = evictions.swap(0, Ordering::Relaxed) as f64 / delta.as_secs_f64();
-                let downloads = downloads.swap(0, Ordering::Relaxed) as f64 / delta.as_secs_f64();
+                } = &*periodic_stats;
+                let evictions_count = evictions_count.swap(0, Ordering::Relaxed);
+                let downloads_count = downloads_count.swap(0, Ordering::Relaxed);
+                let downloads_bytes = downloads_bytes.swap(0, Ordering::Relaxed);
                 let timeline_restarts = timeline_restarts.swap(0, Ordering::Relaxed);
-                info!("evictions={evictions:.2}/s downloads={downloads:.2}/s timeline_restarts={timeline_restarts}");
+
+                total_stats.evictions_count.fetch_add(evictions_count, Ordering::Relaxed);
+                total_stats.downloads_count.fetch_add(downloads_count, Ordering::Relaxed);
+                total_stats.downloads_bytes.fetch_add(downloads_bytes, Ordering::Relaxed);
+                total_stats.timeline_restarts.fetch_add(timeline_restarts, Ordering::Relaxed);
+
+                let evictions_per_s = evictions_count as f64 / delta.as_secs_f64();
+                let downloads_per_s = downloads_count as f64 / delta.as_secs_f64();
+                let downloads_mibs_per_s = downloads_bytes as f64 / delta.as_secs_f64() / ((1 << 20) as f64);
+
+                info!("evictions={evictions_per_s:.2}/s downloads={downloads_per_s:.2}/s download_bytes={downloads_mibs_per_s:.2}MiB/s timeline_restarts={timeline_restarts}");
             }
         }
     });
@@ -124,14 +156,42 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
                 args,
                 Arc::clone(&mgmt_api_client),
                 tl,
-                Arc::clone(&live_stats),
+                Arc::clone(&periodic_stats),
+                token.clone(),
             ));
         }
     }
+    if let Some(runtime) = args.runtime {
+        tokio::spawn(async move {
+            tokio::time::sleep(runtime.into()).await;
+            token.cancel();
+        });
+    }
 
     while let Some(res) = tasks.join_next().await {
         res.unwrap();
     }
+    let end = Instant::now();
+    let duration: Duration = end - start;
+
+    let output = {
+        let LiveStats {
+            evictions_count,
+            downloads_count,
+            downloads_bytes,
+            timeline_restarts,
+        } = &*total_stats;
+        Output {
+            downloads_count: downloads_count.load(Ordering::Relaxed),
+            downloads_bytes: downloads_bytes.load(Ordering::Relaxed),
+            evictions_count: evictions_count.load(Ordering::Relaxed),
+            timeline_restarts: timeline_restarts.load(Ordering::Relaxed),
+            runtime: duration,
+        }
+    };
+    let output = serde_json::to_string_pretty(&output).unwrap();
+    println!("{output}");
+
     Ok(())
 }
 
@@ -140,6 +200,7 @@ async fn timeline_actor(
     mgmt_api_client: Arc<pageserver_client::mgmt_api::Client>,
     timeline: TenantTimelineId,
     live_stats: Arc<LiveStats>,
+    token: CancellationToken,
 ) {
     // TODO: support sharding
     let tenant_shard_id = TenantShardId::unsharded(timeline.tenant_id);
@@ -149,7 +210,7 @@ async fn timeline_actor(
         layers: Vec<mpsc::Sender<OwnedSemaphorePermit>>,
         concurrency: Arc<tokio::sync::Semaphore>,
     }
-    loop {
+    while !token.is_cancelled() {
         debug!("restarting timeline");
         let layer_map_info = mgmt_api_client
             .layer_map_info(tenant_shard_id, timeline.timeline_id)
@@ -185,7 +246,7 @@ async fn timeline_actor(
 
         live_stats.timeline_restart_done();
 
-        loop {
+        while !token.is_cancelled() {
             assert!(!timeline.joinset.is_empty());
             if let Some(res) = timeline.joinset.try_join_next() {
                 debug!(?res, "a layer actor exited, should not happen");
@@ -255,7 +316,7 @@ async fn layer_actor(
                     .layer_ondemand_download(tenant_shard_id, timeline_id, layer.layer_file_name())
                     .await
                     .unwrap();
-                live_stats.download_done();
+                live_stats.download_done(layer.layer_file_size());
                 did_it
             }
         };
diff --git a/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py b/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py
new file mode 100644
index 0000000000..644c1f559b
--- /dev/null
+++ b/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py
@@ -0,0 +1,175 @@
+import json
+from pathlib import Path
+from typing import Any, Dict, Tuple
+
+import pytest
+from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, PgBin, wait_for_last_flush_lsn
+from fixtures.pageserver.utils import wait_for_upload_queue_empty
+from fixtures.remote_storage import s3_storage
+from fixtures.utils import humantime_to_ms
+
+
+@pytest.mark.parametrize("duration", [30])
+@pytest.mark.parametrize("io_engine", ["tokio-epoll-uring", "std-fs"])
+@pytest.mark.parametrize("concurrency_per_target", [1, 10, 100])
+@pytest.mark.timeout(1000)
+def test_download_churn(
+    neon_env_builder: NeonEnvBuilder,
+    zenbenchmark: NeonBenchmarker,
+    pg_bin: PgBin,
+    io_engine: str,
+    concurrency_per_target: int,
+    duration: int,
+):
+    def record(metric, **kwargs):
+        zenbenchmark.record(metric_name=f"pageserver_ondemand_download_churn.{metric}", **kwargs)
+
+    params: Dict[str, Tuple[Any, Dict[str, Any]]] = {}
+
+    # params from fixtures
+    params.update(
+        {
+            # we don't capture `duration`, but instead use the `runtime` output field from pagebench
+        }
+    )
+
+    # configure cache sizes like in prod
+    page_cache_size = 16384
+    max_file_descriptors = 500000
+    neon_env_builder.pageserver_config_override = (
+        f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}"
+    )
+    params.update(
+        {
+            "pageserver_config_override.page_cache_size": (
+                page_cache_size * 8192,
+                {"unit": "byte"},
+            ),
+            "pageserver_config_override.max_file_descriptors": (max_file_descriptors, {"unit": ""}),
+        }
+    )
+
+    for param, (value, kwargs) in params.items():
+        record(param, metric_value=value, report=MetricReport.TEST_PARAM, **kwargs)
+
+    # Setup env
+    env = setup_env(neon_env_builder, pg_bin)
+    env.pageserver.allowed_errors.append(
+        f".*path=/v1/tenant/{env.initial_tenant}/timeline.* request was dropped before completing"
+    )
+
+    run_benchmark(env, pg_bin, record, io_engine, concurrency_per_target, duration)
+
+
+def setup_env(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
+    remote_storage_kind = s3_storage()
+    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
+
+    # We configure tenant conf such that SQL query below produces a lot of layers.
+    # We don't care what's in the layers really, we just care that layers are created.
+    bytes_per_layer = 10 * (1024**2)
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            "pitr_interval": "1000d",  # let's not make it get in the way
+            "gc_period": "0s",  # disable periodic gc to avoid noise
+            "compaction_period": "0s",  # disable L0=>L1 compaction
+            "checkpoint_timeout": "10years",  # rely solely on checkpoint_distance
+            "checkpoint_distance": bytes_per_layer,  # 10M instead of 256M to create more smaller layers
+            "image_creation_threshold": 100000,  # don't create image layers ever
+        }
+    )
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+    client = env.pageserver.http_client()
+
+    with env.endpoints.create_start("main", tenant_id=tenant_id) as ep:
+        ep.safe_psql("CREATE TABLE data (random_text text)")
+        bytes_per_row = 512  # make big enough so WAL record size doesn't dominate
+        desired_layers = 300
+        desired_bytes = bytes_per_layer * desired_layers
+        nrows = desired_bytes / bytes_per_row
+        ep.safe_psql(
+            f"INSERT INTO data SELECT lpad(i::text, {bytes_per_row}, '0') FROM generate_series(1, {int(nrows)})  as i",
+            options="-c statement_timeout=0",
+        )
+        wait_for_last_flush_lsn(env, ep, tenant_id, timeline_id)
+    # TODO: this is a bit imprecise, there could be frozen layers being written out that we don't observe here
+    wait_for_upload_queue_empty(client, tenant_id, timeline_id)
+
+    return env
+
+
+def run_benchmark(
+    env: NeonEnv,
+    pg_bin: PgBin,
+    record,
+    io_engine: str,
+    concurrency_per_target: int,
+    duration_secs: int,
+):
+    ps_http = env.pageserver.http_client()
+    cmd = [
+        str(env.neon_binpath / "pagebench"),
+        "ondemand-download-churn",
+        "--mgmt-api-endpoint",
+        ps_http.base_url,
+        "--runtime",
+        f"{duration_secs}s",
+        "--set-io-engine",
+        f"{io_engine}",
+        "--concurrency-per-target",
+        f"{concurrency_per_target}",
+        # don't specify the targets explicitly, let pagebench auto-discover them
+    ]
+
+    log.info(f"command: {' '.join(cmd)}")
+    basepath = pg_bin.run_capture(cmd, with_command_header=False)
+    results_path = Path(basepath + ".stdout")
+    log.info(f"Benchmark results at: {results_path}")
+
+    with open(results_path, "r") as f:
+        results = json.load(f)
+    log.info(f"Results:\n{json.dumps(results, sort_keys=True, indent=2)}")
+
+    metric = "downloads_count"
+    record(
+        metric,
+        metric_value=results[metric],
+        unit="",
+        report=MetricReport.HIGHER_IS_BETTER,
+    )
+
+    metric = "downloads_bytes"
+    record(
+        metric,
+        metric_value=results[metric],
+        unit="byte",
+        report=MetricReport.HIGHER_IS_BETTER,
+    )
+
+    metric = "evictions_count"
+    record(
+        metric,
+        metric_value=results[metric],
+        unit="",
+        report=MetricReport.HIGHER_IS_BETTER,
+    )
+
+    metric = "timeline_restarts"
+    record(
+        metric,
+        metric_value=results[metric],
+        unit="",
+        report=MetricReport.LOWER_IS_BETTER,
+    )
+
+    metric = "runtime"
+    record(
+        metric,
+        metric_value=humantime_to_ms(results[metric]) / 1000,
+        unit="s",
+        report=MetricReport.TEST_PARAM,
+    )

From 4b97683338bc21e13686fe4311946b36462729c1 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Wed, 15 May 2024 13:17:57 -0400
Subject: [PATCH 062/130] feat(pageserver): use fnv hash for aux file encoding
 (#7742)

FNV hash is simple, portable, and stable. This pull request vendors the
FNV hash implementation from servo and modified it to use the u128
variant.

replaces https://github.com/neondatabase/neon/pull/7644

ref https://github.com/neondatabase/neon/issues/7462

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/aux_file.rs | 65 ++++++++++++++++++++++++++------------
 1 file changed, 45 insertions(+), 20 deletions(-)

diff --git a/pageserver/src/aux_file.rs b/pageserver/src/aux_file.rs
index e6d950487d..38e1875db1 100644
--- a/pageserver/src/aux_file.rs
+++ b/pageserver/src/aux_file.rs
@@ -5,14 +5,35 @@ use bytes::{Buf, BufMut, Bytes};
 use pageserver_api::key::{Key, AUX_KEY_PREFIX, METADATA_KEY_SIZE};
 use tracing::warn;
 
-/// Create a metadata key from a hash, encoded as [AUX_KEY_PREFIX, 2B directory prefix, first 13B of 128b xxhash].
+// BEGIN Copyright (c) 2017 Servo Contributors
+
+/// Const version of FNV hash.
+#[inline]
+#[must_use]
+pub const fn fnv_hash(bytes: &[u8]) -> u128 {
+    const INITIAL_STATE: u128 = 0x6c62272e07bb014262b821756295c58d;
+    const PRIME: u128 = 0x0000000001000000000000000000013B;
+
+    let mut hash = INITIAL_STATE;
+    let mut i = 0;
+    while i < bytes.len() {
+        hash ^= bytes[i] as u128;
+        hash = hash.wrapping_mul(PRIME);
+        i += 1;
+    }
+    hash
+}
+
+// END Copyright (c) 2017 Servo Contributors
+
+/// Create a metadata key from a hash, encoded as [AUX_KEY_PREFIX, 2B directory prefix, least significant 13B of FNV hash].
 fn aux_hash_to_metadata_key(dir_level1: u8, dir_level2: u8, data: &[u8]) -> Key {
-    let mut key = [0; METADATA_KEY_SIZE];
-    let hash = twox_hash::xxh3::hash128(data).to_be_bytes();
+    let mut key: [u8; 16] = [0; METADATA_KEY_SIZE];
+    let hash = fnv_hash(data).to_be_bytes();
     key[0] = AUX_KEY_PREFIX;
     key[1] = dir_level1;
     key[2] = dir_level2;
-    key[3..16].copy_from_slice(&hash[0..13]);
+    key[3..16].copy_from_slice(&hash[3..16]);
     Key::from_metadata_key_fixed_size(&key)
 }
 
@@ -200,15 +221,19 @@ mod tests {
     fn test_hash_portable() {
         // AUX file encoding requires the hash to be portable across all platforms. This test case checks
         // if the algorithm produces the same hash across different environments.
+
         assert_eq!(
-            305317690835051308206966631765527126151,
-            twox_hash::xxh3::hash128("test1".as_bytes())
+            265160408618497461376862998434862070044,
+            super::fnv_hash("test1".as_bytes())
         );
         assert_eq!(
-            85104974691013376326742244813280798847,
-            twox_hash::xxh3::hash128("test/test2".as_bytes())
+            295486155126299629456360817749600553988,
+            super::fnv_hash("test/test2".as_bytes())
+        );
+        assert_eq!(
+            144066263297769815596495629667062367629,
+            super::fnv_hash("".as_bytes())
         );
-        assert_eq!(0, twox_hash::xxh3::hash128("".as_bytes()));
     }
 
     #[test]
@@ -216,28 +241,28 @@ mod tests {
         // To correct retrieve AUX files, the generated keys for the same file must be the same for all versions
         // of the page server.
         assert_eq!(
-            "6200000101E5B20C5F8DD5AA3289D6D9EAFA",
-            encode_aux_file_key("pg_logical/mappings/test1").to_string()
+            "62000001017F8B83D94F7081693471ABF91C",
+            encode_aux_file_key("pg_logical/mappings/test1").to_string(),
         );
         assert_eq!(
-            "620000010239AAC544893139B26F501B97E6",
-            encode_aux_file_key("pg_logical/snapshots/test2").to_string()
+            "62000001027F8E83D94F7081693471ABFCCD",
+            encode_aux_file_key("pg_logical/snapshots/test2").to_string(),
         );
         assert_eq!(
-            "620000010300000000000000000000000000",
-            encode_aux_file_key("pg_logical/replorigin_checkpoint").to_string()
+            "62000001032E07BB014262B821756295C58D",
+            encode_aux_file_key("pg_logical/replorigin_checkpoint").to_string(),
         );
         assert_eq!(
-            "62000001FF8635AF2134B7266EC5B4189FD6",
-            encode_aux_file_key("pg_logical/unsupported").to_string()
+            "62000001FF4F38E1C74754E7D03C1A660178",
+            encode_aux_file_key("pg_logical/unsupported").to_string(),
         );
         assert_eq!(
-            "6200000201772D0E5D71DE14DA86142A1619",
+            "62000002017F8D83D94F7081693471ABFB92",
             encode_aux_file_key("pg_replslot/test3").to_string()
         );
         assert_eq!(
-            "620000FFFF1866EBEB53B807B26A2416F317",
-            encode_aux_file_key("other_file_not_supported").to_string()
+            "620000FFFF2B6ECC8AEF93F643DC44F15E03",
+            encode_aux_file_key("other_file_not_supported").to_string(),
         );
     }
 

From c6d5ff944db91f498e46fa24eb0d667abdf94dba Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Wed, 15 May 2024 14:29:12 -0400
Subject: [PATCH 063/130] fix(test): ensure fixtures are correctly used for
 pageserver_aux_file_policy (#7769)

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 test_runner/fixtures/neon_fixtures.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index d3aadbe612..a6fd4792dd 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1393,7 +1393,7 @@ def neon_env_builder(
     top_output_dir: Path,
     pageserver_virtual_file_io_engine: str,
     pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]],
-    pageserver_aux_file_policy: Optional[AuxFileStore] = None,
+    pageserver_aux_file_policy: Optional[AuxFileStore],
 ) -> Iterator[NeonEnvBuilder]:
     """
     Fixture to create a Neon environment for test.

From 03c603970748844cbc188f1e0dc6179fa1a1e83d Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 16 May 2024 09:26:34 +0100
Subject: [PATCH 064/130] pageserver: refine tenant_id->shard lookup (#7762)

## Problem

This is tech debt from when shard splitting was implemented, to handle
more nicely the edge case of a client reconnect at the moment of the
split.

During shard splits, there were edge cases where we could incorrectly
return NotFound to a getpage@lsn request, prompting an unwanted
reconnect/backoff from the client.

It is already the case that parent shards during splits are marked
InProgress before child shards are created, so `resolve_attached_shard`
will not match on them, thereby implicitly preferring child shards
(good).

However, we were not doing any elegant handling of InProgress in
general: `get_active_tenant_with_timeout` was previously mostly dead
code: it was inspecting the slot found by `resolve_attached_shard` and
maybe waiting for InProgress, but that path is never taken because since
ef7c9c2ccc1a385f74455f45b54faa5b101065e6 the resolve function only ever
returns attached slots.

Closes: https://github.com/neondatabase/neon/issues/7044

## Summary of changes

- Change return value of `resolve_attached_shard` to distinguish between
true NotFound case, and the case where we skipped slots that were
InProgress.
- Rework `get_active_tenant_with_timeout` to loop over calling
resolve_attached_shard, waiting if it sees an InProgress result.

The resulting behavior during a shard split is:
- If we look up a shard early in split when parent is InProgress but
children aren't created yet, we'll wait for the parent to be shut down.
This corresponds to the part of the split where we wait for LSNs to
catch up: so a small delay to the request, but a clean enough handling.
- If we look up a shard while child shards are already present, we will
match on those shards rather than the parent, as intended.
---
 pageserver/src/bin/pageserver.rs |  25 ++--
 pageserver/src/page_service.rs   | 125 +++++++++++------
 pageserver/src/tenant/mgr.rs     | 222 +++++++++++--------------------
 3 files changed, 176 insertions(+), 196 deletions(-)

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index a04195e12b..ba5b2608bd 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -647,17 +647,20 @@ fn start_pageserver(
             None,
             "libpq endpoint listener",
             true,
-            async move {
-                page_service::libpq_listener_main(
-                    conf,
-                    broker_client,
-                    pg_auth,
-                    pageserver_listener,
-                    conf.pg_auth_type,
-                    libpq_ctx,
-                    task_mgr::shutdown_token(),
-                )
-                .await
+            {
+                let tenant_manager = tenant_manager.clone();
+                async move {
+                    page_service::libpq_listener_main(
+                        tenant_manager,
+                        broker_client,
+                        pg_auth,
+                        pageserver_listener,
+                        conf.pg_auth_type,
+                        libpq_ctx,
+                        task_mgr::shutdown_token(),
+                    )
+                    .await
+                }
             },
         );
     }
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index f6b251283c..35aba044b2 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -32,6 +32,7 @@ use std::str;
 use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
+use std::time::Instant;
 use tokio::io::AsyncWriteExt;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_util::io::StreamReader;
@@ -49,7 +50,6 @@ use utils::{
 use crate::auth::check_permission;
 use crate::basebackup;
 use crate::basebackup::BasebackupError;
-use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::import_datadir::import_wal_from_tar;
 use crate::metrics;
@@ -59,13 +59,15 @@ use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
-use crate::tenant::mgr;
-use crate::tenant::mgr::get_active_tenant_with_timeout;
 use crate::tenant::mgr::GetActiveTenantError;
+use crate::tenant::mgr::GetTenantError;
+use crate::tenant::mgr::ShardResolveResult;
 use crate::tenant::mgr::ShardSelector;
+use crate::tenant::mgr::TenantManager;
 use crate::tenant::timeline::WaitLsnError;
 use crate::tenant::GetTimelineError;
 use crate::tenant::PageReconstructError;
+use crate::tenant::Tenant;
 use crate::tenant::Timeline;
 use crate::trace::Tracer;
 use pageserver_api::key::rel_block_to_key;
@@ -135,7 +137,7 @@ async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()
 /// Listens for connections, and launches a new handler task for each.
 ///
 pub async fn libpq_listener_main(
-    conf: &'static PageServerConf,
+    tenant_manager: Arc<TenantManager>,
     broker_client: storage_broker::BrokerClientChannel,
     auth: Option<Arc<SwappableJwtAuth>>,
     listener: TcpListener,
@@ -180,7 +182,7 @@ pub async fn libpq_listener_main(
                     "serving compute connection task",
                     false,
                     page_service_conn_main(
-                        conf,
+                        tenant_manager.clone(),
                         broker_client.clone(),
                         local_auth,
                         socket,
@@ -203,7 +205,7 @@ pub async fn libpq_listener_main(
 
 #[instrument(skip_all, fields(peer_addr))]
 async fn page_service_conn_main(
-    conf: &'static PageServerConf,
+    tenant_manager: Arc<TenantManager>,
     broker_client: storage_broker::BrokerClientChannel,
     auth: Option<Arc<SwappableJwtAuth>>,
     socket: tokio::net::TcpStream,
@@ -260,7 +262,8 @@ async fn page_service_conn_main(
     // and create a child per-query context when it invokes process_query.
     // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
     // and create the per-query context in process_query ourselves.
-    let mut conn_handler = PageServerHandler::new(conf, broker_client, auth, connection_ctx);
+    let mut conn_handler =
+        PageServerHandler::new(tenant_manager, broker_client, auth, connection_ctx);
     let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
 
     match pgbackend
@@ -291,11 +294,12 @@ struct HandlerTimeline {
 }
 
 struct PageServerHandler {
-    _conf: &'static PageServerConf,
     broker_client: storage_broker::BrokerClientChannel,
     auth: Option<Arc<SwappableJwtAuth>>,
     claims: Option<Claims>,
 
+    tenant_manager: Arc<TenantManager>,
+
     /// The context created for the lifetime of the connection
     /// services by this PageServerHandler.
     /// For each query received over the connection,
@@ -381,13 +385,13 @@ impl From<WaitLsnError> for QueryError {
 
 impl PageServerHandler {
     pub fn new(
-        conf: &'static PageServerConf,
+        tenant_manager: Arc<TenantManager>,
         broker_client: storage_broker::BrokerClientChannel,
         auth: Option<Arc<SwappableJwtAuth>>,
         connection_ctx: RequestContext,
     ) -> Self {
         PageServerHandler {
-            _conf: conf,
+            tenant_manager,
             broker_client,
             auth,
             claims: None,
@@ -552,13 +556,9 @@ impl PageServerHandler {
     {
         debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
 
-        let tenant = mgr::get_active_tenant_with_timeout(
-            tenant_id,
-            ShardSelector::First,
-            ACTIVE_TENANT_TIMEOUT,
-            &task_mgr::shutdown_token(),
-        )
-        .await?;
+        let tenant = self
+            .get_active_tenant_with_timeout(tenant_id, ShardSelector::First, ACTIVE_TENANT_TIMEOUT)
+            .await?;
 
         // Make request tracer if needed
         let mut tracer = if tenant.get_trace_read_requests() {
@@ -726,13 +726,9 @@ impl PageServerHandler {
 
         // Create empty timeline
         info!("creating new timeline");
-        let tenant = get_active_tenant_with_timeout(
-            tenant_id,
-            ShardSelector::Zero,
-            ACTIVE_TENANT_TIMEOUT,
-            &task_mgr::shutdown_token(),
-        )
-        .await?;
+        let tenant = self
+            .get_active_tenant_with_timeout(tenant_id, ShardSelector::Zero, ACTIVE_TENANT_TIMEOUT)
+            .await?;
         let timeline = tenant
             .create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
             .await?;
@@ -1370,18 +1366,69 @@ impl PageServerHandler {
         timeline_id: TimelineId,
         selector: ShardSelector,
     ) -> Result<Arc<Timeline>, GetActiveTimelineError> {
-        let tenant = get_active_tenant_with_timeout(
-            tenant_id,
-            selector,
-            ACTIVE_TENANT_TIMEOUT,
-            &task_mgr::shutdown_token(),
-        )
-        .await
-        .map_err(GetActiveTimelineError::Tenant)?;
+        let tenant = self
+            .get_active_tenant_with_timeout(tenant_id, selector, ACTIVE_TENANT_TIMEOUT)
+            .await
+            .map_err(GetActiveTimelineError::Tenant)?;
         let timeline = tenant.get_timeline(timeline_id, true)?;
         set_tracing_field_shard_id(&timeline);
         Ok(timeline)
     }
+
+    /// Get a shard's [`Tenant`] in its active state, if present.  If we don't find the shard and some
+    /// slots for this tenant are `InProgress` then we will wait.
+    /// If we find the [`Tenant`] and it's not yet in state [`TenantState::Active`], we will wait.
+    ///
+    /// `timeout` is used as a total timeout for the whole wait operation.
+    async fn get_active_tenant_with_timeout(
+        &self,
+        tenant_id: TenantId,
+        shard_selector: ShardSelector,
+        timeout: Duration,
+    ) -> Result<Arc<Tenant>, GetActiveTenantError> {
+        let wait_start = Instant::now();
+        let deadline = wait_start + timeout;
+
+        // Resolve TenantId to TenantShardId.  This is usually a quick one-shot thing, the loop is
+        // for handling the rare case that the slot we're accessing is InProgress.
+        let tenant_shard = loop {
+            let resolved = self
+                .tenant_manager
+                .resolve_attached_shard(&tenant_id, shard_selector);
+            match resolved {
+                ShardResolveResult::Found(tenant_shard) => break tenant_shard,
+                ShardResolveResult::NotFound => {
+                    return Err(GetActiveTenantError::NotFound(GetTenantError::NotFound(
+                        tenant_id,
+                    )));
+                }
+                ShardResolveResult::InProgress(barrier) => {
+                    // We can't authoritatively answer right now: wait for InProgress state
+                    // to end, then try again
+                    tokio::select! {
+                        _ = self.await_connection_cancelled() => {
+                            return Err(GetActiveTenantError::Cancelled)
+                        },
+                        _  = barrier.wait() => {
+                            // The barrier completed: proceed around the loop to try looking up again
+                        },
+                        _ = tokio::time::sleep(deadline.duration_since(Instant::now())) => {
+                            return Err(GetActiveTenantError::WaitForActiveTimeout {
+                                latest_state: None,
+                                wait_time: timeout,
+                            });
+                        }
+                    }
+                }
+            };
+        };
+
+        tracing::debug!("Waiting for tenant to enter active state...");
+        tenant_shard
+            .wait_to_become_active(deadline.duration_since(Instant::now()))
+            .await?;
+        Ok(tenant_shard)
+    }
 }
 
 #[async_trait::async_trait]
@@ -1771,13 +1818,13 @@ where
 
             self.check_permission(Some(tenant_id))?;
 
-            let tenant = get_active_tenant_with_timeout(
-                tenant_id,
-                ShardSelector::Zero,
-                ACTIVE_TENANT_TIMEOUT,
-                &task_mgr::shutdown_token(),
-            )
-            .await?;
+            let tenant = self
+                .get_active_tenant_with_timeout(
+                    tenant_id,
+                    ShardSelector::Zero,
+                    ACTIVE_TENANT_TIMEOUT,
+                )
+                .await?;
             pgb.write_message_noflush(&BeMessage::RowDescription(&[
                 RowDescriptor::int8_col(b"checkpoint_distance"),
                 RowDescriptor::int8_col(b"checkpoint_timeout"),
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 5abda7b64e..1d8e2cf6d3 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -16,10 +16,9 @@ use std::cmp::Ordering;
 use std::collections::{BTreeMap, HashMap};
 use std::ops::Deref;
 use std::sync::Arc;
-use std::time::{Duration, Instant};
+use std::time::Duration;
 use sysinfo::SystemExt;
 use tokio::fs;
-use utils::timeout::{timeout_cancellable, TimeoutCancellableError};
 
 use anyhow::Context;
 use once_cell::sync::Lazy;
@@ -119,6 +118,7 @@ pub(crate) enum TenantsMapRemoveResult {
 
 /// When resolving a TenantId to a shard, we may be looking for the 0th
 /// shard, or we might be looking for whichever shard holds a particular page.
+#[derive(Copy, Clone)]
 pub(crate) enum ShardSelector {
     /// Only return the 0th shard, if it is present.  If a non-0th shard is present,
     /// ignore it.
@@ -169,6 +169,14 @@ impl TenantStartupMode {
     }
 }
 
+/// Result type for looking up a TenantId to a specific shard
+pub(crate) enum ShardResolveResult {
+    NotFound,
+    Found(Arc<Tenant>),
+    // Wait for this barrrier, then query again
+    InProgress(utils::completion::Barrier),
+}
+
 impl TenantsMap {
     /// Convenience function for typical usage, where we want to get a `Tenant` object, for
     /// working with attached tenants.  If the TenantId is in the map but in Secondary state,
@@ -182,51 +190,6 @@ impl TenantsMap {
         }
     }
 
-    /// A page service client sends a TenantId, and to look up the correct Tenant we must
-    /// resolve this to a fully qualified TenantShardId.
-    fn resolve_attached_shard(
-        &self,
-        tenant_id: &TenantId,
-        selector: ShardSelector,
-    ) -> Option<TenantShardId> {
-        let mut want_shard = None;
-        match self {
-            TenantsMap::Initializing => None,
-            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
-                for slot in m.range(TenantShardId::tenant_range(*tenant_id)) {
-                    // Ignore all slots that don't contain an attached tenant
-                    let tenant = match &slot.1 {
-                        TenantSlot::Attached(t) => t,
-                        _ => continue,
-                    };
-
-                    match selector {
-                        ShardSelector::First => return Some(*slot.0),
-                        ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
-                            return Some(*slot.0)
-                        }
-                        ShardSelector::Page(key) => {
-                            // First slot we see for this tenant, calculate the expected shard number
-                            // for the key: we will use this for checking if this and subsequent
-                            // slots contain the key, rather than recalculating the hash each time.
-                            if want_shard.is_none() {
-                                want_shard = Some(tenant.shard_identity.get_shard_number(&key));
-                            }
-
-                            if Some(tenant.shard_identity.number) == want_shard {
-                                return Some(*slot.0);
-                            }
-                        }
-                        _ => continue,
-                    }
-                }
-
-                // Fall through: we didn't find an acceptable shard
-                None
-            }
-        }
-    }
-
     /// Only for use from DeleteTenantFlow.  This method directly removes a TenantSlot from the map.
     ///
     /// The normal way to remove a tenant is using a SlotGuard, which will gracefully remove the guarded
@@ -2053,6 +2016,72 @@ impl TenantManager {
 
         Ok(reparented)
     }
+
+    /// A page service client sends a TenantId, and to look up the correct Tenant we must
+    /// resolve this to a fully qualified TenantShardId.
+    ///
+    /// During shard splits: we shall see parent shards in InProgress state and skip them, and
+    /// instead match on child shards which should appear in Attached state.  Very early in a shard
+    /// split, or in other cases where a shard is InProgress, we will return our own InProgress result
+    /// to instruct the caller to wait for that to finish before querying again.
+    pub(crate) fn resolve_attached_shard(
+        &self,
+        tenant_id: &TenantId,
+        selector: ShardSelector,
+    ) -> ShardResolveResult {
+        let tenants = self.tenants.read().unwrap();
+        let mut want_shard = None;
+        let mut any_in_progress = None;
+
+        match &*tenants {
+            TenantsMap::Initializing => ShardResolveResult::NotFound,
+            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
+                for slot in m.range(TenantShardId::tenant_range(*tenant_id)) {
+                    // Ignore all slots that don't contain an attached tenant
+                    let tenant = match &slot.1 {
+                        TenantSlot::Attached(t) => t,
+                        TenantSlot::InProgress(barrier) => {
+                            // We might still find a usable shard, but in case we don't, remember that
+                            // we saw at least one InProgress slot, so that we can distinguish this case
+                            // from a simple NotFound in our return value.
+                            any_in_progress = Some(barrier.clone());
+                            continue;
+                        }
+                        _ => continue,
+                    };
+
+                    match selector {
+                        ShardSelector::First => return ShardResolveResult::Found(tenant.clone()),
+                        ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
+                            return ShardResolveResult::Found(tenant.clone())
+                        }
+                        ShardSelector::Page(key) => {
+                            // First slot we see for this tenant, calculate the expected shard number
+                            // for the key: we will use this for checking if this and subsequent
+                            // slots contain the key, rather than recalculating the hash each time.
+                            if want_shard.is_none() {
+                                want_shard = Some(tenant.shard_identity.get_shard_number(&key));
+                            }
+
+                            if Some(tenant.shard_identity.number) == want_shard {
+                                return ShardResolveResult::Found(tenant.clone());
+                            }
+                        }
+                        _ => continue,
+                    }
+                }
+
+                // Fall through: we didn't find a slot that was in Attached state & matched our selector.  If
+                // we found one or more InProgress slot, indicate to caller that they should retry later.  Otherwise
+                // this requested shard simply isn't found.
+                if let Some(barrier) = any_in_progress {
+                    ShardResolveResult::InProgress(barrier)
+                } else {
+                    ShardResolveResult::NotFound
+                }
+            }
+        }
+    }
 }
 
 #[derive(Debug, thiserror::Error)]
@@ -2101,105 +2130,6 @@ pub(crate) enum GetActiveTenantError {
     Broken(String),
 }
 
-/// Get a [`Tenant`] in its active state. If the tenant_id is currently in [`TenantSlot::InProgress`]
-/// state, then wait for up to `timeout`.  If the [`Tenant`] is not currently in [`TenantState::Active`],
-/// then wait for up to `timeout` (minus however long we waited for the slot).
-pub(crate) async fn get_active_tenant_with_timeout(
-    tenant_id: TenantId,
-    shard_selector: ShardSelector,
-    timeout: Duration,
-    cancel: &CancellationToken,
-) -> Result<Arc<Tenant>, GetActiveTenantError> {
-    enum WaitFor {
-        Barrier(utils::completion::Barrier),
-        Tenant(Arc<Tenant>),
-    }
-
-    let wait_start = Instant::now();
-    let deadline = wait_start + timeout;
-
-    let (wait_for, tenant_shard_id) = {
-        let locked = TENANTS.read().unwrap();
-
-        // Resolve TenantId to TenantShardId
-        let tenant_shard_id = locked
-            .resolve_attached_shard(&tenant_id, shard_selector)
-            .ok_or(GetActiveTenantError::NotFound(GetTenantError::NotFound(
-                tenant_id,
-            )))?;
-
-        let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
-            .map_err(GetTenantError::MapState)?;
-        match peek_slot {
-            Some(TenantSlot::Attached(tenant)) => {
-                match tenant.current_state() {
-                    TenantState::Active => {
-                        // Fast path: we don't need to do any async waiting.
-                        return Ok(tenant.clone());
-                    }
-                    _ => {
-                        tenant.activate_now();
-                        (WaitFor::Tenant(tenant.clone()), tenant_shard_id)
-                    }
-                }
-            }
-            Some(TenantSlot::Secondary(_)) => {
-                return Err(GetActiveTenantError::NotFound(GetTenantError::NotActive(
-                    tenant_shard_id,
-                )))
-            }
-            Some(TenantSlot::InProgress(barrier)) => {
-                (WaitFor::Barrier(barrier.clone()), tenant_shard_id)
-            }
-            None => {
-                return Err(GetActiveTenantError::NotFound(GetTenantError::NotFound(
-                    tenant_id,
-                )))
-            }
-        }
-    };
-
-    let tenant = match wait_for {
-        WaitFor::Barrier(barrier) => {
-            tracing::debug!("Waiting for tenant InProgress state to pass...");
-            timeout_cancellable(
-                deadline.duration_since(Instant::now()),
-                cancel,
-                barrier.wait(),
-            )
-            .await
-            .map_err(|e| match e {
-                TimeoutCancellableError::Timeout => GetActiveTenantError::WaitForActiveTimeout {
-                    latest_state: None,
-                    wait_time: wait_start.elapsed(),
-                },
-                TimeoutCancellableError::Cancelled => GetActiveTenantError::Cancelled,
-            })?;
-            {
-                let locked = TENANTS.read().unwrap();
-                let peek_slot =
-                    tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
-                        .map_err(GetTenantError::MapState)?;
-                match peek_slot {
-                    Some(TenantSlot::Attached(tenant)) => tenant.clone(),
-                    _ => {
-                        return Err(GetActiveTenantError::NotFound(GetTenantError::NotActive(
-                            tenant_shard_id,
-                        )))
-                    }
-                }
-            }
-        }
-        WaitFor::Tenant(tenant) => tenant,
-    };
-
-    tracing::debug!("Waiting for tenant to enter active state...");
-    tenant
-        .wait_to_become_active(deadline.duration_since(Instant::now()))
-        .await?;
-    Ok(tenant)
-}
-
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum DeleteTimelineError {
     #[error("Tenant {0}")]

From 923cf91aa4c1986c47fba0158fff0ac9bf8225ce Mon Sep 17 00:00:00 2001
From: Andrew Rudenko <me@prepor.dev>
Date: Thu, 16 May 2024 12:04:16 +0200
Subject: [PATCH 065/130] compute_ctl: catalog API endpoints (#7575)

## Problem

There are two cloud's features that require extra compute endpoints.

1. We are running pg_dump to get DB schemas. Currently, we are using a
special service for this. But it would be great to execute pg_dump in an
isolated environment. And we already have such an environment, it's our
compute! And likely enough pg_dump already exists there too! (see
https://github.com/neondatabase/cloud/issues/11644#issuecomment-2084617832)
2. We need to have a way to get databases and roles from compute after
time travel (see https://github.com/neondatabase/cloud/issues/12109)

## Summary of changes

It adds two API endpoints to compute_ctl HTTP API that target both of
the aforementioned cases.

---------

Co-authored-by: Tristan Partin <tristan@neon.tech>
---
 Cargo.lock                                  |   2 +
 compute_tools/Cargo.toml                    |   2 +
 compute_tools/src/catalog.rs                | 116 ++++++++++++++++++++
 compute_tools/src/http/api.rs               |  47 ++++++++
 compute_tools/src/http/openapi_spec.yaml    | 112 +++++++++++++++++++
 compute_tools/src/lib.rs                    |   1 +
 libs/compute_api/src/responses.rs           |   8 +-
 test_runner/fixtures/endpoint/__init__.py   |   0
 test_runner/fixtures/endpoint/http.py       |  23 ++++
 test_runner/fixtures/neon_fixtures.py       |   8 ++
 test_runner/regress/test_compute_catalog.py |  34 ++++++
 11 files changed, 352 insertions(+), 1 deletion(-)
 create mode 100644 compute_tools/src/catalog.rs
 create mode 100644 test_runner/fixtures/endpoint/__init__.py
 create mode 100644 test_runner/fixtures/endpoint/http.py
 create mode 100644 test_runner/regress/test_compute_catalog.py

diff --git a/Cargo.lock b/Cargo.lock
index 961101b151..b1f53404ea 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1233,8 +1233,10 @@ dependencies = [
  "serde_json",
  "signal-hook",
  "tar",
+ "thiserror",
  "tokio",
  "tokio-postgres",
+ "tokio-stream",
  "tokio-util",
  "toml_edit",
  "tracing",
diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml
index 759a117ee9..8f96530a9d 100644
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -27,10 +27,12 @@ reqwest = { workspace = true, features = ["json"] }
 tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
 tokio-postgres.workspace = true
 tokio-util.workspace = true
+tokio-stream.workspace = true
 tracing.workspace = true
 tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
 tracing-utils.workspace = true
+thiserror.workspace = true
 url.workspace = true
 
 compute_api.workspace = true
diff --git a/compute_tools/src/catalog.rs b/compute_tools/src/catalog.rs
new file mode 100644
index 0000000000..4fefa831e0
--- /dev/null
+++ b/compute_tools/src/catalog.rs
@@ -0,0 +1,116 @@
+use compute_api::{
+    responses::CatalogObjects,
+    spec::{Database, Role},
+};
+use futures::Stream;
+use postgres::{Client, NoTls};
+use std::{path::Path, process::Stdio, result::Result, sync::Arc};
+use tokio::{
+    io::{AsyncBufReadExt, BufReader},
+    process::Command,
+    task,
+};
+use tokio_stream::{self as stream, StreamExt};
+use tokio_util::codec::{BytesCodec, FramedRead};
+use tracing::warn;
+
+use crate::{
+    compute::ComputeNode,
+    pg_helpers::{get_existing_dbs, get_existing_roles},
+};
+
+pub async fn get_dbs_and_roles(compute: &Arc<ComputeNode>) -> anyhow::Result<CatalogObjects> {
+    let connstr = compute.connstr.clone();
+    task::spawn_blocking(move || {
+        let mut client = Client::connect(connstr.as_str(), NoTls)?;
+        let roles: Vec<Role>;
+        {
+            let mut xact = client.transaction()?;
+            roles = get_existing_roles(&mut xact)?;
+        }
+        let databases: Vec<Database> = get_existing_dbs(&mut client)?.values().cloned().collect();
+
+        Ok(CatalogObjects { roles, databases })
+    })
+    .await?
+}
+
+#[derive(Debug, thiserror::Error)]
+pub enum SchemaDumpError {
+    #[error("Database does not exist.")]
+    DatabaseDoesNotExist,
+    #[error("Failed to execute pg_dump.")]
+    IO(#[from] std::io::Error),
+}
+
+// It uses the pg_dump utility to dump the schema of the specified database.
+// The output is streamed back to the caller and supposed to be streamed via HTTP.
+//
+// Before return the result with the output, it checks that pg_dump produced any output.
+// If not, it tries to parse the stderr output to determine if the database does not exist
+// and special error is returned.
+//
+// To make sure that the process is killed when the caller drops the stream, we use tokio kill_on_drop feature.
+pub async fn get_database_schema(
+    compute: &Arc<ComputeNode>,
+    dbname: &str,
+) -> Result<impl Stream<Item = Result<bytes::Bytes, std::io::Error>>, SchemaDumpError> {
+    let pgbin = &compute.pgbin;
+    let basepath = Path::new(pgbin).parent().unwrap();
+    let pgdump = basepath.join("pg_dump");
+    let mut connstr = compute.connstr.clone();
+    connstr.set_path(dbname);
+    let mut cmd = Command::new(pgdump)
+        .arg("--schema-only")
+        .arg(connstr.as_str())
+        .stdout(Stdio::piped())
+        .stderr(Stdio::piped())
+        .kill_on_drop(true)
+        .spawn()?;
+
+    let stdout = cmd.stdout.take().ok_or_else(|| {
+        std::io::Error::new(std::io::ErrorKind::Other, "Failed to capture stdout.")
+    })?;
+
+    let stderr = cmd.stderr.take().ok_or_else(|| {
+        std::io::Error::new(std::io::ErrorKind::Other, "Failed to capture stderr.")
+    })?;
+
+    let mut stdout_reader = FramedRead::new(stdout, BytesCodec::new());
+    let stderr_reader = BufReader::new(stderr);
+
+    let first_chunk = match stdout_reader.next().await {
+        Some(Ok(bytes)) if !bytes.is_empty() => bytes,
+        Some(Err(e)) => {
+            return Err(SchemaDumpError::IO(e));
+        }
+        _ => {
+            let mut lines = stderr_reader.lines();
+            if let Some(line) = lines.next_line().await? {
+                if line.contains(&format!("FATAL:  database \"{}\" does not exist", dbname)) {
+                    return Err(SchemaDumpError::DatabaseDoesNotExist);
+                }
+                warn!("pg_dump stderr: {}", line)
+            }
+            tokio::spawn(async move {
+                while let Ok(Some(line)) = lines.next_line().await {
+                    warn!("pg_dump stderr: {}", line)
+                }
+            });
+
+            return Err(SchemaDumpError::IO(std::io::Error::new(
+                std::io::ErrorKind::Other,
+                "failed to start pg_dump",
+            )));
+        }
+    };
+    let initial_stream = stream::once(Ok(first_chunk.freeze()));
+    // Consume stderr and log warnings
+    tokio::spawn(async move {
+        let mut lines = stderr_reader.lines();
+        while let Ok(Some(line)) = lines.next_line().await {
+            warn!("pg_dump stderr: {}", line)
+        }
+    });
+    Ok(initial_stream.chain(stdout_reader.map(|res| res.map(|b| b.freeze()))))
+}
diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs
index 128783b477..0286429cf2 100644
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -5,17 +5,21 @@ use std::net::SocketAddr;
 use std::sync::Arc;
 use std::thread;
 
+use crate::catalog::SchemaDumpError;
+use crate::catalog::{get_database_schema, get_dbs_and_roles};
 use crate::compute::forward_termination_signal;
 use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
 use compute_api::requests::ConfigurationRequest;
 use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIError};
 
 use anyhow::Result;
+use hyper::header::CONTENT_TYPE;
 use hyper::service::{make_service_fn, service_fn};
 use hyper::{Body, Method, Request, Response, Server, StatusCode};
 use tokio::task;
 use tracing::{error, info, warn};
 use tracing_utils::http::OtelName;
+use utils::http::request::must_get_query_param;
 
 fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse {
     ComputeStatusResponse {
@@ -133,6 +137,34 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
             }
         }
 
+        (&Method::GET, "/dbs_and_roles") => {
+            info!("serving /dbs_and_roles GET request",);
+            match get_dbs_and_roles(compute).await {
+                Ok(res) => render_json(Body::from(serde_json::to_string(&res).unwrap())),
+                Err(_) => {
+                    render_json_error("can't get dbs and roles", StatusCode::INTERNAL_SERVER_ERROR)
+                }
+            }
+        }
+
+        (&Method::GET, "/database_schema") => {
+            let database = match must_get_query_param(&req, "database") {
+                Err(e) => return e.into_response(),
+                Ok(database) => database,
+            };
+            info!("serving /database_schema GET request with database: {database}",);
+            match get_database_schema(compute, &database).await {
+                Ok(res) => render_plain(Body::wrap_stream(res)),
+                Err(SchemaDumpError::DatabaseDoesNotExist) => {
+                    render_json_error("database does not exist", StatusCode::NOT_FOUND)
+                }
+                Err(e) => {
+                    error!("can't get schema dump: {}", e);
+                    render_json_error("can't get schema dump", StatusCode::INTERNAL_SERVER_ERROR)
+                }
+            }
+        }
+
         // download extension files from remote extension storage on demand
         (&Method::POST, route) if route.starts_with("/extension_server/") => {
             info!("serving {:?} POST request", route);
@@ -303,10 +335,25 @@ fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {
     };
     Response::builder()
         .status(status)
+        .header(CONTENT_TYPE, "application/json")
         .body(Body::from(serde_json::to_string(&error).unwrap()))
         .unwrap()
 }
 
+fn render_json(body: Body) -> Response<Body> {
+    Response::builder()
+        .header(CONTENT_TYPE, "application/json")
+        .body(body)
+        .unwrap()
+}
+
+fn render_plain(body: Body) -> Response<Body> {
+    Response::builder()
+        .header(CONTENT_TYPE, "text/plain")
+        .body(body)
+        .unwrap()
+}
+
 async fn handle_terminate_request(compute: &Arc<ComputeNode>) -> Result<(), (String, StatusCode)> {
     {
         let mut state = compute.state.lock().unwrap();
diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml
index d2ec54299f..b0ddaeae2b 100644
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -68,6 +68,51 @@ paths:
               schema:
                 $ref: "#/components/schemas/Info"
 
+  /dbs_and_roles:
+    get:
+      tags:
+        - Info
+      summary: Get databases and roles in the catalog.
+      description: ""
+      operationId: getDbsAndRoles
+      responses:
+        200:
+          description: Compute schema objects
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/DbsAndRoles"
+
+  /database_schema:
+    get:
+      tags:
+        - Info
+      summary: Get schema dump
+      parameters:
+        - name: database
+          in: query
+          description: Database name to dump.
+          required: true
+          schema:
+            type: string
+          example: "postgres"
+      description: Get schema dump in SQL format.
+      operationId: getDatabaseSchema
+      responses:
+        200:
+          description: Schema dump
+          content:
+            text/plain:
+              schema:
+                type: string
+                description: Schema dump in SQL format.
+        404:
+          description: Non existing database.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/GenericError"
+
   /check_writability:
     post:
       tags:
@@ -229,6 +274,73 @@ components:
         num_cpus:
           type: integer
 
+    DbsAndRoles:
+      type: object
+      description: Databases and Roles
+      required:
+        - roles
+        - databases
+      properties:
+        roles:
+          type: array
+          items:
+            $ref: "#/components/schemas/Role"
+        databases:
+          type: array
+          items:
+            $ref: "#/components/schemas/Database"
+
+    Database:
+      type: object
+      description: Database
+      required:
+        - name
+        - owner
+        - restrict_conn
+        - invalid
+      properties:
+        name:
+          type: string
+        owner:
+          type: string
+        options:
+          type: array
+          items:
+            $ref: "#/components/schemas/GenericOption"
+        restrict_conn:
+          type: boolean
+        invalid:
+          type: boolean
+
+    Role:
+      type: object
+      description: Role
+      required:
+        - name
+      properties:
+        name:
+          type: string
+        encrypted_password:
+          type: string
+        options:
+          type: array
+          items:
+            $ref: "#/components/schemas/GenericOption"
+
+    GenericOption:
+      type: object
+      description: Schema Generic option
+      required:
+        - name
+        - vartype
+      properties:
+        name:
+          type: string
+        value:
+          type: string
+        vartype:
+          type: string
+
     ComputeState:
       type: object
       required:
diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs
index eac808385c..18c228ba54 100644
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -8,6 +8,7 @@ pub mod configurator;
 pub mod http;
 #[macro_use]
 pub mod logger;
+pub mod catalog;
 pub mod compute;
 pub mod extension_server;
 pub mod monitor;
diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs
index fd0c90d447..d05d625b0a 100644
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -3,7 +3,7 @@
 use chrono::{DateTime, Utc};
 use serde::{Deserialize, Serialize, Serializer};
 
-use crate::spec::ComputeSpec;
+use crate::spec::{ComputeSpec, Database, Role};
 
 #[derive(Serialize, Debug, Deserialize)]
 pub struct GenericAPIError {
@@ -113,6 +113,12 @@ pub struct ComputeMetrics {
     pub total_ext_download_size: u64,
 }
 
+#[derive(Clone, Debug, Default, Serialize)]
+pub struct CatalogObjects {
+    pub roles: Vec<Role>,
+    pub databases: Vec<Database>,
+}
+
 /// Response of the `/computes/{compute_id}/spec` control-plane API.
 /// This is not actually a compute API response, so consider moving
 /// to a different place.
diff --git a/test_runner/fixtures/endpoint/__init__.py b/test_runner/fixtures/endpoint/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py
new file mode 100644
index 0000000000..42f0539c19
--- /dev/null
+++ b/test_runner/fixtures/endpoint/http.py
@@ -0,0 +1,23 @@
+import requests
+from requests.adapters import HTTPAdapter
+
+
+class EndpointHttpClient(requests.Session):
+    def __init__(
+        self,
+        port: int,
+    ):
+        super().__init__()
+        self.port = port
+
+        self.mount("http://", HTTPAdapter())
+
+    def dbs_and_roles(self):
+        res = self.get(f"http://localhost:{self.port}/dbs_and_roles")
+        res.raise_for_status()
+        return res.json()
+
+    def database_schema(self, database: str):
+        res = self.get(f"http://localhost:{self.port}/database_schema?database={database}")
+        res.raise_for_status()
+        return res.text
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index a6fd4792dd..b4761f103b 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -48,6 +48,7 @@ from urllib3.util.retry import Retry
 from fixtures import overlayfs
 from fixtures.broker import NeonBroker
 from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId
+from fixtures.endpoint.http import EndpointHttpClient
 from fixtures.log_helper import log
 from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
 from fixtures.pageserver.allowed_errors import (
@@ -3373,6 +3374,13 @@ class Endpoint(PgProtocol):
         self.active_safekeepers: List[int] = list(map(lambda sk: sk.id, env.safekeepers))
         # path to conf is <repo_dir>/endpoints/<endpoint_id>/pgdata/postgresql.conf
 
+    def http_client(
+        self, auth_token: Optional[str] = None, retries: Optional[Retry] = None
+    ) -> EndpointHttpClient:
+        return EndpointHttpClient(
+            port=self.http_port,
+        )
+
     def create(
         self,
         branch_name: str,
diff --git a/test_runner/regress/test_compute_catalog.py b/test_runner/regress/test_compute_catalog.py
new file mode 100644
index 0000000000..dd36190fcd
--- /dev/null
+++ b/test_runner/regress/test_compute_catalog.py
@@ -0,0 +1,34 @@
+import requests
+from fixtures.neon_fixtures import NeonEnv
+
+
+def test_compute_catalog(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+    env.neon_cli.create_branch("test_config", "empty")
+
+    endpoint = env.endpoints.create_start("test_config", config_lines=["log_min_messages=debug1"])
+    client = endpoint.http_client()
+
+    objects = client.dbs_and_roles()
+
+    # Assert that 'cloud_admin' role exists in the 'roles' list
+    assert any(
+        role["name"] == "cloud_admin" for role in objects["roles"]
+    ), "The 'cloud_admin' role is missing"
+
+    # Assert that 'postgres' database exists in the 'databases' list
+    assert any(
+        db["name"] == "postgres" for db in objects["databases"]
+    ), "The 'postgres' database is missing"
+
+    ddl = client.database_schema(database="postgres")
+
+    assert "-- PostgreSQL database dump" in ddl
+
+    try:
+        client.database_schema(database="nonexistentdb")
+        raise AssertionError("Expected HTTPError was not raised")
+    except requests.exceptions.HTTPError as e:
+        assert (
+            e.response.status_code == 404
+        ), f"Expected 404 status code, but got {e.response.status_code}"

From 790c05d67543a2f183193eff66de4e90dbe2e7f9 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 16 May 2024 12:05:50 +0100
Subject: [PATCH 066/130] proxy: swap tungstenite for a simpler impl (#7353)

## Problem

I wanted to do a deep dive of the tungstenite codebase.
tokio-tungstenite is incredibly convoluted... In my searching I found
[fastwebsockets by deno](https://github.com/denoland/fastwebsockets),
but it wasn't quite sufficient.

This also removes the default 16MB/64MB frame/message size limitation.
framed-websockets solves this by inserting continuation frames for
partially received messages, so the whole message does not need to be
entirely read into memory.

## Summary of changes

I took the fastwebsockets code as a starting off point and rewrote it to
be simpler, server-only, and be poll-based to support our Read/Write
wrappers.

I have replaced our tungstenite code with my framed-websockets fork.

<https://github.com/neondatabase/framed-websockets>
---
 Cargo.lock                                   | 109 ++++++++-----------
 Cargo.toml                                   |   7 +-
 proxy/Cargo.toml                             |   4 +-
 proxy/src/serverless.rs                      |   9 +-
 proxy/src/serverless/websocket.rs            |  93 ++++++++--------
 test_runner/regress/test_proxy_websockets.py |   9 +-
 6 files changed, 107 insertions(+), 124 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index b1f53404ea..e1edd53fea 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -708,7 +708,7 @@ dependencies = [
  "sha1",
  "sync_wrapper",
  "tokio",
- "tokio-tungstenite 0.20.0",
+ "tokio-tungstenite",
  "tower",
  "tower-layer",
  "tower-service",
@@ -979,6 +979,12 @@ version = "3.13.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1"
 
+[[package]]
+name = "bytemuck"
+version = "1.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "78834c15cb5d5efe3452d58b1e8ba890dd62d21907f867f383358198e56ebca5"
+
 [[package]]
 name = "byteorder"
 version = "1.4.3"
@@ -1598,7 +1604,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6943ae99c34386c84a470c499d3414f66502a41340aa895406e0d2e4a207b91d"
 dependencies = [
  "cfg-if",
- "hashbrown 0.14.0",
+ "hashbrown 0.14.5",
  "lock_api",
  "once_cell",
  "parking_lot_core 0.9.8",
@@ -1999,6 +2005,27 @@ dependencies = [
  "percent-encoding",
 ]
 
+[[package]]
+name = "framed-websockets"
+version = "0.1.0"
+source = "git+https://github.com/neondatabase/framed-websockets#34eff3d6f8cfccbc5f35e4f65314ff7328621127"
+dependencies = [
+ "base64 0.21.1",
+ "bytemuck",
+ "bytes",
+ "futures-core",
+ "futures-sink",
+ "http-body-util",
+ "hyper 1.2.0",
+ "hyper-util",
+ "pin-project",
+ "rand 0.8.5",
+ "sha1",
+ "thiserror",
+ "tokio",
+ "tokio-util",
+]
+
 [[package]]
 name = "fs2"
 version = "0.4.3"
@@ -2277,9 +2304,9 @@ dependencies = [
 
 [[package]]
 name = "hashbrown"
-version = "0.14.0"
+version = "0.14.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2c6201b9ff9fd90a5a3bac2e56a830d0caa509576f0e503818ee82c181b3437a"
+checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
 dependencies = [
  "ahash",
  "allocator-api2",
@@ -2287,11 +2314,11 @@ dependencies = [
 
 [[package]]
 name = "hashlink"
-version = "0.8.4"
+version = "0.9.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e8094feaf31ff591f651a2664fb9cfd92bba7a60ce3197265e9482ebe753c8f7"
+checksum = "6ba4ff7128dee98c7dc9794b6a411377e1404dba1c97deb8d1a55297bd25d8af"
 dependencies = [
- "hashbrown 0.14.0",
+ "hashbrown 0.14.5",
 ]
 
 [[package]]
@@ -2600,21 +2627,6 @@ dependencies = [
  "tokio-native-tls",
 ]
 
-[[package]]
-name = "hyper-tungstenite"
-version = "0.13.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7a343d17fe7885302ed7252767dc7bb83609a874b6ff581142241ec4b73957ad"
-dependencies = [
- "http-body-util",
- "hyper 1.2.0",
- "hyper-util",
- "pin-project-lite",
- "tokio",
- "tokio-tungstenite 0.21.0",
- "tungstenite 0.21.0",
-]
-
 [[package]]
 name = "hyper-util"
 version = "0.1.3"
@@ -2692,7 +2704,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ad227c3af19d4914570ad36d30409928b75967c298feb9ea1969db3a610bb14e"
 dependencies = [
  "equivalent",
- "hashbrown 0.14.0",
+ "hashbrown 0.14.5",
 ]
 
 [[package]]
@@ -2954,7 +2966,7 @@ version = "0.12.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d3262e75e648fce39813cb56ac41f3c3e3f65217ebf3844d818d1f9398cfb0dc"
 dependencies = [
- "hashbrown 0.14.0",
+ "hashbrown 0.14.5",
 ]
 
 [[package]]
@@ -3007,7 +3019,7 @@ checksum = "652bc741286361c06de8cb4d89b21a6437f120c508c51713663589eeb9928ac5"
 dependencies = [
  "bytes",
  "crossbeam-utils",
- "hashbrown 0.14.0",
+ "hashbrown 0.14.5",
  "itoa",
  "lasso",
  "measured-derive",
@@ -3569,7 +3581,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "49203cdcae0030493bad186b28da2fa25645fa276a51b6fec8010d281e02ef79"
 dependencies = [
  "dlv-list",
- "hashbrown 0.14.0",
+ "hashbrown 0.14.5",
 ]
 
 [[package]]
@@ -3896,7 +3908,7 @@ dependencies = [
  "ahash",
  "bytes",
  "chrono",
- "hashbrown 0.14.0",
+ "hashbrown 0.14.5",
  "num",
  "num-bigint",
  "paste",
@@ -4380,9 +4392,10 @@ dependencies = [
  "dashmap",
  "env_logger",
  "fallible-iterator",
+ "framed-websockets",
  "futures",
  "git-version",
- "hashbrown 0.13.2",
+ "hashbrown 0.14.5",
  "hashlink",
  "hex",
  "hmac",
@@ -4392,7 +4405,6 @@ dependencies = [
  "humantime",
  "hyper 0.14.26",
  "hyper 1.2.0",
- "hyper-tungstenite",
  "hyper-util",
  "indexmap 2.0.1",
  "ipnet",
@@ -4437,7 +4449,6 @@ dependencies = [
  "smol_str",
  "socket2 0.5.5",
  "subtle",
- "sync_wrapper",
  "task-local-extensions",
  "thiserror",
  "tikv-jemalloc-ctl",
@@ -4446,6 +4457,7 @@ dependencies = [
  "tokio-postgres",
  "tokio-postgres-rustls",
  "tokio-rustls 0.25.0",
+ "tokio-tungstenite",
  "tokio-util",
  "tower-service",
  "tracing",
@@ -6382,19 +6394,7 @@ dependencies = [
  "futures-util",
  "log",
  "tokio",
- "tungstenite 0.20.1",
-]
-
-[[package]]
-name = "tokio-tungstenite"
-version = "0.21.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c83b561d025642014097b66e6c1bb422783339e0909e4429cde4749d1990bc38"
-dependencies = [
- "futures-util",
- "log",
- "tokio",
- "tungstenite 0.21.0",
+ "tungstenite",
 ]
 
 [[package]]
@@ -6408,7 +6408,7 @@ dependencies = [
  "futures-io",
  "futures-sink",
  "futures-util",
- "hashbrown 0.14.0",
+ "hashbrown 0.14.5",
  "pin-project-lite",
  "tokio",
  "tracing",
@@ -6690,25 +6690,6 @@ dependencies = [
  "utf-8",
 ]
 
-[[package]]
-name = "tungstenite"
-version = "0.21.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9ef1a641ea34f399a848dea702823bbecfb4c486f911735368f1f137cb8257e1"
-dependencies = [
- "byteorder",
- "bytes",
- "data-encoding",
- "http 1.1.0",
- "httparse",
- "log",
- "rand 0.8.5",
- "sha1",
- "thiserror",
- "url",
- "utf-8",
-]
-
 [[package]]
 name = "twox-hash"
 version = "1.6.3"
@@ -7504,7 +7485,7 @@ dependencies = [
  "futures-sink",
  "futures-util",
  "getrandom 0.2.11",
- "hashbrown 0.14.0",
+ "hashbrown 0.14.5",
  "hex",
  "hmac",
  "hyper 0.14.26",
diff --git a/Cargo.toml b/Cargo.toml
index 3ccdabee18..b59a5dcd6d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -81,13 +81,14 @@ enum-map = "2.4.2"
 enumset = "1.0.12"
 fail = "0.5.0"
 fallible-iterator = "0.2"
+framed-websockets = { version = "0.1.0", git = "https://github.com/neondatabase/framed-websockets" }
 fs2 = "0.4.3"
 futures = "0.3"
 futures-core = "0.3"
 futures-util = "0.3"
 git-version = "0.3"
-hashbrown = "0.13"
-hashlink = "0.8.4"
+hashbrown = "0.14"
+hashlink = "0.9.1"
 hdrhistogram = "7.5.2"
 hex = "0.4"
 hex-literal = "0.4"
@@ -98,7 +99,7 @@ http-types = { version = "2", default-features = false }
 humantime = "2.1"
 humantime-serde = "1.1.1"
 hyper = "0.14"
-hyper-tungstenite = "0.13.0"
+tokio-tungstenite = "0.20.0"
 indexmap = "2"
 inotify = "0.10.2"
 ipnet = "2.9.0"
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 3002006aed..5f9b0aa75b 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -26,6 +26,7 @@ clap.workspace = true
 consumption_metrics.workspace = true
 dashmap.workspace = true
 env_logger.workspace = true
+framed-websockets.workspace = true
 futures.workspace = true
 git-version.workspace = true
 hashbrown.workspace = true
@@ -35,7 +36,6 @@ hmac.workspace = true
 hostname.workspace = true
 http.workspace = true
 humantime.workspace = true
-hyper-tungstenite.workspace = true
 hyper.workspace = true
 hyper1 = { package = "hyper", version = "1.2", features = ["server"] }
 hyper-util = { version = "0.1", features = ["server", "http1", "http2", "tokio"] }
@@ -76,7 +76,6 @@ smol_str.workspace = true
 smallvec.workspace = true
 socket2.workspace = true
 subtle.workspace = true
-sync_wrapper.workspace = true
 task-local-extensions.workspace = true
 thiserror.workspace = true
 tikv-jemallocator.workspace = true
@@ -106,6 +105,7 @@ workspace_hack.workspace = true
 [dev-dependencies]
 camino-tempfile.workspace = true
 fallible-iterator.workspace = true
+tokio-tungstenite.workspace = true
 rcgen.workspace = true
 rstest.workspace = true
 tokio-postgres-rustls.workspace = true
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index f634ab4e98..24ee749e6e 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -102,7 +102,7 @@ pub async fn task_main(
     let connections = tokio_util::task::task_tracker::TaskTracker::new();
     connections.close(); // allows `connections.wait to complete`
 
-    let server = Builder::new(hyper_util::rt::TokioExecutor::new());
+    let server = Builder::new(TokioExecutor::new());
 
     while let Some(res) = run_until_cancelled(ws_listener.accept(), &cancellation_token).await {
         let (conn, peer_addr) = res.context("could not accept TCP stream")?;
@@ -255,7 +255,6 @@ async fn connection_handler(
                 .in_current_span()
                 .map_ok_or_else(api_error_into_response, |r| r),
             );
-
             async move {
                 let res = handler.await;
                 cancel_request.disarm();
@@ -301,7 +300,7 @@ async fn request_handler(
         .map(|s| s.to_string());
 
     // Check if the request is a websocket upgrade request.
-    if hyper_tungstenite::is_upgrade_request(&request) {
+    if framed_websockets::upgrade::is_upgrade_request(&request) {
         let ctx = RequestMonitoring::new(
             session_id,
             peer_addr,
@@ -312,7 +311,7 @@ async fn request_handler(
         let span = ctx.span.clone();
         info!(parent: &span, "performing websocket upgrade");
 
-        let (response, websocket) = hyper_tungstenite::upgrade(&mut request, None)
+        let (response, websocket) = framed_websockets::upgrade::upgrade(&mut request)
             .map_err(|e| ApiError::BadRequest(e.into()))?;
 
         ws_connections.spawn(
@@ -334,7 +333,7 @@ async fn request_handler(
         );
 
         // Return the response so the spawned future can continue.
-        Ok(response)
+        Ok(response.map(|_: http_body_util::Empty<Bytes>| Full::new(Bytes::new())))
     } else if request.uri().path() == "/sql" && *request.method() == Method::POST {
         let ctx = RequestMonitoring::new(
             session_id,
diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs
index 649bec2c7c..61d6d60dbe 100644
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -7,10 +7,11 @@ use crate::{
     proxy::{handle_client, ClientMode},
     rate_limiter::EndpointRateLimiter,
 };
-use bytes::{Buf, Bytes};
+use bytes::{Buf, BufMut, Bytes, BytesMut};
+use framed_websockets::{Frame, OpCode, WebSocketServer};
 use futures::{Sink, Stream};
-use hyper::upgrade::Upgraded;
-use hyper_tungstenite::{tungstenite::Message, HyperWebsocket, WebSocketStream};
+use hyper1::upgrade::OnUpgrade;
+use hyper_util::rt::TokioIo;
 use pin_project_lite::pin_project;
 
 use std::{
@@ -21,25 +22,23 @@ use std::{
 use tokio::io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf};
 use tracing::warn;
 
-// TODO: use `std::sync::Exclusive` once it's stabilized.
-// Tracking issue: https://github.com/rust-lang/rust/issues/98407.
-use sync_wrapper::SyncWrapper;
-
 pin_project! {
     /// This is a wrapper around a [`WebSocketStream`] that
     /// implements [`AsyncRead`] and [`AsyncWrite`].
-    pub struct WebSocketRw<S = Upgraded> {
+    pub struct WebSocketRw<S> {
         #[pin]
-        stream: SyncWrapper<WebSocketStream<S>>,
-        bytes: Bytes,
+        stream: WebSocketServer<S>,
+        recv: Bytes,
+        send: BytesMut,
     }
 }
 
 impl<S> WebSocketRw<S> {
-    pub fn new(stream: WebSocketStream<S>) -> Self {
+    pub fn new(stream: WebSocketServer<S>) -> Self {
         Self {
-            stream: stream.into(),
-            bytes: Bytes::new(),
+            stream,
+            recv: Bytes::new(),
+            send: BytesMut::new(),
         }
     }
 }
@@ -50,22 +49,24 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AsyncWrite for WebSocketRw<S> {
         cx: &mut Context<'_>,
         buf: &[u8],
     ) -> Poll<io::Result<usize>> {
-        let mut stream = self.project().stream.get_pin_mut();
+        let this = self.project();
+        let mut stream = this.stream;
+        this.send.put(buf);
 
         ready!(stream.as_mut().poll_ready(cx).map_err(io_error))?;
-        match stream.as_mut().start_send(Message::Binary(buf.into())) {
+        match stream.as_mut().start_send(Frame::binary(this.send.split())) {
             Ok(()) => Poll::Ready(Ok(buf.len())),
             Err(e) => Poll::Ready(Err(io_error(e))),
         }
     }
 
     fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
-        let stream = self.project().stream.get_pin_mut();
+        let stream = self.project().stream;
         stream.poll_flush(cx).map_err(io_error)
     }
 
     fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
-        let stream = self.project().stream.get_pin_mut();
+        let stream = self.project().stream;
         stream.poll_close(cx).map_err(io_error)
     }
 }
@@ -76,13 +77,10 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AsyncRead for WebSocketRw<S> {
         cx: &mut Context<'_>,
         buf: &mut ReadBuf<'_>,
     ) -> Poll<io::Result<()>> {
-        if buf.remaining() > 0 {
-            let bytes = ready!(self.as_mut().poll_fill_buf(cx))?;
-            let len = std::cmp::min(bytes.len(), buf.remaining());
-            buf.put_slice(&bytes[..len]);
-            self.consume(len);
-        }
-
+        let bytes = ready!(self.as_mut().poll_fill_buf(cx))?;
+        let len = std::cmp::min(bytes.len(), buf.remaining());
+        buf.put_slice(&bytes[..len]);
+        self.consume(len);
         Poll::Ready(Ok(()))
     }
 }
@@ -94,31 +92,27 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AsyncBufRead for WebSocketRw<S> {
 
         let mut this = self.project();
         loop {
-            if !this.bytes.chunk().is_empty() {
-                let chunk = (*this.bytes).chunk();
+            if !this.recv.chunk().is_empty() {
+                let chunk = (*this.recv).chunk();
                 return Poll::Ready(Ok(chunk));
             }
 
-            let res = ready!(this.stream.as_mut().get_pin_mut().poll_next(cx));
+            let res = ready!(this.stream.as_mut().poll_next(cx));
             match res.transpose().map_err(io_error)? {
-                Some(message) => match message {
-                    Message::Ping(_) => {}
-                    Message::Pong(_) => {}
-                    Message::Text(text) => {
+                Some(message) => match message.opcode {
+                    OpCode::Ping => {}
+                    OpCode::Pong => {}
+                    OpCode::Text => {
                         // We expect to see only binary messages.
                         let error = "unexpected text message in the websocket";
-                        warn!(length = text.len(), error);
+                        warn!(length = message.payload.len(), error);
                         return Poll::Ready(Err(io_error(error)));
                     }
-                    Message::Frame(_) => {
-                        // This case is impossible according to Frame's doc.
-                        panic!("unexpected raw frame in the websocket");
+                    OpCode::Binary | OpCode::Continuation => {
+                        debug_assert!(this.recv.is_empty());
+                        *this.recv = message.payload.freeze();
                     }
-                    Message::Binary(chunk) => {
-                        assert!(this.bytes.is_empty());
-                        *this.bytes = Bytes::from(chunk);
-                    }
-                    Message::Close(_) => return EOF,
+                    OpCode::Close => return EOF,
                 },
                 None => return EOF,
             }
@@ -126,19 +120,21 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AsyncBufRead for WebSocketRw<S> {
     }
 
     fn consume(self: Pin<&mut Self>, amount: usize) {
-        self.project().bytes.advance(amount);
+        self.project().recv.advance(amount);
     }
 }
 
 pub async fn serve_websocket(
     config: &'static ProxyConfig,
     mut ctx: RequestMonitoring,
-    websocket: HyperWebsocket,
+    websocket: OnUpgrade,
     cancellation_handler: Arc<CancellationHandlerMain>,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     hostname: Option<String>,
 ) -> anyhow::Result<()> {
     let websocket = websocket.await?;
+    let websocket = WebSocketServer::after_handshake(TokioIo::new(websocket));
+
     let conn_gauge = Metrics::get()
         .proxy
         .client_connections
@@ -177,15 +173,16 @@ pub async fn serve_websocket(
 mod tests {
     use std::pin::pin;
 
+    use framed_websockets::WebSocketServer;
     use futures::{SinkExt, StreamExt};
-    use hyper_tungstenite::{
-        tungstenite::{protocol::Role, Message},
-        WebSocketStream,
-    };
     use tokio::{
         io::{duplex, AsyncReadExt, AsyncWriteExt},
         task::JoinSet,
     };
+    use tokio_tungstenite::{
+        tungstenite::{protocol::Role, Message},
+        WebSocketStream,
+    };
 
     use super::WebSocketRw;
 
@@ -210,9 +207,7 @@ mod tests {
         });
 
         js.spawn(async move {
-            let mut rw = pin!(WebSocketRw::new(
-                WebSocketStream::from_raw_socket(stream2, Role::Server, None).await
-            ));
+            let mut rw = pin!(WebSocketRw::new(WebSocketServer::after_handshake(stream2)));
 
             let mut buf = vec![0; 1024];
             let n = rw.read(&mut buf).await.unwrap();
diff --git a/test_runner/regress/test_proxy_websockets.py b/test_runner/regress/test_proxy_websockets.py
index 6d1cb9765a..6211446a40 100644
--- a/test_runner/regress/test_proxy_websockets.py
+++ b/test_runner/regress/test_proxy_websockets.py
@@ -135,7 +135,14 @@ async def test_websockets_pipelined(static_proxy: NeonProxy):
         query_message = "SELECT 1".encode("utf-8") + b"\0"
         length2 = (4 + len(query_message)).to_bytes(4, byteorder="big")
         await websocket.send(
-            [length0, startup_message, b"p", length1, auth_message, b"Q", length2, query_message]
+            length0
+            + startup_message
+            + b"p"
+            + length1
+            + auth_message
+            + b"Q"
+            + length2
+            + query_message
         )
 
         startup_response = await websocket.recv()

From ec069dc45ec6c0ef9b500dc0f433a0415b7e26db Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 16 May 2024 16:48:49 +0200
Subject: [PATCH 067/130] tiered compaction: introduce PAGE_SZ constant and use
 it (#7785)

pointed out by @problame : we use the literal 8192 instead of a properly
defined constant. replace the literal by a PAGE_SZ constant.
---
 pageserver/compaction/src/bin/compaction-simulator.rs | 3 ++-
 pageserver/compaction/src/compact_tiered.rs           | 6 +++---
 pageserver/compaction/src/helpers.rs                  | 2 ++
 pageserver/compaction/src/simulator.rs                | 3 ++-
 4 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/pageserver/compaction/src/bin/compaction-simulator.rs b/pageserver/compaction/src/bin/compaction-simulator.rs
index 1fd69407d3..c308694ae1 100644
--- a/pageserver/compaction/src/bin/compaction-simulator.rs
+++ b/pageserver/compaction/src/bin/compaction-simulator.rs
@@ -1,4 +1,5 @@
 use clap::{Parser, Subcommand};
+use pageserver_compaction::helpers::PAGE_SZ;
 use pageserver_compaction::simulator::MockTimeline;
 use rand::Rng;
 use std::io::Write;
@@ -51,7 +52,7 @@ async fn simulate(cmd: &SimulateCmd, results_path: &Path) -> anyhow::Result<()>
     let mut executor = MockTimeline::new();
 
     // Convert the logical size in MB into a key range.
-    let key_range = 0..((cmd.logical_size * 1024 * 1024) / 8192);
+    let key_range = 0..((cmd.logical_size * 1024 * 1024) / PAGE_SZ);
     //let key_range = u64::MIN..u64::MAX;
     println!(
         "starting simulation with key range {:016X}-{:016X}",
diff --git a/pageserver/compaction/src/compact_tiered.rs b/pageserver/compaction/src/compact_tiered.rs
index a8f184af24..33c9948f45 100644
--- a/pageserver/compaction/src/compact_tiered.rs
+++ b/pageserver/compaction/src/compact_tiered.rs
@@ -25,7 +25,7 @@ use std::collections::{HashSet, VecDeque};
 use std::ops::Range;
 
 use crate::helpers::{
-    accum_key_values, keyspace_total_size, merge_delta_keys_buffered, overlaps_with,
+    accum_key_values, keyspace_total_size, merge_delta_keys_buffered, overlaps_with, PAGE_SZ,
 };
 use crate::interface::*;
 use utils::lsn::Lsn;
@@ -379,7 +379,7 @@ where
                 .get_keyspace(&job.key_range, job.lsn_range.end, ctx)
                 .await?,
             &self.shard_identity,
-        ) * 8192;
+        ) * PAGE_SZ;
 
         let wal_size = job
             .input_layers
@@ -441,7 +441,7 @@ where
         let mut window = KeyspaceWindow::new(
             E::Key::MIN..E::Key::MAX,
             keyspace,
-            self.target_file_size / 8192,
+            self.target_file_size / PAGE_SZ,
         );
         while let Some(key_range) = window.choose_next_image(&self.shard_identity) {
             new_jobs.push(CompactionJob::<E> {
diff --git a/pageserver/compaction/src/helpers.rs b/pageserver/compaction/src/helpers.rs
index 2c922b0a49..8ed1d16082 100644
--- a/pageserver/compaction/src/helpers.rs
+++ b/pageserver/compaction/src/helpers.rs
@@ -16,6 +16,8 @@ use std::pin::Pin;
 use std::task::{ready, Poll};
 use utils::lsn::Lsn;
 
+pub const PAGE_SZ: u64 = 8192;
+
 pub fn keyspace_total_size<K>(
     keyspace: &CompactionKeySpace<K>,
     shard_identity: &ShardIdentity,
diff --git a/pageserver/compaction/src/simulator.rs b/pageserver/compaction/src/simulator.rs
index 3543df64fa..a7c8bd5c1f 100644
--- a/pageserver/compaction/src/simulator.rs
+++ b/pageserver/compaction/src/simulator.rs
@@ -14,6 +14,7 @@ use std::ops::Range;
 use std::sync::Arc;
 use std::sync::Mutex;
 
+use crate::helpers::PAGE_SZ;
 use crate::helpers::{merge_delta_keys, overlaps_with};
 
 use crate::interface;
@@ -509,7 +510,7 @@ impl interface::CompactionJobExecutor for MockTimeline {
         let new_layer = Arc::new(MockImageLayer {
             key_range: key_range.clone(),
             lsn_range: lsn..lsn,
-            file_size: accum_size * 8192,
+            file_size: accum_size * PAGE_SZ,
             deleted: Mutex::new(false),
         });
         info!(

From 4c5afb7b1000768b4ec6dd3db362c1159a189aaa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 16 May 2024 19:35:13 +0200
Subject: [PATCH 068/130] Remove SSO_ACCOUNT_ID from scrubber docs and
 BucketConfig (#7774)

As of #6202 we support `AWS_PROFILE` as well, which is more convenient.
Change the docs to using it instead of `SSO_ACCOUNT_ID`. Also, remove
`SSO_ACCOUNT_ID` from BucketConfig as it is confusing to the code's
reader: it's not the "main" way of setting up authentication for the
scrubber any more.

It is a breaking change for the on-disk format as we persist `sso_account_id` to disk,
but it was quite inconsistent with the other methods which are not persistet. Also,
I don't think we want to support the case where one version writes the json and
another version reads it.

Related: #7667
---
 s3_scrubber/README.md  | 12 +++++++-----
 s3_scrubber/src/lib.rs | 24 ++++--------------------
 2 files changed, 11 insertions(+), 25 deletions(-)

diff --git a/s3_scrubber/README.md b/s3_scrubber/README.md
index c1deab8852..8a96542ada 100644
--- a/s3_scrubber/README.md
+++ b/s3_scrubber/README.md
@@ -9,11 +9,13 @@ and `safekeeper`, and does housekeeping such as cleaning up objects for tenants
 
 #### S3
 
-Do `aws sso login --profile dev` to get the SSO access to the bucket to clean, get the SSO_ACCOUNT_ID for your profile (`cat ~/.aws/config` may help).
+Do `aws sso login --profile dev` to get the SSO access to the bucket to clean.
+Also, set the following environment variables:
 
-- `SSO_ACCOUNT_ID`: Credentials id to use for accessing S3 buckets
+- `AWS_PROFILE`: Profile name to use for accessing S3 buckets (e.g. `dev`)
 - `REGION`: A region where the bucket is located at.
 - `BUCKET`: Bucket name
+- `BUCKET_PREFIX` (optional): Prefix inside the bucket
 
 #### Console API
 
@@ -43,7 +45,7 @@ processing by the `purge-garbage` subcommand.
 
 Example:
 
-`env SSO_ACCOUNT_ID=123456 REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- find-garbage --node-kind=pageserver --depth=tenant --output-path=eu-west-1-garbage.json`
+`env AWS_PROFILE=dev REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- find-garbage --node-kind=pageserver --depth=tenant --output-path=eu-west-1-garbage.json`
 
 #### `purge-garbage`
 
@@ -59,7 +61,7 @@ to pass them on the command line
 
 Example:
 
-`env SSO_ACCOUNT_ID=123456 cargo run --release -- purge-garbage --node-kind=pageserver --depth=tenant --input-path=eu-west-1-garbage.json`
+`env AWS_PROFILE=dev cargo run --release -- purge-garbage --node-kind=pageserver --depth=tenant --input-path=eu-west-1-garbage.json`
 
 Add the `--delete` argument before `purge-garbage` to enable deletion.  This is intentionally
 not provided inline in the example above to avoid accidents.  Without the `--delete` flag
@@ -72,7 +74,7 @@ Errors are logged to stderr and summary to stdout.
 
 For pageserver:
 ```
-env SSO_ACCOUNT_ID=123456 REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- scan-metadata --node-kind pageserver
+env AWS_PROFILE=dev REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- scan-metadata --node-kind pageserver
 
 Timelines: 31106
 With errors: 3
diff --git a/s3_scrubber/src/lib.rs b/s3_scrubber/src/lib.rs
index 7966fb6a88..e0f99ecd9c 100644
--- a/s3_scrubber/src/lib.rs
+++ b/s3_scrubber/src/lib.rs
@@ -200,30 +200,15 @@ impl RootTarget {
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(deny_unknown_fields)]
 pub struct BucketConfig {
     pub region: String,
     pub bucket: String,
     pub prefix_in_bucket: Option<String>,
-
-    /// Use SSO if this is set, else rely on AWS_* environment vars
-    pub sso_account_id: Option<String>,
-}
-
-impl Display for BucketConfig {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(
-            f,
-            "{}/{}/{}",
-            self.sso_account_id.as_deref().unwrap_or("<none>"),
-            self.region,
-            self.bucket
-        )
-    }
 }
 
 impl BucketConfig {
     pub fn from_env() -> anyhow::Result<Self> {
-        let sso_account_id = env::var("SSO_ACCOUNT_ID").ok();
         let region = env::var("REGION").context("'REGION' param retrieval")?;
         let bucket = env::var("BUCKET").context("'BUCKET' param retrieval")?;
         let prefix_in_bucket = env::var("BUCKET_PREFIX").ok();
@@ -232,7 +217,6 @@ impl BucketConfig {
             region,
             bucket,
             prefix_in_bucket,
-            sso_account_id,
         })
     }
 }
@@ -276,7 +260,7 @@ pub fn init_logging(file_name: &str) -> WorkerGuard {
     guard
 }
 
-pub fn init_s3_client(account_id: Option<String>, bucket_region: Region) -> Client {
+pub fn init_s3_client(bucket_region: Region) -> Client {
     let credentials_provider = {
         // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
         let chain = CredentialsProviderChain::first_try(
@@ -290,7 +274,7 @@ pub fn init_s3_client(account_id: Option<String>, bucket_region: Region) -> Clie
         );
 
         // Use SSO if we were given an account ID
-        match account_id {
+        match std::env::var("SSO_ACCOUNT_ID").ok() {
             Some(sso_account) => chain.or_else(
                 "sso",
                 SsoCredentialsProvider::builder()
@@ -334,7 +318,7 @@ fn init_remote(
 ) -> anyhow::Result<(Arc<Client>, RootTarget)> {
     let bucket_region = Region::new(bucket_config.region);
     let delimiter = "/".to_string();
-    let s3_client = Arc::new(init_s3_client(bucket_config.sso_account_id, bucket_region));
+    let s3_client = Arc::new(init_s3_client(bucket_region));
 
     let s3_root = match node_kind {
         NodeKind::Pageserver => RootTarget::Pageserver(S3Target {

From 4b8809b280b04c92d0c9e2cfb21cbc230de7995b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 16 May 2024 22:25:19 +0200
Subject: [PATCH 069/130] Tiered compaction: improvements to the windows
 (#7787)

Tiered compaction employs two sliding windows over the keyspace:
`KeyspaceWindow` for the image layer generation and `Window` for the
delta layer generation. Do some fixes to both windows:

* The distinction between the two windows is not very clear. Do the
absolute minimum to mention where they are used in the rustdoc
description of the struct. Maybe we should rename them (say
`WindowForImage` and `WindowForDelta`) or merge them into one window
implementation.
* Require the keys to strictly increase. The `accum_key_values` already
combines the key, so there is no logic needed in `Window::feed` for the
same key repeating. This is a follow-up to address the request in
https://github.com/neondatabase/neon/pull/7671#pullrequestreview-2051995541
* In `choose_next_delta`, we claimed in the comment to use 1.25 as the
factor but it was 1.66 instead. Fix this discrepancy by using `*5/4` as
the two operations.
---
 pageserver/compaction/src/compact_tiered.rs | 24 +++++++++++----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/pageserver/compaction/src/compact_tiered.rs b/pageserver/compaction/src/compact_tiered.rs
index 33c9948f45..20f88868f9 100644
--- a/pageserver/compaction/src/compact_tiered.rs
+++ b/pageserver/compaction/src/compact_tiered.rs
@@ -663,8 +663,8 @@ where
     }
 }
 
-// Sliding window through keyspace and values
-// This is used by over_with_images to decide on good split points
+/// Sliding window through keyspace and values for image layer
+/// This is used by [`LevelCompactionState::cover_with_images`] to decide on good split points
 struct KeyspaceWindow<K> {
     head: KeyspaceWindowHead<K>,
 
@@ -804,9 +804,9 @@ struct WindowElement<K> {
     accum_size: u64,
 }
 
-// Sliding window through keyspace and values
-//
-// This is used to decide what layer to write next, from the beginning of the window.
+/// Sliding window through keyspace and values for delta layer tiling
+///
+/// This is used to decide which delta layer to write next.
 struct Window<K> {
     elems: VecDeque<WindowElement<K>>,
 
@@ -830,11 +830,13 @@ where
     fn feed(&mut self, key: K, size: u64) {
         let last_size;
         if let Some(last) = self.elems.back_mut() {
-            assert!(last.last_key <= key);
-            if key == last.last_key {
-                last.accum_size += size;
-                return;
-            }
+            // We require the keys to be strictly increasing for the window.
+            // Keys should already have been deduplicated by `accum_key_values`
+            assert!(
+                last.last_key < key,
+                "last_key(={}) >= key(={key})",
+                last.last_key
+            );
             last_size = last.accum_size;
         } else {
             last_size = 0;
@@ -922,7 +924,7 @@ where
         // If we're willing to stretch it up to 1.25 target size, could we
         // gobble up the rest of the work? This avoids creating very small
         // "tail" layers at the end of the keyspace
-        if !has_more && self.remain_size() < target_size * 5 / 3 {
+        if !has_more && self.remain_size() < target_size * 5 / 4 {
             self.commit_upto(self.elems.len());
         } else {
             let delta_split_at = self.find_size_split(target_size);

From 6d951e69d636dc1fbfda2bc0282547fcae19ec81 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 17 May 2024 12:24:02 +0200
Subject: [PATCH 070/130] test_suite: patch, don't replace, the `tenant_config`
 field, where appropriate (#7771)

Before this PR, the changed tests would overwrite the entire
`tenant_config` because `pageserver_config_override` is merged
non-recursively into the `ps_cfg`.

This meant they would override the
`PAGESERVER_DEFAULT_TENANT_CONFIG_COMPACTION_ALGORITHM`, impacting our
matrix build for `compaction_algorithm=Tiered|Legacy` in
https://github.com/neondatabase/neon/pull/7748.

I found the tests fixed in this PR using the
`NEON_PAGESERVER_PANIC_ON_UNSPECIFIED_COMPACTION_ALGORITHM` env var that
I added in #7748. Therefore, I think this is an exhaustive fix. This is
better than just searching the code base for `tenant_config`, which is
what I had sketched in #7747.

refs #7749
---
 test_runner/fixtures/neon_fixtures.py       | 14 +++++++----
 test_runner/regress/test_branch_behind.py   |  3 +--
 test_runner/regress/test_gc_aggressive.py   |  9 +++----
 test_runner/regress/test_old_request_lsn.py |  3 +--
 test_runner/regress/test_pageserver_api.py  |  2 ++
 test_runner/regress/test_pitr_gc.py         |  6 ++---
 test_runner/regress/test_recovery.py        |  8 +++---
 test_runner/regress/test_tenant_conf.py     | 27 +++++++++++++--------
 test_runner/regress/test_tenant_size.py     | 11 ++++++---
 test_runner/regress/test_timeline_size.py   | 20 +++++++++------
 test_runner/regress/test_wal_receiver.py    | 13 +++++++---
 11 files changed, 70 insertions(+), 46 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index b4761f103b..23f30804b4 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -455,7 +455,7 @@ class NeonEnvBuilder:
         test_overlay_dir: Optional[Path] = None,
         pageserver_remote_storage: Optional[RemoteStorage] = None,
         # toml that will be decomposed into `--config-override` flags during `pageserver --init`
-        pageserver_config_override: Optional[str] = None,
+        pageserver_config_override: Optional[str | Callable[[Dict[str, Any]], None]] = None,
         num_safekeepers: int = 1,
         num_pageservers: int = 1,
         # Use non-standard SK ids to check for various parsing bugs
@@ -1127,10 +1127,14 @@ class NeonEnv:
                 )
 
             if config.pageserver_config_override is not None:
-                for o in config.pageserver_config_override.split(";"):
-                    override = toml.loads(o)
-                    for key, value in override.items():
-                        ps_cfg[key] = value
+                if callable(config.pageserver_config_override):
+                    config.pageserver_config_override(ps_cfg)
+                else:
+                    assert isinstance(config.pageserver_config_override, str)
+                    for o in config.pageserver_config_override.split(";"):
+                        override = toml.loads(o)
+                        for key, value in override.items():
+                            ps_cfg[key] = value
 
             # Create a corresponding NeonPageserver object
             self.pageservers.append(
diff --git a/test_runner/regress/test_branch_behind.py b/test_runner/regress/test_branch_behind.py
index ac2fc79be4..0a5336f5a2 100644
--- a/test_runner/regress/test_branch_behind.py
+++ b/test_runner/regress/test_branch_behind.py
@@ -11,8 +11,7 @@ from fixtures.utils import print_gc_result, query_scalar
 #
 def test_branch_behind(neon_env_builder: NeonEnvBuilder):
     # Disable pitr, because here we want to test branch creation after GC
-    neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
-    env = neon_env_builder.init_start()
+    env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"})
 
     error_regexes = [
         ".*invalid branch start lsn.*",
diff --git a/test_runner/regress/test_gc_aggressive.py b/test_runner/regress/test_gc_aggressive.py
index e5067bba8b..44133f2350 100644
--- a/test_runner/regress/test_gc_aggressive.py
+++ b/test_runner/regress/test_gc_aggressive.py
@@ -67,8 +67,7 @@ async def update_and_gc(env: NeonEnv, endpoint: Endpoint, timeline: TimelineId):
 #
 def test_gc_aggressive(neon_env_builder: NeonEnvBuilder):
     # Disable pitr, because here we want to test branch creation after GC
-    neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
-    env = neon_env_builder.init_start()
+    env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"})
     timeline = env.neon_cli.create_branch("test_gc_aggressive", "main")
     endpoint = env.endpoints.create_start("test_gc_aggressive")
 
@@ -94,13 +93,11 @@ def test_gc_aggressive(neon_env_builder: NeonEnvBuilder):
 
 #
 def test_gc_index_upload(neon_env_builder: NeonEnvBuilder):
-    # Disable time-based pitr, we will use LSN-based thresholds in the manual GC calls
-    neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
     num_index_uploads = 0
 
     neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
-
-    env = neon_env_builder.init_start()
+    # Disable time-based pitr, we will use LSN-based thresholds in the manual GC calls
+    env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"})
     tenant_id = env.initial_tenant
     timeline_id = env.neon_cli.create_branch("test_gc_index_upload", "main")
     endpoint = env.endpoints.create_start("test_gc_index_upload")
diff --git a/test_runner/regress/test_old_request_lsn.py b/test_runner/regress/test_old_request_lsn.py
index 43b0bb56f0..f1dd3fb67d 100644
--- a/test_runner/regress/test_old_request_lsn.py
+++ b/test_runner/regress/test_old_request_lsn.py
@@ -16,8 +16,7 @@ from fixtures.utils import print_gc_result, query_scalar
 #
 def test_old_request_lsn(neon_env_builder: NeonEnvBuilder):
     # Disable pitr, because here we want to test branch creation after GC
-    neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
-    env = neon_env_builder.init_start()
+    env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"})
     env.neon_cli.create_branch("test_old_request_lsn", "main")
     endpoint = env.endpoints.create_start("test_old_request_lsn")
 
diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py
index 80a1d72f4a..abbea59113 100644
--- a/test_runner/regress/test_pageserver_api.py
+++ b/test_runner/regress/test_pageserver_api.py
@@ -42,6 +42,8 @@ def test_pageserver_init_node_id(neon_simple_env: NeonEnv, neon_binpath: Path):
         "listen_http_addr",
         "pg_auth_type",
         "http_auth_type",
+        # TODO: only needed for NEON_PAGESERVER_PANIC_ON_UNSPECIFIED_COMPACTION_ALGORITHM in https://github.com/neondatabase/neon/pull/7748
+        # "tenant_config",
     ]
     required_config_overrides = [
         f"--config-override={toml.dumps({k: ps_config[k]})}" for k in required_config_keys
diff --git a/test_runner/regress/test_pitr_gc.py b/test_runner/regress/test_pitr_gc.py
index 6434f431a4..7e676b5515 100644
--- a/test_runner/regress/test_pitr_gc.py
+++ b/test_runner/regress/test_pitr_gc.py
@@ -10,11 +10,9 @@ from fixtures.utils import print_gc_result, query_scalar
 #
 def test_pitr_gc(neon_env_builder: NeonEnvBuilder):
     # Set pitr interval such that we need to keep the data
-    neon_env_builder.pageserver_config_override = (
-        "tenant_config={pitr_interval = '1 day', gc_horizon = 0}"
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={"pitr_interval": "1 day", "gc_horizon": "0"}
     )
-
-    env = neon_env_builder.init_start()
     endpoint_main = env.endpoints.create_start("main")
 
     main_pg_conn = endpoint_main.connect()
diff --git a/test_runner/regress/test_recovery.py b/test_runner/regress/test_recovery.py
index ab5c8be256..e21f9bb6f6 100644
--- a/test_runner/regress/test_recovery.py
+++ b/test_runner/regress/test_recovery.py
@@ -10,9 +10,11 @@ from fixtures.neon_fixtures import NeonEnvBuilder
 #
 def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder):
     # Override default checkpointer settings to run it more often
-    neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance = 1048576}"
-
-    env = neon_env_builder.init_start()
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            "checkpoint_distance": "1048576",
+        }
+    )
     env.pageserver.is_testing_enabled_or_skip()
 
     # We expect the pageserver to exit, which will cause storage storage controller
diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py
index a345464208..2cbb036c0d 100644
--- a/test_runner/regress/test_tenant_conf.py
+++ b/test_runner/regress/test_tenant_conf.py
@@ -1,5 +1,6 @@
 import json
 from contextlib import closing
+from typing import Any, Dict
 
 import psycopg2.extras
 from fixtures.common_types import Lsn
@@ -14,16 +15,22 @@ from fixtures.utils import wait_until
 
 def test_tenant_config(neon_env_builder: NeonEnvBuilder):
     """Test per tenant configuration"""
-    # set some non-default global config
-    neon_env_builder.pageserver_config_override = """
-page_cache_size=444;
-wait_lsn_timeout='111 s';
-[tenant_config]
-checkpoint_distance = 10000
-compaction_target_size = 1048576
-evictions_low_residence_duration_metric_threshold = "2 days"
-eviction_policy = { "kind" = "LayerAccessThreshold", period = "20s", threshold = "23 hours" }
-"""
+
+    def set_some_nondefault_global_config(ps_cfg: Dict[str, Any]):
+        ps_cfg["page_cache_size"] = 444
+        ps_cfg["wait_lsn_timeout"] = "111 s"
+
+        tenant_config = ps_cfg.setdefault("tenant_config", {})
+        tenant_config["checkpoint_distance"] = 10000
+        tenant_config["compaction_target_size"] = 1048576
+        tenant_config["evictions_low_residence_duration_metric_threshold"] = "2 days"
+        tenant_config["eviction_policy"] = {
+            "kind": "LayerAccessThreshold",
+            "period": "20s",
+            "threshold": "23 hours",
+        }
+
+    neon_env_builder.pageserver_config_override = set_some_nondefault_global_config
 
     env = neon_env_builder.init_start()
     # we configure eviction but no remote storage, there might be error lines
diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py
index 7894f6933d..d3a228dbeb 100644
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -502,9 +502,14 @@ def test_get_tenant_size_with_multiple_branches(
 
     gc_horizon = 128 * 1024
 
-    neon_env_builder.pageserver_config_override = f"tenant_config={{compaction_period='0s', gc_period='0s', pitr_interval='0sec', gc_horizon={gc_horizon}}}"
-
-    env = neon_env_builder.init_start()
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            "compaction_period": "0s",
+            "gc_period": "0s",
+            "pitr_interval": "0sec",
+            "gc_horizon": gc_horizon,
+        }
+    )
 
     # FIXME: we have a race condition between GC and delete timeline. GC might fail with this
     # error. Similar to https://github.com/neondatabase/neon/issues/2671
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index 18063bf104..a6d06df3b6 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -415,11 +415,12 @@ def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder
 
     # Disable background compaction as we don't want it to happen after `get_physical_size` request
     # and before checking the expected size on disk, which makes the assertion failed
-    neon_env_builder.pageserver_config_override = (
-        "tenant_config={checkpoint_distance=100000, compaction_period='10m'}"
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            "checkpoint_distance": "100000",
+            "compaction_period": "10m",
+        }
     )
-
-    env = neon_env_builder.init_start()
     pageserver_http = env.pageserver.http_client()
 
     new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_compaction")
@@ -462,9 +463,14 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder):
 
     # Disable background compaction and GC as we don't want it to happen after `get_physical_size` request
     # and before checking the expected size on disk, which makes the assertion failed
-    neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance=100000, compaction_period='0s', gc_period='0s', pitr_interval='1s'}"
-
-    env = neon_env_builder.init_start()
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            "checkpoint_distance": "100000",
+            "compaction_period": "0s",
+            "gc_period": "0s",
+            "pitr_interval": "1s",
+        }
+    )
     pageserver_http = env.pageserver.http_client()
 
     new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_gc")
diff --git a/test_runner/regress/test_wal_receiver.py b/test_runner/regress/test_wal_receiver.py
index d9265dcbcd..6582b34218 100644
--- a/test_runner/regress/test_wal_receiver.py
+++ b/test_runner/regress/test_wal_receiver.py
@@ -1,4 +1,5 @@
 import time
+from typing import Any, Dict
 
 from fixtures.common_types import Lsn, TenantId
 from fixtures.log_helper import log
@@ -42,10 +43,14 @@ def test_pageserver_lsn_wait_error_start(neon_env_builder: NeonEnvBuilder):
 # Kills one of the safekeepers and ensures that only the active ones are printed in the state.
 def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuilder):
     # Trigger WAL wait timeout faster
-    neon_env_builder.pageserver_config_override = """
-        wait_lsn_timeout = "1s"
-        tenant_config={walreceiver_connect_timeout = "2s", lagging_wal_timeout = "2s"}
-    """
+    def customize_pageserver_toml(ps_cfg: Dict[str, Any]):
+        ps_cfg["wait_lsn_timeout"] = "1s"
+        tenant_config = ps_cfg.setdefault("tenant_config", {})
+        tenant_config["walreceiver_connect_timeout"] = "2s"
+        tenant_config["lagging_wal_timeout"] = "2s"
+
+    neon_env_builder.pageserver_config_override = customize_pageserver_toml
+
     # Have notable SK ids to ensure we check logs for their presence, not some other random numbers
     neon_env_builder.safekeepers_id_start = 12345
     neon_env_builder.num_safekeepers = 3

From c1390bfc3bbd3a7f00df334a39220ca312fc888e Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 17 May 2024 13:25:01 +0300
Subject: [PATCH 071/130] chore: update defaults for timeline_detach_ancestor
 (#7779)

by having 100 copy operations in flight twe climb up to 2500 requests
per min or 41/s. This is still probably less than is allowed, but fast
enough for our purposes.
---
 pageserver/src/tenant/timeline/detach_ancestor.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs
index 7f59758c87..4d8e570181 100644
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -56,7 +56,7 @@ impl Default for Options {
     fn default() -> Self {
         Self {
             rewrite_concurrency: std::num::NonZeroUsize::new(2).unwrap(),
-            copy_concurrency: std::num::NonZeroUsize::new(10).unwrap(),
+            copy_concurrency: std::num::NonZeroUsize::new(100).unwrap(),
         }
     }
 }

From a8e6d259cb49d1bf156dfc2215b92c04d1e8a08f Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 17 May 2024 13:24:03 +0100
Subject: [PATCH 072/130] pageserver: fixes for layer path changes (#7786)

## Problem

- When a layer with legacy local path format is evicted and then
re-downloaded, a panic happened because the path downloaded by remote
storage didn't match the path stored in Layer.
- While investigating, I also realized that secondary locations would
have a similar issue with evictions.

Closes: #7783

## Summary of changes

- Make remote timeline client take local paths as an input: it should
not have its own ideas about local paths, instead it just uses the layer
path that the Layer has.
- Make secondary state store an explicit local path, populated on scan
of local disk at startup. This provides the same behavior as for Layer,
that our local_layer_path is a _default_, but the layer path can
actually be anything (e.g. an old style one).
- Add tests for both cases.
---
 pageserver/src/disk_usage_eviction_task.rs    |  8 +-
 .../src/tenant/remote_timeline_client.rs      |  2 +
 .../tenant/remote_timeline_client/download.rs | 11 +--
 pageserver/src/tenant/secondary.rs            | 54 +++---------
 pageserver/src/tenant/secondary/downloader.rs | 47 ++++++++---
 pageserver/src/tenant/storage_layer/layer.rs  |  1 +
 .../regress/test_pageserver_generations.py    | 84 +++++++++++++++----
 7 files changed, 119 insertions(+), 88 deletions(-)

diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs
index ebeb8bbb20..7f25e49570 100644
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -535,17 +535,11 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
                 }
                 EvictionLayer::Secondary(layer) => {
                     let file_size = layer.metadata.file_size();
-                    let tenant_manager = tenant_manager.clone();
 
                     js.spawn(async move {
                         layer
                             .secondary_tenant
-                            .evict_layer(
-                                tenant_manager.get_conf(),
-                                layer.timeline_id,
-                                layer.name,
-                                layer.metadata,
-                            )
+                            .evict_layer(layer.timeline_id, layer.name)
                             .await;
                         Ok(file_size)
                     });
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index c5462dac43..07d6af696c 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -518,6 +518,7 @@ impl RemoteTimelineClient {
         &self,
         layer_file_name: &LayerName,
         layer_metadata: &LayerFileMetadata,
+        local_path: &Utf8Path,
         cancel: &CancellationToken,
         ctx: &RequestContext,
     ) -> anyhow::Result<u64> {
@@ -536,6 +537,7 @@ impl RemoteTimelineClient {
                 self.timeline_id,
                 layer_file_name,
                 layer_metadata,
+                local_path,
                 cancel,
                 ctx,
             )
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index f3c9e64533..70c5cae05e 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -21,7 +21,6 @@ use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
-use crate::tenant::storage_layer::layer::local_layer_path;
 use crate::tenant::storage_layer::LayerName;
 use crate::tenant::Generation;
 use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
@@ -50,19 +49,13 @@ pub async fn download_layer_file<'a>(
     timeline_id: TimelineId,
     layer_file_name: &'a LayerName,
     layer_metadata: &'a LayerFileMetadata,
+    local_path: &Utf8Path,
     cancel: &CancellationToken,
     ctx: &RequestContext,
 ) -> Result<u64, DownloadError> {
     debug_assert_current_span_has_tenant_and_timeline_id();
 
     let timeline_path = conf.timeline_path(&tenant_shard_id, &timeline_id);
-    let local_path = local_layer_path(
-        conf,
-        &tenant_shard_id,
-        &timeline_id,
-        layer_file_name,
-        &layer_metadata.generation,
-    );
 
     let remote_path = remote_layer_path(
         &tenant_shard_id.tenant_id,
@@ -82,7 +75,7 @@ pub async fn download_layer_file<'a>(
     // For more context about durable_rename check this email from postgres mailing list:
     // https://www.postgresql.org/message-id/56583BDD.9060302@2ndquadrant.com
     // If pageserver crashes the temp file will be deleted on startup and re-downloaded.
-    let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION);
+    let temp_file_path = path_with_suffix_extension(local_path, TEMP_DOWNLOAD_EXTENSION);
 
     let bytes_amount = download_retry(
         || async { download_object(storage, &remote_path, &temp_file_path, cancel, ctx).await },
diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs
index 7075044baf..252b6eb11b 100644
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -6,11 +6,9 @@ mod scheduler;
 use std::{sync::Arc, time::SystemTime};
 
 use crate::{
-    config::PageServerConf,
     context::RequestContext,
     disk_usage_eviction_task::DiskUsageEvictionInfo,
     task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
-    virtual_file::MaybeFatalIo,
 };
 
 use self::{
@@ -21,9 +19,8 @@ use self::{
 use super::{
     config::{SecondaryLocationConfig, TenantConfOpt},
     mgr::TenantManager,
-    remote_timeline_client::LayerFileMetadata,
     span::debug_assert_current_span_has_tenant_id,
-    storage_layer::{layer::local_layer_path, LayerName},
+    storage_layer::LayerName,
 };
 
 use pageserver_api::{
@@ -178,13 +175,7 @@ impl SecondaryTenant {
 
     /// Cancellation safe, but on cancellation the eviction will go through
     #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline_id, name=%name))]
-    pub(crate) async fn evict_layer(
-        self: &Arc<Self>,
-        conf: &PageServerConf,
-        timeline_id: TimelineId,
-        name: LayerName,
-        metadata: LayerFileMetadata,
-    ) {
+    pub(crate) async fn evict_layer(self: &Arc<Self>, timeline_id: TimelineId, name: LayerName) {
         debug_assert_current_span_has_tenant_id();
 
         let guard = match self.gate.enter() {
@@ -197,41 +188,11 @@ impl SecondaryTenant {
 
         let now = SystemTime::now();
 
-        let local_path = local_layer_path(
-            conf,
-            &self.tenant_shard_id,
-            &timeline_id,
-            &name,
-            &metadata.generation,
-        );
-
         let this = self.clone();
 
         // spawn it to be cancellation safe
         tokio::task::spawn_blocking(move || {
             let _guard = guard;
-            // We tolerate ENOENT, because between planning eviction and executing
-            // it, the secondary downloader could have seen an updated heatmap that
-            // resulted in a layer being deleted.
-            // Other local I/O errors are process-fatal: these should never happen.
-            let deleted = std::fs::remove_file(local_path);
-
-            let not_found = deleted
-                .as_ref()
-                .is_err_and(|x| x.kind() == std::io::ErrorKind::NotFound);
-
-            let deleted = if not_found {
-                false
-            } else {
-                deleted
-                    .map(|()| true)
-                    .fatal_err("Deleting layer during eviction")
-            };
-
-            if !deleted {
-                // skip updating accounting and putting perhaps later timestamp
-                return;
-            }
 
             // Update the timeline's state.  This does not have to be synchronized with
             // the download process, because:
@@ -250,8 +211,15 @@ impl SecondaryTenant {
             // of the cache.
             let mut detail = this.detail.lock().unwrap();
             if let Some(timeline_detail) = detail.timelines.get_mut(&timeline_id) {
-                timeline_detail.on_disk_layers.remove(&name);
-                timeline_detail.evicted_at.insert(name, now);
+                let removed = timeline_detail.on_disk_layers.remove(&name);
+
+                // We might race with removal of the same layer during downloads, if it was removed
+                // from the heatmap.  If we see that the OnDiskState is gone, then no need to
+                // do a physical deletion or store in evicted_at.
+                if let Some(removed) = removed {
+                    removed.remove_blocking();
+                    timeline_detail.evicted_at.insert(name, now);
+                }
             }
         })
         .await
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 46a3d7e81f..8f27220771 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -111,6 +111,7 @@ struct SecondaryDownloader {
 pub(super) struct OnDiskState {
     metadata: LayerFileMetadata,
     access_time: SystemTime,
+    local_path: Utf8PathBuf,
 }
 
 impl OnDiskState {
@@ -121,12 +122,26 @@ impl OnDiskState {
         _ame: LayerName,
         metadata: LayerFileMetadata,
         access_time: SystemTime,
+        local_path: Utf8PathBuf,
     ) -> Self {
         Self {
             metadata,
             access_time,
+            local_path,
         }
     }
+
+    // This is infallible, because all errors are either acceptable (ENOENT), or totally
+    // unexpected (fatal).
+    pub(super) fn remove_blocking(&self) {
+        // We tolerate ENOENT, because between planning eviction and executing
+        // it, the secondary downloader could have seen an updated heatmap that
+        // resulted in a layer being deleted.
+        // Other local I/O errors are process-fatal: these should never happen.
+        std::fs::remove_file(&self.local_path)
+            .or_else(fs_ext::ignore_not_found)
+            .fatal_err("Deleting secondary layer")
+    }
 }
 
 #[derive(Debug, Clone, Default)]
@@ -816,20 +831,12 @@ impl<'a> TenantDownloader<'a> {
                 if cfg!(debug_assertions) {
                     // Debug for https://github.com/neondatabase/neon/issues/6966: check that the files we think
                     // are already present on disk are really there.
-                    let local_path = local_layer_path(
-                        self.conf,
-                        tenant_shard_id,
-                        &timeline.timeline_id,
-                        &layer.name,
-                        &layer.metadata.generation,
-                    );
-
-                    match tokio::fs::metadata(&local_path).await {
+                    match tokio::fs::metadata(&on_disk.local_path).await {
                         Ok(meta) => {
                             tracing::debug!(
                                 "Layer {} present at {}, size {}",
                                 layer.name,
-                                local_path,
+                                on_disk.local_path,
                                 meta.len(),
                             );
                         }
@@ -837,7 +844,7 @@ impl<'a> TenantDownloader<'a> {
                             tracing::warn!(
                                 "Layer {} not found at {} ({})",
                                 layer.name,
-                                local_path,
+                                on_disk.local_path,
                                 e
                             );
                             debug_assert!(false);
@@ -926,6 +933,13 @@ impl<'a> TenantDownloader<'a> {
                         v.get_mut().access_time = t.access_time;
                     }
                     Entry::Vacant(e) => {
+                        let local_path = local_layer_path(
+                            self.conf,
+                            tenant_shard_id,
+                            &timeline.timeline_id,
+                            &t.name,
+                            &t.metadata.generation,
+                        );
                         e.insert(OnDiskState::new(
                             self.conf,
                             tenant_shard_id,
@@ -933,6 +947,7 @@ impl<'a> TenantDownloader<'a> {
                             t.name,
                             LayerFileMetadata::from(&t.metadata),
                             t.access_time,
+                            local_path,
                         ));
                     }
                 }
@@ -955,6 +970,14 @@ impl<'a> TenantDownloader<'a> {
             &self.secondary_state.cancel
         );
 
+        let local_path = local_layer_path(
+            self.conf,
+            tenant_shard_id,
+            timeline_id,
+            &layer.name,
+            &layer.metadata.generation,
+        );
+
         // Note: no backoff::retry wrapper here because download_layer_file does its own retries internally
         let downloaded_bytes = match download_layer_file(
             self.conf,
@@ -963,6 +986,7 @@ impl<'a> TenantDownloader<'a> {
             *timeline_id,
             &layer.name,
             &LayerFileMetadata::from(&layer.metadata),
+            &local_path,
             &self.secondary_state.cancel,
             ctx,
         )
@@ -1116,6 +1140,7 @@ async fn init_timeline_state(
                                     name,
                                     LayerFileMetadata::from(&remote_meta.metadata),
                                     remote_meta.access_time,
+                                    file_path,
                                 ),
                             );
                         }
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index e8c712c4c6..97b349f635 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1108,6 +1108,7 @@ impl LayerInner {
             .download_layer_file(
                 &self.desc.layer_name(),
                 &self.metadata(),
+                &self.path,
                 &timeline.cancel,
                 ctx,
             )
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index 9b97254410..0235cf6d20 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -25,6 +25,7 @@ from fixtures.neon_fixtures import (
     S3Scrubber,
     generate_uploads_and_deletions,
 )
+from fixtures.pageserver.common_types import parse_layer_file_name
 from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import (
     assert_tenant_state,
@@ -632,39 +633,86 @@ def test_upgrade_generationless_local_file_paths(
     generation numbers: it should accept these layer files, and avoid doing
     a delete/download cycle on them.
     """
-    env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
-    tenant_id = env.initial_tenant
-    timeline_id = env.initial_timeline
+    neon_env_builder.num_pageservers = 2
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    tenant_id = TenantId.generate()
+    timeline_id = TimelineId.generate()
+    env.neon_cli.create_tenant(
+        tenant_id, timeline_id, conf=TENANT_CONF, placement_policy='{"Attached":1}'
+    )
 
     workload = Workload(env, tenant_id, timeline_id)
     workload.init()
     workload.write_rows(1000)
 
-    env.pageserver.stop()
+    attached_pageserver = env.get_tenant_pageserver(tenant_id)
+    secondary_pageserver = list([ps for ps in env.pageservers if ps.id != attached_pageserver.id])[
+        0
+    ]
+
+    attached_pageserver.http_client().tenant_heatmap_upload(tenant_id)
+    secondary_pageserver.http_client().tenant_secondary_download(tenant_id)
 
     # Rename the local paths to legacy format, to simulate what
-    # we would see when upgrading
-    timeline_dir = env.pageserver.timeline_dir(tenant_id, timeline_id)
-    files_renamed = 0
-    for filename in os.listdir(timeline_dir):
-        path = os.path.join(timeline_dir, filename)
-        log.info(f"Found file {path}")
-        if path.endswith("-v1-00000001"):
-            new_path = path[:-12]
-            os.rename(path, new_path)
-            log.info(f"Renamed {path} -> {new_path}")
-            files_renamed += 1
+    # we would see when upgrading.  Do this on both attached and secondary locations, as we will
+    # test the behavior of both.
+    for pageserver in env.pageservers:
+        pageserver.stop()
+        timeline_dir = pageserver.timeline_dir(tenant_id, timeline_id)
+        files_renamed = 0
+        for filename in os.listdir(timeline_dir):
+            path = os.path.join(timeline_dir, filename)
+            log.info(f"Found file {path}")
+            if path.endswith("-v1-00000001"):
+                new_path = path[:-12]
+                os.rename(path, new_path)
+                log.info(f"Renamed {path} -> {new_path}")
+                files_renamed += 1
 
-    assert files_renamed > 0
+        assert files_renamed > 0
 
-    env.pageserver.start()
+        pageserver.start()
 
     workload.validate()
 
     # Assert that there were no on-demand downloads
     assert (
-        env.pageserver.http_client().get_metric_value(
+        attached_pageserver.http_client().get_metric_value(
             "pageserver_remote_ondemand_downloaded_layers_total"
         )
         == 0
     )
+
+    # Do a secondary download and ensure there were no layer downloads
+    secondary_pageserver.http_client().tenant_secondary_download(tenant_id)
+    assert (
+        secondary_pageserver.http_client().get_metric_value(
+            "pageserver_secondary_download_layer_total"
+        )
+        == 0
+    )
+
+    # Check that when we evict and promote one of the legacy-named layers, everything works as
+    # expected
+    local_layers = list(
+        (
+            parse_layer_file_name(path.name),
+            os.path.join(attached_pageserver.timeline_dir(tenant_id, timeline_id), path),
+        )
+        for path in attached_pageserver.list_layers(tenant_id, timeline_id)
+    )
+    (victim_layer_name, victim_path) = local_layers[0]
+    assert os.path.exists(victim_path)
+
+    attached_pageserver.http_client().evict_layer(
+        tenant_id, timeline_id, victim_layer_name.to_str()
+    )
+    assert not os.path.exists(victim_path)
+
+    attached_pageserver.http_client().download_layer(
+        tenant_id, timeline_id, victim_layer_name.to_str()
+    )
+    # We should download into the same local path we started with
+    assert os.path.exists(victim_path)

From af99c959ef460326b35716239c09b3a572c43b4c Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 17 May 2024 16:44:33 +0100
Subject: [PATCH 073/130] storage controller: use SERIALIZABLE isolation level
 (#7792)

## Problem

The storage controller generally assumes that things like updating
generation numbers are atomic: it should use a strict isolation level.

## Summary of changes

- Wrap all database operations in a SERIALIZABLE transaction.
- Retry serialization failures, as these do not indicate problems and
are normal when plenty of concurrent work is happening.

Using this isolation level for all reads is overkill, but much simpler
than reasoning about it on a per-operation basis, and does not hurt
performance.

Tested this with a modified version of storage_controller_many_tenants
test with 128k shards, to check that our performance is still fine: it
is.
---
 storage_controller/src/persistence.rs | 230 ++++++++++++++------------
 1 file changed, 126 insertions(+), 104 deletions(-)

diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index dca37166ba..67c05296d5 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -173,7 +173,7 @@ impl Persistence {
     /// Wraps `with_conn` in order to collect latency and error metrics
     async fn with_measured_conn<F, R>(&self, op: DatabaseOperation, func: F) -> DatabaseResult<R>
     where
-        F: FnOnce(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
+        F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
         R: Send + 'static,
     {
         let latency = &METRICS_REGISTRY
@@ -199,13 +199,48 @@ impl Persistence {
     /// Call the provided function in a tokio blocking thread, with a Diesel database connection.
     async fn with_conn<F, R>(&self, func: F) -> DatabaseResult<R>
     where
-        F: FnOnce(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
+        F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
         R: Send + 'static,
     {
+        // A generous allowance for how many times we may retry serializable transactions
+        // before giving up.  This is not expected to be hit: it is a defensive measure in case we
+        // somehow engineer a situation where duelling transactions might otherwise live-lock.
+        const MAX_RETRIES: usize = 128;
+
         let mut conn = self.connection_pool.get()?;
-        tokio::task::spawn_blocking(move || -> DatabaseResult<R> { func(&mut conn) })
-            .await
-            .expect("Task panic")
+        tokio::task::spawn_blocking(move || -> DatabaseResult<R> {
+            let mut retry_count = 0;
+            loop {
+                match conn.build_transaction().serializable().run(|c| func(c)) {
+                    Ok(r) => break Ok(r),
+                    Err(
+                        err @ DatabaseError::Query(diesel::result::Error::DatabaseError(
+                            diesel::result::DatabaseErrorKind::SerializationFailure,
+                            _,
+                        )),
+                    ) => {
+                        retry_count += 1;
+                        if retry_count > MAX_RETRIES {
+                            tracing::error!(
+                                "Exceeded max retries on SerializationFailure errors: {err:?}"
+                            );
+                            break Err(err);
+                        } else {
+                            // Retry on serialization errors: these are expected, because even though our
+                            // transactions don't fight for the same rows, they will occasionally collide
+                            // on index pages (e.g. increment_generation for unrelated shards can collide)
+                            tracing::debug!(
+                                "Retrying transaction on serialization failure {err:?}"
+                            );
+                            continue;
+                        }
+                    }
+                    Err(e) => break Err(e),
+                }
+            }
+        })
+        .await
+        .expect("Task panic")
     }
 
     /// When a node is first registered, persist it before using it for anything
@@ -358,14 +393,11 @@ impl Persistence {
         self.with_measured_conn(
             DatabaseOperation::InsertTenantShards,
             move |conn| -> DatabaseResult<()> {
-                conn.transaction(|conn| -> QueryResult<()> {
-                    for tenant in &shards {
-                        diesel::insert_into(tenant_shards)
-                            .values(tenant)
-                            .execute(conn)?;
-                    }
-                    Ok(())
-                })?;
+                for tenant in &shards {
+                    diesel::insert_into(tenant_shards)
+                        .values(tenant)
+                        .execute(conn)?;
+                }
                 Ok(())
             },
         )
@@ -533,8 +565,11 @@ impl Persistence {
             let update = ShardUpdate {
                 generation: input_generation.map(|g| g.into().unwrap() as i32),
                 placement_policy: input_placement_policy
+                    .as_ref()
                     .map(|p| serde_json::to_string(&p).unwrap()),
-                config: input_config.map(|c| serde_json::to_string(&c).unwrap()),
+                config: input_config
+                    .as_ref()
+                    .map(|c| serde_json::to_string(&c).unwrap()),
                 scheduling_policy: input_scheduling_policy
                     .map(|p| serde_json::to_string(&p).unwrap()),
             };
@@ -581,55 +616,51 @@ impl Persistence {
     ) -> DatabaseResult<()> {
         use crate::schema::tenant_shards::dsl::*;
         self.with_measured_conn(DatabaseOperation::BeginShardSplit, move |conn| -> DatabaseResult<()> {
-            conn.transaction(|conn| -> DatabaseResult<()> {
-                // Mark parent shards as splitting
+            // Mark parent shards as splitting
 
-                let updated = diesel::update(tenant_shards)
-                    .filter(tenant_id.eq(split_tenant_id.to_string()))
-                    .filter(shard_count.eq(old_shard_count.literal() as i32))
-                    .set((splitting.eq(1),))
-                    .execute(conn)?;
-                if u8::try_from(updated)
-                    .map_err(|_| DatabaseError::Logical(
-                        format!("Overflow existing shard count {} while splitting", updated))
-                    )? != old_shard_count.count() {
-                    // Perhaps a deletion or another split raced with this attempt to split, mutating
-                    // the parent shards that we intend to split. In this case the split request should fail.
-                    return Err(DatabaseError::Logical(
-                        format!("Unexpected existing shard count {updated} when preparing tenant for split (expected {})", old_shard_count.count())
-                    ));
+            let updated = diesel::update(tenant_shards)
+                .filter(tenant_id.eq(split_tenant_id.to_string()))
+                .filter(shard_count.eq(old_shard_count.literal() as i32))
+                .set((splitting.eq(1),))
+                .execute(conn)?;
+            if u8::try_from(updated)
+                .map_err(|_| DatabaseError::Logical(
+                    format!("Overflow existing shard count {} while splitting", updated))
+                )? != old_shard_count.count() {
+                // Perhaps a deletion or another split raced with this attempt to split, mutating
+                // the parent shards that we intend to split. In this case the split request should fail.
+                return Err(DatabaseError::Logical(
+                    format!("Unexpected existing shard count {updated} when preparing tenant for split (expected {})", old_shard_count.count())
+                ));
+            }
+
+            // FIXME: spurious clone to sidestep closure move rules
+            let parent_to_children = parent_to_children.clone();
+
+            // Insert child shards
+            for (parent_shard_id, children) in parent_to_children {
+                let mut parent = crate::schema::tenant_shards::table
+                    .filter(tenant_id.eq(parent_shard_id.tenant_id.to_string()))
+                    .filter(shard_number.eq(parent_shard_id.shard_number.0 as i32))
+                    .filter(shard_count.eq(parent_shard_id.shard_count.literal() as i32))
+                    .load::<TenantShardPersistence>(conn)?;
+                let parent = if parent.len() != 1 {
+                    return Err(DatabaseError::Logical(format!(
+                        "Parent shard {parent_shard_id} not found"
+                    )));
+                } else {
+                    parent.pop().unwrap()
+                };
+                for mut shard in children {
+                    // Carry the parent's generation into the child
+                    shard.generation = parent.generation;
+
+                    debug_assert!(shard.splitting == SplitState::Splitting);
+                    diesel::insert_into(tenant_shards)
+                        .values(shard)
+                        .execute(conn)?;
                 }
-
-                // FIXME: spurious clone to sidestep closure move rules
-                let parent_to_children = parent_to_children.clone();
-
-                // Insert child shards
-                for (parent_shard_id, children) in parent_to_children {
-                    let mut parent = crate::schema::tenant_shards::table
-                        .filter(tenant_id.eq(parent_shard_id.tenant_id.to_string()))
-                        .filter(shard_number.eq(parent_shard_id.shard_number.0 as i32))
-                        .filter(shard_count.eq(parent_shard_id.shard_count.literal() as i32))
-                        .load::<TenantShardPersistence>(conn)?;
-                    let parent = if parent.len() != 1 {
-                        return Err(DatabaseError::Logical(format!(
-                            "Parent shard {parent_shard_id} not found"
-                        )));
-                    } else {
-                        parent.pop().unwrap()
-                    };
-                    for mut shard in children {
-                        // Carry the parent's generation into the child
-                        shard.generation = parent.generation;
-
-                        debug_assert!(shard.splitting == SplitState::Splitting);
-                        diesel::insert_into(tenant_shards)
-                            .values(shard)
-                            .execute(conn)?;
-                    }
-                }
-
-                Ok(())
-            })?;
+            }
 
             Ok(())
         })
@@ -647,22 +678,18 @@ impl Persistence {
         self.with_measured_conn(
             DatabaseOperation::CompleteShardSplit,
             move |conn| -> DatabaseResult<()> {
-                conn.transaction(|conn| -> QueryResult<()> {
-                    // Drop parent shards
-                    diesel::delete(tenant_shards)
-                        .filter(tenant_id.eq(split_tenant_id.to_string()))
-                        .filter(shard_count.eq(old_shard_count.literal() as i32))
-                        .execute(conn)?;
+                // Drop parent shards
+                diesel::delete(tenant_shards)
+                    .filter(tenant_id.eq(split_tenant_id.to_string()))
+                    .filter(shard_count.eq(old_shard_count.literal() as i32))
+                    .execute(conn)?;
 
-                    // Clear sharding flag
-                    let updated = diesel::update(tenant_shards)
-                        .filter(tenant_id.eq(split_tenant_id.to_string()))
-                        .set((splitting.eq(0),))
-                        .execute(conn)?;
-                    debug_assert!(updated > 0);
-
-                    Ok(())
-                })?;
+                // Clear sharding flag
+                let updated = diesel::update(tenant_shards)
+                    .filter(tenant_id.eq(split_tenant_id.to_string()))
+                    .set((splitting.eq(0),))
+                    .execute(conn)?;
+                debug_assert!(updated > 0);
 
                 Ok(())
             },
@@ -681,39 +708,34 @@ impl Persistence {
         self.with_measured_conn(
             DatabaseOperation::AbortShardSplit,
             move |conn| -> DatabaseResult<AbortShardSplitStatus> {
-                let aborted =
-                    conn.transaction(|conn| -> DatabaseResult<AbortShardSplitStatus> {
-                        // Clear the splitting state on parent shards
-                        let updated = diesel::update(tenant_shards)
-                            .filter(tenant_id.eq(split_tenant_id.to_string()))
-                            .filter(shard_count.ne(new_shard_count.literal() as i32))
-                            .set((splitting.eq(0),))
-                            .execute(conn)?;
+                // Clear the splitting state on parent shards
+                let updated = diesel::update(tenant_shards)
+                    .filter(tenant_id.eq(split_tenant_id.to_string()))
+                    .filter(shard_count.ne(new_shard_count.literal() as i32))
+                    .set((splitting.eq(0),))
+                    .execute(conn)?;
 
-                        // Parent shards are already gone: we cannot abort.
-                        if updated == 0 {
-                            return Ok(AbortShardSplitStatus::Complete);
-                        }
+                // Parent shards are already gone: we cannot abort.
+                if updated == 0 {
+                    return Ok(AbortShardSplitStatus::Complete);
+                }
 
-                        // Sanity check: if parent shards were present, their cardinality should
-                        // be less than the number of child shards.
-                        if updated >= new_shard_count.count() as usize {
-                            return Err(DatabaseError::Logical(format!(
-                                "Unexpected parent shard count {updated} while aborting split to \
+                // Sanity check: if parent shards were present, their cardinality should
+                // be less than the number of child shards.
+                if updated >= new_shard_count.count() as usize {
+                    return Err(DatabaseError::Logical(format!(
+                        "Unexpected parent shard count {updated} while aborting split to \
                             count {new_shard_count:?} on tenant {split_tenant_id}"
-                            )));
-                        }
+                    )));
+                }
 
-                        // Erase child shards
-                        diesel::delete(tenant_shards)
-                            .filter(tenant_id.eq(split_tenant_id.to_string()))
-                            .filter(shard_count.eq(new_shard_count.literal() as i32))
-                            .execute(conn)?;
+                // Erase child shards
+                diesel::delete(tenant_shards)
+                    .filter(tenant_id.eq(split_tenant_id.to_string()))
+                    .filter(shard_count.eq(new_shard_count.literal() as i32))
+                    .execute(conn)?;
 
-                        Ok(AbortShardSplitStatus::Aborted)
-                    })?;
-
-                Ok(aborted)
+                Ok(AbortShardSplitStatus::Aborted)
             },
         )
         .await

From c84656a53e92ca4628ffa8061e34102263576f43 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 17 May 2024 17:01:24 +0100
Subject: [PATCH 074/130] pageserver: implement auto-splitting (#7681)

## Problem

Currently tenants are only split into multiple shards if a human being
calls the API to do it.

Issue: #7388

## Summary of changes

- Add a pageserver API for returning the top tenants by size
- Add a step to the controller's background loop where if there is no
reconciliation or optimization to be done, it looks for things to split.
- Add a test that runs pgbench on many tenants concurrently, and checks
that splitting happens as expected as tenants grow, without interrupting
the client I/O.

This PR is quite basic: there is a tasklist in
https://github.com/neondatabase/neon/issues/7388 for further work. This
PR is meant to be safe (off by default), and sufficient to enable our
staging environment to run lots of sharded tenants without a human
having to set them up.
---
 control_plane/src/local_env.rs                |   4 +
 control_plane/src/storage_controller.rs       |   4 +
 libs/pageserver_api/src/models.rs             |  49 +++
 libs/pageserver_api/src/shard.rs              |   2 +-
 pageserver/client/src/mgmt_api.rs             |  12 +
 pageserver/src/http/routes.rs                 |  98 ++++++
 pageserver/src/metrics.rs                     |  64 ++--
 pageserver/src/tenant.rs                      |  26 ++
 .../src/tenant/remote_timeline_client.rs      |   6 +-
 storage_controller/src/main.rs                |   5 +
 storage_controller/src/pageserver_client.rs   |  14 +-
 storage_controller/src/service.rs             | 114 ++++++-
 test_runner/fixtures/pageserver/http.py       |  15 +
 .../performance/test_sharding_autosplit.py    | 280 ++++++++++++++++++
 test_runner/regress/test_sharding.py          |  42 +++
 15 files changed, 689 insertions(+), 46 deletions(-)
 create mode 100644 test_runner/performance/test_sharding_autosplit.py

diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index d13884198e..0edcf1be4e 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -152,6 +152,9 @@ pub struct NeonStorageControllerConf {
     /// Heartbeat timeout before marking a node offline
     #[serde(with = "humantime_serde")]
     pub max_unavailable: Duration,
+
+    /// Threshold for auto-splitting a tenant into shards
+    pub split_threshold: Option<u64>,
 }
 
 impl NeonStorageControllerConf {
@@ -164,6 +167,7 @@ impl Default for NeonStorageControllerConf {
     fn default() -> Self {
         Self {
             max_unavailable: Self::DEFAULT_MAX_UNAVAILABLE_INTERVAL,
+            split_threshold: None,
         }
     }
 }
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index f1c43f4036..96e8276f4d 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -305,6 +305,10 @@ impl StorageController {
             ));
         }
 
+        if let Some(split_threshold) = self.config.split_threshold.as_ref() {
+            args.push(format!("--split-threshold={split_threshold}"))
+        }
+
         background_process::start_process(
             COMMAND,
             &self.env.base_data_dir,
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 7cf54bf32a..d52fb5e93d 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -824,6 +824,55 @@ pub struct TenantScanRemoteStorageResponse {
     pub shards: Vec<TenantScanRemoteStorageShard>,
 }
 
+#[derive(Serialize, Deserialize, Debug, Clone)]
+#[serde(rename_all = "snake_case")]
+pub enum TenantSorting {
+    ResidentSize,
+    MaxLogicalSize,
+}
+
+impl Default for TenantSorting {
+    fn default() -> Self {
+        Self::ResidentSize
+    }
+}
+
+#[derive(Serialize, Deserialize, Debug, Clone)]
+pub struct TopTenantShardsRequest {
+    // How would you like to sort the tenants?
+    pub order_by: TenantSorting,
+
+    // How many results?
+    pub limit: usize,
+
+    // Omit tenants with more than this many shards (e.g. if this is the max number of shards
+    // that the caller would ever split to)
+    pub where_shards_lt: Option<ShardCount>,
+
+    // Omit tenants where the ordering metric is less than this (this is an optimization to
+    // let us quickly exclude numerous tiny shards)
+    pub where_gt: Option<u64>,
+}
+
+#[derive(Serialize, Deserialize, Debug, PartialEq, Eq)]
+pub struct TopTenantShardItem {
+    pub id: TenantShardId,
+
+    /// Total size of layers on local disk for all timelines in this tenant
+    pub resident_size: u64,
+
+    /// Total size of layers in remote storage for all timelines in this tenant
+    pub physical_size: u64,
+
+    /// The largest logical size of a timeline within this tenant
+    pub max_logical_size: u64,
+}
+
+#[derive(Serialize, Deserialize, Debug, Default)]
+pub struct TopTenantShardsResponse {
+    pub shards: Vec<TopTenantShardItem>,
+}
+
 pub mod virtual_file {
     #[derive(
         Copy,
diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index ff6d3d91b6..43d9b2e48c 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -125,7 +125,7 @@ impl ShardCount {
 
     /// `v` may be zero, or the number of shards in the tenant.  `v` is what
     /// [`Self::literal`] would return.
-    pub fn new(val: u8) -> Self {
+    pub const fn new(val: u8) -> Self {
         Self(val)
     }
 }
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 6df8b2170d..5904713da9 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -486,6 +486,18 @@ impl Client {
             .map_err(Error::ReceiveBody)
     }
 
+    pub async fn top_tenant_shards(
+        &self,
+        request: TopTenantShardsRequest,
+    ) -> Result<TopTenantShardsResponse> {
+        let uri = format!("{}/v1/top_tenants", self.mgmt_api_endpoint);
+        self.request(Method::POST, uri, request)
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
     pub async fn layer_map_info(
         &self,
         tenant_shard_id: TenantShardId,
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 0a98d32f02..b8d5c67ce0 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1,6 +1,8 @@
 //!
 //! Management HTTP API
 //!
+use std::cmp::Reverse;
+use std::collections::BinaryHeap;
 use std::collections::HashMap;
 use std::str::FromStr;
 use std::sync::Arc;
@@ -24,7 +26,11 @@ use pageserver_api::models::TenantScanRemoteStorageShard;
 use pageserver_api::models::TenantShardLocation;
 use pageserver_api::models::TenantShardSplitRequest;
 use pageserver_api::models::TenantShardSplitResponse;
+use pageserver_api::models::TenantSorting;
 use pageserver_api::models::TenantState;
+use pageserver_api::models::TopTenantShardItem;
+use pageserver_api::models::TopTenantShardsRequest;
+use pageserver_api::models::TopTenantShardsResponse;
 use pageserver_api::models::{
     DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest,
     TenantLoadRequest, TenantLocationConfigRequest,
@@ -2323,6 +2329,97 @@ async fn get_utilization(
         .map_err(ApiError::InternalServerError)
 }
 
+/// Report on the largest tenants on this pageserver, for the storage controller to identify
+/// candidates for splitting
+async fn post_top_tenants(
+    mut r: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    check_permission(&r, None)?;
+    let request: TopTenantShardsRequest = json_request(&mut r).await?;
+    let state = get_state(&r);
+
+    fn get_size_metric(sizes: &TopTenantShardItem, order_by: &TenantSorting) -> u64 {
+        match order_by {
+            TenantSorting::ResidentSize => sizes.resident_size,
+            TenantSorting::MaxLogicalSize => sizes.max_logical_size,
+        }
+    }
+
+    #[derive(Eq, PartialEq)]
+    struct HeapItem {
+        metric: u64,
+        sizes: TopTenantShardItem,
+    }
+
+    impl PartialOrd for HeapItem {
+        fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+            Some(self.cmp(other))
+        }
+    }
+
+    /// Heap items have reverse ordering on their metric: this enables using BinaryHeap, which
+    /// supports popping the greatest item but not the smallest.
+    impl Ord for HeapItem {
+        fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+            Reverse(self.metric).cmp(&Reverse(other.metric))
+        }
+    }
+
+    let mut top_n: BinaryHeap<HeapItem> = BinaryHeap::with_capacity(request.limit);
+
+    // FIXME: this is a lot of clones to take this tenant list
+    for (tenant_shard_id, tenant_slot) in state.tenant_manager.list() {
+        if let Some(shards_lt) = request.where_shards_lt {
+            // Ignore tenants which already have >= this many shards
+            if tenant_shard_id.shard_count >= shards_lt {
+                continue;
+            }
+        }
+
+        let sizes = match tenant_slot {
+            TenantSlot::Attached(tenant) => tenant.get_sizes(),
+            TenantSlot::Secondary(_) | TenantSlot::InProgress(_) => {
+                continue;
+            }
+        };
+        let metric = get_size_metric(&sizes, &request.order_by);
+
+        if let Some(gt) = request.where_gt {
+            // Ignore tenants whose metric is <= the lower size threshold, to do less sorting work
+            if metric <= gt {
+                continue;
+            }
+        };
+
+        match top_n.peek() {
+            None => {
+                // Top N list is empty: candidate becomes first member
+                top_n.push(HeapItem { metric, sizes });
+            }
+            Some(i) if i.metric > metric && top_n.len() < request.limit => {
+                // Lowest item in list is greater than our candidate, but we aren't at limit yet: push to end
+                top_n.push(HeapItem { metric, sizes });
+            }
+            Some(i) if i.metric > metric => {
+                // List is at limit and lowest value is greater than our candidate, drop it.
+            }
+            Some(_) => top_n.push(HeapItem { metric, sizes }),
+        }
+
+        while top_n.len() > request.limit {
+            top_n.pop();
+        }
+    }
+
+    json_response(
+        StatusCode::OK,
+        TopTenantShardsResponse {
+            shards: top_n.into_iter().map(|i| i.sizes).collect(),
+        },
+    )
+}
+
 /// Common functionality of all the HTTP API handlers.
 ///
 /// - Adds a tracing span to each request (by `request_span`)
@@ -2609,5 +2706,6 @@ pub fn make_router(
         )
         .put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
         .get("/v1/utilization", |r| api_handler(r, get_utilization))
+        .post("/v1/top_tenants", |r| api_handler(r, post_top_tenants))
         .any(handler_404))
 }
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index ffcd08b4b3..5315f0b936 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -2098,7 +2098,7 @@ pub(crate) struct TimelineMetrics {
     pub garbage_collect_histo: StorageTimeMetrics,
     pub find_gc_cutoffs_histo: StorageTimeMetrics,
     pub last_record_gauge: IntGauge,
-    resident_physical_size_gauge: UIntGauge,
+    pub resident_physical_size_gauge: UIntGauge,
     /// copy of LayeredTimeline.current_logical_size
     pub current_logical_size_gauge: UIntGauge,
     pub aux_file_size_gauge: IntGauge,
@@ -2312,6 +2312,7 @@ use pin_project_lite::pin_project;
 use std::collections::HashMap;
 use std::num::NonZeroUsize;
 use std::pin::Pin;
+use std::sync::atomic::AtomicU64;
 use std::sync::{Arc, Mutex};
 use std::task::{Context, Poll};
 use std::time::{Duration, Instant};
@@ -2321,35 +2322,35 @@ use crate::task_mgr::TaskKind;
 use crate::tenant::mgr::TenantSlot;
 
 /// Maintain a per timeline gauge in addition to the global gauge.
-struct PerTimelineRemotePhysicalSizeGauge {
-    last_set: u64,
+pub(crate) struct PerTimelineRemotePhysicalSizeGauge {
+    last_set: AtomicU64,
     gauge: UIntGauge,
 }
 
 impl PerTimelineRemotePhysicalSizeGauge {
     fn new(per_timeline_gauge: UIntGauge) -> Self {
         Self {
-            last_set: per_timeline_gauge.get(),
+            last_set: AtomicU64::new(0),
             gauge: per_timeline_gauge,
         }
     }
-    fn set(&mut self, sz: u64) {
+    pub(crate) fn set(&self, sz: u64) {
         self.gauge.set(sz);
-        if sz < self.last_set {
-            REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set - sz);
+        let prev = self.last_set.swap(sz, std::sync::atomic::Ordering::Relaxed);
+        if sz < prev {
+            REMOTE_PHYSICAL_SIZE_GLOBAL.sub(prev - sz);
         } else {
-            REMOTE_PHYSICAL_SIZE_GLOBAL.add(sz - self.last_set);
+            REMOTE_PHYSICAL_SIZE_GLOBAL.add(sz - prev);
         };
-        self.last_set = sz;
     }
-    fn get(&self) -> u64 {
+    pub(crate) fn get(&self) -> u64 {
         self.gauge.get()
     }
 }
 
 impl Drop for PerTimelineRemotePhysicalSizeGauge {
     fn drop(&mut self) {
-        REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set);
+        REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set.load(std::sync::atomic::Ordering::Relaxed));
     }
 }
 
@@ -2357,7 +2358,7 @@ pub(crate) struct RemoteTimelineClientMetrics {
     tenant_id: String,
     shard_id: String,
     timeline_id: String,
-    remote_physical_size_gauge: Mutex<Option<PerTimelineRemotePhysicalSizeGauge>>,
+    pub(crate) remote_physical_size_gauge: PerTimelineRemotePhysicalSizeGauge,
     calls: Mutex<HashMap<(&'static str, &'static str), IntCounterPair>>,
     bytes_started_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
     bytes_finished_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
@@ -2365,38 +2366,27 @@ pub(crate) struct RemoteTimelineClientMetrics {
 
 impl RemoteTimelineClientMetrics {
     pub fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
+        let tenant_id_str = tenant_shard_id.tenant_id.to_string();
+        let shard_id_str = format!("{}", tenant_shard_id.shard_slug());
+        let timeline_id_str = timeline_id.to_string();
+
+        let remote_physical_size_gauge = PerTimelineRemotePhysicalSizeGauge::new(
+            REMOTE_PHYSICAL_SIZE
+                .get_metric_with_label_values(&[&tenant_id_str, &shard_id_str, &timeline_id_str])
+                .unwrap(),
+        );
+
         RemoteTimelineClientMetrics {
-            tenant_id: tenant_shard_id.tenant_id.to_string(),
-            shard_id: format!("{}", tenant_shard_id.shard_slug()),
-            timeline_id: timeline_id.to_string(),
+            tenant_id: tenant_id_str,
+            shard_id: shard_id_str,
+            timeline_id: timeline_id_str,
             calls: Mutex::new(HashMap::default()),
             bytes_started_counter: Mutex::new(HashMap::default()),
             bytes_finished_counter: Mutex::new(HashMap::default()),
-            remote_physical_size_gauge: Mutex::new(None),
+            remote_physical_size_gauge,
         }
     }
 
-    pub(crate) fn remote_physical_size_set(&self, sz: u64) {
-        let mut guard = self.remote_physical_size_gauge.lock().unwrap();
-        let gauge = guard.get_or_insert_with(|| {
-            PerTimelineRemotePhysicalSizeGauge::new(
-                REMOTE_PHYSICAL_SIZE
-                    .get_metric_with_label_values(&[
-                        &self.tenant_id,
-                        &self.shard_id,
-                        &self.timeline_id,
-                    ])
-                    .unwrap(),
-            )
-        });
-        gauge.set(sz);
-    }
-
-    pub(crate) fn remote_physical_size_get(&self) -> u64 {
-        let guard = self.remote_physical_size_gauge.lock().unwrap();
-        guard.as_ref().map(|gauge| gauge.get()).unwrap_or(0)
-    }
-
     pub fn remote_operation_time(
         &self,
         file_kind: &RemoteOpFileKind,
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 026cbc107c..54b63f7042 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -21,6 +21,7 @@ use futures::FutureExt;
 use futures::StreamExt;
 use pageserver_api::models;
 use pageserver_api::models::TimelineState;
+use pageserver_api::models::TopTenantShardItem;
 use pageserver_api::models::WalRedoManagerStatus;
 use pageserver_api::shard::ShardIdentity;
 use pageserver_api::shard::ShardStripeSize;
@@ -2196,6 +2197,31 @@ impl Tenant {
 
         Ok(())
     }
+
+    pub(crate) fn get_sizes(&self) -> TopTenantShardItem {
+        let mut result = TopTenantShardItem {
+            id: self.tenant_shard_id,
+            resident_size: 0,
+            physical_size: 0,
+            max_logical_size: 0,
+        };
+
+        for timeline in self.timelines.lock().unwrap().values() {
+            result.resident_size += timeline.metrics.resident_physical_size_gauge.get();
+
+            result.physical_size += timeline
+                .remote_client
+                .metrics
+                .remote_physical_size_gauge
+                .get();
+            result.max_logical_size = std::cmp::max(
+                result.max_logical_size,
+                timeline.metrics.current_logical_size_gauge.get(),
+            );
+        }
+
+        result
+    }
 }
 
 /// Given a Vec of timelines and their ancestors (timeline_id, ancestor_id),
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 07d6af696c..3a1113cf01 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -317,7 +317,7 @@ pub struct RemoteTimelineClient {
 
     upload_queue: Mutex<UploadQueue>,
 
-    metrics: Arc<RemoteTimelineClientMetrics>,
+    pub(crate) metrics: Arc<RemoteTimelineClientMetrics>,
 
     storage_impl: GenericRemoteStorage,
 
@@ -461,11 +461,11 @@ impl RemoteTimelineClient {
         } else {
             0
         };
-        self.metrics.remote_physical_size_set(size);
+        self.metrics.remote_physical_size_gauge.set(size);
     }
 
     pub fn get_remote_physical_size(&self) -> u64 {
-        self.metrics.remote_physical_size_get()
+        self.metrics.remote_physical_size_gauge.get()
     }
 
     //
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index f1454af533..ce8f8d0cdd 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -66,6 +66,10 @@ struct Cli {
     #[arg(long)]
     max_unavailable_interval: Option<humantime::Duration>,
 
+    /// Size threshold for automatically splitting shards (disabled by default)
+    #[arg(long)]
+    split_threshold: Option<u64>,
+
     /// Maximum number of reconcilers that may run in parallel
     #[arg(long)]
     reconciler_concurrency: Option<usize>,
@@ -255,6 +259,7 @@ async fn async_main() -> anyhow::Result<()> {
         reconciler_concurrency: args
             .reconciler_concurrency
             .unwrap_or(RECONCILER_CONCURRENCY_DEFAULT),
+        split_threshold: args.split_threshold,
     };
 
     // After loading secrets & config, but before starting anything else, apply database migrations
diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs
index 25b6b67e12..769aba80ca 100644
--- a/storage_controller/src/pageserver_client.rs
+++ b/storage_controller/src/pageserver_client.rs
@@ -2,7 +2,7 @@ use pageserver_api::{
     models::{
         LocationConfig, LocationConfigListResponse, PageserverUtilization, SecondaryProgress,
         TenantScanRemoteStorageResponse, TenantShardSplitRequest, TenantShardSplitResponse,
-        TimelineCreateRequest, TimelineInfo,
+        TimelineCreateRequest, TimelineInfo, TopTenantShardsRequest, TopTenantShardsResponse,
     },
     shard::TenantShardId,
 };
@@ -234,4 +234,16 @@ impl PageserverClient {
             self.inner.get_utilization().await
         )
     }
+
+    pub(crate) async fn top_tenant_shards(
+        &self,
+        request: TopTenantShardsRequest,
+    ) -> Result<TopTenantShardsResponse> {
+        measured_request!(
+            "top_tenants",
+            crate::metrics::Method::Post,
+            &self.node_id_label,
+            self.inner.top_tenant_shards(request).await
+        )
+    }
 }
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index ae7e8d3d7d..f914f4e0bb 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -32,10 +32,10 @@ use pageserver_api::{
         TenantPolicyRequest, TenantShardMigrateRequest, TenantShardMigrateResponse,
         UtilizationScore,
     },
-    models::{SecondaryProgress, TenantConfigRequest},
+    models::{SecondaryProgress, TenantConfigRequest, TopTenantShardsRequest},
 };
 use reqwest::StatusCode;
-use tracing::instrument;
+use tracing::{instrument, Instrument};
 
 use crate::pageserver_client::PageserverClient;
 use pageserver_api::{
@@ -222,6 +222,10 @@ pub struct Config {
 
     /// How many Reconcilers may be spawned concurrently
     pub reconciler_concurrency: usize,
+
+    /// How large must a shard grow in bytes before we split it?
+    /// None disables auto-splitting.
+    pub split_threshold: Option<u64>,
 }
 
 impl From<DatabaseError> for ApiError {
@@ -699,7 +703,7 @@ impl Service {
     /// e.g. a tenant create/attach/migrate must eventually be retried: this task is responsible
     /// for those retries.
     #[instrument(skip_all)]
-    async fn background_reconcile(&self) {
+    async fn background_reconcile(self: &Arc<Self>) {
         self.startup_complete.clone().wait().await;
 
         const BACKGROUND_RECONCILE_PERIOD: Duration = Duration::from_secs(20);
@@ -711,7 +715,11 @@ impl Service {
                 let reconciles_spawned = self.reconcile_all();
                 if reconciles_spawned == 0 {
                     // Run optimizer only when we didn't find any other work to do
-                    self.optimize_all().await;
+                    let optimizations = self.optimize_all().await;
+                    if optimizations == 0 {
+                        // Run new splits only when no optimizations are pending
+                        self.autosplit_tenants().await;
+                    }
                 }
             }
               _ = self.cancel.cancelled() => return
@@ -4766,6 +4774,104 @@ impl Service {
         validated_work
     }
 
+    /// Look for shards which are oversized and in need of splitting
+    async fn autosplit_tenants(self: &Arc<Self>) {
+        let Some(split_threshold) = self.config.split_threshold else {
+            // Auto-splitting is disabled
+            return;
+        };
+
+        let nodes = self.inner.read().unwrap().nodes.clone();
+
+        const SPLIT_TO_MAX: ShardCount = ShardCount::new(8);
+
+        let mut top_n = Vec::new();
+
+        // Call into each node to look for big tenants
+        let top_n_request = TopTenantShardsRequest {
+            // We currently split based on logical size, for simplicity: logical size is a signal of
+            // the user's intent to run a large database, whereas physical/resident size can be symptoms
+            // of compaction issues.  Eventually we should switch to using resident size to bound the
+            // disk space impact of one shard.
+            order_by: models::TenantSorting::MaxLogicalSize,
+            limit: 10,
+            where_shards_lt: Some(SPLIT_TO_MAX),
+            where_gt: Some(split_threshold),
+        };
+        for node in nodes.values() {
+            let request_ref = &top_n_request;
+            match node
+                .with_client_retries(
+                    |client| async move {
+                        let request = request_ref.clone();
+                        client.top_tenant_shards(request.clone()).await
+                    },
+                    &self.config.jwt_token,
+                    3,
+                    3,
+                    Duration::from_secs(5),
+                    &self.cancel,
+                )
+                .await
+            {
+                Some(Ok(node_top_n)) => {
+                    top_n.extend(node_top_n.shards.into_iter());
+                }
+                Some(Err(mgmt_api::Error::Cancelled)) => {
+                    continue;
+                }
+                Some(Err(e)) => {
+                    tracing::warn!("Failed to fetch top N tenants from {node}: {e}");
+                    continue;
+                }
+                None => {
+                    // Node is shutting down
+                    continue;
+                }
+            };
+        }
+
+        // Pick the biggest tenant to split first
+        top_n.sort_by_key(|i| i.resident_size);
+        let Some(split_candidate) = top_n.into_iter().next() else {
+            tracing::debug!("No split-elegible shards found");
+            return;
+        };
+
+        // We spawn a task to run this, so it's exactly like some external API client requesting it.  We don't
+        // want to block the background reconcile loop on this.
+        tracing::info!("Auto-splitting tenant for size threshold {split_threshold}: current size {split_candidate:?}");
+
+        let this = self.clone();
+        tokio::spawn(
+            async move {
+                match this
+                    .tenant_shard_split(
+                        split_candidate.id.tenant_id,
+                        TenantShardSplitRequest {
+                            // Always split to the max number of shards: this avoids stepping through
+                            // intervening shard counts and encountering the overrhead of a split+cleanup
+                            // each time as a tenant grows, and is not too expensive because our max shard
+                            // count is relatively low anyway.
+                            // This policy will be adjusted in future once we support higher shard count.
+                            new_shard_count: SPLIT_TO_MAX.literal(),
+                            new_stripe_size: Some(ShardParameters::DEFAULT_STRIPE_SIZE),
+                        },
+                    )
+                    .await
+                {
+                    Ok(_) => {
+                        tracing::info!("Successful auto-split");
+                    }
+                    Err(e) => {
+                        tracing::error!("Auto-split failed: {e}");
+                    }
+                }
+            }
+            .instrument(tracing::info_span!("auto_split", tenant_id=%split_candidate.id.tenant_id)),
+        );
+    }
+
     /// Useful for tests: run whatever work a background [`Self::reconcile_all`] would have done, but
     /// also wait for any generated Reconcilers to complete.  Calling this until it returns zero should
     /// put the system into a quiescent state where future background reconciliations won't do anything.
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 0b2963d89c..4d563a532b 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -890,3 +890,18 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         assert current_logical_size == non_incremental
         assert isinstance(current_logical_size, int)
         return current_logical_size
+
+    def top_tenants(
+        self, order_by: str, limit: int, where_shards_lt: int, where_gt: int
+    ) -> dict[Any, Any]:
+        res = self.post(
+            f"http://localhost:{self.port}/v1/top_tenants",
+            json={
+                "order_by": order_by,
+                "limit": limit,
+                "where_shards_lt": where_shards_lt,
+                "where_gt": where_gt,
+            },
+        )
+        self.verbose_error(res)
+        return res.json()  # type: ignore
diff --git a/test_runner/performance/test_sharding_autosplit.py b/test_runner/performance/test_sharding_autosplit.py
new file mode 100644
index 0000000000..9cd83f0959
--- /dev/null
+++ b/test_runner/performance/test_sharding_autosplit.py
@@ -0,0 +1,280 @@
+import concurrent.futures
+import re
+from pathlib import Path
+
+import pytest
+from fixtures.common_types import TenantId, TimelineId
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+    PgBin,
+    tenant_get_shards,
+)
+
+
+@pytest.mark.timeout(600)
+def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
+    """
+    Check that sharding, including auto-splitting, "just works" under pgbench workloads.
+
+    This is not a benchmark, but it lives in the same place as benchmarks in order to be run
+    on a dedicated node that can sustain some significant throughput.
+
+    Other tests validate the details of shard splitting, error cases etc.  This test is
+    the sanity check that it all really works as expected with realistic amounts of data
+    and under load.
+
+    Success conditions:
+    - Tenants auto-split when their capacity grows
+    - Client workloads are not interrupted while that happens
+    """
+
+    neon_env_builder.num_pageservers = 8
+    neon_env_builder.storage_controller_config = {
+        # Split tenants at 500MB: it's up to the storage controller how it interprets this (logical
+        # sizes, physical sizes, etc).  We will write this much data logically, therefore other sizes
+        # will reliably be greater.
+        "split_threshold": 1024 * 1024 * 500
+    }
+
+    tenant_conf = {
+        # We want layer rewrites to happen as soon as possible (this is the most stressful
+        # case for the system), so set PITR interval to something tiny.
+        "pitr_interval": "5s",
+        # Scaled down thresholds.  We will run at ~1GB scale but would like to emulate
+        # the behavior of a system running at ~100GB scale.
+        "checkpoint_distance": f"{1024 * 1024}",
+        "compaction_threshold": "1",
+        "compaction_target_size": f"{1024 * 1024}",
+        "image_creation_threshold": "2",
+        "image_layer_creation_check_threshold": "0",
+    }
+
+    env = neon_env_builder.init_start()
+
+    for ps in env.pageservers:
+        ps.allowed_errors.extend(
+            [
+                # We shut down pageservers while they might have some compaction work going on
+                ".*Compaction failed.*shutting down.*"
+            ]
+        )
+
+    env.storage_controller.allowed_errors.extend(
+        [
+            # The neon_local functionality for updating computes is flaky for unknown reasons
+            ".*Local notification hook failed.*",
+            ".*Marking shard.*for notification retry.*",
+            ".*Failed to notify compute.*",
+        ]
+    )
+
+    # Total tenants
+    tenant_count = 4
+
+    # Transaction rate: we set this rather than running at full-speed because we
+    # might run on a slow node that doesn't cope well with many full-speed pgbenches running concurrently.
+    transaction_rate = 100
+
+    class TenantState:
+        def __init__(self, timeline_id, endpoint):
+            self.timeline_id = timeline_id
+            self.endpoint = endpoint
+
+    # Create tenants
+    tenants = {}
+    for tenant_id in set(TenantId.generate() for _i in range(0, tenant_count)):
+        timeline_id = TimelineId.generate()
+        env.neon_cli.create_tenant(tenant_id, timeline_id, conf=tenant_conf)
+        endpoint = env.endpoints.create("main", tenant_id=tenant_id)
+        tenants[tenant_id] = TenantState(timeline_id, endpoint)
+        endpoint.start()
+
+    def run_pgbench_init(endpoint):
+        pg_bin.run_capture(
+            [
+                "pgbench",
+                "-s50",
+                "-i",
+                f"postgres://cloud_admin@localhost:{endpoint.pg_port}/postgres",
+            ]
+        )
+
+    def check_pgbench_output(out_path: str):
+        """
+        When we run pgbench, we want not just an absence of errors, but also continuous evidence
+        of I/O progressing: our shard splitting and migration should not interrrupt the benchmark.
+        """
+        matched_lines = 0
+        stderr = Path(f"{out_path}.stderr").read_text()
+
+        low_watermark = None
+
+        # Apply this as a threshold for what we consider an unacceptable interruption to I/O
+        min_tps = transaction_rate // 10
+
+        for line in stderr.split("\n"):
+            match = re.match(r"progress: ([0-9\.]+) s, ([0-9\.]+) tps, .* ([0-9]+) failed", line)
+            if match is None:
+                # Fall back to older-version pgbench output (omits failure count)
+                match = re.match(r"progress: ([0-9\.]+) s, ([0-9\.]+) tps, .*", line)
+                if match is None:
+                    continue
+                else:
+                    (_time, tps) = match.groups()
+                    tps = float(tps)
+                    failed = 0
+            else:
+                (_time, tps, failed) = match.groups()  # type: ignore
+                tps = float(tps)
+                failed = int(failed)
+
+            matched_lines += 1
+
+            if failed > 0:
+                raise RuntimeError(
+                    f"pgbench on tenant {endpoint.tenant_id} run at {out_path} has failed > 0"
+                )
+
+            if low_watermark is None or low_watermark > tps:
+                low_watermark = tps
+
+            # Temporarily disabled: have seen some 0 tps regions on Hetzner runners, but not
+            # at the same time as a shard split.
+            # if tps < min_tps:
+            #     raise RuntimeError(
+            #         f"pgbench on tenant {endpoint.tenant_id} run at {out_path} has tps < {min_tps}"
+            #     )
+
+        log.info(f"Checked {matched_lines} progress lines, lowest TPS was {min_tps}")
+
+        if matched_lines == 0:
+            raise RuntimeError(f"pgbench output at {out_path} contained no progress lines")
+
+    def run_pgbench_main(endpoint):
+        out_path = pg_bin.run_capture(
+            [
+                "pgbench",
+                "-s50",
+                "-T",
+                "180",
+                "-R",
+                f"{transaction_rate}",
+                "-P",
+                "1",
+                f"postgres://cloud_admin@localhost:{endpoint.pg_port}/postgres",
+            ]
+        )
+
+        check_pgbench_output(out_path)
+
+    def run_pgbench_read(endpoint):
+        out_path = pg_bin.run_capture(
+            [
+                "pgbench",
+                "-s50",
+                "-T",
+                "30",
+                "-R",
+                f"{transaction_rate}",
+                "-S",
+                "-P",
+                "1",
+                f"postgres://cloud_admin@localhost:{endpoint.pg_port}/postgres",
+            ]
+        )
+
+        check_pgbench_output(out_path)
+
+    with concurrent.futures.ThreadPoolExecutor(max_workers=tenant_count) as pgbench_threads:
+        pgbench_futs = []
+        for tenant_state in tenants.values():
+            fut = pgbench_threads.submit(run_pgbench_init, tenant_state.endpoint)
+            pgbench_futs.append(fut)
+
+        log.info("Waiting for pgbench inits")
+        for fut in pgbench_futs:
+            fut.result()
+
+        pgbench_futs = []
+        for tenant_state in tenants.values():
+            fut = pgbench_threads.submit(run_pgbench_main, tenant_state.endpoint)
+            pgbench_futs.append(fut)
+
+        log.info("Waiting for pgbench read/write pass")
+        for fut in pgbench_futs:
+            fut.result()
+
+    def assert_all_split():
+        for tenant_id in tenants.keys():
+            shards = tenant_get_shards(env, tenant_id)
+            assert len(shards) == 8
+
+    # This is not a wait_until, because we wanted the splits to happen _while_ pgbench is running: otherwise
+    # this test is not properly doing its job of validating that splits work nicely under load.
+    assert_all_split()
+
+    env.storage_controller.assert_log_contains(".*Successful auto-split.*")
+
+    # Log timeline sizes, useful for debug, and implicitly validates that the shards
+    # are available in the places the controller thinks they should be.
+    for tenant_id, tenant_state in tenants.items():
+        (shard_zero_id, shard_zero_ps) = tenant_get_shards(env, tenant_id)[0]
+        timeline_info = shard_zero_ps.http_client().timeline_detail(
+            shard_zero_id, tenant_state.timeline_id
+        )
+        log.info(f"{shard_zero_id} timeline: {timeline_info}")
+
+    # Run compaction for all tenants, restart endpoint so that on subsequent reads we will
+    # definitely hit pageserver for reads.  This compaction passis expected to drop unwanted
+    # layers but not do any rewrites (we're still in the same generation)
+    for tenant_id, tenant_state in tenants.items():
+        tenant_state.endpoint.stop()
+        for shard_id, shard_ps in tenant_get_shards(env, tenant_id):
+            shard_ps.http_client().timeline_gc(shard_id, tenant_state.timeline_id, gc_horizon=None)
+            shard_ps.http_client().timeline_compact(shard_id, tenant_state.timeline_id)
+        tenant_state.endpoint.start()
+
+    with concurrent.futures.ThreadPoolExecutor(max_workers=tenant_count) as pgbench_threads:
+        pgbench_futs = []
+        for tenant_state in tenants.values():
+            fut = pgbench_threads.submit(run_pgbench_read, tenant_state.endpoint)
+            pgbench_futs.append(fut)
+
+        log.info("Waiting for pgbench read pass")
+        for fut in pgbench_futs:
+            fut.result()
+
+    env.storage_controller.consistency_check()
+
+    # Restart the storage controller
+    env.storage_controller.stop()
+    env.storage_controller.start()
+
+    env.storage_controller.consistency_check()
+
+    # Restart all pageservers
+    for ps in env.pageservers:
+        ps.stop()
+        ps.start()
+
+    # Freshen gc_info in Timeline, so that when compaction runs in the background in the
+    # subsequent pgbench period, the last_gc_cutoff is updated and enables the conditions for a rewrite to pass.
+    for tenant_id, tenant_state in tenants.items():
+        for shard_id, shard_ps in tenant_get_shards(env, tenant_id):
+            shard_ps.http_client().timeline_gc(shard_id, tenant_state.timeline_id, gc_horizon=None)
+
+    # One last check data remains readable after everything has restarted
+    with concurrent.futures.ThreadPoolExecutor(max_workers=tenant_count) as pgbench_threads:
+        pgbench_futs = []
+        for tenant_state in tenants.values():
+            fut = pgbench_threads.submit(run_pgbench_read, tenant_state.endpoint)
+            pgbench_futs.append(fut)
+
+        log.info("Waiting for pgbench read pass")
+        for fut in pgbench_futs:
+            fut.result()
+
+    # Assert that some rewrites happened
+    # TODO: uncomment this after https://github.com/neondatabase/neon/pull/7531 is merged
+    # assert any(ps.log_contains(".*Rewriting layer after shard split.*") for ps in env.pageservers)
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 87544af598..1bfeec6f4b 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -1326,3 +1326,45 @@ def test_sharding_unlogged_relation(neon_env_builder: NeonEnvBuilder):
 
         # Ensure that post-endpoint-restart modifications are ingested happily by pageserver
         wait_for_last_flush_lsn(env, ep, tenant_id, timeline_id)
+
+
+def test_top_tenants(neon_env_builder: NeonEnvBuilder):
+    """
+    The top_tenants API is used in shard auto-splitting to find candidates.
+    """
+
+    env = neon_env_builder.init_configs()
+    neon_env_builder.start()
+
+    tenants = []
+    n_tenants = 8
+    for i in range(0, n_tenants):
+        tenant_id = TenantId.generate()
+        timeline_id = TimelineId.generate()
+        env.neon_cli.create_tenant(tenant_id, timeline_id)
+
+        # Write a different amount of data to each tenant
+        w = Workload(env, tenant_id, timeline_id)
+        w.init()
+        w.write_rows(i * 1000)
+        w.stop()
+
+        logical_size = env.pageserver.http_client().timeline_detail(tenant_id, timeline_id)[
+            "current_logical_size"
+        ]
+        tenants.append((tenant_id, timeline_id, logical_size))
+
+        log.info(f"Created {tenant_id}/{timeline_id} with size {logical_size}")
+
+    # Ask for 1 largest tenant
+    top_1 = env.pageserver.http_client().top_tenants("max_logical_size", 1, 8, 0)
+    assert len(top_1["shards"]) == 1
+    assert top_1["shards"][0]["id"] == str(tenants[-1][0])
+    assert top_1["shards"][0]["max_logical_size"] == tenants[-1][2]
+
+    # Apply a lower bound limit
+    top = env.pageserver.http_client().top_tenants(
+        "max_logical_size", 100, 8, where_gt=tenants[3][2]
+    )
+    assert len(top["shards"]) == n_tenants - 4
+    assert set(i["id"] for i in top["shards"]) == set(str(i[0]) for i in tenants[4:])

From aaf60819fa479e37a4b477b20e1fbcee2d5a046f Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Fri, 17 May 2024 15:22:49 -0400
Subject: [PATCH 075/130] feat(pageserver): persist aux file policy in index
 part (#7668)

Part of https://github.com/neondatabase/neon/issues/7462

## Summary of changes

Tenant config is not persisted unless it's attached on the storage
controller. In this pull request, we persist the aux file policy flag in
the `index_part.json`.

Admins can set `switch_aux_file_policy` in the storage controller or
using the page server API. Upon the first aux file gets written, the
write path will compare the aux file policy target with the current
policy. If it is switch-able, we will do the switch. Otherwise, the
original policy will be used. The test cases show what the admins can do
/ cannot do.

The `last_aux_file_policy` is stored in `IndexPart`. Updates to the
persisted policy are done via
`schedule_index_upload_for_aux_file_policy_update`. On the write path,
the writer will update the field.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Joonas Koivunen <joonas@neon.tech>
---
 libs/pageserver_api/src/models.rs             | 134 ++++++++
 pageserver/ctl/src/main.rs                    |   1 +
 pageserver/src/http/routes.rs                 |   2 +
 pageserver/src/pgdatadir_mapping.rs           |  33 +-
 pageserver/src/tenant.rs                      | 296 ++++++++++++++++++
 pageserver/src/tenant/config.rs               |   4 +-
 .../src/tenant/remote_timeline_client.rs      |  13 +
 .../tenant/remote_timeline_client/index.rs    |  87 ++++-
 pageserver/src/tenant/timeline.rs             |   9 +-
 pageserver/src/tenant/timeline/delete.rs      |   2 +
 pageserver/src/tenant/upload_queue.rs         |   6 +
 test_runner/regress/test_aux_files.py         |  72 +++++
 12 files changed, 648 insertions(+), 11 deletions(-)
 create mode 100644 test_runner/regress/test_aux_files.py

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index d52fb5e93d..80ca696313 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -10,6 +10,7 @@ use std::{
     io::{BufRead, Read},
     num::{NonZeroU64, NonZeroUsize},
     str::FromStr,
+    sync::atomic::AtomicUsize,
     time::{Duration, SystemTime},
 };
 
@@ -308,13 +309,88 @@ pub struct TenantConfig {
     pub switch_aux_file_policy: Option<AuxFilePolicy>,
 }
 
+/// The policy for the aux file storage. It can be switched through `switch_aux_file_policy`
+/// tenant config. When the first aux file written, the policy will be persisted in the
+/// `index_part.json` file and has a limited migration path.
+///
+/// Currently, we only allow the following migration path:
+///
+/// Unset -> V1
+///       -> V2
+///       -> CrossValidation -> V2
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 pub enum AuxFilePolicy {
+    /// V1 aux file policy: store everything in AUX_FILE_KEY
     V1,
+    /// V2 aux file policy: store in the AUX_FILE keyspace
     V2,
+    /// Cross validation runs both formats on the write path and does validation
+    /// on the read path.
     CrossValidation,
 }
 
+impl AuxFilePolicy {
+    pub fn is_valid_migration_path(from: Option<Self>, to: Self) -> bool {
+        matches!(
+            (from, to),
+            (None, _) | (Some(AuxFilePolicy::CrossValidation), AuxFilePolicy::V2)
+        )
+    }
+
+    /// If a tenant writes aux files without setting `switch_aux_policy`, this value will be used.
+    pub fn default_tenant_config() -> Self {
+        Self::V1
+    }
+}
+
+/// The aux file policy memory flag. Users can store `Option<AuxFilePolicy>` into this atomic flag. 0 == unspecified.
+pub struct AtomicAuxFilePolicy(AtomicUsize);
+
+impl AtomicAuxFilePolicy {
+    pub fn new(policy: Option<AuxFilePolicy>) -> Self {
+        Self(AtomicUsize::new(
+            policy.map(AuxFilePolicy::to_usize).unwrap_or_default(),
+        ))
+    }
+
+    pub fn load(&self) -> Option<AuxFilePolicy> {
+        match self.0.load(std::sync::atomic::Ordering::Acquire) {
+            0 => None,
+            other => Some(AuxFilePolicy::from_usize(other)),
+        }
+    }
+
+    pub fn store(&self, policy: Option<AuxFilePolicy>) {
+        self.0.store(
+            policy.map(AuxFilePolicy::to_usize).unwrap_or_default(),
+            std::sync::atomic::Ordering::Release,
+        );
+    }
+}
+
+impl AuxFilePolicy {
+    pub fn to_usize(self) -> usize {
+        match self {
+            Self::V1 => 1,
+            Self::CrossValidation => 2,
+            Self::V2 => 3,
+        }
+    }
+
+    pub fn try_from_usize(this: usize) -> Option<Self> {
+        match this {
+            1 => Some(Self::V1),
+            2 => Some(Self::CrossValidation),
+            3 => Some(Self::V2),
+            _ => None,
+        }
+    }
+
+    pub fn from_usize(this: usize) -> Self {
+        Self::try_from_usize(this).unwrap()
+    }
+}
+
 impl FromStr for AuxFilePolicy {
     type Err = anyhow::Error;
 
@@ -604,6 +680,9 @@ pub struct TimelineInfo {
     pub state: TimelineState,
 
     pub walreceiver_status: String,
+
+    /// The last aux file policy being used on this timeline
+    pub last_aux_file_policy: Option<AuxFilePolicy>,
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -1505,4 +1584,59 @@ mod tests {
             assert_eq!(actual, expected, "example on {line}");
         }
     }
+
+    #[test]
+    fn test_aux_file_migration_path() {
+        assert!(AuxFilePolicy::is_valid_migration_path(
+            None,
+            AuxFilePolicy::V1
+        ));
+        assert!(AuxFilePolicy::is_valid_migration_path(
+            None,
+            AuxFilePolicy::V2
+        ));
+        assert!(AuxFilePolicy::is_valid_migration_path(
+            None,
+            AuxFilePolicy::CrossValidation
+        ));
+        // Self-migration is not a valid migration path, and the caller should handle it by itself.
+        assert!(!AuxFilePolicy::is_valid_migration_path(
+            Some(AuxFilePolicy::V1),
+            AuxFilePolicy::V1
+        ));
+        assert!(!AuxFilePolicy::is_valid_migration_path(
+            Some(AuxFilePolicy::V2),
+            AuxFilePolicy::V2
+        ));
+        assert!(!AuxFilePolicy::is_valid_migration_path(
+            Some(AuxFilePolicy::CrossValidation),
+            AuxFilePolicy::CrossValidation
+        ));
+        // Migrations not allowed
+        assert!(!AuxFilePolicy::is_valid_migration_path(
+            Some(AuxFilePolicy::CrossValidation),
+            AuxFilePolicy::V1
+        ));
+        assert!(!AuxFilePolicy::is_valid_migration_path(
+            Some(AuxFilePolicy::V1),
+            AuxFilePolicy::V2
+        ));
+        assert!(!AuxFilePolicy::is_valid_migration_path(
+            Some(AuxFilePolicy::V2),
+            AuxFilePolicy::V1
+        ));
+        assert!(!AuxFilePolicy::is_valid_migration_path(
+            Some(AuxFilePolicy::V2),
+            AuxFilePolicy::CrossValidation
+        ));
+        assert!(!AuxFilePolicy::is_valid_migration_path(
+            Some(AuxFilePolicy::V1),
+            AuxFilePolicy::CrossValidation
+        ));
+        // Migrations allowed
+        assert!(AuxFilePolicy::is_valid_migration_path(
+            Some(AuxFilePolicy::CrossValidation),
+            AuxFilePolicy::V2
+        ));
+    }
 }
diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs
index 1fb75584fc..e92c352dab 100644
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -219,6 +219,7 @@ fn handle_metadata(
     let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?;
     println!("Current metadata:\n{meta:?}");
     let mut update_meta = false;
+    // TODO: simplify this part
     if let Some(disk_consistent_lsn) = disk_consistent_lsn {
         meta = TimelineMetadata::new(
             *disk_consistent_lsn,
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index b8d5c67ce0..7efd48afc7 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -439,6 +439,8 @@ async fn build_timeline_info_common(
         state,
 
         walreceiver_status,
+
+        last_aux_file_policy: timeline.last_aux_file_policy.load(),
     };
     Ok(info)
 }
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 1092d64d33..402f075365 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -35,7 +35,7 @@ use std::ops::ControlFlow;
 use std::ops::Range;
 use strum::IntoEnumIterator;
 use tokio_util::sync::CancellationToken;
-use tracing::{debug, trace, warn};
+use tracing::{debug, info, trace, warn};
 use utils::bin_ser::DeserializeError;
 use utils::vec_map::{VecMap, VecMapOrdering};
 use utils::{bin_ser::BeSer, lsn::Lsn};
@@ -718,10 +718,11 @@ impl Timeline {
         lsn: Lsn,
         ctx: &RequestContext,
     ) -> Result<HashMap<String, Bytes>, PageReconstructError> {
-        match self.get_switch_aux_file_policy() {
-            AuxFilePolicy::V1 => self.list_aux_files_v1(lsn, ctx).await,
-            AuxFilePolicy::V2 => self.list_aux_files_v2(lsn, ctx).await,
-            AuxFilePolicy::CrossValidation => {
+        let current_policy = self.last_aux_file_policy.load();
+        match current_policy {
+            Some(AuxFilePolicy::V1) | None => self.list_aux_files_v1(lsn, ctx).await,
+            Some(AuxFilePolicy::V2) => self.list_aux_files_v2(lsn, ctx).await,
+            Some(AuxFilePolicy::CrossValidation) => {
                 let v1_result = self.list_aux_files_v1(lsn, ctx).await;
                 let v2_result = self.list_aux_files_v2(lsn, ctx).await;
                 match (v1_result, v2_result) {
@@ -1469,7 +1470,27 @@ impl<'a> DatadirModification<'a> {
         content: &[u8],
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
-        let policy = self.tline.get_switch_aux_file_policy();
+        let switch_policy = self.tline.get_switch_aux_file_policy();
+
+        let policy = {
+            let current_policy = self.tline.last_aux_file_policy.load();
+            // Allowed switch path:
+            // * no aux files -> v1/v2/cross-validation
+            // * cross-validation->v2
+            if AuxFilePolicy::is_valid_migration_path(current_policy, switch_policy) {
+                self.tline.last_aux_file_policy.store(Some(switch_policy));
+                self.tline
+                    .remote_client
+                    .schedule_index_upload_for_aux_file_policy_update(Some(switch_policy))?;
+                info!(current=?current_policy, next=?switch_policy, "switching aux file policy");
+                switch_policy
+            } else {
+                // This branch handles non-valid migration path, and the case that switch_policy == current_policy.
+                // And actually, because the migration path always allow unspecified -> *, this unwrap_or will never be hit.
+                current_policy.unwrap_or(AuxFilePolicy::default_tenant_config())
+            }
+        };
+
         if let AuxFilePolicy::V2 | AuxFilePolicy::CrossValidation = policy {
             let key = aux_file::encode_aux_file_key(path);
             // retrieve the key from the engine
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 54b63f7042..d42b9082b7 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -20,6 +20,7 @@ use futures::stream::FuturesUnordered;
 use futures::FutureExt;
 use futures::StreamExt;
 use pageserver_api::models;
+use pageserver_api::models::AuxFilePolicy;
 use pageserver_api::models::TimelineState;
 use pageserver_api::models::TopTenantShardItem;
 use pageserver_api::models::WalRedoManagerStatus;
@@ -529,6 +530,7 @@ impl Tenant {
         index_part: Option<IndexPart>,
         metadata: TimelineMetadata,
         ancestor: Option<Arc<Timeline>>,
+        last_aux_file_policy: Option<AuxFilePolicy>,
         _ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         let tenant_id = self.tenant_shard_id;
@@ -539,6 +541,10 @@ impl Tenant {
             ancestor.clone(),
             resources,
             CreateTimelineCause::Load,
+            // This could be derived from ancestor branch + index part. Though the only caller of `timeline_init_and_sync` is `load_remote_timeline`,
+            // there will potentially be other caller of this function in the future, and we don't know whether `index_part` or `ancestor` takes precedence.
+            // Therefore, we pass this field explicitly for now, and remove it once we fully migrate to aux file v2.
+            last_aux_file_policy,
         )?;
         let disk_consistent_lsn = timeline.get_disk_consistent_lsn();
         anyhow::ensure!(
@@ -553,6 +559,10 @@ impl Tenant {
 
         if let Some(index_part) = index_part.as_ref() {
             timeline.remote_client.init_upload_queue(index_part)?;
+
+            timeline
+                .last_aux_file_policy
+                .store(index_part.last_aux_file_policy());
         } else {
             // No data on the remote storage, but we have local metadata file. We can end up
             // here with timeline_create being interrupted before finishing index part upload.
@@ -1173,12 +1183,15 @@ impl Tenant {
             None
         };
 
+        let last_aux_file_policy = index_part.last_aux_file_policy();
+
         self.timeline_init_and_sync(
             timeline_id,
             resources,
             Some(index_part),
             remote_metadata,
             ancestor,
+            last_aux_file_policy,
             ctx,
         )
         .await
@@ -1358,6 +1371,7 @@ impl Tenant {
             create_guard,
             initdb_lsn,
             None,
+            None,
         )
         .await
     }
@@ -2441,6 +2455,7 @@ impl Tenant {
         ancestor: Option<Arc<Timeline>>,
         resources: TimelineResources,
         cause: CreateTimelineCause,
+        last_aux_file_policy: Option<AuxFilePolicy>,
     ) -> anyhow::Result<Arc<Timeline>> {
         let state = match cause {
             CreateTimelineCause::Load => {
@@ -2469,6 +2484,7 @@ impl Tenant {
             resources,
             pg_version,
             state,
+            last_aux_file_policy,
             self.cancel.child_token(),
         );
 
@@ -3119,6 +3135,7 @@ impl Tenant {
                 timeline_create_guard,
                 start_lsn + 1,
                 Some(Arc::clone(src_timeline)),
+                src_timeline.last_aux_file_policy.load(),
             )
             .await?;
 
@@ -3312,6 +3329,7 @@ impl Tenant {
                 timeline_create_guard,
                 pgdata_lsn,
                 None,
+                None,
             )
             .await?;
 
@@ -3383,6 +3401,7 @@ impl Tenant {
         create_guard: TimelineCreateGuard<'a>,
         start_lsn: Lsn,
         ancestor: Option<Arc<Timeline>>,
+        last_aux_file_policy: Option<AuxFilePolicy>,
     ) -> anyhow::Result<UninitializedTimeline> {
         let tenant_shard_id = self.tenant_shard_id;
 
@@ -3398,6 +3417,7 @@ impl Tenant {
                 ancestor,
                 resources,
                 CreateTimelineCause::Load,
+                last_aux_file_policy,
             )
             .context("Failed to create timeline data structure")?;
 
@@ -5621,4 +5641,280 @@ mod tests {
 
         Ok(())
     }
+
+    #[tokio::test]
+    async fn test_branch_copies_dirty_aux_file_flag() {
+        let harness = TenantHarness::create("test_branch_copies_dirty_aux_file_flag").unwrap();
+
+        // the default aux file policy to switch is v1 if not set by the admins
+        assert_eq!(
+            harness.tenant_conf.switch_aux_file_policy,
+            AuxFilePolicy::V1
+        );
+        let (tenant, ctx) = harness.load().await;
+
+        let mut lsn = Lsn(0x08);
+
+        let tline: Arc<Timeline> = tenant
+            .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx)
+            .await
+            .unwrap();
+
+        // no aux file is written at this point, so the persistent flag should be unset
+        assert_eq!(tline.last_aux_file_policy.load(), None);
+
+        {
+            lsn += 8;
+            let mut modification = tline.begin_modification(lsn);
+            modification
+                .put_file("pg_logical/mappings/test1", b"first", &ctx)
+                .await
+                .unwrap();
+            modification.commit(&ctx).await.unwrap();
+        }
+
+        // there is no tenant manager to pass the configuration through, so lets mimic it
+        tenant.set_new_location_config(
+            AttachedTenantConf::try_from(LocationConf::attached_single(
+                TenantConfOpt {
+                    switch_aux_file_policy: Some(AuxFilePolicy::V2),
+                    ..Default::default()
+                },
+                tenant.generation,
+                &pageserver_api::models::ShardParameters::default(),
+            ))
+            .unwrap(),
+        );
+
+        assert_eq!(
+            tline.get_switch_aux_file_policy(),
+            AuxFilePolicy::V2,
+            "wanted state has been updated"
+        );
+        assert_eq!(
+            tline.last_aux_file_policy.load(),
+            Some(AuxFilePolicy::V1),
+            "aux file is written with switch_aux_file_policy unset (which is v1), so we should keep v1"
+        );
+
+        // we can read everything from the storage
+        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
+        assert_eq!(
+            files.get("pg_logical/mappings/test1"),
+            Some(&bytes::Bytes::from_static(b"first"))
+        );
+
+        {
+            lsn += 8;
+            let mut modification = tline.begin_modification(lsn);
+            modification
+                .put_file("pg_logical/mappings/test2", b"second", &ctx)
+                .await
+                .unwrap();
+            modification.commit(&ctx).await.unwrap();
+        }
+
+        assert_eq!(
+            tline.last_aux_file_policy.load(),
+            Some(AuxFilePolicy::V1),
+            "keep v1 storage format when new files are written"
+        );
+
+        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
+        assert_eq!(
+            files.get("pg_logical/mappings/test2"),
+            Some(&bytes::Bytes::from_static(b"second"))
+        );
+
+        let child = tenant
+            .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(lsn), &ctx)
+            .await
+            .unwrap();
+
+        // child copies the last flag even if that is not on remote storage yet
+        assert_eq!(child.get_switch_aux_file_policy(), AuxFilePolicy::V2);
+        assert_eq!(child.last_aux_file_policy.load(), Some(AuxFilePolicy::V1));
+
+        let files = child.list_aux_files(lsn, &ctx).await.unwrap();
+        assert_eq!(files.get("pg_logical/mappings/test1"), None);
+        assert_eq!(files.get("pg_logical/mappings/test2"), None);
+
+        // even if we crash here without flushing parent timeline with it's new
+        // last_aux_file_policy we are safe, because child was never meant to access ancestor's
+        // files. the ancestor can even switch back to V1 because of a migration safely.
+    }
+
+    #[tokio::test]
+    async fn aux_file_policy_switch() {
+        let mut harness = TenantHarness::create("aux_file_policy_switch").unwrap();
+        harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::CrossValidation; // set to cross-validation mode
+        let (tenant, ctx) = harness.load().await;
+
+        let mut lsn = Lsn(0x08);
+
+        let tline: Arc<Timeline> = tenant
+            .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx)
+            .await
+            .unwrap();
+
+        assert_eq!(
+            tline.last_aux_file_policy.load(),
+            None,
+            "no aux file is written so it should be unset"
+        );
+
+        {
+            lsn += 8;
+            let mut modification = tline.begin_modification(lsn);
+            modification
+                .put_file("pg_logical/mappings/test1", b"first", &ctx)
+                .await
+                .unwrap();
+            modification.commit(&ctx).await.unwrap();
+        }
+
+        // there is no tenant manager to pass the configuration through, so lets mimic it
+        tenant.set_new_location_config(
+            AttachedTenantConf::try_from(LocationConf::attached_single(
+                TenantConfOpt {
+                    switch_aux_file_policy: Some(AuxFilePolicy::V2),
+                    ..Default::default()
+                },
+                tenant.generation,
+                &pageserver_api::models::ShardParameters::default(),
+            ))
+            .unwrap(),
+        );
+
+        assert_eq!(
+            tline.get_switch_aux_file_policy(),
+            AuxFilePolicy::V2,
+            "wanted state has been updated"
+        );
+        assert_eq!(
+            tline.last_aux_file_policy.load(),
+            Some(AuxFilePolicy::CrossValidation),
+            "dirty index_part.json reflected state is yet to be updated"
+        );
+
+        // we can still read the auxfile v1 before we ingest anything new
+        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
+        assert_eq!(
+            files.get("pg_logical/mappings/test1"),
+            Some(&bytes::Bytes::from_static(b"first"))
+        );
+
+        {
+            lsn += 8;
+            let mut modification = tline.begin_modification(lsn);
+            modification
+                .put_file("pg_logical/mappings/test2", b"second", &ctx)
+                .await
+                .unwrap();
+            modification.commit(&ctx).await.unwrap();
+        }
+
+        assert_eq!(
+            tline.last_aux_file_policy.load(),
+            Some(AuxFilePolicy::V2),
+            "ingesting a file should apply the wanted switch state when applicable"
+        );
+
+        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
+        assert_eq!(
+            files.get("pg_logical/mappings/test1"),
+            Some(&bytes::Bytes::from_static(b"first")),
+            "cross validation writes to both v1 and v2 so this should be available in v2"
+        );
+        assert_eq!(
+            files.get("pg_logical/mappings/test2"),
+            Some(&bytes::Bytes::from_static(b"second"))
+        );
+
+        // mimic again by trying to flip it from V2 to V1 (not switched to while ingesting a file)
+        tenant.set_new_location_config(
+            AttachedTenantConf::try_from(LocationConf::attached_single(
+                TenantConfOpt {
+                    switch_aux_file_policy: Some(AuxFilePolicy::V1),
+                    ..Default::default()
+                },
+                tenant.generation,
+                &pageserver_api::models::ShardParameters::default(),
+            ))
+            .unwrap(),
+        );
+
+        {
+            lsn += 8;
+            let mut modification = tline.begin_modification(lsn);
+            modification
+                .put_file("pg_logical/mappings/test2", b"third", &ctx)
+                .await
+                .unwrap();
+            modification.commit(&ctx).await.unwrap();
+        }
+
+        assert_eq!(
+            tline.get_switch_aux_file_policy(),
+            AuxFilePolicy::V1,
+            "wanted state has been updated again, even if invalid request"
+        );
+
+        assert_eq!(
+            tline.last_aux_file_policy.load(),
+            Some(AuxFilePolicy::V2),
+            "ingesting a file should apply the wanted switch state when applicable"
+        );
+
+        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
+        assert_eq!(
+            files.get("pg_logical/mappings/test1"),
+            Some(&bytes::Bytes::from_static(b"first"))
+        );
+        assert_eq!(
+            files.get("pg_logical/mappings/test2"),
+            Some(&bytes::Bytes::from_static(b"third"))
+        );
+
+        // mimic again by trying to flip it from from V1 to V2 (not switched to while ingesting a file)
+        tenant.set_new_location_config(
+            AttachedTenantConf::try_from(LocationConf::attached_single(
+                TenantConfOpt {
+                    switch_aux_file_policy: Some(AuxFilePolicy::V2),
+                    ..Default::default()
+                },
+                tenant.generation,
+                &pageserver_api::models::ShardParameters::default(),
+            ))
+            .unwrap(),
+        );
+
+        {
+            lsn += 8;
+            let mut modification = tline.begin_modification(lsn);
+            modification
+                .put_file("pg_logical/mappings/test3", b"last", &ctx)
+                .await
+                .unwrap();
+            modification.commit(&ctx).await.unwrap();
+        }
+
+        assert_eq!(tline.get_switch_aux_file_policy(), AuxFilePolicy::V2);
+
+        assert_eq!(tline.last_aux_file_policy.load(), Some(AuxFilePolicy::V2));
+
+        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
+        assert_eq!(
+            files.get("pg_logical/mappings/test1"),
+            Some(&bytes::Bytes::from_static(b"first"))
+        );
+        assert_eq!(
+            files.get("pg_logical/mappings/test2"),
+            Some(&bytes::Bytes::from_static(b"third"))
+        );
+        assert_eq!(
+            files.get("pg_logical/mappings/test3"),
+            Some(&bytes::Bytes::from_static(b"last"))
+        );
+    }
 }
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index a743ce3c16..a695363cdc 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -373,6 +373,8 @@ pub struct TenantConf {
 
     /// Switch to a new aux file policy. Switching this flag requires the user has not written any aux file into
     /// the storage before, and this flag cannot be switched back. Otherwise there will be data corruptions.
+    /// There is a `last_aux_file_policy` flag which gets persisted in `index_part.json` once the first aux
+    /// file is written.
     pub switch_aux_file_policy: AuxFilePolicy,
 }
 
@@ -574,7 +576,7 @@ impl Default for TenantConf {
             lazy_slru_download: false,
             timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
             image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
-            switch_aux_file_policy: AuxFilePolicy::V1,
+            switch_aux_file_policy: AuxFilePolicy::default_tenant_config(),
         }
     }
 }
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 3a1113cf01..d3adae6841 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -189,6 +189,7 @@ use camino::Utf8Path;
 use chrono::{NaiveDateTime, Utc};
 
 pub(crate) use download::download_initdb_tar_zst;
+use pageserver_api::models::AuxFilePolicy;
 use pageserver_api::shard::{ShardIndex, TenantShardId};
 use scopeguard::ScopeGuard;
 use tokio_util::sync::CancellationToken;
@@ -611,6 +612,17 @@ impl RemoteTimelineClient {
         Ok(())
     }
 
+    /// Launch an index-file upload operation in the background, with only aux_file_policy flag updated.
+    pub(crate) fn schedule_index_upload_for_aux_file_policy_update(
+        self: &Arc<Self>,
+        last_aux_file_policy: Option<AuxFilePolicy>,
+    ) -> anyhow::Result<()> {
+        let mut guard = self.upload_queue.lock().unwrap();
+        let upload_queue = guard.initialized_mut()?;
+        upload_queue.last_aux_file_policy = last_aux_file_policy;
+        self.schedule_index_upload(upload_queue);
+        Ok(())
+    }
     ///
     /// Launch an index-file upload operation in the background, if necessary.
     ///
@@ -1851,6 +1863,7 @@ impl RemoteTimelineClient {
                         dangling_files: HashMap::default(),
                         shutting_down: false,
                         shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
+                        last_aux_file_policy: initialized.last_aux_file_policy,
                     };
 
                     let upload_queue = std::mem::replace(
diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index b114d6aa10..032dda7ff3 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -5,6 +5,7 @@
 use std::collections::HashMap;
 
 use chrono::NaiveDateTime;
+use pageserver_api::models::AuxFilePolicy;
 use serde::{Deserialize, Serialize};
 use utils::id::TimelineId;
 
@@ -88,6 +89,16 @@ pub struct IndexPart {
 
     #[serde(default)]
     pub(crate) lineage: Lineage,
+
+    /// Describes the kind of aux files stored in the timeline.
+    ///
+    /// The value is modified during file ingestion when the latest wanted value communicated via tenant config is applied if it is acceptable.
+    /// A V1 setting after V2 files have been committed is not accepted.
+    ///
+    /// None means no aux files have been written to the storage before the point
+    /// when this flag is introduced.
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub(crate) last_aux_file_policy: Option<AuxFilePolicy>,
 }
 
 impl IndexPart {
@@ -101,10 +112,11 @@ impl IndexPart {
     ///      is always generated from the keys of `layer_metadata`)
     /// - 4: timeline_layers is fully removed.
     /// - 5: lineage was added
-    const LATEST_VERSION: usize = 5;
+    /// - 6: last_aux_file_policy is added.
+    const LATEST_VERSION: usize = 6;
 
     // Versions we may see when reading from a bucket.
-    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5];
+    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6];
 
     pub const FILE_NAME: &'static str = "index_part.json";
 
@@ -113,6 +125,7 @@ impl IndexPart {
         disk_consistent_lsn: Lsn,
         metadata: TimelineMetadata,
         lineage: Lineage,
+        last_aux_file_policy: Option<AuxFilePolicy>,
     ) -> Self {
         let layer_metadata = layers_and_metadata
             .iter()
@@ -126,6 +139,7 @@ impl IndexPart {
             metadata,
             deleted_at: None,
             lineage,
+            last_aux_file_policy,
         }
     }
 
@@ -155,8 +169,13 @@ impl IndexPart {
             example_metadata.disk_consistent_lsn(),
             example_metadata,
             Default::default(),
+            Some(AuxFilePolicy::V1),
         )
     }
+
+    pub(crate) fn last_aux_file_policy(&self) -> Option<AuxFilePolicy> {
+        self.last_aux_file_policy
+    }
 }
 
 impl From<&UploadQueueInitialized> for IndexPart {
@@ -165,7 +184,13 @@ impl From<&UploadQueueInitialized> for IndexPart {
         let metadata = uq.latest_metadata.clone();
         let lineage = uq.latest_lineage.clone();
 
-        Self::new(&uq.latest_files, disk_consistent_lsn, metadata, lineage)
+        Self::new(
+            &uq.latest_files,
+            disk_consistent_lsn,
+            metadata,
+            lineage,
+            uq.last_aux_file_policy,
+        )
     }
 }
 
@@ -299,6 +324,7 @@ mod tests {
             metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
             deleted_at: None,
             lineage: Lineage::default(),
+            last_aux_file_policy: None,
         };
 
         let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
@@ -340,6 +366,7 @@ mod tests {
             metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
             deleted_at: None,
             lineage: Lineage::default(),
+            last_aux_file_policy: None,
         };
 
         let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
@@ -383,6 +410,7 @@ mod tests {
             deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
                 "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()),
             lineage: Lineage::default(),
+            last_aux_file_policy: None,
         };
 
         let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
@@ -428,6 +456,7 @@ mod tests {
             .unwrap(),
             deleted_at: None,
             lineage: Lineage::default(),
+            last_aux_file_policy: None,
         };
 
         let empty_layers_parsed = IndexPart::from_s3_bytes(empty_layers_json.as_bytes()).unwrap();
@@ -468,6 +497,7 @@ mod tests {
             metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
             deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
             lineage: Lineage::default(),
+            last_aux_file_policy: None,
         };
 
         let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
@@ -511,6 +541,57 @@ mod tests {
                 reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
                 original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))),
             },
+            last_aux_file_policy: None,
+        };
+
+        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
+        assert_eq!(part, expected);
+    }
+
+    #[test]
+    fn v6_indexpart_is_parsed() {
+        let example = r#"{
+            "version":6,
+            "layer_metadata":{
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
+            },
+            "disk_consistent_lsn":"0/16960E8",
+            "metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
+            "deleted_at": "2023-07-31T09:00:00.123",
+            "lineage":{
+                "original_ancestor":["e2bfd8c633d713d279e6fcd2bcc15b6d","0/15A7618","2024-05-07T18:52:36.322426563"],
+                "reparenting_history":["e1bfd8c633d713d279e6fcd2bcc15b6d"]
+            },
+            "last_aux_file_policy": "V2"
+        }"#;
+
+        let expected = IndexPart {
+            version: 6,
+            layer_metadata: HashMap::from([
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
+                    file_size: 25600000,
+                    generation: Generation::none(),
+                    shard: ShardIndex::unsharded()
+                }),
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
+                    // serde_json should always parse this but this might be a double with jq for
+                    // example.
+                    file_size: 9007199254741001,
+                    generation: Generation::none(),
+                    shard: ShardIndex::unsharded()
+                })
+            ]),
+            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
+            metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
+            deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
+                "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()),
+            lineage: Lineage {
+                reparenting_history_truncated: false,
+                reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
+                original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))),
+            },
+            last_aux_file_policy: Some(AuxFilePolicy::V2),
         };
 
         let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index df9bc9b35b..1fb1928079 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -23,7 +23,7 @@ use pageserver_api::{
     },
     keyspace::{KeySpaceAccum, SparseKeyPartitioning},
     models::{
-        AuxFilePolicy, CompactionAlgorithm, DownloadRemoteLayersTaskInfo,
+        AtomicAuxFilePolicy, AuxFilePolicy, CompactionAlgorithm, DownloadRemoteLayersTaskInfo,
         DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, InMemoryLayerInfo, LayerMapInfo,
         TimelineState,
     },
@@ -413,7 +413,11 @@ pub struct Timeline {
     /// Keep aux directory cache to avoid it's reconstruction on each update
     pub(crate) aux_files: tokio::sync::Mutex<AuxFilesState>,
 
+    /// Size estimator for aux file v2
     pub(crate) aux_file_size_estimator: AuxFileSizeEstimator,
+
+    /// Indicate whether aux file v2 storage is enabled.
+    pub(crate) last_aux_file_policy: AtomicAuxFilePolicy,
 }
 
 pub struct WalReceiverInfo {
@@ -2133,6 +2137,7 @@ impl Timeline {
         resources: TimelineResources,
         pg_version: u32,
         state: TimelineState,
+        aux_file_policy: Option<AuxFilePolicy>,
         cancel: CancellationToken,
     ) -> Arc<Self> {
         let disk_consistent_lsn = metadata.disk_consistent_lsn();
@@ -2257,6 +2262,8 @@ impl Timeline {
                 }),
 
                 aux_file_size_estimator: AuxFileSizeEstimator::new(aux_file_metrics),
+
+                last_aux_file_policy: AtomicAuxFilePolicy::new(aux_file_policy),
             };
             result.repartition_threshold =
                 result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index 901f5149b3..b5dfc86e77 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -280,6 +280,8 @@ impl DeleteTimelineFlow {
                 // Important. We dont pass ancestor above because it can be missing.
                 // Thus we need to skip the validation here.
                 CreateTimelineCause::Delete,
+                // Aux file policy is not needed for deletion, assuming deletion does not read aux keyspace
+                None,
             )
             .context("create_timeline_struct")?;
 
diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs
index a2f761fa94..c0cc8f3124 100644
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -8,6 +8,7 @@ use std::collections::{HashMap, VecDeque};
 use std::fmt::Debug;
 
 use chrono::NaiveDateTime;
+use pageserver_api::models::AuxFilePolicy;
 use std::sync::Arc;
 use tracing::info;
 use utils::lsn::AtomicLsn;
@@ -60,6 +61,9 @@ pub(crate) struct UploadQueueInitialized {
     /// Part of the flattened "next" `index_part.json`.
     pub(crate) latest_lineage: Lineage,
 
+    /// The last aux file policy used on this timeline.
+    pub(crate) last_aux_file_policy: Option<AuxFilePolicy>,
+
     /// `disk_consistent_lsn` from the last metadata file that was successfully
     /// uploaded. `Lsn(0)` if nothing was uploaded yet.
     /// Unlike `latest_files` or `latest_metadata`, this value is never ahead.
@@ -189,6 +193,7 @@ impl UploadQueue {
             dangling_files: HashMap::new(),
             shutting_down: false,
             shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
+            last_aux_file_policy: Default::default(),
         };
 
         *self = UploadQueue::Initialized(state);
@@ -239,6 +244,7 @@ impl UploadQueue {
             dangling_files: HashMap::new(),
             shutting_down: false,
             shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
+            last_aux_file_policy: index_part.last_aux_file_policy(),
         };
 
         *self = UploadQueue::Initialized(state);
diff --git a/test_runner/regress/test_aux_files.py b/test_runner/regress/test_aux_files.py
new file mode 100644
index 0000000000..be9c41a867
--- /dev/null
+++ b/test_runner/regress/test_aux_files.py
@@ -0,0 +1,72 @@
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+    logical_replication_sync,
+)
+
+
+def test_aux_v2_config_switch(neon_env_builder: NeonEnvBuilder, vanilla_pg):
+    env = neon_env_builder.init_start()
+    endpoint = env.endpoints.create_start("main")
+    client = env.pageserver.http_client()
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    tenant_config = client.tenant_config(tenant_id).effective_config
+    tenant_config["switch_aux_file_policy"] = "V2"
+    client.set_tenant_config(tenant_id, tenant_config)
+    # aux file v2 is enabled on the write path, so for now, it should be unset (or null)
+    assert (
+        client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)["last_aux_file_policy"]
+        is None
+    )
+
+    pg_conn = endpoint.connect()
+    cur = pg_conn.cursor()
+
+    cur.execute("create table t(pk integer primary key, payload integer)")
+    cur.execute(
+        "CREATE TABLE replication_example(id SERIAL PRIMARY KEY, somedata int, text varchar(120));"
+    )
+    cur.execute("create publication pub1 for table t, replication_example")
+
+    # now start subscriber, aux files will be created at this point. TODO: find better ways of testing aux files (i.e., neon_test_utils)
+    # instead of going through the full logical replication process.
+    vanilla_pg.start()
+    vanilla_pg.safe_psql("create table t(pk integer primary key, payload integer)")
+    vanilla_pg.safe_psql(
+        "CREATE TABLE replication_example(id SERIAL PRIMARY KEY, somedata int, text varchar(120), testcolumn1 int, testcolumn2 int, testcolumn3 int);"
+    )
+    connstr = endpoint.connstr().replace("'", "''")
+    log.info(f"ep connstr is {endpoint.connstr()}, subscriber connstr {vanilla_pg.connstr()}")
+    vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1")
+
+    # Wait logical replication channel to be established
+    logical_replication_sync(vanilla_pg, endpoint)
+    vanilla_pg.stop()
+    endpoint.stop()
+
+    with env.pageserver.http_client() as client:
+        # aux file v2 flag should be enabled at this point
+        assert client.timeline_detail(tenant_id, timeline_id)["last_aux_file_policy"] == "V2"
+    with env.pageserver.http_client() as client:
+        tenant_config = client.tenant_config(tenant_id).effective_config
+        tenant_config["switch_aux_file_policy"] = "V1"
+        client.set_tenant_config(tenant_id, tenant_config)
+        # the flag should still be enabled
+        assert (
+            client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)[
+                "last_aux_file_policy"
+            ]
+            == "V2"
+        )
+    env.pageserver.restart()
+    with env.pageserver.http_client() as client:
+        # aux file v2 flag should be persisted
+        assert (
+            client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)[
+                "last_aux_file_policy"
+            ]
+            == "V2"
+        )

From e1a9669d05374ea27685a1cf527676fe01df7722 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Fri, 17 May 2024 16:04:02 -0400
Subject: [PATCH 076/130] feat(pagebench): add aux file bench (#7746)

part of https://github.com/neondatabase/neon/issues/7462

## Summary of changes

This pull request adds two APIs to the pageserver management API:
list_aux_files and ingest_aux_files. The aux file pagebench is intended
to be used on an empty timeline because the data do not go through the
safekeeper. LSNs are advanced by 8 for each ingestion, to avoid
invariant checks inside the pageserver.

For now, I only care about space amplification / read amplification, so
the bench is designed in a very simple way: ingest 10000 files, and I
will manually dump the layer map to analyze.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/pageserver_api/src/models.rs         | 10 +++
 pageserver/client/src/mgmt_api.rs         | 57 +++++++++++++
 pageserver/pagebench/src/cmd/aux_files.rs | 98 +++++++++++++++++++++++
 pageserver/pagebench/src/main.rs          |  3 +
 pageserver/src/http/routes.rs             | 75 +++++++++++++++++
 5 files changed, 243 insertions(+)
 create mode 100644 pageserver/pagebench/src/cmd/aux_files.rs

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 80ca696313..451ee1a13c 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -841,6 +841,16 @@ pub struct DownloadRemoteLayersTaskSpawnRequest {
     pub max_concurrent_downloads: NonZeroUsize,
 }
 
+#[derive(Debug, Serialize, Deserialize)]
+pub struct IngestAuxFilesRequest {
+    pub aux_files: HashMap<String, String>,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+pub struct ListAuxFilesRequest {
+    pub lsn: Lsn,
+}
+
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct DownloadRemoteLayersTaskInfo {
     pub task_id: String,
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 5904713da9..69b86d9c46 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -1,8 +1,12 @@
+use std::collections::HashMap;
+
+use bytes::Bytes;
 use pageserver_api::{models::*, shard::TenantShardId};
 use reqwest::{IntoUrl, Method, StatusCode};
 use utils::{
     http::error::HttpErrorBody,
     id::{TenantId, TimelineId},
+    lsn::Lsn,
 };
 
 pub mod util;
@@ -561,4 +565,57 @@ impl Client {
             }),
         }
     }
+
+    pub async fn ingest_aux_files(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        aux_files: HashMap<String, String>,
+    ) -> Result<bool> {
+        let uri = format!(
+            "{}/v1/tenant/{}/timeline/{}/ingest_aux_files",
+            self.mgmt_api_endpoint, tenant_shard_id, timeline_id
+        );
+        let resp = self
+            .request_noerror(Method::POST, &uri, IngestAuxFilesRequest { aux_files })
+            .await?;
+        match resp.status() {
+            StatusCode::OK => Ok(true),
+            status => Err(match resp.json::<HttpErrorBody>().await {
+                Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg),
+                Err(_) => {
+                    Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri))
+                }
+            }),
+        }
+    }
+
+    pub async fn list_aux_files(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        lsn: Lsn,
+    ) -> Result<HashMap<String, Bytes>> {
+        let uri = format!(
+            "{}/v1/tenant/{}/timeline/{}/list_aux_files",
+            self.mgmt_api_endpoint, tenant_shard_id, timeline_id
+        );
+        let resp = self
+            .request_noerror(Method::POST, &uri, ListAuxFilesRequest { lsn })
+            .await?;
+        match resp.status() {
+            StatusCode::OK => {
+                let resp: HashMap<String, Bytes> = resp.json().await.map_err(|e| {
+                    Error::ApiError(StatusCode::INTERNAL_SERVER_ERROR, format!("{e}"))
+                })?;
+                Ok(resp)
+            }
+            status => Err(match resp.json::<HttpErrorBody>().await {
+                Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg),
+                Err(_) => {
+                    Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri))
+                }
+            }),
+        }
+    }
 }
diff --git a/pageserver/pagebench/src/cmd/aux_files.rs b/pageserver/pagebench/src/cmd/aux_files.rs
new file mode 100644
index 0000000000..eb5b242a5f
--- /dev/null
+++ b/pageserver/pagebench/src/cmd/aux_files.rs
@@ -0,0 +1,98 @@
+use pageserver_api::models::{AuxFilePolicy, TenantConfig, TenantConfigRequest};
+use pageserver_api::shard::TenantShardId;
+use utils::id::TenantTimelineId;
+use utils::lsn::Lsn;
+
+use std::collections::HashMap;
+use std::sync::Arc;
+
+/// Ingest aux files into the pageserver.
+#[derive(clap::Parser)]
+pub(crate) struct Args {
+    #[clap(long, default_value = "http://localhost:9898")]
+    mgmt_api_endpoint: String,
+    #[clap(long, default_value = "postgres://postgres@localhost:64000")]
+    page_service_connstring: String,
+    #[clap(long)]
+    pageserver_jwt: Option<String>,
+
+    targets: Option<Vec<TenantTimelineId>>,
+}
+
+pub(crate) fn main(args: Args) -> anyhow::Result<()> {
+    let rt = tokio::runtime::Builder::new_multi_thread()
+        .enable_all()
+        .build()
+        .unwrap();
+
+    let main_task = rt.spawn(main_impl(args));
+    rt.block_on(main_task).unwrap()
+}
+
+async fn main_impl(args: Args) -> anyhow::Result<()> {
+    let args: &'static Args = Box::leak(Box::new(args));
+
+    let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
+        args.mgmt_api_endpoint.clone(),
+        args.pageserver_jwt.as_deref(),
+    ));
+
+    // discover targets
+    let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
+        &mgmt_api_client,
+        crate::util::cli::targets::Spec {
+            limit_to_first_n_targets: None,
+            targets: {
+                if let Some(targets) = &args.targets {
+                    if targets.len() != 1 {
+                        anyhow::bail!("must specify exactly one target");
+                    }
+                    Some(targets.clone())
+                } else {
+                    None
+                }
+            },
+        },
+    )
+    .await?;
+
+    let timeline = timelines[0];
+    let tenant_shard_id = TenantShardId::unsharded(timeline.tenant_id);
+    let timeline_id = timeline.timeline_id;
+
+    println!("operating on timeline {}", timeline);
+
+    mgmt_api_client
+        .tenant_config(&TenantConfigRequest {
+            tenant_id: timeline.tenant_id,
+            config: TenantConfig {
+                switch_aux_file_policy: Some(AuxFilePolicy::V2),
+                ..Default::default()
+            },
+        })
+        .await?;
+
+    for batch in 0..100 {
+        let items = (0..100)
+            .map(|id| {
+                (
+                    format!("pg_logical/mappings/{:03}.{:03}", batch, id),
+                    format!("{:08}", id),
+                )
+            })
+            .collect::<HashMap<_, _>>();
+        let file_cnt = items.len();
+        mgmt_api_client
+            .ingest_aux_files(tenant_shard_id, timeline_id, items)
+            .await?;
+        println!("ingested {file_cnt} files");
+    }
+
+    let files = mgmt_api_client
+        .list_aux_files(tenant_shard_id, timeline_id, Lsn(Lsn::MAX.0 - 1))
+        .await?;
+
+    println!("{} files found", files.len());
+
+    anyhow::Ok(())
+}
diff --git a/pageserver/pagebench/src/main.rs b/pageserver/pagebench/src/main.rs
index 743102d853..5527557450 100644
--- a/pageserver/pagebench/src/main.rs
+++ b/pageserver/pagebench/src/main.rs
@@ -14,6 +14,7 @@ mod util {
 
 /// The pagebench CLI sub-commands, dispatched in [`main`] below.
 mod cmd {
+    pub(super) mod aux_files;
     pub(super) mod basebackup;
     pub(super) mod getpage_latest_lsn;
     pub(super) mod ondemand_download_churn;
@@ -27,6 +28,7 @@ enum Args {
     GetPageLatestLsn(cmd::getpage_latest_lsn::Args),
     TriggerInitialSizeCalculation(cmd::trigger_initial_size_calculation::Args),
     OndemandDownloadChurn(cmd::ondemand_download_churn::Args),
+    AuxFiles(cmd::aux_files::Args),
 }
 
 fn main() {
@@ -46,6 +48,7 @@ fn main() {
             cmd::trigger_initial_size_calculation::main(args)
         }
         Args::OndemandDownloadChurn(args) => cmd::ondemand_download_churn::main(args),
+        Args::AuxFiles(args) => cmd::aux_files::main(args),
     }
     .unwrap()
 }
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 7efd48afc7..0eab6510ca 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -16,6 +16,8 @@ use hyper::header;
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
+use pageserver_api::models::IngestAuxFilesRequest;
+use pageserver_api::models::ListAuxFilesRequest;
 use pageserver_api::models::LocationConfig;
 use pageserver_api::models::LocationConfigListResponse;
 use pageserver_api::models::ShardParameters;
@@ -2331,6 +2333,71 @@ async fn get_utilization(
         .map_err(ApiError::InternalServerError)
 }
 
+async fn list_aux_files(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    let body: ListAuxFilesRequest = json_request(&mut request).await?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+
+    let state = get_state(&request);
+
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
+
+    let process = || async move {
+        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+        let files = timeline.list_aux_files(body.lsn, &ctx).await?;
+        Ok::<_, anyhow::Error>(files)
+    };
+
+    match process().await {
+        Ok(st) => json_response(StatusCode::OK, st),
+        Err(err) => json_response(
+            StatusCode::INTERNAL_SERVER_ERROR,
+            ApiError::InternalServerError(err).to_string(),
+        ),
+    }
+}
+
+async fn ingest_aux_files(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    let body: IngestAuxFilesRequest = json_request(&mut request).await?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+
+    let state = get_state(&request);
+
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
+
+    let process = || async move {
+        let mut modification = timeline.begin_modification(Lsn(
+            timeline.get_last_record_lsn().0 + 8
+        ) /* advance LSN by 8 */);
+        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+        for (fname, content) in body.aux_files {
+            modification
+                .put_file(&fname, content.as_bytes(), &ctx)
+                .await?;
+        }
+        modification.commit(&ctx).await?;
+        Ok::<_, anyhow::Error>(())
+    };
+
+    match process().await {
+        Ok(st) => json_response(StatusCode::OK, st),
+        Err(err) => Err(ApiError::InternalServerError(err)),
+    }
+}
+
 /// Report on the largest tenants on this pageserver, for the storage controller to identify
 /// candidates for splitting
 async fn post_top_tenants(
@@ -2708,6 +2775,14 @@ pub fn make_router(
         )
         .put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
         .get("/v1/utilization", |r| api_handler(r, get_utilization))
+        .post(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/ingest_aux_files",
+            |r| testing_api_handler("ingest_aux_files", r, ingest_aux_files),
+        )
+        .post(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/list_aux_files",
+            |r| testing_api_handler("list_aux_files", r, list_aux_files),
+        )
         .post("/v1/top_tenants", |r| api_handler(r, post_top_tenants))
         .any(handler_404))
 }

From 5caee4ca54ea16905ccc3e7f60b3221f33a74b91 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sun, 19 May 2024 20:49:49 +0300
Subject: [PATCH 077/130] Fix calculation in test

The comment says that this checks if there's enough space on the page
for logical message *and* an XLOG_SWITCH. So the sizes of the logical
message and the XLOG_SWITCH record should be added together, not
subtracted.

I saw a panic in the test that led me to investigate and notice this
(https://neon-github-public-dev.s3.amazonaws.com/reports/pr-7803/9142396223/index.html):

    RuntimeError: Run ['/tmp/neon/bin/wal_craft', 'in-existing', 'last_wal_record_xlog_switch_ends_on_page_boundary', "host=localhost port=16165 user=cloud_admin dbname=postgres options='-cstatement_timeout=120s '"] failed:
      stdout:

      stderr:
        thread 'main' panicked at libs/postgres_ffi/wal_craft/src/lib.rs:370:27:
        attempt to subtract with overflow
        stack backtrace:
           0: rust_begin_unwind
                     at /rustc/9b00956e56009bab2aa15d7bff10916599e3d6d6/library/std/src/panicking.rs:645:5
           1: core::panicking::panic_fmt
                     at /rustc/9b00956e56009bab2aa15d7bff10916599e3d6d6/library/core/src/panicking.rs:72:14
           2: core::panicking::panic
                     at /rustc/9b00956e56009bab2aa15d7bff10916599e3d6d6/library/core/src/panicking.rs:145:5
           3: <wal_craft::LastWalRecordXlogSwitchEndsOnPageBoundary as wal_craft::Crafter>::craft::<postgres::client::Client>
                     at libs/postgres_ffi/wal_craft/src/lib.rs:370:27
           4: wal_craft::main::{closure#0}
                     at libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs:21:17
           5: wal_craft::main
                     at libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs:66:47
           6: <fn() -> core::result::Result<(), anyhow::Error> as core::ops::function::FnOnce<()>>::call_once
                     at /rustc/9b00956e56009bab2aa15d7bff10916599e3d6d6/library/core/src/ops/function.rs:250:5
        note: Some details are omitted, run with `RUST_BACKTRACE=full` for a verbose backtrace.
---
 libs/postgres_ffi/wal_craft/src/lib.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs
index 262068cbda..b6769629a8 100644
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -359,7 +359,7 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
             // Is there enough space on the page for another logical message and an
             // XLOG_SWITCH? If not, start over.
             let page_remain = XLOG_BLCKSZ as u64 - u64::from(after_lsn) % XLOG_BLCKSZ as u64;
-            if page_remain < base_size - XLOG_SIZE_OF_XLOG_RECORD as u64 {
+            if page_remain < base_size + XLOG_SIZE_OF_XLOG_RECORD as u64 {
                 continue;
             }
 

From a5ecca976ec3abf97c8c3db4f78c230b4553b509 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Sun, 19 May 2024 20:45:53 +0100
Subject: [PATCH 078/130] proxy: bump parquet (#7782)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Summary of changes

Updates the parquet lib. one change left that we need is in an open PR
against upstream, hopefully we can remove the git dependency by 52.0.0
https://github.com/apache/arrow-rs/pull/5773

I'm not sure why the parquet files got a little bit bigger. I tested
them and they still open fine. 🤷

side effect of the update, chrono updated and added yet another
deprecation warning (hence why the safekeepers change)
---
 Cargo.lock                   | 29 ++++++++++----
 Cargo.toml                   |  8 ++--
 proxy/src/context/parquet.rs | 73 +++++++++++++++++++-----------------
 safekeeper/src/broker.rs     |  2 +-
 workspace_hack/Cargo.toml    |  4 +-
 5 files changed, 67 insertions(+), 49 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index e1edd53fea..e6060c82f5 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1072,9 +1072,9 @@ dependencies = [
 
 [[package]]
 name = "chrono"
-version = "0.4.31"
+version = "0.4.38"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7f2c685bad3eb3d45a01354cedb7d5faa66194d1d58ba6e267a8de788f79db38"
+checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401"
 dependencies = [
  "android-tzdata",
  "iana-time-zone",
@@ -1082,7 +1082,7 @@ dependencies = [
  "num-traits",
  "serde",
  "wasm-bindgen",
- "windows-targets 0.48.0",
+ "windows-targets 0.52.4",
 ]
 
 [[package]]
@@ -1109,7 +1109,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "defaa24ecc093c77630e6c15e17c51f5e187bf35ee514f4e2d67baaa96dae22b"
 dependencies = [
  "ciborium-io",
- "half",
+ "half 1.8.2",
 ]
 
 [[package]]
@@ -2278,6 +2278,17 @@ version = "1.8.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7"
 
+[[package]]
+name = "half"
+version = "2.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888"
+dependencies = [
+ "cfg-if",
+ "crunchy",
+ "num-traits",
+]
+
 [[package]]
 name = "hash32"
 version = "0.3.1"
@@ -3902,12 +3913,13 @@ dependencies = [
 
 [[package]]
 name = "parquet"
-version = "49.0.0"
-source = "git+https://github.com/neondatabase/arrow-rs?branch=neon-fix-bugs#8a0bc58aa67b98aabbd8eee7c6ca4281967ff9e9"
+version = "51.0.0"
+source = "git+https://github.com/apache/arrow-rs?branch=master#2534976a564be3d2d56312dc88fb1b6ed4cef829"
 dependencies = [
  "ahash",
  "bytes",
  "chrono",
+ "half 2.4.1",
  "hashbrown 0.14.5",
  "num",
  "num-bigint",
@@ -3916,12 +3928,13 @@ dependencies = [
  "thrift",
  "twox-hash",
  "zstd",
+ "zstd-sys",
 ]
 
 [[package]]
 name = "parquet_derive"
-version = "49.0.0"
-source = "git+https://github.com/neondatabase/arrow-rs?branch=neon-fix-bugs#8a0bc58aa67b98aabbd8eee7c6ca4281967ff9e9"
+version = "51.0.0"
+source = "git+https://github.com/apache/arrow-rs?branch=master#2534976a564be3d2d56312dc88fb1b6ed4cef829"
 dependencies = [
  "parquet",
  "proc-macro2",
diff --git a/Cargo.toml b/Cargo.toml
index b59a5dcd6d..2a7dea447e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -122,8 +122,8 @@ opentelemetry = "0.20.0"
 opentelemetry-otlp = { version = "0.13.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
 opentelemetry-semantic-conventions = "0.12.0"
 parking_lot = "0.12"
-parquet = { version = "49.0.0", default-features = false, features = ["zstd"] }
-parquet_derive = "49.0.0"
+parquet = { version = "51.0.0", default-features = false, features = ["zstd"] }
+parquet_derive = "51.0.0"
 pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
 pin-project-lite = "0.2"
 procfs = "0.14"
@@ -244,8 +244,8 @@ tonic-build = "0.9"
 tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
 
 # bug fixes for UUID
-parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" }
-parquet_derive = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" }
+parquet = { git = "https://github.com/apache/arrow-rs", branch = "master" }
+parquet_derive = { git = "https://github.com/apache/arrow-rs", branch = "master" }
 
 ################# Binary contents sections
 
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index 8104fe6087..392821c430 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -307,7 +307,7 @@ where
 }
 
 async fn upload_parquet(
-    w: SerializedFileWriter<Writer<BytesMut>>,
+    mut w: SerializedFileWriter<Writer<BytesMut>>,
     len: i64,
     storage: &GenericRemoteStorage,
 ) -> anyhow::Result<Writer<BytesMut>> {
@@ -319,11 +319,15 @@ async fn upload_parquet(
 
     // I don't know how compute intensive this is, although it probably isn't much... better be safe than sorry.
     // finish method only available on the fork: https://github.com/apache/arrow-rs/issues/5253
-    let (writer, metadata) = tokio::task::spawn_blocking(move || w.finish())
+    let (mut buffer, metadata) =
+        tokio::task::spawn_blocking(move || -> parquet::errors::Result<_> {
+            let metadata = w.finish()?;
+            let buffer = std::mem::take(w.inner_mut().get_mut());
+            Ok((buffer, metadata))
+        })
         .await
         .unwrap()?;
 
-    let mut buffer = writer.into_inner();
     let data = buffer.split().freeze();
 
     let compression = len as f64 / len_uncompressed as f64;
@@ -474,10 +478,11 @@ mod tests {
         RequestData {
             session_id: uuid::Builder::from_random_bytes(rng.gen()).into_uuid(),
             peer_addr: Ipv4Addr::from(rng.gen::<[u8; 4]>()).to_string(),
-            timestamp: chrono::NaiveDateTime::from_timestamp_millis(
+            timestamp: chrono::DateTime::from_timestamp_millis(
                 rng.gen_range(1703862754..1803862754),
             )
-            .unwrap(),
+            .unwrap()
+            .naive_utc(),
             application_name: Some("test".to_owned()),
             username: Some(hex::encode(rng.gen::<[u8; 4]>())),
             endpoint_id: Some(hex::encode(rng.gen::<[u8; 16]>())),
@@ -560,15 +565,15 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1315008, 3, 6000),
-                (1315001, 3, 6000),
-                (1315061, 3, 6000),
-                (1315018, 3, 6000),
-                (1315148, 3, 6000),
-                (1314990, 3, 6000),
-                (1314782, 3, 6000),
-                (1315018, 3, 6000),
-                (438575, 1, 2000)
+                (1315314, 3, 6000),
+                (1315307, 3, 6000),
+                (1315367, 3, 6000),
+                (1315324, 3, 6000),
+                (1315454, 3, 6000),
+                (1315296, 3, 6000),
+                (1315088, 3, 6000),
+                (1315324, 3, 6000),
+                (438713, 1, 2000)
             ]
         );
 
@@ -598,11 +603,11 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1221738, 5, 10000),
-                (1227888, 5, 10000),
-                (1229682, 5, 10000),
-                (1229044, 5, 10000),
-                (1220322, 5, 10000)
+                (1222212, 5, 10000),
+                (1228362, 5, 10000),
+                (1230156, 5, 10000),
+                (1229518, 5, 10000),
+                (1220796, 5, 10000)
             ]
         );
 
@@ -634,11 +639,11 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1207385, 5, 10000),
-                (1207116, 5, 10000),
-                (1207409, 5, 10000),
-                (1207397, 5, 10000),
-                (1207652, 5, 10000)
+                (1207859, 5, 10000),
+                (1207590, 5, 10000),
+                (1207883, 5, 10000),
+                (1207871, 5, 10000),
+                (1208126, 5, 10000)
             ]
         );
 
@@ -663,15 +668,15 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1315008, 3, 6000),
-                (1315001, 3, 6000),
-                (1315061, 3, 6000),
-                (1315018, 3, 6000),
-                (1315148, 3, 6000),
-                (1314990, 3, 6000),
-                (1314782, 3, 6000),
-                (1315018, 3, 6000),
-                (438575, 1, 2000)
+                (1315314, 3, 6000),
+                (1315307, 3, 6000),
+                (1315367, 3, 6000),
+                (1315324, 3, 6000),
+                (1315454, 3, 6000),
+                (1315296, 3, 6000),
+                (1315088, 3, 6000),
+                (1315324, 3, 6000),
+                (438713, 1, 2000)
             ]
         );
 
@@ -708,7 +713,7 @@ mod tests {
         // files are smaller than the size threshold, but they took too long to fill so were flushed early
         assert_eq!(
             file_stats,
-            [(659240, 2, 3001), (658954, 2, 3000), (658750, 2, 2999)]
+            [(659462, 2, 3001), (659176, 2, 3000), (658972, 2, 2999)]
         );
 
         tmpdir.close().unwrap();
diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs
index 98f58d3e49..ea16ce450f 100644
--- a/safekeeper/src/broker.rs
+++ b/safekeeper/src/broker.rs
@@ -319,7 +319,7 @@ async fn task_stats(stats: Arc<BrokerStats>) {
 
                 let now = BrokerStats::now_millis();
                 if now > last_pulled && now - last_pulled > warn_duration.as_millis() as u64 {
-                    let ts = chrono::NaiveDateTime::from_timestamp_millis(last_pulled as i64).expect("invalid timestamp");
+                    let ts = chrono::DateTime::from_timestamp_millis(last_pulled as i64).expect("invalid timestamp");
                     info!("no broker updates for some time, last update: {:?}", ts);
                 }
             }
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index b605757f64..7582562450 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -51,7 +51,7 @@ num-bigint = { version = "0.4" }
 num-integer = { version = "0.1", features = ["i128"] }
 num-traits = { version = "0.2", features = ["i128", "libm"] }
 once_cell = { version = "1" }
-parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs", default-features = false, features = ["zstd"] }
+parquet = { git = "https://github.com/apache/arrow-rs", branch = "master", default-features = false, features = ["zstd"] }
 prost = { version = "0.11" }
 rand = { version = "0.8", features = ["small_rng"] }
 regex = { version = "1" }
@@ -102,7 +102,7 @@ num-bigint = { version = "0.4" }
 num-integer = { version = "0.1", features = ["i128"] }
 num-traits = { version = "0.2", features = ["i128", "libm"] }
 once_cell = { version = "1" }
-parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs", default-features = false, features = ["zstd"] }
+parquet = { git = "https://github.com/apache/arrow-rs", branch = "master", default-features = false, features = ["zstd"] }
 prost = { version = "0.11" }
 regex = { version = "1" }
 regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] }

From 291fcb9e4f7086237c409580b1fd3accf46d5ba3 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 20 May 2024 09:25:25 +0100
Subject: [PATCH 079/130] pageserver: use the heatmap upload interval to set
 the secondary download interval (#7793)

## Problem

The heatmap upload period is configurable, but secondary mode downloads
were using a fixed download period.

Closes: #6200

## Summary of changes

- Use the upload period in the heatmap to adjust the download period.

In practice, this will reduce the frequency of downloads from its
current 60 second period to what heatmaps use, which is 5-10m depending
on environment.

This is an improvement rather than being optimal: we could be smarter
about periods, and schedule downloads to occur around the time we expect
the next upload, rather than just using the same period, but that's
something we can address in future if it comes up.
---
 pageserver/src/tenant/secondary/downloader.rs | 84 ++++++++++---------
 .../regress/test_pageserver_secondary.py      | 18 ++--
 2 files changed, 58 insertions(+), 44 deletions(-)

diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 8f27220771..609e1431cf 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -62,14 +62,10 @@ use super::{
     CommandRequest, DownloadCommand,
 };
 
-/// For each tenant, how long must have passed since the last download_tenant call before
-/// calling it again.  This is approximately the time by which local data is allowed
-/// to fall behind remote data.
-///
-/// TODO: this should just be a default, and the actual period should be controlled
-/// via the heatmap itself
-/// `<ttps://github.com/neondatabase/neon/issues/6200>`
-const DOWNLOAD_FRESHEN_INTERVAL: Duration = Duration::from_millis(60000);
+/// For each tenant, default period for how long must have passed since the last download_tenant call before
+/// calling it again.  This default is replaced with the value of [`HeatMapTenant::upload_period_ms`] after first
+/// download, if the uploader populated it.
+const DEFAULT_DOWNLOAD_INTERVAL: Duration = Duration::from_millis(60000);
 
 /// Range of concurrency we may use when downloading layers within a timeline.  This is independent
 /// for each tenant we're downloading: the concurrency of _tenants_ is defined separately in
@@ -152,14 +148,22 @@ pub(super) struct SecondaryDetailTimeline {
     pub(super) evicted_at: HashMap<LayerName, SystemTime>,
 }
 
+// Aspects of a heatmap that we remember after downloading it
+#[derive(Clone, Debug)]
+struct DownloadSummary {
+    etag: Etag,
+    #[allow(unused)]
+    mtime: SystemTime,
+    upload_period: Duration,
+}
+
 /// This state is written by the secondary downloader, it is opaque
 /// to TenantManager
 #[derive(Debug)]
 pub(super) struct SecondaryDetail {
     pub(super) config: SecondaryLocationConfig,
 
-    last_download: Option<Instant>,
-    last_etag: Option<Etag>,
+    last_download: Option<DownloadSummary>,
     next_download: Option<Instant>,
     pub(super) timelines: HashMap<TimelineId, SecondaryDetailTimeline>,
 }
@@ -189,7 +193,6 @@ impl SecondaryDetail {
         Self {
             config,
             last_download: None,
-            last_etag: None,
             next_download: None,
             timelines: HashMap::new(),
         }
@@ -243,9 +246,8 @@ impl SecondaryDetail {
 
 struct PendingDownload {
     secondary_state: Arc<SecondaryTenant>,
-    last_download: Option<Instant>,
+    last_download: Option<DownloadSummary>,
     target_time: Option<Instant>,
-    period: Option<Duration>,
 }
 
 impl scheduler::PendingJob for PendingDownload {
@@ -295,10 +297,17 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
 
         tracing::debug!("Secondary tenant download completed");
 
-        // Update freshened_at even if there was an error: we don't want errored tenants to implicitly
-        // take priority to run again.
         let mut detail = secondary_state.detail.lock().unwrap();
-        detail.next_download = Some(Instant::now() + period_jitter(DOWNLOAD_FRESHEN_INTERVAL, 5));
+
+        let period = detail
+            .last_download
+            .as_ref()
+            .map(|d| d.upload_period)
+            .unwrap_or(DEFAULT_DOWNLOAD_INTERVAL);
+
+        // We advance next_download irrespective of errors: we don't want error cases to result in
+        // expensive busy-polling.
+        detail.next_download = Some(Instant::now() + period_jitter(period, 5));
     }
 
     async fn schedule(&mut self) -> SchedulingResult<PendingDownload> {
@@ -331,11 +340,11 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
                     if detail.next_download.is_none() {
                         // Initialize randomly in the range from 0 to our interval: this uniformly spreads the start times.  Subsequent
                         // rounds will use a smaller jitter to avoid accidentally synchronizing later.
-                        detail.next_download = Some(now.checked_add(period_warmup(DOWNLOAD_FRESHEN_INTERVAL)).expect(
+                        detail.next_download = Some(now.checked_add(period_warmup(DEFAULT_DOWNLOAD_INTERVAL)).expect(
                         "Using our constant, which is known to be small compared with clock range",
                     ));
                     }
-                    (detail.last_download, detail.next_download.unwrap())
+                    (detail.last_download.clone(), detail.next_download.unwrap())
                 };
 
                 if now > next_download {
@@ -343,7 +352,6 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
                         secondary_state: secondary_tenant,
                         last_download,
                         target_time: Some(next_download),
-                        period: Some(DOWNLOAD_FRESHEN_INTERVAL),
                     })
                 } else {
                     None
@@ -369,7 +377,6 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
 
         Ok(PendingDownload {
             target_time: None,
-            period: None,
             last_download: None,
             secondary_state: tenant,
         })
@@ -386,7 +393,6 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
             secondary_state,
             last_download,
             target_time,
-            period,
         } = job;
 
         let (completion, barrier) = utils::completion::channel();
@@ -423,20 +429,15 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
 
             // If the job had a target execution time, we may check our final execution
             // time against that for observability purposes.
-            if let (Some(target_time), Some(period)) = (target_time, period) {
-                // Only track execution lag if this isn't our first download: otherwise, it is expected
-                // that execution will have taken longer than our configured interval, for example
-                // when starting up a pageserver and
-                if last_download.is_some() {
-                    // Elapsed time includes any scheduling lag as well as the execution of the job
-                    let elapsed = Instant::now().duration_since(target_time);
+            if let (Some(target_time), Some(last_download)) = (target_time, last_download) {
+                // Elapsed time includes any scheduling lag as well as the execution of the job
+                let elapsed = Instant::now().duration_since(target_time);
 
-                    warn_when_period_overrun(
-                        elapsed,
-                        period,
-                        BackgroundLoopKind::SecondaryDownload,
-                    );
-                }
+                warn_when_period_overrun(
+                    elapsed,
+                    last_download.upload_period,
+                    BackgroundLoopKind::SecondaryDownload,
+                );
             }
 
             CompleteDownload {
@@ -525,12 +526,12 @@ impl<'a> TenantDownloader<'a> {
         let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
 
         // We will use the etag from last successful download to make the download conditional on changes
-        let last_etag = self
+        let last_download = self
             .secondary_state
             .detail
             .lock()
             .unwrap()
-            .last_etag
+            .last_download
             .clone();
 
         // Download the tenant's heatmap
@@ -539,7 +540,7 @@ impl<'a> TenantDownloader<'a> {
             etag: heatmap_etag,
             bytes: heatmap_bytes,
         } = match tokio::select!(
-            bytes = self.download_heatmap(last_etag.as_ref()) => {bytes?},
+            bytes = self.download_heatmap(last_download.as_ref().map(|d| &d.etag)) => {bytes?},
             _ = self.secondary_state.cancel.cancelled() => return Ok(())
         ) {
             HeatMapDownload::Unmodified => {
@@ -599,7 +600,14 @@ impl<'a> TenantDownloader<'a> {
 
         // Only update last_etag after a full successful download: this way will not skip
         // the next download, even if the heatmap's actual etag is unchanged.
-        self.secondary_state.detail.lock().unwrap().last_etag = Some(heatmap_etag);
+        self.secondary_state.detail.lock().unwrap().last_download = Some(DownloadSummary {
+            etag: heatmap_etag,
+            mtime: heatmap_mtime,
+            upload_period: heatmap
+                .upload_period_ms
+                .map(|ms| Duration::from_millis(ms as u64))
+                .unwrap_or(DEFAULT_DOWNLOAD_INTERVAL),
+        });
 
         Ok(())
     }
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index fdc09a063d..127340a1e7 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -575,7 +575,10 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
     tenant_timelines = {}
 
     # This mirrors a constant in `downloader.rs`
-    freshen_interval_secs = 60
+    default_download_period_secs = 60
+
+    # The upload period, which will also be the download once the secondary has seen its first heatmap
+    upload_period_secs = 20
 
     for _i in range(0, tenant_count):
         tenant_id = TenantId.generate()
@@ -587,7 +590,7 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
             placement_policy='{"Attached":1}',
             # Run with a low heatmap period so that we can avoid having to do synthetic API calls
             # to trigger the upload promptly.
-            conf={"heatmap_period": "1s"},
+            conf={"heatmap_period": f"{upload_period_secs}s"},
         )
         env.neon_cli.create_timeline("main2", tenant_id, timeline_b)
 
@@ -597,7 +600,7 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
 
     # Wait long enough that the background downloads should happen; we expect all the inital layers
     # of all the initial timelines to show up on the secondary location of each tenant.
-    time.sleep(freshen_interval_secs * 1.5)
+    time.sleep(default_download_period_secs * 1.5)
 
     for tenant_id, timelines in tenant_timelines.items():
         attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"]
@@ -613,8 +616,8 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
         # Delete the second timeline: this should be reflected later on the secondary
         env.storage_controller.pageserver_api().timeline_delete(tenant_id, timelines[1])
 
-    # Wait long enough for the secondary locations to see the deletion
-    time.sleep(freshen_interval_secs * 1.5)
+    # Wait long enough for the secondary locations to see the deletion: 2x period plus a grace factor
+    time.sleep(upload_period_secs * 2.5)
 
     for tenant_id, timelines in tenant_timelines.items():
         attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"]
@@ -626,6 +629,9 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
         assert ps_secondary.list_layers(tenant_id, timelines[0])
 
         # This one was deleted
+        log.info(
+            f"Checking for secondary timeline deletion {tenant_id}/{timeline_id} on node {ps_secondary.id}"
+        )
         assert not ps_secondary.list_layers(tenant_id, timelines[1])
 
     t_end = time.time()
@@ -640,7 +646,7 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
 
     download_rate = (total_heatmap_downloads / tenant_count) / (t_end - t_start)
 
-    expect_download_rate = 1.0 / freshen_interval_secs
+    expect_download_rate = 1.0 / upload_period_secs
     log.info(f"Download rate: {download_rate * 60}/min vs expected {expect_download_rate * 60}/min")
 
     assert download_rate < expect_download_rate * 2

From a7b84cca5aef3dacc90895f39f0e0ad2b6c950cc Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Mon, 20 May 2024 12:07:25 +0200
Subject: [PATCH 080/130] Upgrade of pgvector to 0.7.0 (#7726)

Upgrade pgvector to 0.7.0.

This PR is based on Heikki's PR #6753 and just uses pgvector 0.7.0
instead of 0.6.0

I have now done all planned manual tests.

The pull request is ready to be reviewed and merged and can be deployed
in production together / after swap enablement.

See (https://github.com/neondatabase/autoscaling/issues/800)

Fixes https://github.com/neondatabase/neon/issues/6516
Fixes https://github.com/neondatabase/neon/issues/7780

## Documentation input for usage recommendations

### maintenance_work_mem
In Neon

`maintenance_work_mem` is very small by default (depends on configured
RAM for your compute but can be as low as 64 MB).
To optimize pgvector index build time you may have to bump it up
according to your working set size (size of tuples for vector index
creation).
You can do so in the current session using

`SET maintenance_work_mem='10 GB';`

The target value you choose should fit into the memory of your compute
size and not exceed 50-60% of available RAM.
The value above has been successfully used on a 7CU endpoint.

### max_parallel_maintenance_workers

max_parallel_maintenance_workers is also small by default (2). For
efficient parallel pgvector index creation you have to bump it up with

`SET max_parallel_maintenance_workers = 7`

to make use of all the CPUs available, assuming you have configured your
endpoint to use 7CU.

## ID input for changelog

pgvector extension in Neon has been upgraded from version 0.5.1 to
version 0.7.0.
Please see https://github.com/pgvector/pgvector/ for documentation of
new capabilities in pgvector version 0.7.0

If you have existing databases with pgvector 0.5.1 already installed
there is a slight difference in behavior in the following corner cases
even if you don't run `ALTER EXTENSION UPDATE`:

### L2 distance from NULL::vector

For the following script, comparing the NULL::vector to non-null vectors
the resulting output changes:

```sql
SET enable_seqscan = off;

CREATE TABLE t (val vector(3));
INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL);
CREATE INDEX ON t USING hnsw (val vector_l2_ops);

INSERT INTO t (val) VALUES ('[1,2,4]');

SELECT * FROM t ORDER BY val <-> (SELECT NULL::vector);
```
and now the output is
```
   val
---------
 [1,1,1]
 [1,2,4]
 [1,2,3]
 [0,0,0]
(4 rows)
```

For the following script
```sql
SET enable_seqscan = off;

CREATE TABLE t (val vector(3));
INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL);
CREATE INDEX ON t USING ivfflat (val vector_l2_ops) WITH (lists = 1);

INSERT INTO t (val) VALUES ('[1,2,4]');

SELECT * FROM t ORDER BY val <-> (SELECT NULL::vector);
```
the output now is

```
   val
---------
 [0,0,0]
 [1,2,3]
 [1,1,1]
 [1,2,4]
(4 rows)
```

### changed error messages
If you provide invalid literals for datatype vector you may get
improved/changed error messages, for example:
```sql
neondb=> SELECT '[4e38,1]'::vector;
ERROR:  "4e38" is out of range for type vector
LINE 1: SELECT '[4e38,1]'::vector;
               ^
```

---------

Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
---
 .dockerignore              |  1 +
 Dockerfile.compute-node    |  7 +++-
 patches/pgvector.patch     | 78 ++++++++++++++++++++++++++++++++++++++
 pgxn/neon/pagestore_smgr.c | 19 +++++++++-
 4 files changed, 101 insertions(+), 4 deletions(-)
 create mode 100644 patches/pgvector.patch

diff --git a/.dockerignore b/.dockerignore
index f7a6232ba1..1258532db8 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -17,6 +17,7 @@
 !libs/
 !neon_local/
 !pageserver/
+!patches/
 !pgxn/
 !proxy/
 !s3_scrubber/
diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index bd4534ce1d..5bf3246f34 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -241,9 +241,12 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -
 FROM build-deps AS vector-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.1.tar.gz -O pgvector.tar.gz && \
-    echo "cc7a8e034a96e30a819911ac79d32f6bc47bdd1aa2de4d7d4904e26b83209dc8 pgvector.tar.gz" | sha256sum --check && \
+COPY patches/pgvector.patch /pgvector.patch
+
+RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.0.tar.gz -O pgvector.tar.gz && \
+    echo "1b5503a35c265408b6eb282621c5e1e75f7801afc04eecb950796cfee2e3d1d8 pgvector.tar.gz" | sha256sum --check && \
     mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
+    patch -p1 < /pgvector.patch && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control
diff --git a/patches/pgvector.patch b/patches/pgvector.patch
new file mode 100644
index 0000000000..84ac6644c5
--- /dev/null
+++ b/patches/pgvector.patch
@@ -0,0 +1,78 @@
+From 0b0194a57bd0f3598bd57dbedd0df3932330169d Mon Sep 17 00:00:00 2001
+From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
+Date: Fri, 2 Feb 2024 22:26:45 +0200
+Subject: [PATCH 1/1] Make v0.6.0 work with Neon
+
+Now that the WAL-logging happens as a separate step at the end of the
+build, we need a few neon-specific hints to make it work.
+---
+ src/hnswbuild.c | 36 ++++++++++++++++++++++++++++++++++++
+ 1 file changed, 36 insertions(+)
+
+diff --git a/src/hnswbuild.c b/src/hnswbuild.c
+index 680789b..ec54dea 100644
+--- a/src/hnswbuild.c
++++ b/src/hnswbuild.c
+@@ -840,9 +840,17 @@ HnswParallelBuildMain(dsm_segment *seg, shm_toc *toc)
+ 
+ 	hnswarea = shm_toc_lookup(toc, PARALLEL_KEY_HNSW_AREA, false);
+ 
++#ifdef NEON_SMGR
++	smgr_start_unlogged_build(RelationGetSmgr(indexRel));
++#endif
++
+ 	/* Perform inserts */
+ 	HnswParallelScanAndInsert(heapRel, indexRel, hnswshared, hnswarea, false);
+ 
++#ifdef NEON_SMGR
++	smgr_finish_unlogged_build_phase_1(RelationGetSmgr(indexRel));
++#endif
++
+ 	/* Close relations within worker */
+ 	index_close(indexRel, indexLockmode);
+ 	table_close(heapRel, heapLockmode);
+@@ -1089,13 +1097,41 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo,
+ 	SeedRandom(42);
+ #endif
+ 
++#ifdef NEON_SMGR
++	smgr_start_unlogged_build(RelationGetSmgr(index));
++#endif
++
+ 	InitBuildState(buildstate, heap, index, indexInfo, forkNum);
+ 
+ 	BuildGraph(buildstate, forkNum);
+ 
++#ifdef NEON_SMGR
++	smgr_finish_unlogged_build_phase_1(RelationGetSmgr(index));
++#endif
++
+ 	if (RelationNeedsWAL(index))
++	{
+ 		log_newpage_range(index, forkNum, 0, RelationGetNumberOfBlocks(index), true);
+ 
++#ifdef NEON_SMGR
++		{
++#if PG_VERSION_NUM >= 160000
++			RelFileLocator rlocator = RelationGetSmgr(index)->smgr_rlocator.locator;
++#else
++			RelFileNode rlocator = RelationGetSmgr(index)->smgr_rnode.node;
++#endif
++
++			SetLastWrittenLSNForBlockRange(XactLastRecEnd, rlocator,
++										   MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
++			SetLastWrittenLSNForRelation(XactLastRecEnd, rlocator, MAIN_FORKNUM);
++		}
++#endif
++	}
++
++#ifdef NEON_SMGR
++	smgr_end_unlogged_build(RelationGetSmgr(index));
++#endif
++
+ 	FreeBuildState(buildstate);
+ }
+ 
+-- 
+2.39.2
+
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index e3b841f526..249ad313b0 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -45,6 +45,7 @@
  */
 #include "postgres.h"
 
+#include "access/parallel.h"
 #include "access/xact.h"
 #include "access/xlog.h"
 #include "access/xlogdefs.h"
@@ -2822,10 +2823,14 @@ neon_start_unlogged_build(SMgrRelation reln)
 	reln->smgr_relpersistence = RELPERSISTENCE_UNLOGGED;
 
 	/*
+	 * Create the local file. In a parallel build, the leader is expected to
+	 * call this first and do it.
+	 *
 	 * FIXME: should we pass isRedo true to create the tablespace dir if it
 	 * doesn't exist? Is it needed?
 	 */
-	mdcreate(reln, MAIN_FORKNUM, false);
+	if (!IsParallelWorker())
+		mdcreate(reln, MAIN_FORKNUM, false);
 }
 
 /*
@@ -2849,7 +2854,17 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln)
 	Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_1);
 	Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED);
 
-	unlogged_build_phase = UNLOGGED_BUILD_PHASE_2;
+	/*
+	 * In a parallel build, (only) the leader process performs the 2nd
+	 * phase.
+	 */
+	if (IsParallelWorker())
+	{
+		unlogged_build_rel = NULL;
+		unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
+	}
+	else
+		unlogged_build_phase = UNLOGGED_BUILD_PHASE_2;
 }
 
 /*

From e3f51abadf835784471034c583605c6b41154f2c Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 13 May 2024 13:18:18 +0300
Subject: [PATCH 081/130] safekeeper: close connection when COPY stream ends.

We can't gracefully exit COPY mode (and don't need that), so close connection to
prevent further attempts to use it.
---
 libs/postgres_backend/src/lib.rs | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs
index 260018ad89..6c41b7f347 100644
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -820,10 +820,11 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
         Ok(ProcessMsgResult::Continue)
     }
 
-    /// Log as info/error result of handling COPY stream and send back
-    /// ErrorResponse if that makes sense. Shutdown the stream if we got
-    /// Terminate. TODO: transition into waiting for Sync msg if we initiate the
-    /// close.
+    /// - Log as info/error result of handling COPY stream and send back
+    ///   ErrorResponse if that makes sense.
+    /// - Shutdown the stream if we got Terminate.
+    /// - Then close the connection because we don't handle exiting from COPY
+    ///   stream normally.
     pub async fn handle_copy_stream_end(&mut self, end: CopyStreamHandlerEnd) {
         use CopyStreamHandlerEnd::*;
 
@@ -849,10 +850,6 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
             }
         }
 
-        if let Terminate = &end {
-            self.state = ProtoState::Closed;
-        }
-
         let err_to_send_and_errcode = match &end {
             ServerInitiated(_) => Some((end.to_string(), SQLSTATE_SUCCESSFUL_COMPLETION)),
             Other(_) => Some((format!("{end:#}"), SQLSTATE_INTERNAL_ERROR)),
@@ -882,6 +879,12 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
                 error!("failed to send ErrorResponse: {}", ee);
             }
         }
+
+        // Proper COPY stream finishing to continue using the connection is not
+        // implemented at the server side (we don't need it so far). To prevent
+        // further usages of the connection, close it.
+        self.framed.shutdown().await.ok();
+        self.state = ProtoState::Closed;
     }
 }
 

From de8dfee4bda97deb8a2f02e4e4cc0c7641f56d09 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 13 May 2024 13:19:12 +0300
Subject: [PATCH 082/130] safekeeper: log LSNs on walreceiver/walsender exit.

Useful for observability.
---
 safekeeper/src/receive_wal.rs | 29 ++++++++++++++++++++++-------
 safekeeper/src/send_wal.rs    | 11 +++++++----
 2 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs
index 015b53bb2e..0356def7df 100644
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -183,9 +183,19 @@ impl SafekeeperPostgresHandler {
         &mut self,
         pgb: &mut PostgresBackend<IO>,
     ) -> Result<(), QueryError> {
-        if let Err(end) = self.handle_start_wal_push_guts(pgb).await {
+        let mut tli: Option<Arc<Timeline>> = None;
+        if let Err(end) = self.handle_start_wal_push_guts(pgb, &mut tli).await {
             // Log the result and probably send it to the client, closing the stream.
-            pgb.handle_copy_stream_end(end).await;
+            let handle_end_fut = pgb.handle_copy_stream_end(end);
+            // If we managed to create the timeline, augment logging with current LSNs etc.
+            if let Some(tli) = tli {
+                let info = tli.get_safekeeper_info(&self.conf).await;
+                handle_end_fut
+                    .instrument(info_span!("", term=%info.term, last_log_term=%info.last_log_term, flush_lsn=%Lsn(info.flush_lsn), commit_lsn=%Lsn(info.commit_lsn)))
+                    .await;
+            } else {
+                handle_end_fut.await;
+            }
         }
         Ok(())
     }
@@ -193,6 +203,7 @@ impl SafekeeperPostgresHandler {
     pub async fn handle_start_wal_push_guts<IO: AsyncRead + AsyncWrite + Unpin>(
         &mut self,
         pgb: &mut PostgresBackend<IO>,
+        tli: &mut Option<Arc<Timeline>>,
     ) -> Result<(), CopyStreamHandlerEnd> {
         // Notify the libpq client that it's allowed to send `CopyData` messages
         pgb.write_message(&BeMessage::CopyBothResponse).await?;
@@ -222,13 +233,17 @@ impl SafekeeperPostgresHandler {
         // Read first message and create timeline if needed.
         let res = network_reader.read_first_message().await;
 
-        let res = if let Ok((tli, next_msg)) = res {
+        let network_res = if let Ok((timeline, next_msg)) = res {
             let pageserver_feedback_rx: tokio::sync::broadcast::Receiver<PageserverFeedback> =
-                tli.get_walreceivers().pageserver_feedback_tx.subscribe();
+                timeline
+                    .get_walreceivers()
+                    .pageserver_feedback_tx
+                    .subscribe();
+            *tli = Some(timeline.clone());
 
             tokio::select! {
                 // todo: add read|write .context to these errors
-                r = network_reader.run(msg_tx, msg_rx, reply_tx, tli.clone(), next_msg) => r,
+                r = network_reader.run(msg_tx, msg_rx, reply_tx, timeline.clone(), next_msg) => r,
                 r = network_write(pgb, reply_rx, pageserver_feedback_rx) => r,
             }
         } else {
@@ -244,13 +259,13 @@ impl SafekeeperPostgresHandler {
         match acceptor_handle {
             None => {
                 // failed even before spawning; read_network should have error
-                Err(res.expect_err("no error with WalAcceptor not spawn"))
+                Err(network_res.expect_err("no error with WalAcceptor not spawn"))
             }
             Some(handle) => {
                 let wal_acceptor_res = handle.await;
 
                 // If there was any network error, return it.
-                res?;
+                network_res?;
 
                 // Otherwise, WalAcceptor thread must have errored.
                 match wal_acceptor_res {
diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs
index 59a8c595ab..ecaae9cfe7 100644
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -340,12 +340,16 @@ impl SafekeeperPostgresHandler {
         start_pos: Lsn,
         term: Option<Term>,
     ) -> Result<(), QueryError> {
+        let tli = GlobalTimelines::get(self.ttid).map_err(|e| QueryError::Other(e.into()))?;
         if let Err(end) = self
-            .handle_start_replication_guts(pgb, start_pos, term)
+            .handle_start_replication_guts(pgb, start_pos, term, tli.clone())
             .await
         {
+            let info = tli.get_safekeeper_info(&self.conf).await;
             // Log the result and probably send it to the client, closing the stream.
-            pgb.handle_copy_stream_end(end).await;
+            pgb.handle_copy_stream_end(end)
+            .instrument(info_span!("", term=%info.term, last_log_term=%info.last_log_term, flush_lsn=%Lsn(info.flush_lsn), commit_lsn=%Lsn(info.flush_lsn)))
+            .await;
         }
         Ok(())
     }
@@ -355,10 +359,9 @@ impl SafekeeperPostgresHandler {
         pgb: &mut PostgresBackend<IO>,
         start_pos: Lsn,
         term: Option<Term>,
+        tli: Arc<Timeline>,
     ) -> Result<(), CopyStreamHandlerEnd> {
         let appname = self.appname.clone();
-        let tli =
-            GlobalTimelines::get(self.ttid).map_err(|e| CopyStreamHandlerEnd::Other(e.into()))?;
 
         // Use a guard object to remove our entry from the timeline when we are done.
         let ws_guard = Arc::new(tli.get_walsenders().register(

From 7701ca45dd2215ecca8b8c3de50926ae9b520ffd Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Mon, 20 May 2024 12:08:45 -0400
Subject: [PATCH 083/130] feat(pageserver): generate image layers for sparse
 keyspace (#7567)

Part of https://github.com/neondatabase/neon/issues/7462

Sparse keyspace does not generate image layers for now. This pull
request adds support for generating image layers for sparse keyspace.


## Summary of changes

* Use the scan interface to generate compaction data for sparse
keyspace.
* Track num of delta layers reads during scan.
* Read-trigger compaction: when a scan on the keyspace touches too many
delta files, generate an image layer. There are one hard-coded threshold
for now: max delta layers we want to touch for a scan.
* L0 compaction does not need to compute holes for metadata keyspace.

Know issue: the scan interface currently reads past the image layer,
which causes `delta_layer_accessed` keeps increasing even if image
layers are generated. The pull request to fix that will be separate, and
orthogonal to this one.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/pgdatadir_mapping.rs          |   6 +-
 pageserver/src/tenant.rs                     | 105 +++++-
 pageserver/src/tenant/storage_layer.rs       |  19 +-
 pageserver/src/tenant/timeline.rs            | 339 +++++++++++++------
 pageserver/src/tenant/timeline/compaction.rs |  35 +-
 5 files changed, 363 insertions(+), 141 deletions(-)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 402f075365..b4fc4a08ee 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -40,7 +40,11 @@ use utils::bin_ser::DeserializeError;
 use utils::vec_map::{VecMap, VecMapOrdering};
 use utils::{bin_ser::BeSer, lsn::Lsn};
 
-const MAX_AUX_FILE_DELTAS: usize = 1024;
+/// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached.
+pub const MAX_AUX_FILE_DELTAS: usize = 1024;
+
+/// Max number of aux-file-related delta layers. The compaction will create a new image layer once this threshold is reached.
+pub const MAX_AUX_FILE_V2_DELTAS: usize = 64;
 
 #[derive(Debug)]
 pub enum LsnForTimestamp {
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index d42b9082b7..e598e9d2e3 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -4777,7 +4777,12 @@ mod tests {
             info!("Doing vectored read on {:?}", read);
 
             let vectored_res = tline
-                .get_vectored_impl(read.clone(), reads_lsn, ValuesReconstructState::new(), &ctx)
+                .get_vectored_impl(
+                    read.clone(),
+                    reads_lsn,
+                    &mut ValuesReconstructState::new(),
+                    &ctx,
+                )
                 .await;
             tline
                 .validate_get_vectored_impl(&vectored_res, read, reads_lsn, &ctx)
@@ -4826,7 +4831,7 @@ mod tests {
             .get_vectored_impl(
                 aux_keyspace.clone(),
                 read_lsn,
-                ValuesReconstructState::new(),
+                &mut ValuesReconstructState::new(),
                 &ctx,
             )
             .await;
@@ -4971,7 +4976,7 @@ mod tests {
             .get_vectored_impl(
                 read.clone(),
                 current_lsn,
-                ValuesReconstructState::new(),
+                &mut ValuesReconstructState::new(),
                 &ctx,
             )
             .await?;
@@ -5106,7 +5111,7 @@ mod tests {
                         ranges: vec![child_gap_at_key..child_gap_at_key.next()],
                     },
                     query_lsn,
-                    ValuesReconstructState::new(),
+                    &mut ValuesReconstructState::new(),
                     &ctx,
                 )
                 .await;
@@ -5547,7 +5552,7 @@ mod tests {
             .await?;
 
         const NUM_KEYS: usize = 1000;
-        const STEP: usize = 100; // random update + scan base_key + idx * STEP
+        const STEP: usize = 10000; // random update + scan base_key + idx * STEP
 
         let cancel = CancellationToken::new();
 
@@ -5580,7 +5585,7 @@ mod tests {
 
         let keyspace = KeySpace::single(base_key..base_key.add((NUM_KEYS * STEP) as u32));
 
-        for _ in 0..10 {
+        for iter in 0..=10 {
             // Read all the blocks
             for (blknum, last_lsn) in updated.iter().enumerate() {
                 test_key.field6 = (blknum * STEP) as u32;
@@ -5595,7 +5600,7 @@ mod tests {
                 .get_vectored_impl(
                     keyspace.clone(),
                     lsn,
-                    ValuesReconstructState::default(),
+                    &mut ValuesReconstructState::default(),
                     &ctx,
                 )
                 .await?
@@ -5631,17 +5636,91 @@ mod tests {
                 updated[blknum] = lsn;
             }
 
-            // Perform a cycle of flush, compact, and GC
-            tline.freeze_and_flush().await?;
-            tline.compact(&cancel, EnumSet::empty(), &ctx).await?;
-            tenant
-                .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
-                .await?;
+            // Perform two cycles of flush, compact, and GC
+            for round in 0..2 {
+                tline.freeze_and_flush().await?;
+                tline
+                    .compact(
+                        &cancel,
+                        if iter % 5 == 0 && round == 0 {
+                            let mut flags = EnumSet::new();
+                            flags.insert(CompactFlags::ForceImageLayerCreation);
+                            flags.insert(CompactFlags::ForceRepartition);
+                            flags
+                        } else {
+                            EnumSet::empty()
+                        },
+                        &ctx,
+                    )
+                    .await?;
+                tenant
+                    .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
+                    .await?;
+            }
         }
 
         Ok(())
     }
 
+    #[tokio::test]
+    async fn test_metadata_compaction_trigger() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_metadata_compaction_trigger")?;
+        let (tenant, ctx) = harness.load().await;
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await?;
+
+        let cancel = CancellationToken::new();
+
+        let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
+        base_key.field1 = AUX_KEY_PREFIX;
+        let test_key = base_key;
+        let mut lsn = Lsn(0x10);
+
+        for _ in 0..20 {
+            lsn = Lsn(lsn.0 + 0x10);
+            let mut writer = tline.writer().await;
+            writer
+                .put(
+                    test_key,
+                    lsn,
+                    &Value::Image(test_img(&format!("{} at {}", 0, lsn))),
+                    &ctx,
+                )
+                .await?;
+            writer.finish_write(lsn);
+            drop(writer);
+            tline.freeze_and_flush().await?; // force create a delta layer
+        }
+
+        let before_num_l0_delta_files = tline
+            .layers
+            .read()
+            .await
+            .layer_map()
+            .get_level0_deltas()?
+            .len();
+
+        tline.compact(&cancel, EnumSet::empty(), &ctx).await?;
+
+        let after_num_l0_delta_files = tline
+            .layers
+            .read()
+            .await
+            .layer_map()
+            .get_level0_deltas()?
+            .len();
+
+        assert!(after_num_l0_delta_files < before_num_l0_delta_files, "after_num_l0_delta_files={after_num_l0_delta_files}, before_num_l0_delta_files={before_num_l0_delta_files}");
+
+        assert_eq!(
+            tline.get(test_key, lsn, &ctx).await?,
+            test_img(&format!("{} at {}", 0, lsn))
+        );
+
+        Ok(())
+    }
+
     #[tokio::test]
     async fn test_branch_copies_dirty_aux_file_flag() {
         let harness = TenantHarness::create("test_branch_copies_dirty_aux_file_flag").unwrap();
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 94a5e9ec47..4c8a518551 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -113,12 +113,17 @@ impl From<VectoredValueReconstructState> for ValueReconstructState {
     }
 }
 
-/// Bag of data accumulated during a vectored get
+/// Bag of data accumulated during a vectored get.
 pub(crate) struct ValuesReconstructState {
+    /// The keys will be removed after `get_vectored` completes. The caller outside `Timeline`
+    /// should not expect to get anything from this hashmap.
     pub(crate) keys: HashMap<Key, Result<VectoredValueReconstructState, PageReconstructError>>,
 
     keys_done: KeySpaceRandomAccum,
+
+    // Statistics that are still accessible as a caller of `get_vectored_impl`.
     layers_visited: u32,
+    delta_layers_visited: u32,
 }
 
 impl ValuesReconstructState {
@@ -127,6 +132,7 @@ impl ValuesReconstructState {
             keys: HashMap::new(),
             keys_done: KeySpaceRandomAccum::new(),
             layers_visited: 0,
+            delta_layers_visited: 0,
         }
     }
 
@@ -140,8 +146,17 @@ impl ValuesReconstructState {
         }
     }
 
-    pub(crate) fn on_layer_visited(&mut self) {
+    pub(crate) fn on_layer_visited(&mut self, layer: &ReadableLayer) {
         self.layers_visited += 1;
+        if let ReadableLayer::PersistentLayer(layer) = layer {
+            if layer.layer_desc().is_delta() {
+                self.delta_layers_visited += 1;
+            }
+        }
+    }
+
+    pub(crate) fn get_delta_layers_visited(&self) -> u32 {
+        self.delta_layers_visited
     }
 
     pub(crate) fn get_layers_visited(&self) -> u32 {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 1fb1928079..e6b58b7166 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -18,8 +18,8 @@ use fail::fail_point;
 use once_cell::sync::Lazy;
 use pageserver_api::{
     key::{
-        AUX_FILES_KEY, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE,
-        NON_INHERITED_SPARSE_RANGE,
+        AUX_FILES_KEY, KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX,
+        NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE,
     },
     keyspace::{KeySpaceAccum, SparseKeyPartitioning},
     models::{
@@ -60,7 +60,6 @@ use std::{
     ops::ControlFlow,
 };
 
-use crate::tenant::timeline::init::LocalLayerFileMetadata;
 use crate::{
     aux_file::AuxFileSizeEstimator,
     tenant::{
@@ -89,6 +88,9 @@ use crate::{
     metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize,
 };
 use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind};
+use crate::{
+    pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS, tenant::timeline::init::LocalLayerFileMetadata,
+};
 use crate::{
     pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind},
     virtual_file::{MaybeFatalIo, VirtualFile},
@@ -782,6 +784,11 @@ pub(crate) enum ShutdownMode {
     Hard,
 }
 
+struct ImageLayerCreationOutcome {
+    image: Option<ResidentLayer>,
+    next_start_key: Key,
+}
+
 /// Public interface functions
 impl Timeline {
     /// Get the LSN where this branch was created
@@ -883,7 +890,7 @@ impl Timeline {
                 }
 
                 let vectored_res = self
-                    .get_vectored_impl(keyspace.clone(), lsn, reconstruct_state, ctx)
+                    .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx)
                     .await;
 
                 if self.conf.validate_vectored_get {
@@ -1028,7 +1035,12 @@ impl Timeline {
             }
             GetVectoredImpl::Vectored => {
                 let vectored_res = self
-                    .get_vectored_impl(keyspace.clone(), lsn, ValuesReconstructState::new(), ctx)
+                    .get_vectored_impl(
+                        keyspace.clone(),
+                        lsn,
+                        &mut ValuesReconstructState::new(),
+                        ctx,
+                    )
                     .await;
 
                 if self.conf.validate_vectored_get {
@@ -1116,7 +1128,7 @@ impl Timeline {
             .get_vectored_impl(
                 keyspace.clone(),
                 lsn,
-                ValuesReconstructState::default(),
+                &mut ValuesReconstructState::default(),
                 ctx,
             )
             .await;
@@ -1193,7 +1205,7 @@ impl Timeline {
         &self,
         keyspace: KeySpace,
         lsn: Lsn,
-        mut reconstruct_state: ValuesReconstructState,
+        reconstruct_state: &mut ValuesReconstructState,
         ctx: &RequestContext,
     ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
         let get_kind = if keyspace.total_raw_size() == 1 {
@@ -1205,7 +1217,7 @@ impl Timeline {
         let get_data_timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME
             .for_get_kind(get_kind)
             .start_timer();
-        self.get_vectored_reconstruct_data(keyspace, lsn, &mut reconstruct_state, ctx)
+        self.get_vectored_reconstruct_data(keyspace, lsn, reconstruct_state, ctx)
             .await?;
         get_data_timer.stop_and_record();
 
@@ -1214,7 +1226,8 @@ impl Timeline {
             .start_timer();
         let mut results: BTreeMap<Key, Result<Bytes, PageReconstructError>> = BTreeMap::new();
         let layers_visited = reconstruct_state.get_layers_visited();
-        for (key, res) in reconstruct_state.keys {
+
+        for (key, res) in std::mem::take(&mut reconstruct_state.keys) {
             match res {
                 Err(err) => {
                     results.insert(key, Err(err));
@@ -3448,7 +3461,7 @@ impl Timeline {
                 unmapped_keyspace = keyspace_to_read;
                 cont_lsn = next_cont_lsn;
 
-                reconstruct_state.on_layer_visited();
+                reconstruct_state.on_layer_visited(&layer_to_read);
             } else {
                 break;
             }
@@ -4134,6 +4147,176 @@ impl Timeline {
         false
     }
 
+    /// Create image layers for Postgres data. Assumes the caller passes a partition that is not too large,
+    /// so that at most one image layer will be produced from this function.
+    async fn create_image_layer_for_rel_blocks(
+        self: &Arc<Self>,
+        partition: &KeySpace,
+        mut image_layer_writer: ImageLayerWriter,
+        lsn: Lsn,
+        ctx: &RequestContext,
+        img_range: Range<Key>,
+        start: Key,
+    ) -> Result<ImageLayerCreationOutcome, CreateImageLayersError> {
+        let mut wrote_keys = false;
+
+        let mut key_request_accum = KeySpaceAccum::new();
+        for range in &partition.ranges {
+            let mut key = range.start;
+            while key < range.end {
+                // Decide whether to retain this key: usually we do, but sharded tenants may
+                // need to drop keys that don't belong to them.  If we retain the key, add it
+                // to `key_request_accum` for later issuing a vectored get
+                if self.shard_identity.is_key_disposable(&key) {
+                    debug!(
+                        "Dropping key {} during compaction (it belongs on shard {:?})",
+                        key,
+                        self.shard_identity.get_shard_number(&key)
+                    );
+                } else {
+                    key_request_accum.add_key(key);
+                }
+
+                let last_key_in_range = key.next() == range.end;
+                key = key.next();
+
+                // Maybe flush `key_rest_accum`
+                if key_request_accum.raw_size() >= Timeline::MAX_GET_VECTORED_KEYS
+                    || (last_key_in_range && key_request_accum.raw_size() > 0)
+                {
+                    let results = self
+                        .get_vectored(key_request_accum.consume_keyspace(), lsn, ctx)
+                        .await?;
+
+                    for (img_key, img) in results {
+                        let img = match img {
+                            Ok(img) => img,
+                            Err(err) => {
+                                // If we fail to reconstruct a VM or FSM page, we can zero the
+                                // page without losing any actual user data. That seems better
+                                // than failing repeatedly and getting stuck.
+                                //
+                                // We had a bug at one point, where we truncated the FSM and VM
+                                // in the pageserver, but the Postgres didn't know about that
+                                // and continued to generate incremental WAL records for pages
+                                // that didn't exist in the pageserver. Trying to replay those
+                                // WAL records failed to find the previous image of the page.
+                                // This special case allows us to recover from that situation.
+                                // See https://github.com/neondatabase/neon/issues/2601.
+                                //
+                                // Unfortunately we cannot do this for the main fork, or for
+                                // any metadata keys, keys, as that would lead to actual data
+                                // loss.
+                                if is_rel_fsm_block_key(img_key) || is_rel_vm_block_key(img_key) {
+                                    warn!("could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}");
+                                    ZERO_PAGE.clone()
+                                } else {
+                                    return Err(CreateImageLayersError::PageReconstructError(err));
+                                }
+                            }
+                        };
+
+                        // Write all the keys we just read into our new image layer.
+                        image_layer_writer.put_image(img_key, img, ctx).await?;
+                        wrote_keys = true;
+                    }
+                }
+            }
+        }
+
+        if wrote_keys {
+            // Normal path: we have written some data into the new image layer for this
+            // partition, so flush it to disk.
+            let image_layer = image_layer_writer.finish(self, ctx).await?;
+            Ok(ImageLayerCreationOutcome {
+                image: Some(image_layer),
+                next_start_key: img_range.end,
+            })
+        } else {
+            // Special case: the image layer may be empty if this is a sharded tenant and the
+            // partition does not cover any keys owned by this shard.  In this case, to ensure
+            // we don't leave gaps between image layers, leave `start` where it is, so that the next
+            // layer we write will cover the key range that we just scanned.
+            tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
+            Ok(ImageLayerCreationOutcome {
+                image: None,
+                next_start_key: start,
+            })
+        }
+    }
+
+    /// Create an image layer for metadata keys. This function produces one image layer for all metadata
+    /// keys for now. Because metadata keys cannot exceed basebackup size limit, the image layer for it
+    /// would not be too large to fit in a single image layer.
+    #[allow(clippy::too_many_arguments)]
+    async fn create_image_layer_for_metadata_keys(
+        self: &Arc<Self>,
+        partition: &KeySpace,
+        mut image_layer_writer: ImageLayerWriter,
+        lsn: Lsn,
+        ctx: &RequestContext,
+        img_range: Range<Key>,
+        mode: ImageLayerCreationMode,
+    ) -> Result<ImageLayerCreationOutcome, CreateImageLayersError> {
+        assert!(!matches!(mode, ImageLayerCreationMode::Initial));
+
+        // Metadata keys image layer creation.
+        let mut reconstruct_state = ValuesReconstructState::default();
+        let data = self
+            .get_vectored_impl(partition.clone(), lsn, &mut reconstruct_state, ctx)
+            .await?;
+        let (data, total_kb_retrieved, total_key_retrieved) = {
+            let mut new_data = BTreeMap::new();
+            let mut total_kb_retrieved = 0;
+            let mut total_key_retrieved = 0;
+            for (k, v) in data {
+                let v = v.map_err(CreateImageLayersError::PageReconstructError)?;
+                total_kb_retrieved += KEY_SIZE + v.len();
+                total_key_retrieved += 1;
+                new_data.insert(k, v);
+            }
+            (new_data, total_kb_retrieved / 1024, total_key_retrieved)
+        };
+        let delta_file_accessed = reconstruct_state.get_delta_layers_visited();
+
+        let trigger_generation = delta_file_accessed as usize >= MAX_AUX_FILE_V2_DELTAS;
+        info!(
+            "generate image layers for metadata keys: trigger_generation={trigger_generation}, \
+                delta_file_accessed={delta_file_accessed}, total_kb_retrieved={total_kb_retrieved}, \
+                total_key_retrieved={total_key_retrieved}"
+        );
+        if !trigger_generation && mode == ImageLayerCreationMode::Try {
+            return Ok(ImageLayerCreationOutcome {
+                image: None,
+                next_start_key: img_range.end,
+            });
+        }
+        let has_keys = !data.is_empty();
+        for (k, v) in data {
+            // Even if the value is empty (deleted), we do not delete it for now until we can ensure vectored get
+            // considers this situation properly.
+            // if v.is_empty() {
+            //     continue;
+            // }
+
+            // No need to handle sharding b/c metadata keys are always on the 0-th shard.
+
+            // TODO: split image layers to avoid too large layer files. Too large image files are not handled
+            // on the normal data path either.
+            image_layer_writer.put_image(k, v, ctx).await?;
+        }
+        Ok(ImageLayerCreationOutcome {
+            image: if has_keys {
+                let image_layer = image_layer_writer.finish(self, ctx).await?;
+                Some(image_layer)
+            } else {
+                tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
+                None
+            },
+            next_start_key: img_range.end,
+        })
+    }
+
     #[tracing::instrument(skip_all, fields(%lsn, %mode))]
     async fn create_image_layers(
         self: &Arc<Timeline>,
@@ -4175,19 +4358,17 @@ impl Timeline {
 
         for partition in partitioning.parts.iter() {
             let img_range = start..partition.ranges.last().unwrap().end;
-
-            if partition.overlaps(&Key::metadata_key_range()) {
-                // TODO(chi): The next patch will correctly create image layers for metadata keys, and it would be a
-                // rather big change. Keep this patch small for now.
-                match mode {
-                    ImageLayerCreationMode::Force | ImageLayerCreationMode::Try => {
-                        // skip image layer creation anyways for metadata keys.
-                        start = img_range.end;
-                        continue;
-                    }
-                    ImageLayerCreationMode::Initial => {
-                        return Err(CreateImageLayersError::Other(anyhow::anyhow!("no image layer should be created for metadata keys when flushing frozen layers")));
-                    }
+            let compact_metadata = partition.overlaps(&Key::metadata_key_range());
+            if compact_metadata {
+                for range in &partition.ranges {
+                    assert!(
+                        range.start.field1 >= METADATA_KEY_BEGIN_PREFIX
+                            && range.end.field1 <= METADATA_KEY_END_PREFIX,
+                        "metadata keys must be partitioned separately"
+                    );
+                }
+                if mode == ImageLayerCreationMode::Initial {
+                    return Err(CreateImageLayersError::Other(anyhow::anyhow!("no image layer should be created for metadata keys when flushing frozen layers")));
                 }
             } else if let ImageLayerCreationMode::Try = mode {
                 // check_for_image_layers = false -> skip
@@ -4198,7 +4379,7 @@ impl Timeline {
                 }
             }
 
-            let mut image_layer_writer = ImageLayerWriter::new(
+            let image_layer_writer = ImageLayerWriter::new(
                 self.conf,
                 self.timeline_id,
                 self.tenant_shard_id,
@@ -4214,87 +4395,39 @@ impl Timeline {
                 )))
             });
 
-            let mut wrote_keys = false;
+            if !compact_metadata {
+                let ImageLayerCreationOutcome {
+                    image,
+                    next_start_key,
+                } = self
+                    .create_image_layer_for_rel_blocks(
+                        partition,
+                        image_layer_writer,
+                        lsn,
+                        ctx,
+                        img_range,
+                        start,
+                    )
+                    .await?;
 
-            let mut key_request_accum = KeySpaceAccum::new();
-            for range in &partition.ranges {
-                let mut key = range.start;
-                while key < range.end {
-                    // Decide whether to retain this key: usually we do, but sharded tenants may
-                    // need to drop keys that don't belong to them.  If we retain the key, add it
-                    // to `key_request_accum` for later issuing a vectored get
-                    if self.shard_identity.is_key_disposable(&key) {
-                        debug!(
-                            "Dropping key {} during compaction (it belongs on shard {:?})",
-                            key,
-                            self.shard_identity.get_shard_number(&key)
-                        );
-                    } else {
-                        key_request_accum.add_key(key);
-                    }
-
-                    let last_key_in_range = key.next() == range.end;
-                    key = key.next();
-
-                    // Maybe flush `key_rest_accum`
-                    if key_request_accum.raw_size() >= Timeline::MAX_GET_VECTORED_KEYS
-                        || (last_key_in_range && key_request_accum.raw_size() > 0)
-                    {
-                        let results = self
-                            .get_vectored(key_request_accum.consume_keyspace(), lsn, ctx)
-                            .await?;
-
-                        for (img_key, img) in results {
-                            let img = match img {
-                                Ok(img) => img,
-                                Err(err) => {
-                                    // If we fail to reconstruct a VM or FSM page, we can zero the
-                                    // page without losing any actual user data. That seems better
-                                    // than failing repeatedly and getting stuck.
-                                    //
-                                    // We had a bug at one point, where we truncated the FSM and VM
-                                    // in the pageserver, but the Postgres didn't know about that
-                                    // and continued to generate incremental WAL records for pages
-                                    // that didn't exist in the pageserver. Trying to replay those
-                                    // WAL records failed to find the previous image of the page.
-                                    // This special case allows us to recover from that situation.
-                                    // See https://github.com/neondatabase/neon/issues/2601.
-                                    //
-                                    // Unfortunately we cannot do this for the main fork, or for
-                                    // any metadata keys, keys, as that would lead to actual data
-                                    // loss.
-                                    if is_rel_fsm_block_key(img_key) || is_rel_vm_block_key(img_key)
-                                    {
-                                        warn!("could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}");
-                                        ZERO_PAGE.clone()
-                                    } else {
-                                        return Err(CreateImageLayersError::PageReconstructError(
-                                            err,
-                                        ));
-                                    }
-                                }
-                            };
-
-                            // Write all the keys we just read into our new image layer.
-                            image_layer_writer.put_image(img_key, img, ctx).await?;
-                            wrote_keys = true;
-                        }
-                    }
-                }
-            }
-
-            if wrote_keys {
-                // Normal path: we have written some data into the new image layer for this
-                // partition, so flush it to disk.
-                start = img_range.end;
-                let image_layer = image_layer_writer.finish(self, ctx).await?;
-                image_layers.push(image_layer);
+                start = next_start_key;
+                image_layers.extend(image);
             } else {
-                // Special case: the image layer may be empty if this is a sharded tenant and the
-                // partition does not cover any keys owned by this shard.  In this case, to ensure
-                // we don't leave gaps between image layers, leave `start` where it is, so that the next
-                // layer we write will cover the key range that we just scanned.
-                tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
+                let ImageLayerCreationOutcome {
+                    image,
+                    next_start_key,
+                } = self
+                    .create_image_layer_for_metadata_keys(
+                        partition,
+                        image_layer_writer,
+                        lsn,
+                        ctx,
+                        img_range,
+                        mode,
+                    )
+                    .await?;
+                start = next_start_key;
+                image_layers.extend(image);
             }
         }
 
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index ed48b4c9cb..2eff469591 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -116,9 +116,13 @@ impl Timeline {
 
                 // 3. Create new image layers for partitions that have been modified
                 // "enough".
-                let dense_layers = self
+                let mut partitioning = dense_partitioning;
+                partitioning
+                    .parts
+                    .extend(sparse_partitioning.into_dense().parts);
+                let image_layers = self
                     .create_image_layers(
-                        &dense_partitioning,
+                        &partitioning,
                         lsn,
                         if flags.contains(CompactFlags::ForceImageLayerCreation) {
                             ImageLayerCreationMode::Force
@@ -130,24 +134,8 @@ impl Timeline {
                     .await
                     .map_err(anyhow::Error::from)?;
 
-                // For now, nothing will be produced...
-                let sparse_layers = self
-                    .create_image_layers(
-                        &sparse_partitioning.clone().into_dense(),
-                        lsn,
-                        if flags.contains(CompactFlags::ForceImageLayerCreation) {
-                            ImageLayerCreationMode::Force
-                        } else {
-                            ImageLayerCreationMode::Try
-                        },
-                        &image_ctx,
-                    )
-                    .await
-                    .map_err(anyhow::Error::from)?;
-                assert!(sparse_layers.is_empty());
-
-                self.upload_new_image_layers(dense_layers)?;
-                dense_partitioning.parts.len()
+                self.upload_new_image_layers(image_layers)?;
+                partitioning.parts.len()
             }
             Err(err) => {
                 // no partitioning? This is normal, if the timeline was just created
@@ -499,8 +487,11 @@ impl Timeline {
 
         for &DeltaEntry { key: next_key, .. } in all_keys.iter() {
             if let Some(prev_key) = prev {
-                // just first fast filter
-                if next_key.to_i128() - prev_key.to_i128() >= min_hole_range {
+                // just first fast filter, do not create hole entries for metadata keys. The last hole in the
+                // compaction is the gap between data key and metadata keys.
+                if next_key.to_i128() - prev_key.to_i128() >= min_hole_range
+                    && !Key::is_metadata_key(&prev_key)
+                {
                     let key_range = prev_key..next_key;
                     // Measuring hole by just subtraction of i128 representation of key range boundaries
                     // has not so much sense, because largest holes will corresponds field1/field2 changes.

From 2d7091871f82b6e9d598fdaed8cc28bdc27b9a16 Mon Sep 17 00:00:00 2001
From: Andy Hattemer <andrew.hattemer@gmail.com>
Date: Mon, 20 May 2024 12:15:43 -0400
Subject: [PATCH 084/130] Update banner image in Readme (#7801)

Update the readme banner with updated branding.
---
 README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 00a90f4483..ea0a289502 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,6 @@
-[![Neon](https://user-images.githubusercontent.com/13738772/236813940-dcfdcb5b-69d3-449b-a686-013febe834d4.png)](https://neon.tech)
+[![Neon](https://github.com/neondatabase/neon/assets/11527560/f15a17f0-836e-40c5-b35d-030606a6b660)](https://neon.tech)
+
+
 
 # Neon
 

From 6810d2aa53b7b7646013d2f236d155a4f1b4721d Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Mon, 20 May 2024 14:24:18 -0400
Subject: [PATCH 085/130] feat(pageserver): do not read past image layers for
 vectored get (#7773)

## Problem

Part of https://github.com/neondatabase/neon/issues/7462

On metadata keyspace, vectored get will not stop if a key is not found,
and will read past the image layer. However, the semantics is different
from single get, because if a key does not exist in the image layer, it
means that the key does not exist in the past, or have been deleted.
This pull request fixed it by recording image layer coverage during the
vectored get process and stop when the full keyspace is covered by an
image layer. A corresponding test case is added to ensure generating
image layer reduces the number of delta layers.

This optimization (or bug fix) also applies to rel block keyspaces. If a
key is missing, we can know it's missing once the first image layer is
reached. Page server will not attempt to read lower layers, which
potentially incurs layer downloads + evictions.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/pageserver_api/src/keyspace.rs           |   2 +-
 pageserver/src/tenant.rs                      | 372 +++++++++++++++++-
 pageserver/src/tenant/storage_layer.rs        |  26 +-
 .../src/tenant/storage_layer/image_layer.rs   |   4 +
 pageserver/src/tenant/timeline.rs             |  79 +++-
 5 files changed, 461 insertions(+), 22 deletions(-)

diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs
index c0c4710a00..12c6dc3a6d 100644
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -307,7 +307,7 @@ impl KeySpace {
     }
 
     /// Merge another keyspace into the current one.
-    /// Note: the keyspaces must not ovelap (enforced via assertions)
+    /// Note: the keyspaces must not overlap (enforced via assertions). To merge overlapping key ranges, use `KeySpaceRandomAccum`.
     pub fn merge(&mut self, other: &KeySpace) {
         let all_ranges = self
             .ranges
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index e598e9d2e3..1a66f2c919 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3968,7 +3968,7 @@ mod tests {
     use crate::tenant::harness::*;
     use crate::tenant::timeline::CompactFlags;
     use crate::DEFAULT_PG_VERSION;
-    use bytes::BytesMut;
+    use bytes::{Bytes, BytesMut};
     use hex_literal::hex;
     use pageserver_api::key::{AUX_KEY_PREFIX, NON_INHERITED_RANGE};
     use pageserver_api::keyspace::KeySpace;
@@ -5996,4 +5996,374 @@ mod tests {
             Some(&bytes::Bytes::from_static(b"last"))
         );
     }
+
+    #[tokio::test]
+    async fn test_metadata_image_creation() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_metadata_image_creation")?;
+        let (tenant, ctx) = harness.load().await;
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await?;
+
+        const NUM_KEYS: usize = 1000;
+        const STEP: usize = 10000; // random update + scan base_key + idx * STEP
+
+        let cancel = CancellationToken::new();
+
+        let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
+        base_key.field1 = AUX_KEY_PREFIX;
+        let mut test_key = base_key;
+        let mut lsn = Lsn(0x10);
+
+        async fn scan_with_statistics(
+            tline: &Timeline,
+            keyspace: &KeySpace,
+            lsn: Lsn,
+            ctx: &RequestContext,
+        ) -> anyhow::Result<(BTreeMap<Key, Result<Bytes, PageReconstructError>>, usize)> {
+            let mut reconstruct_state = ValuesReconstructState::default();
+            let res = tline
+                .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx)
+                .await?;
+            Ok((res, reconstruct_state.get_delta_layers_visited() as usize))
+        }
+
+        #[allow(clippy::needless_range_loop)]
+        for blknum in 0..NUM_KEYS {
+            lsn = Lsn(lsn.0 + 0x10);
+            test_key.field6 = (blknum * STEP) as u32;
+            let mut writer = tline.writer().await;
+            writer
+                .put(
+                    test_key,
+                    lsn,
+                    &Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
+                    &ctx,
+                )
+                .await?;
+            writer.finish_write(lsn);
+            drop(writer);
+        }
+
+        let keyspace = KeySpace::single(base_key..base_key.add((NUM_KEYS * STEP) as u32));
+
+        for iter in 1..=10 {
+            for _ in 0..NUM_KEYS {
+                lsn = Lsn(lsn.0 + 0x10);
+                let blknum = thread_rng().gen_range(0..NUM_KEYS);
+                test_key.field6 = (blknum * STEP) as u32;
+                let mut writer = tline.writer().await;
+                writer
+                    .put(
+                        test_key,
+                        lsn,
+                        &Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
+                        &ctx,
+                    )
+                    .await?;
+                writer.finish_write(lsn);
+                drop(writer);
+            }
+
+            tline.freeze_and_flush().await?;
+
+            if iter % 5 == 0 {
+                let (_, before_delta_file_accessed) =
+                    scan_with_statistics(&tline, &keyspace, lsn, &ctx).await?;
+                tline
+                    .compact(
+                        &cancel,
+                        {
+                            let mut flags = EnumSet::new();
+                            flags.insert(CompactFlags::ForceImageLayerCreation);
+                            flags.insert(CompactFlags::ForceRepartition);
+                            flags
+                        },
+                        &ctx,
+                    )
+                    .await?;
+                let (_, after_delta_file_accessed) =
+                    scan_with_statistics(&tline, &keyspace, lsn, &ctx).await?;
+                assert!(after_delta_file_accessed < before_delta_file_accessed, "after_delta_file_accessed={after_delta_file_accessed}, before_delta_file_accessed={before_delta_file_accessed}");
+                // Given that we already produced an image layer, there should be no delta layer needed for the scan, but still setting a low threshold there for unforeseen circumstances.
+                assert!(
+                    after_delta_file_accessed <= 2,
+                    "after_delta_file_accessed={after_delta_file_accessed}"
+                );
+            }
+        }
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_vectored_missing_data_key_reads() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_vectored_missing_data_key_reads")?;
+        let (tenant, ctx) = harness.load().await;
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await?;
+
+        let cancel = CancellationToken::new();
+
+        let base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
+        let base_key_child = Key::from_hex("000000000033333333444444445500000001").unwrap();
+        let base_key_nonexist = Key::from_hex("000000000033333333444444445500000002").unwrap();
+
+        let mut lsn = Lsn(0x20);
+
+        {
+            let mut writer = tline.writer().await;
+            writer
+                .put(base_key, lsn, &Value::Image(test_img("data key 1")), &ctx)
+                .await?;
+            writer.finish_write(lsn);
+            drop(writer);
+
+            tline.freeze_and_flush().await?; // this will create a image layer
+        }
+
+        let child = tenant
+            .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(lsn), &ctx)
+            .await
+            .unwrap();
+
+        lsn.0 += 0x10;
+
+        {
+            let mut writer = child.writer().await;
+            writer
+                .put(
+                    base_key_child,
+                    lsn,
+                    &Value::Image(test_img("data key 2")),
+                    &ctx,
+                )
+                .await?;
+            writer.finish_write(lsn);
+            drop(writer);
+
+            child.freeze_and_flush().await?; // this will create a delta
+
+            {
+                // update the partitioning to include the test key space, otherwise they
+                // will be dropped by image layer creation
+                let mut guard = child.partitioning.lock().await;
+                let ((partitioning, _), partition_lsn) = &mut *guard;
+                partitioning
+                    .parts
+                    .push(KeySpace::single(base_key..base_key_nonexist)); // exclude the nonexist key
+                *partition_lsn = lsn;
+            }
+
+            child
+                .compact(
+                    &cancel,
+                    {
+                        let mut set = EnumSet::empty();
+                        set.insert(CompactFlags::ForceImageLayerCreation);
+                        set
+                    },
+                    &ctx,
+                )
+                .await?; // force create an image layer for the keys, TODO: check if the image layer is created
+        }
+
+        async fn get_vectored_impl_wrapper(
+            tline: &Arc<Timeline>,
+            key: Key,
+            lsn: Lsn,
+            ctx: &RequestContext,
+        ) -> Result<Option<Bytes>, GetVectoredError> {
+            let mut reconstruct_state = ValuesReconstructState::new();
+            let mut res = tline
+                .get_vectored_impl(
+                    KeySpace::single(key..key.next()),
+                    lsn,
+                    &mut reconstruct_state,
+                    ctx,
+                )
+                .await?;
+            Ok(res.pop_last().map(|(k, v)| {
+                assert_eq!(k, key);
+                v.unwrap()
+            }))
+        }
+
+        // test vectored get on parent timeline
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, base_key, lsn, &ctx).await?,
+            Some(test_img("data key 1"))
+        );
+        assert!(get_vectored_impl_wrapper(&tline, base_key_child, lsn, &ctx)
+            .await
+            .unwrap_err()
+            .is_missing_key_error());
+        assert!(
+            get_vectored_impl_wrapper(&tline, base_key_nonexist, lsn, &ctx)
+                .await
+                .unwrap_err()
+                .is_missing_key_error()
+        );
+
+        // test vectored get on child timeline
+        assert_eq!(
+            get_vectored_impl_wrapper(&child, base_key, lsn, &ctx).await?,
+            Some(test_img("data key 1"))
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&child, base_key_child, lsn, &ctx).await?,
+            Some(test_img("data key 2"))
+        );
+        assert!(
+            get_vectored_impl_wrapper(&child, base_key_nonexist, lsn, &ctx)
+                .await
+                .unwrap_err()
+                .is_missing_key_error()
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_vectored_missing_metadata_key_reads() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_vectored_missing_metadata_key_reads")?;
+        let (tenant, ctx) = harness.load().await;
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await?;
+
+        let cancel = CancellationToken::new();
+
+        let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
+        let mut base_key_child = Key::from_hex("000000000033333333444444445500000001").unwrap();
+        let mut base_key_nonexist = Key::from_hex("000000000033333333444444445500000002").unwrap();
+        base_key.field1 = AUX_KEY_PREFIX;
+        base_key_child.field1 = AUX_KEY_PREFIX;
+        base_key_nonexist.field1 = AUX_KEY_PREFIX;
+
+        let mut lsn = Lsn(0x20);
+
+        {
+            let mut writer = tline.writer().await;
+            writer
+                .put(
+                    base_key,
+                    lsn,
+                    &Value::Image(test_img("metadata key 1")),
+                    &ctx,
+                )
+                .await?;
+            writer.finish_write(lsn);
+            drop(writer);
+
+            tline.freeze_and_flush().await?; // this will create an image layer
+
+            tline
+                .compact(
+                    &cancel,
+                    {
+                        let mut set = EnumSet::empty();
+                        set.insert(CompactFlags::ForceImageLayerCreation);
+                        set.insert(CompactFlags::ForceRepartition);
+                        set
+                    },
+                    &ctx,
+                )
+                .await?; // force create an image layer for metadata keys
+            tenant
+                .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
+                .await?;
+        }
+
+        let child = tenant
+            .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(lsn), &ctx)
+            .await
+            .unwrap();
+
+        lsn.0 += 0x10;
+
+        {
+            let mut writer = child.writer().await;
+            writer
+                .put(
+                    base_key_child,
+                    lsn,
+                    &Value::Image(test_img("metadata key 2")),
+                    &ctx,
+                )
+                .await?;
+            writer.finish_write(lsn);
+            drop(writer);
+
+            child.freeze_and_flush().await?;
+
+            child
+                .compact(
+                    &cancel,
+                    {
+                        let mut set = EnumSet::empty();
+                        set.insert(CompactFlags::ForceImageLayerCreation);
+                        set.insert(CompactFlags::ForceRepartition);
+                        set
+                    },
+                    &ctx,
+                )
+                .await?; // force create an image layer for metadata keys
+            tenant
+                .gc_iteration(Some(child.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
+                .await?;
+        }
+
+        async fn get_vectored_impl_wrapper(
+            tline: &Arc<Timeline>,
+            key: Key,
+            lsn: Lsn,
+            ctx: &RequestContext,
+        ) -> Result<Option<Bytes>, GetVectoredError> {
+            let mut reconstruct_state = ValuesReconstructState::new();
+            let mut res = tline
+                .get_vectored_impl(
+                    KeySpace::single(key..key.next()),
+                    lsn,
+                    &mut reconstruct_state,
+                    ctx,
+                )
+                .await?;
+            Ok(res.pop_last().map(|(k, v)| {
+                assert_eq!(k, key);
+                v.unwrap()
+            }))
+        }
+
+        // test vectored get on parent timeline
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, base_key, lsn, &ctx).await?,
+            Some(test_img("metadata key 1"))
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, base_key_child, lsn, &ctx).await?,
+            None
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, base_key_nonexist, lsn, &ctx).await?,
+            None
+        );
+
+        // test vectored get on child timeline
+        assert_eq!(
+            get_vectored_impl_wrapper(&child, base_key, lsn, &ctx).await?,
+            None
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&child, base_key_child, lsn, &ctx).await?,
+            Some(test_img("metadata key 2"))
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&child, base_key_nonexist, lsn, &ctx).await?,
+            None
+        );
+
+        Ok(())
+    }
 }
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 4c8a518551..9ccf20c0d4 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -113,14 +113,17 @@ impl From<VectoredValueReconstructState> for ValueReconstructState {
     }
 }
 
-/// Bag of data accumulated during a vectored get.
+/// Bag of data accumulated during a vectored get..
 pub(crate) struct ValuesReconstructState {
     /// The keys will be removed after `get_vectored` completes. The caller outside `Timeline`
     /// should not expect to get anything from this hashmap.
     pub(crate) keys: HashMap<Key, Result<VectoredValueReconstructState, PageReconstructError>>,
-
+    /// The keys which are already retrieved
     keys_done: KeySpaceRandomAccum,
 
+    /// The keys covered by the image layers
+    keys_with_image_coverage: Option<Range<Key>>,
+
     // Statistics that are still accessible as a caller of `get_vectored_impl`.
     layers_visited: u32,
     delta_layers_visited: u32,
@@ -131,6 +134,7 @@ impl ValuesReconstructState {
         Self {
             keys: HashMap::new(),
             keys_done: KeySpaceRandomAccum::new(),
+            keys_with_image_coverage: None,
             layers_visited: 0,
             delta_layers_visited: 0,
         }
@@ -186,6 +190,16 @@ impl ValuesReconstructState {
         }
     }
 
+    /// On hitting image layer, we can mark all keys in this range as done, because
+    /// if the image layer does not contain a key, it is deleted/never added.
+    pub(crate) fn on_image_layer_visited(&mut self, key_range: &Range<Key>) {
+        let prev_val = self.keys_with_image_coverage.replace(key_range.clone());
+        assert_eq!(
+            prev_val, None,
+            "should consume the keyspace before the next iteration"
+        );
+    }
+
     /// Update the state collected for a given key.
     /// Returns true if this was the last value needed for the key and false otherwise.
     ///
@@ -248,8 +262,12 @@ impl ValuesReconstructState {
 
     /// Returns the key space describing the keys that have
     /// been marked as completed since the last call to this function.
-    pub(crate) fn consume_done_keys(&mut self) -> KeySpace {
-        self.keys_done.consume_keyspace()
+    /// Returns individual keys done, and the image layer coverage.
+    pub(crate) fn consume_done_keys(&mut self) -> (KeySpace, Option<Range<Key>>) {
+        (
+            self.keys_done.consume_keyspace(),
+            self.keys_with_image_coverage.take(),
+        )
     }
 }
 
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 6ea452b993..becd1e7a6d 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -158,6 +158,7 @@ pub struct ImageLayerInner {
     index_start_blk: u32,
     index_root_blk: u32,
 
+    key_range: Range<Key>,
     lsn: Lsn,
 
     file: VirtualFile,
@@ -419,6 +420,7 @@ impl ImageLayerInner {
             file,
             file_id,
             max_vectored_read_bytes,
+            key_range: actual_summary.key_range,
         }))
     }
 
@@ -478,6 +480,8 @@ impl ImageLayerInner {
         self.do_reads_and_update_state(reads, reconstruct_state, ctx)
             .await;
 
+        reconstruct_state.on_image_layer_visited(&self.key_range);
+
         Ok(())
     }
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index e6b58b7166..7f2a41d90c 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -21,7 +21,7 @@ use pageserver_api::{
         AUX_FILES_KEY, KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX,
         NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE,
     },
-    keyspace::{KeySpaceAccum, SparseKeyPartitioning},
+    keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning},
     models::{
         AtomicAuxFilePolicy, AuxFilePolicy, CompactionAlgorithm, DownloadRemoteLayersTaskInfo,
         DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, InMemoryLayerInfo, LayerMapInfo,
@@ -348,8 +348,8 @@ pub struct Timeline {
     // though let's keep them both for better error visibility.
     pub initdb_lsn: Lsn,
 
-    /// When did we last calculate the partitioning?
-    partitioning: tokio::sync::Mutex<((KeyPartitioning, SparseKeyPartitioning), Lsn)>,
+    /// When did we last calculate the partitioning? Make it pub to test cases.
+    pub(super) partitioning: tokio::sync::Mutex<((KeyPartitioning, SparseKeyPartitioning), Lsn)>,
 
     /// Configuration: how often should the partitioning be recalculated.
     repartition_threshold: u64,
@@ -483,6 +483,11 @@ impl GcCutoffs {
     }
 }
 
+pub(crate) struct TimelineVisitOutcome {
+    completed_keyspace: KeySpace,
+    image_covered_keyspace: KeySpace,
+}
+
 /// An error happened in a get() operation.
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum PageReconstructError {
@@ -507,6 +512,13 @@ pub(crate) enum PageReconstructError {
     MissingKey(MissingKeyError),
 }
 
+impl GetVectoredError {
+    #[cfg(test)]
+    pub(crate) fn is_missing_key_error(&self) -> bool {
+        matches!(self, Self::MissingKey(_))
+    }
+}
+
 #[derive(Debug)]
 pub struct MissingKeyError {
     key: Key,
@@ -3300,12 +3312,15 @@ impl Timeline {
 
         let mut cont_lsn = Lsn(request_lsn.0 + 1);
 
-        loop {
+        let missing_keyspace = loop {
             if self.cancel.is_cancelled() {
                 return Err(GetVectoredError::Cancelled);
             }
 
-            let completed = Self::get_vectored_reconstruct_data_timeline(
+            let TimelineVisitOutcome {
+                completed_keyspace: completed,
+                image_covered_keyspace,
+            } = Self::get_vectored_reconstruct_data_timeline(
                 timeline,
                 keyspace.clone(),
                 cont_lsn,
@@ -3324,12 +3339,31 @@ impl Timeline {
                 ranges: vec![NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE],
             });
 
-            // Keyspace is fully retrieved, no ancestor timeline, or metadata scan (where we do not look
-            // into ancestor timelines). TODO: is there any other metadata which we want to inherit?
-            if keyspace.total_raw_size() == 0 || timeline.ancestor_timeline.is_none() {
-                break;
+            // Keyspace is fully retrieved
+            if keyspace.is_empty() {
+                break None;
             }
 
+            // Not fully retrieved but no ancestor timeline.
+            if timeline.ancestor_timeline.is_none() {
+                break Some(keyspace);
+            }
+
+            // Now we see if there are keys covered by the image layer but does not exist in the
+            // image layer, which means that the key does not exist.
+
+            // The block below will stop the vectored search if any of the keys encountered an image layer
+            // which did not contain a snapshot for said key. Since we have already removed all completed
+            // keys from `keyspace`, we expect there to be no overlap between it and the image covered key
+            // space. If that's not the case, we had at least one key encounter a gap in the image layer
+            // and stop the search as a result of that.
+            let removed = keyspace.remove_overlapping_with(&image_covered_keyspace);
+            if !removed.is_empty() {
+                break Some(removed);
+            }
+            // If we reached this point, `remove_overlapping_with` should not have made any change to the
+            // keyspace.
+
             // Take the min to avoid reconstructing a page with data newer than request Lsn.
             cont_lsn = std::cmp::min(Lsn(request_lsn.0 + 1), Lsn(timeline.ancestor_lsn.0 + 1));
             timeline_owned = timeline
@@ -3337,14 +3371,14 @@ impl Timeline {
                 .await
                 .map_err(GetVectoredError::GetReadyAncestorError)?;
             timeline = &*timeline_owned;
-        }
+        };
 
-        if keyspace.total_raw_size() != 0 {
+        if let Some(missing_keyspace) = missing_keyspace {
             return Err(GetVectoredError::MissingKey(MissingKeyError {
-                key: keyspace.start().unwrap(), /* better if we can store the full keyspace */
+                key: missing_keyspace.start().unwrap(), /* better if we can store the full keyspace */
                 shard: self
                     .shard_identity
-                    .get_shard_number(&keyspace.start().unwrap()),
+                    .get_shard_number(&missing_keyspace.start().unwrap()),
                 cont_lsn,
                 request_lsn,
                 ancestor_lsn: Some(timeline.ancestor_lsn),
@@ -3369,6 +3403,9 @@ impl Timeline {
     ///
     /// At each iteration pop the top of the fringe (the layer with the highest Lsn)
     /// and get all the required reconstruct data from the layer in one go.
+    ///
+    /// Returns the completed keyspace and the keyspaces with image coverage. The caller
+    /// decides how to deal with these two keyspaces.
     async fn get_vectored_reconstruct_data_timeline(
         timeline: &Timeline,
         keyspace: KeySpace,
@@ -3376,20 +3413,27 @@ impl Timeline {
         reconstruct_state: &mut ValuesReconstructState,
         cancel: &CancellationToken,
         ctx: &RequestContext,
-    ) -> Result<KeySpace, GetVectoredError> {
+    ) -> Result<TimelineVisitOutcome, GetVectoredError> {
         let mut unmapped_keyspace = keyspace.clone();
         let mut fringe = LayerFringe::new();
 
         let mut completed_keyspace = KeySpace::default();
+        let mut image_covered_keyspace = KeySpaceRandomAccum::new();
 
         loop {
             if cancel.is_cancelled() {
                 return Err(GetVectoredError::Cancelled);
             }
 
-            let keys_done_last_step = reconstruct_state.consume_done_keys();
+            let (keys_done_last_step, keys_with_image_coverage) =
+                reconstruct_state.consume_done_keys();
             unmapped_keyspace.remove_overlapping_with(&keys_done_last_step);
             completed_keyspace.merge(&keys_done_last_step);
+            if let Some(keys_with_image_coverage) = keys_with_image_coverage {
+                unmapped_keyspace
+                    .remove_overlapping_with(&KeySpace::single(keys_with_image_coverage.clone()));
+                image_covered_keyspace.add_range(keys_with_image_coverage);
+            }
 
             // Do not descent any further if the last layer we visited
             // completed all keys in the keyspace it inspected. This is not
@@ -3467,7 +3511,10 @@ impl Timeline {
             }
         }
 
-        Ok(completed_keyspace)
+        Ok(TimelineVisitOutcome {
+            completed_keyspace,
+            image_covered_keyspace: image_covered_keyspace.consume_keyspace(),
+        })
     }
 
     /// # Cancel-safety

From 6f3e043a76dd47a18180eec627ad5f5bbade6186 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Mon, 20 May 2024 17:00:47 -0700
Subject: [PATCH 086/130] Add some more replication slot metrics (#7761)

## Problem
We want to add alerts for when people's replication slots break, and
also metrics for retained WAL so that we can make warn customers when
their storage gets bloated.

## Summary of changes
Adds the metrics. Addresses
https://github.com/neondatabase/neon/issues/7593
---
 vm-image-spec.yaml | 34 +++++++++++++++++++++++++++++++---
 1 file changed, 31 insertions(+), 3 deletions(-)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index fa7cd014bf..0f9d56e466 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -254,8 +254,8 @@ files:
           select
             case
               when pg_catalog.pg_is_in_recovery()
-              then pg_last_wal_replay_lsn()
-              else pg_current_wal_lsn()
+              then (pg_last_wal_replay_lsn() - '0/0')::FLOAT8
+              else (pg_current_wal_lsn() - '0/0')::FLOAT8
             end as lsn;
 
       - metric_name: replication_delay_bytes
@@ -294,6 +294,9 @@ files:
         query: |
           SELECT checkpoints_timed FROM pg_stat_bgwriter;
 
+      # In all the below metrics, we cast LSNs to floats because Prometheus only supports floats.
+      # It's probably fine because float64 can store integers from -2^53 to +2^53 exactly.
+
       # Number of slots is limited by max_replication_slots, so collecting position for all of them shouldn't be bad.
       - metric_name: logical_slot_restart_lsn
         type: gauge
@@ -302,7 +305,32 @@ files:
           - slot_name
         values: [restart_lsn]
         query: |
-          select slot_name, restart_lsn from pg_replication_slots where slot_type = 'logical';
+          select slot_name, (restart_lsn - '0/0')::FLOAT8 from pg_replication_slots where slot_type = 'logical';
+
+      - metric_name: retained_wal
+        type: gauge
+        help: 'Retained WAL in inactive replication slots'
+        key_labels:
+          - slot_name
+        values: [retained_wal]
+        query: |
+          SELECT slot_name, pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)::FLOAT8 AS retained_wal
+          FROM pg_replication_slots
+          WHERE active = false;
+
+      - metric_name: wal_is_lost
+        type: gauge
+        help: 'Whether or not the replication slot\'s wal_status is lost'
+        key_labels:
+          - slot_name
+        values: [wal_status_is_lost]
+        query: |
+          SELECT slot_name,
+          CASE
+            WHEN wal_status = 'lost' THEN 1
+            ELSE 0
+          END AS wal_status_is_lost
+          FROM pg_replication_slots;
   - filename: neon_collector_autoscaling.yml
     content: |
       collector_name: neon_collector_autoscaling

From baeb58432f77809f68eb648481e81ed5af15a8b0 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 21 May 2024 10:48:17 +0000
Subject: [PATCH 087/130] build(deps): bump requests from 2.31.0 to 2.32.0
 (#7816)

---
 poetry.lock    | 21 ++++++++++++++++-----
 pyproject.toml |  2 +-
 2 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index ef9f572b17..25c0c7398d 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -2405,6 +2405,7 @@ files = [
     {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
+    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
     {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
     {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
     {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
@@ -2529,13 +2530,13 @@ files = [
 
 [[package]]
 name = "requests"
-version = "2.31.0"
+version = "2.32.0"
 description = "Python HTTP for Humans."
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "requests-2.31.0-py3-none-any.whl", hash = "sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f"},
-    {file = "requests-2.31.0.tar.gz", hash = "sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1"},
+    {file = "requests-2.32.0-py3-none-any.whl", hash = "sha256:f2c3881dddb70d056c5bd7600a4fae312b2a300e39be6a118d30b90bd27262b5"},
+    {file = "requests-2.32.0.tar.gz", hash = "sha256:fa5490319474c82ef1d2c9bc459d3652e3ae4ef4c4ebdd18a21145a47ca4b6b8"},
 ]
 
 [package.dependencies]
@@ -2959,6 +2960,16 @@ files = [
     {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
     {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
     {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
+    {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
+    {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
+    {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
+    {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
@@ -3196,4 +3207,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "dcde14c58a32bda5f123319a069352c458b3719f3c62977991eebb9803a46a9e"
+content-hash = "16ebd6a46768be7f67dbdb4ee5903b167d94edc9965f29252f038c67e9e907b0"
diff --git a/pyproject.toml b/pyproject.toml
index ac7f9b061c..131d1121f7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,7 +10,7 @@ pytest = "^7.4.4"
 psycopg2-binary = "^2.9.6"
 typing-extensions = "^4.6.1"
 PyJWT = {version = "^2.1.0", extras = ["crypto"]}
-requests = "^2.31.0"
+requests = "^2.32.0"
 pytest-xdist = "^3.3.1"
 asyncpg = "^0.29.0"
 aiopg = "^1.4.0"

From 4ce6e2d2fc83ff8664eef2f80912cade71240669 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 21 May 2024 13:46:04 +0100
Subject: [PATCH 088/130] pageserver: fix secondary progress stats when layers
 are 404 (#7814)

## Problem

Noticed this issue in staging.

When a tenant is under somewhat heavy timeline creation/deletion
thrashing, it becomes quite common for secondary downloads to encounter
404s downloading layers. This is tolerated by design, because heatmaps
are not guaranteed to be up to date with what layers/timelines actually
exist.

However, we were not updating the SecondaryProgress structure in this
case, so after such a download pass, we would leave a SecondaryProgress
state with lower "downloaded" stats than "total" stats. This causes the
storage controller to consider this secondary location inelegible for
optimization actions such as we do after shard splits

This issue has relative low impact because a typical tenant will
eventually upload a heatmap where we do download all the layers and
thereby enable the controller to progress with migrations -- the heavy
thrashing of timeline creation/deletion is an artifact of our nightly
stress tests.

## Summary of changes

- In the layer 404 case, subtract the skipped layer's stats from the
totals, so that at the end of this download pass we should still end up
in a complete state.
- When updating `last_downloaded`, do a sanity check that our progress
is complete. In debug builds, assert out if this is not the case. In
prod builds, correct the stats and log a warning.
---
 pageserver/src/tenant/secondary/downloader.rs | 92 +++++++++++++------
 1 file changed, 63 insertions(+), 29 deletions(-)

diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 609e1431cf..870475eb57 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -569,6 +569,39 @@ impl<'a> TenantDownloader<'a> {
             heatmap.timelines.len()
         );
 
+        // Get or initialize the local disk state for the timelines we will update
+        let mut timeline_states = HashMap::new();
+        for timeline in &heatmap.timelines {
+            let timeline_state = self
+                .secondary_state
+                .detail
+                .lock()
+                .unwrap()
+                .timelines
+                .get(&timeline.timeline_id)
+                .cloned();
+
+            let timeline_state = match timeline_state {
+                Some(t) => t,
+                None => {
+                    // We have no existing state: need to scan local disk for layers first.
+                    let timeline_state =
+                        init_timeline_state(self.conf, tenant_shard_id, timeline).await;
+
+                    // Re-acquire detail lock now that we're done with async load from local FS
+                    self.secondary_state
+                        .detail
+                        .lock()
+                        .unwrap()
+                        .timelines
+                        .insert(timeline.timeline_id, timeline_state.clone());
+                    timeline_state
+                }
+            };
+
+            timeline_states.insert(timeline.timeline_id, timeline_state);
+        }
+
         // Clean up any local layers that aren't in the heatmap.  We do this first for all timelines, on the general
         // principle that deletions should be done before writes wherever possible, and so that we can use this
         // phase to initialize our SecondaryProgress.
@@ -579,6 +612,10 @@ impl<'a> TenantDownloader<'a> {
 
         // Download the layers in the heatmap
         for timeline in heatmap.timelines {
+            let timeline_state = timeline_states
+                .remove(&timeline.timeline_id)
+                .expect("Just populated above");
+
             if self.secondary_state.cancel.is_cancelled() {
                 tracing::debug!(
                     "Cancelled before downloading timeline {}",
@@ -588,7 +625,7 @@ impl<'a> TenantDownloader<'a> {
             }
 
             let timeline_id = timeline.timeline_id;
-            self.download_timeline(timeline, ctx)
+            self.download_timeline(timeline, timeline_state, ctx)
                 .instrument(tracing::info_span!(
                     "secondary_download_timeline",
                     tenant_id=%tenant_shard_id.tenant_id,
@@ -609,6 +646,22 @@ impl<'a> TenantDownloader<'a> {
                 .unwrap_or(DEFAULT_DOWNLOAD_INTERVAL),
         });
 
+        // Robustness: we should have updated progress properly, but in case we didn't, make sure
+        // we don't leave the tenant in a state where we claim to have successfully downloaded
+        // everything, but our progress is incomplete.  The invariant here should be that if
+        // we have set `last_download` to this heatmap's etag, then the next time we see that
+        // etag we can safely do no work (i.e. we must be complete).
+        let mut progress = self.secondary_state.progress.lock().unwrap();
+        debug_assert!(progress.layers_downloaded == progress.layers_total);
+        debug_assert!(progress.bytes_downloaded == progress.bytes_total);
+        if progress.layers_downloaded != progress.layers_total
+            || progress.bytes_downloaded != progress.bytes_total
+        {
+            tracing::warn!("Correcting drift in progress stats ({progress:?})");
+            progress.layers_downloaded = progress.layers_total;
+            progress.bytes_downloaded = progress.bytes_total;
+        }
+
         Ok(())
     }
 
@@ -784,6 +837,7 @@ impl<'a> TenantDownloader<'a> {
     async fn download_timeline(
         &self,
         timeline: HeatMapTimeline,
+        timeline_state: SecondaryDetailTimeline,
         ctx: &RequestContext,
     ) -> Result<(), UpdateError> {
         debug_assert_current_span_has_tenant_and_timeline_id();
@@ -792,34 +846,6 @@ impl<'a> TenantDownloader<'a> {
         // Accumulate updates to the state
         let mut touched = Vec::new();
 
-        // Clone a view of what layers already exist on disk
-        let timeline_state = self
-            .secondary_state
-            .detail
-            .lock()
-            .unwrap()
-            .timelines
-            .get(&timeline.timeline_id)
-            .cloned();
-
-        let timeline_state = match timeline_state {
-            Some(t) => t,
-            None => {
-                // We have no existing state: need to scan local disk for layers first.
-                let timeline_state =
-                    init_timeline_state(self.conf, tenant_shard_id, &timeline).await;
-
-                // Re-acquire detail lock now that we're done with async load from local FS
-                self.secondary_state
-                    .detail
-                    .lock()
-                    .unwrap()
-                    .timelines
-                    .insert(timeline.timeline_id, timeline_state.clone());
-                timeline_state
-            }
-        };
-
         tracing::debug!(timeline_id=%timeline.timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len());
 
         let mut download_futs = Vec::new();
@@ -1009,6 +1035,14 @@ impl<'a> TenantDownloader<'a> {
                     "Skipped downloading missing layer {}, raced with compaction/gc?",
                     layer.name
                 );
+
+                // If the layer is 404, adjust the progress statistics to reflect that we will not download it.
+                let mut progress = self.secondary_state.progress.lock().unwrap();
+                progress.layers_total = progress.layers_total.saturating_sub(1);
+                progress.bytes_total = progress
+                    .bytes_total
+                    .saturating_sub(layer.metadata.file_size);
+
                 return Ok(None);
             }
             Err(e) => return Err(e.into()),

From 478cc37a70c4357e7817e6c8834264be4d3e3bfd Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 21 May 2024 14:32:29 +0300
Subject: [PATCH 089/130] Propagate standby apply LSN to pageserver to hold off
 GC.

To avoid pageserver gc'ing data needed by standby, propagate standby apply LSN
through standby -> safekeeper -> broker -> pageserver flow and hold off GC for
it. Iteration of GC resets the value to remove the horizon when standby goes
away -- pushes are assumed to happen at least once between gc iterations. As a
safety guard max allowed lag compared to normal GC horizon is hardcoded as 10GB.
Add test for the feature.

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 libs/safekeeper_api/src/models.rs             |  3 +
 pageserver/src/tenant/timeline.rs             | 28 +++++-
 .../walreceiver/connection_manager.rs         | 13 +++
 safekeeper/src/broker.rs                      |  1 +
 safekeeper/src/http/routes.rs                 |  1 +
 safekeeper/src/send_wal.rs                    | 85 +++++++++++++++----
 safekeeper/src/timeline.rs                    |  7 +-
 storage_broker/benches/rps.rs                 |  1 +
 storage_broker/proto/broker.proto             |  3 +
 storage_broker/src/bin/storage_broker.rs      |  1 +
 test_runner/fixtures/neon_fixtures.py         | 11 +++
 test_runner/regress/test_hot_standby.py       | 47 +++++++---
 12 files changed, 169 insertions(+), 32 deletions(-)

diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs
index ce5a1e411e..2fbc333075 100644
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -50,6 +50,9 @@ pub struct SkTimelineInfo {
     pub safekeeper_connstr: Option<String>,
     #[serde(default)]
     pub http_connstr: Option<String>,
+    // Minimum of all active RO replicas flush LSN
+    #[serde(default = "lsn_invalid")]
+    pub standby_horizon: Lsn,
 }
 
 #[derive(Debug, Clone, Deserialize, Serialize)]
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 7f2a41d90c..2c43c26359 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -269,6 +269,8 @@ pub struct Timeline {
     // Atomic would be more appropriate here.
     last_freeze_ts: RwLock<Instant>,
 
+    pub(crate) standby_horizon: AtomicLsn,
+
     // WAL redo manager. `None` only for broken tenants.
     walredo_mgr: Option<Arc<super::WalRedoManager>>,
 
@@ -2279,6 +2281,8 @@ impl Timeline {
                 compaction_lock: tokio::sync::Mutex::default(),
                 gc_lock: tokio::sync::Mutex::default(),
 
+                standby_horizon: AtomicLsn::new(0),
+
                 timeline_get_throttle: resources.timeline_get_throttle,
 
                 aux_files: tokio::sync::Mutex::new(AuxFilesState {
@@ -4844,7 +4848,29 @@ impl Timeline {
             (horizon_cutoff, pitr_cutoff, retain_lsns)
         };
 
-        let new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff);
+        let mut new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff);
+        let standby_horizon = self.standby_horizon.load();
+        // Hold GC for the standby, but as a safety guard do it only within some
+        // reasonable lag.
+        if standby_horizon != Lsn::INVALID {
+            if let Some(standby_lag) = new_gc_cutoff.checked_sub(standby_horizon) {
+                const MAX_ALLOWED_STANDBY_LAG: u64 = 10u64 << 30; // 10 GB
+                if standby_lag.0 < MAX_ALLOWED_STANDBY_LAG {
+                    new_gc_cutoff = Lsn::min(standby_horizon, new_gc_cutoff);
+                    trace!("holding off GC for standby apply LSN {}", standby_horizon);
+                } else {
+                    warn!(
+                        "standby is lagging for more than {}MB, not holding gc for it",
+                        MAX_ALLOWED_STANDBY_LAG / 1024 / 1024
+                    )
+                }
+            }
+        }
+
+        // Reset standby horizon to ignore it if it is not updated till next GC.
+        // It is an easy way to unset it when standby disappears without adding
+        // more conf options.
+        self.standby_horizon.store(Lsn::INVALID);
 
         let res = self
             .gc_timeline(horizon_cutoff, pitr_cutoff, retain_lsns, new_gc_cutoff)
diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
index 991e4ac045..a3c7adae44 100644
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -705,6 +705,7 @@ impl ConnectionManagerState {
                     commit_lsn: info.commit_lsn,
                     safekeeper_connstr: info.safekeeper_connstr,
                     availability_zone: info.availability_zone,
+                    standby_horizon: info.standby_horizon,
                 }
             }
             MessageType::SafekeeperDiscoveryResponse => {
@@ -725,6 +726,17 @@ impl ConnectionManagerState {
 
         WALRECEIVER_BROKER_UPDATES.inc();
 
+        trace!(
+            "safekeeper info update: standby_horizon(cutoff)={}",
+            timeline_update.standby_horizon
+        );
+        if timeline_update.standby_horizon != 0 {
+            // ignore reports from safekeepers not connected to replicas
+            self.timeline
+                .standby_horizon
+                .store(Lsn(timeline_update.standby_horizon));
+        }
+
         let new_safekeeper_id = NodeId(timeline_update.safekeeper_id);
         let old_entry = self.wal_stream_candidates.insert(
             new_safekeeper_id,
@@ -1094,6 +1106,7 @@ mod tests {
                 commit_lsn,
                 safekeeper_connstr: safekeeper_connstr.to_owned(),
                 availability_zone: None,
+                standby_horizon: 0,
             },
             latest_update,
         }
diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs
index ea16ce450f..46a51438ea 100644
--- a/safekeeper/src/broker.rs
+++ b/safekeeper/src/broker.rs
@@ -186,6 +186,7 @@ async fn discover_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<
                         commit_lsn: sk_info.commit_lsn,
                         safekeeper_connstr: sk_info.safekeeper_connstr,
                         availability_zone: sk_info.availability_zone,
+                        standby_horizon: 0,
                     };
 
                     // note this is a blocking call
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index 30d0081a47..808bb1e490 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -350,6 +350,7 @@ async fn record_safekeeper_info(mut request: Request<Body>) -> Result<Response<B
         backup_lsn: sk_info.backup_lsn.0,
         local_start_lsn: sk_info.local_start_lsn.0,
         availability_zone: None,
+        standby_horizon: sk_info.standby_horizon.0,
     };
 
     let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs
index ecaae9cfe7..4edd09a318 100644
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -23,7 +23,7 @@ use utils::failpoint_support;
 use utils::id::TenantTimelineId;
 use utils::pageserver_feedback::PageserverFeedback;
 
-use std::cmp::min;
+use std::cmp::{max, min};
 use std::net::SocketAddr;
 use std::str;
 use std::sync::Arc;
@@ -85,8 +85,17 @@ impl StandbyReply {
 
 #[derive(Debug, Clone, Copy, Serialize, Deserialize)]
 pub struct StandbyFeedback {
-    reply: StandbyReply,
-    hs_feedback: HotStandbyFeedback,
+    pub reply: StandbyReply,
+    pub hs_feedback: HotStandbyFeedback,
+}
+
+impl StandbyFeedback {
+    pub fn empty() -> Self {
+        StandbyFeedback {
+            reply: StandbyReply::empty(),
+            hs_feedback: HotStandbyFeedback::empty(),
+        }
+    }
 }
 
 /// WalSenders registry. Timeline holds it (wrapped in Arc).
@@ -162,8 +171,8 @@ impl WalSenders {
     }
 
     /// Get aggregated hot standby feedback (we send it to compute).
-    pub fn get_hotstandby(self: &Arc<WalSenders>) -> HotStandbyFeedback {
-        self.mutex.lock().agg_hs_feedback
+    pub fn get_hotstandby(self: &Arc<WalSenders>) -> StandbyFeedback {
+        self.mutex.lock().agg_standby_feedback
     }
 
     /// Record new pageserver feedback, update aggregated values.
@@ -184,6 +193,10 @@ impl WalSenders {
     fn record_standby_reply(self: &Arc<WalSenders>, id: WalSenderId, reply: &StandbyReply) {
         let mut shared = self.mutex.lock();
         let slot = shared.get_slot_mut(id);
+        debug!(
+            "Record standby reply: ts={} apply_lsn={}",
+            reply.reply_ts, reply.apply_lsn
+        );
         match &mut slot.feedback {
             ReplicationFeedback::Standby(sf) => sf.reply = *reply,
             ReplicationFeedback::Pageserver(_) => {
@@ -208,7 +221,7 @@ impl WalSenders {
                 })
             }
         }
-        shared.update_hs_feedback();
+        shared.update_reply_feedback();
     }
 
     /// Get remote_consistent_lsn reported by the pageserver. Returns None if
@@ -226,13 +239,13 @@ impl WalSenders {
     fn unregister(self: &Arc<WalSenders>, id: WalSenderId) {
         let mut shared = self.mutex.lock();
         shared.slots[id] = None;
-        shared.update_hs_feedback();
+        shared.update_reply_feedback();
     }
 }
 
 struct WalSendersShared {
     // aggregated over all walsenders value
-    agg_hs_feedback: HotStandbyFeedback,
+    agg_standby_feedback: StandbyFeedback,
     // last feedback ever received from any pageserver, empty if none
     last_ps_feedback: PageserverFeedback,
     // total counter of pageserver feedbacks received
@@ -243,7 +256,7 @@ struct WalSendersShared {
 impl WalSendersShared {
     fn new() -> Self {
         WalSendersShared {
-            agg_hs_feedback: HotStandbyFeedback::empty(),
+            agg_standby_feedback: StandbyFeedback::empty(),
             last_ps_feedback: PageserverFeedback::empty(),
             ps_feedback_counter: 0,
             slots: Vec::new(),
@@ -260,10 +273,11 @@ impl WalSendersShared {
         self.slots[id].as_mut().expect("walsender doesn't exist")
     }
 
-    /// Update aggregated hot standy feedback. We just take min of valid xmins
+    /// Update aggregated hot standy and normal reply feedbacks. We just take min of valid xmins
     /// and ts.
-    fn update_hs_feedback(&mut self) {
+    fn update_reply_feedback(&mut self) {
         let mut agg = HotStandbyFeedback::empty();
+        let mut reply_agg = StandbyReply::empty();
         for ws_state in self.slots.iter().flatten() {
             if let ReplicationFeedback::Standby(standby_feedback) = ws_state.feedback {
                 let hs_feedback = standby_feedback.hs_feedback;
@@ -276,7 +290,7 @@ impl WalSendersShared {
                     } else {
                         agg.xmin = hs_feedback.xmin;
                     }
-                    agg.ts = min(agg.ts, hs_feedback.ts);
+                    agg.ts = max(agg.ts, hs_feedback.ts);
                 }
                 if hs_feedback.catalog_xmin != INVALID_FULL_TRANSACTION_ID {
                     if agg.catalog_xmin != INVALID_FULL_TRANSACTION_ID {
@@ -284,11 +298,43 @@ impl WalSendersShared {
                     } else {
                         agg.catalog_xmin = hs_feedback.catalog_xmin;
                     }
-                    agg.ts = min(agg.ts, hs_feedback.ts);
+                    agg.ts = max(agg.ts, hs_feedback.ts);
+                }
+                let reply = standby_feedback.reply;
+                if reply.write_lsn != Lsn::INVALID {
+                    if reply_agg.write_lsn != Lsn::INVALID {
+                        reply_agg.write_lsn = Lsn::min(reply_agg.write_lsn, reply.write_lsn);
+                    } else {
+                        reply_agg.write_lsn = reply.write_lsn;
+                    }
+                }
+                if reply.flush_lsn != Lsn::INVALID {
+                    if reply_agg.flush_lsn != Lsn::INVALID {
+                        reply_agg.flush_lsn = Lsn::min(reply_agg.flush_lsn, reply.flush_lsn);
+                    } else {
+                        reply_agg.flush_lsn = reply.flush_lsn;
+                    }
+                }
+                if reply.apply_lsn != Lsn::INVALID {
+                    if reply_agg.apply_lsn != Lsn::INVALID {
+                        reply_agg.apply_lsn = Lsn::min(reply_agg.apply_lsn, reply.apply_lsn);
+                    } else {
+                        reply_agg.apply_lsn = reply.apply_lsn;
+                    }
+                }
+                if reply.reply_ts != 0 {
+                    if reply_agg.reply_ts != 0 {
+                        reply_agg.reply_ts = TimestampTz::min(reply_agg.reply_ts, reply.reply_ts);
+                    } else {
+                        reply_agg.reply_ts = reply.reply_ts;
+                    }
                 }
             }
         }
-        self.agg_hs_feedback = agg;
+        self.agg_standby_feedback = StandbyFeedback {
+            reply: reply_agg,
+            hs_feedback: agg,
+        };
     }
 }
 
@@ -793,8 +839,11 @@ mod tests {
     fn test_hs_feedback_no_valid() {
         let mut wss = WalSendersShared::new();
         push_feedback(&mut wss, hs_feedback(1, INVALID_FULL_TRANSACTION_ID));
-        wss.update_hs_feedback();
-        assert_eq!(wss.agg_hs_feedback.xmin, INVALID_FULL_TRANSACTION_ID);
+        wss.update_reply_feedback();
+        assert_eq!(
+            wss.agg_standby_feedback.hs_feedback.xmin,
+            INVALID_FULL_TRANSACTION_ID
+        );
     }
 
     #[test]
@@ -803,7 +852,7 @@ mod tests {
         push_feedback(&mut wss, hs_feedback(1, INVALID_FULL_TRANSACTION_ID));
         push_feedback(&mut wss, hs_feedback(1, 42));
         push_feedback(&mut wss, hs_feedback(1, 64));
-        wss.update_hs_feedback();
-        assert_eq!(wss.agg_hs_feedback.xmin, 42);
+        wss.update_reply_feedback();
+        assert_eq!(wss.agg_standby_feedback.hs_feedback.xmin, 42);
     }
 }
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index 64f764f191..e97247dc7c 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -248,6 +248,7 @@ impl SharedState {
         &self,
         ttid: &TenantTimelineId,
         conf: &SafeKeeperConf,
+        standby_apply_lsn: Lsn,
     ) -> SafekeeperTimelineInfo {
         SafekeeperTimelineInfo {
             safekeeper_id: conf.my_id.0,
@@ -270,6 +271,7 @@ impl SharedState {
             backup_lsn: self.sk.state.inmem.backup_lsn.0,
             local_start_lsn: self.sk.state.local_start_lsn.0,
             availability_zone: conf.availability_zone.clone(),
+            standby_horizon: standby_apply_lsn.0,
         }
     }
 
@@ -663,7 +665,7 @@ impl Timeline {
 
             // if this is AppendResponse, fill in proper hot standby feedback.
             if let Some(AcceptorProposerMessage::AppendResponse(ref mut resp)) = rmsg {
-                resp.hs_feedback = self.walsenders.get_hotstandby();
+                resp.hs_feedback = self.walsenders.get_hotstandby().hs_feedback;
             }
 
             commit_lsn = shared_state.sk.state.inmem.commit_lsn;
@@ -716,7 +718,8 @@ impl Timeline {
     /// Get safekeeper info for broadcasting to broker and other peers.
     pub async fn get_safekeeper_info(&self, conf: &SafeKeeperConf) -> SafekeeperTimelineInfo {
         let shared_state = self.write_shared_state().await;
-        shared_state.get_safekeeper_info(&self.ttid, conf)
+        let standby_apply_lsn = self.walsenders.get_hotstandby().reply.apply_lsn;
+        shared_state.get_safekeeper_info(&self.ttid, conf, standby_apply_lsn)
     }
 
     /// Update timeline state with peer safekeeper data.
diff --git a/storage_broker/benches/rps.rs b/storage_broker/benches/rps.rs
index d66cbefa45..1a6fb7fedf 100644
--- a/storage_broker/benches/rps.rs
+++ b/storage_broker/benches/rps.rs
@@ -147,6 +147,7 @@ async fn publish(client: Option<BrokerClientChannel>, n_keys: u64) {
                 http_connstr: "zenith-1-sk-1.local:7677".to_owned(),
                 local_start_lsn: 0,
                 availability_zone: None,
+                standby_horizon: 0,
             };
             counter += 1;
             yield info;
diff --git a/storage_broker/proto/broker.proto b/storage_broker/proto/broker.proto
index 7d1b63d23f..a420fd9c66 100644
--- a/storage_broker/proto/broker.proto
+++ b/storage_broker/proto/broker.proto
@@ -42,6 +42,7 @@ message SafekeeperTimelineInfo {
     uint64 remote_consistent_lsn = 7;
     uint64 peer_horizon_lsn = 8;
     uint64 local_start_lsn = 9;
+    uint64 standby_horizon = 14;
     // A connection string to use for WAL receiving.
     string safekeeper_connstr = 10;
     // HTTP endpoint connection string
@@ -105,4 +106,6 @@ message SafekeeperDiscoveryResponse {
     string safekeeper_connstr = 4;
     // Availability zone of a safekeeper.
     optional string availability_zone = 5;
+    // Replica apply LSN
+    uint64 standby_horizon = 6;
 }
diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs
index 8c88b61abc..0a4af543ab 100644
--- a/storage_broker/src/bin/storage_broker.rs
+++ b/storage_broker/src/bin/storage_broker.rs
@@ -736,6 +736,7 @@ mod tests {
             http_connstr: "neon-1-sk-1.local:7677".to_owned(),
             local_start_lsn: 0,
             availability_zone: None,
+            standby_horizon: 0,
         })
     }
 
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 23f30804b4..41377e2db2 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4288,6 +4288,17 @@ def wait_replica_caughtup(primary: Endpoint, secondary: Endpoint):
         time.sleep(1)
 
 
+def log_replica_lag(primary: Endpoint, secondary: Endpoint):
+    last_replay_lsn = Lsn(
+        secondary.safe_psql_scalar("SELECT pg_last_wal_replay_lsn()", log_query=False)
+    )
+    primary_lsn = Lsn(
+        primary.safe_psql_scalar("SELECT pg_current_wal_flush_lsn()", log_query=False)
+    )
+    lag = primary_lsn - last_replay_lsn
+    log.info(f"primary_lsn={primary_lsn}, replay_lsn={last_replay_lsn}, lag={lag}")
+
+
 def wait_for_last_flush_lsn(
     env: NeonEnv,
     endpoint: Endpoint,
diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py
index 179cc273ec..31f436cb4c 100644
--- a/test_runner/regress/test_hot_standby.py
+++ b/test_runner/regress/test_hot_standby.py
@@ -1,9 +1,20 @@
 import os
 import re
+import threading
 import time
+from functools import partial
 
+import pytest
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, tenant_get_shards, wait_replica_caughtup
+from fixtures.neon_fixtures import (
+    NeonEnv,
+    NeonEnvBuilder,
+    PgBin,
+    log_replica_lag,
+    tenant_get_shards,
+    wait_replica_caughtup,
+)
+from fixtures.utils import wait_until
 
 
 # Check for corrupted WAL messages which might otherwise go unnoticed if
@@ -104,19 +115,28 @@ def test_2_replicas_start(neon_simple_env: NeonEnv):
                 wait_replica_caughtup(primary, secondary2)
 
 
-# We had an issue that a standby server made GetPage requests with an
-# old LSN, based on the last-written LSN cache, to avoid waits in the
-# pageserver.  However, requesting a page with a very old LSN, such
-# that the GC horizon has already advanced past it, results in an
-# error from the pageserver:
-# "Bad request: tried to request a page version that was garbage collected"
+# Test two different scenarios related to gc of data needed by hot standby.
 #
-# To avoid that, the compute<-> pageserver protocol was updated so
-# that that the standby now sends two LSNs, the old last-written LSN
-# and the current replay LSN.
+# When pause_apply is False, standby is mostly caught up with the primary.
+# However, in compute <-> pageserver protocol version 1 only one LSN had been
+# sent to the pageserver in page request, and to avoid waits in the pageserver
+# it was last-written LSN cache value. If page hasn't been updated for a long
+# time that resulted in an error from the pageserver: "Bad request: tried to
+# request a page version that was garbage collected". For primary this wasn't a
+# problem because pageserver always bumped LSN to the newest one; for standy
+# that would be incorrect since we might get page fresher then apply LSN. Hence,
+# in protocol version v2 two LSNs were introduced: main request_lsn (apply LSN
+# in case of standby) and not_modified_since which could be used as an
+# optimization to avoid waiting.
 #
 # https://github.com/neondatabase/neon/issues/6211
-def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder):
+#
+# When pause_apply is True we model standby lagging behind primary (e.g. due to
+# high max_standby_streaming_delay). To prevent pageserver from removing data
+# still needed by the standby apply LSN is propagated in standby -> safekeepers
+# -> broker -> pageserver flow so that pageserver could hold off gc for it.
+@pytest.mark.parametrize("pause_apply", [False, True])
+def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool):
     tenant_conf = {
         # set PITR interval to be small, so we can do GC
         "pitr_interval": "0 s",
@@ -160,6 +180,9 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder):
             # so we still remember the LSNs of the pages.
             s_cur.execute("SELECT clear_buffer_cache()")
 
+            if pause_apply:
+                s_cur.execute("SELECT pg_wal_replay_pause()")
+
             # Do other stuff on the primary, to advance the WAL
             p_cur.execute("CREATE TABLE test2 AS SELECT generate_series(1, 1000000) AS g")
 
@@ -176,6 +199,8 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder):
             # generates use old not_modified_since LSNs, older than
             # the GC cutoff, but new request LSNs. (In protocol
             # version 1 there was only one LSN, and this failed.)
+            log_replica_lag(primary, secondary)
             s_cur.execute("SELECT COUNT(*) FROM test")
+            log_replica_lag(primary, secondary)
             res = s_cur.fetchone()
             assert res[0] == 10000

From f54c3b96e08f78e56d1f82f8305994e9cc8f867f Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 21 May 2024 14:42:25 +0300
Subject: [PATCH 090/130] Fix bugs in hot standby feedback propagation and add
 test for it.

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/walproposer_pg.c              | 19 +++---
 safekeeper/src/send_wal.rs              |  9 ++-
 test_runner/regress/test_hot_standby.py | 88 +++++++++++++++++++++++++
 3 files changed, 104 insertions(+), 12 deletions(-)

diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index e5ef93b456..492a46fd54 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -1852,34 +1852,30 @@ static void
 CombineHotStanbyFeedbacks(HotStandbyFeedback *hs, WalProposer *wp)
 {
 	hs->ts = 0;
-	hs->xmin.value = ~0;		/* largest unsigned value */
-	hs->catalog_xmin.value = ~0;	/* largest unsigned value */
+	hs->xmin = InvalidFullTransactionId;
+	hs->catalog_xmin = InvalidFullTransactionId;
 
 	for (int i = 0; i < wp->n_safekeepers; i++)
 	{
-		if (wp->safekeeper[i].appendResponse.hs.ts != 0)
+
+		if (wp->safekeeper[i].state == SS_ACTIVE)
 		{
 			HotStandbyFeedback *skhs = &wp->safekeeper[i].appendResponse.hs;
 
 			if (FullTransactionIdIsNormal(skhs->xmin)
-				&& FullTransactionIdPrecedes(skhs->xmin, hs->xmin))
+				&& (!FullTransactionIdIsValid(hs->xmin) || FullTransactionIdPrecedes(skhs->xmin, hs->xmin)))
 			{
 				hs->xmin = skhs->xmin;
 				hs->ts = skhs->ts;
 			}
 			if (FullTransactionIdIsNormal(skhs->catalog_xmin)
-				&& FullTransactionIdPrecedes(skhs->catalog_xmin, hs->xmin))
+				&& (!FullTransactionIdIsValid(hs->catalog_xmin) || FullTransactionIdPrecedes(skhs->catalog_xmin, hs->catalog_xmin)))
 			{
 				hs->catalog_xmin = skhs->catalog_xmin;
 				hs->ts = skhs->ts;
 			}
 		}
 	}
-
-	if (hs->xmin.value == ~0)
-		hs->xmin = InvalidFullTransactionId;
-	if (hs->catalog_xmin.value == ~0)
-		hs->catalog_xmin = InvalidFullTransactionId;
 }
 
 /*
@@ -1946,9 +1942,10 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, Safekeeper *sk)
 	}
 
 	CombineHotStanbyFeedbacks(&hsFeedback, wp);
-	if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &agg_hs_feedback, sizeof hsFeedback) != 0)
+	if (memcmp(&hsFeedback, &agg_hs_feedback, sizeof hsFeedback) != 0)
 	{
 		agg_hs_feedback = hsFeedback;
+		elog(DEBUG2, "ProcessStandbyHSFeedback(xmin=%d, catalog_xmin=%d", XidFromFullTransactionId(hsFeedback.xmin), XidFromFullTransactionId(hsFeedback.catalog_xmin));
 		ProcessStandbyHSFeedback(hsFeedback.ts,
 								 XidFromFullTransactionId(hsFeedback.xmin),
 								 EpochFromFullTransactionId(hsFeedback.xmin),
diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs
index 4edd09a318..5a9745e1c9 100644
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -756,8 +756,15 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> ReplyReader<IO> {
         match msg.first().cloned() {
             Some(HOT_STANDBY_FEEDBACK_TAG_BYTE) => {
                 // Note: deserializing is on m[1..] because we skip the tag byte.
-                let hs_feedback = HotStandbyFeedback::des(&msg[1..])
+                let mut hs_feedback = HotStandbyFeedback::des(&msg[1..])
                     .context("failed to deserialize HotStandbyFeedback")?;
+                // TODO: xmin/catalog_xmin are serialized by walreceiver.c in this way:
+                // pq_sendint32(&reply_message, xmin);
+                // pq_sendint32(&reply_message, xmin_epoch);
+                // So it is two big endian 32-bit words in low endian order!
+                hs_feedback.xmin = (hs_feedback.xmin >> 32) | (hs_feedback.xmin << 32);
+                hs_feedback.catalog_xmin =
+                    (hs_feedback.catalog_xmin >> 32) | (hs_feedback.catalog_xmin << 32);
                 self.ws_guard
                     .walsenders
                     .record_hs_feedback(self.ws_guard.id, &hs_feedback);
diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py
index 31f436cb4c..244d482c18 100644
--- a/test_runner/regress/test_hot_standby.py
+++ b/test_runner/regress/test_hot_standby.py
@@ -204,3 +204,91 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool):
             log_replica_lag(primary, secondary)
             res = s_cur.fetchone()
             assert res[0] == 10000
+
+
+def run_pgbench(connstr: str, pg_bin: PgBin):
+    log.info(f"Start a pgbench workload on pg {connstr}")
+    # s10 is about 150MB of data. In debug mode init takes about 15s on SSD.
+    pg_bin.run_capture(["pgbench", "-i", "-s10", connstr])
+    log.info("pgbench init done")
+    pg_bin.run_capture(["pgbench", "-T60", connstr])
+
+
+# assert that pgbench_accounts and its index are created.
+def pgbench_accounts_initialized(ep):
+    ep.safe_psql_scalar("select 'pgbench_accounts_pkey'::regclass")
+
+
+# Test that hot_standby_feedback works in neon (it is forwarded through
+# safekeepers). That is, ensure queries on standby don't fail during load on
+# primary under the following conditions:
+# - pgbench bombards primary with updates.
+# - On the secondary we run long select of the updated table.
+# - Set small max_standby_streaming_delay: hs feedback should prevent conflicts
+#   so apply doesn't need to wait.
+# - Do agressive vacuum on primary which still shouldn't create conflicts.
+#   Actually this appears to be redundant due to microvacuum existence.
+#
+# Without hs feedback enabled we'd see 'User query might have needed to see row
+# versions that must be removed.' errors.
+def test_hot_standby_feedback(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
+    env = neon_env_builder.init_start()
+    agressive_vacuum_conf = [
+        "log_autovacuum_min_duration = 0",
+        "autovacuum_naptime = 10s",
+        "autovacuum_vacuum_threshold = 25",
+        "autovacuum_vacuum_scale_factor = 0.1",
+        "autovacuum_vacuum_cost_delay = -1",
+    ]
+    with env.endpoints.create_start(
+        branch_name="main", endpoint_id="primary", config_lines=agressive_vacuum_conf
+    ) as primary:
+        # It would be great to have more strict max_standby_streaming_delay=0s here, but then sometimes it fails with
+        # 'User was holding shared buffer pin for too long.'.
+        with env.endpoints.new_replica_start(
+            origin=primary,
+            endpoint_id="secondary",
+            config_lines=[
+                "max_standby_streaming_delay=2s",
+                "neon.protocol_version=2",
+                "hot_standby_feedback=true",
+            ],
+        ) as secondary:
+            log.info(
+                f"primary connstr is {primary.connstr()}, secondary connstr {secondary.connstr()}"
+            )
+            t = threading.Thread(target=run_pgbench, args=(primary.connstr(), pg_bin))
+            t.start()
+            # Wait until pgbench_accounts is created + filled on replica *and*
+            # index is created. Otherwise index creation would conflict with
+            # read queries and hs feedback won't save us.
+            wait_until(60, 1.0, partial(pgbench_accounts_initialized, secondary))
+
+            # Test should fail if hs feedback is disabled anyway, but cross
+            # check that walproposer sets some xmin.
+            def xmin_is_not_null():
+                slot_xmin = primary.safe_psql_scalar(
+                    "select xmin from pg_replication_slots where slot_name = 'wal_proposer_slot'",
+                    log_query=False,
+                )
+                log.info(f"xmin is {slot_xmin}")
+                assert int(slot_xmin) > 0
+
+            wait_until(10, 1.0, xmin_is_not_null)
+            for _ in range(1, 5):
+                # in debug mode takes about 5-7s
+                balance = secondary.safe_psql_scalar("select sum(abalance) from pgbench_accounts")
+                log.info(f"balance={balance}")
+                log_replica_lag(primary, secondary)
+            t.join()
+
+        # check xmin is reset when standby is gone
+        def xmin_is_null():
+            slot_xmin = primary.safe_psql_scalar(
+                "select xmin from pg_replication_slots where slot_name = 'wal_proposer_slot'",
+                log_query=False,
+            )
+            log.info(f"xmin is {slot_xmin}")
+            assert slot_xmin is None
+
+        wait_until(10, 1.0, xmin_is_null)

From f2771a99b7cc286a936ecc27949d305d1fe7103c Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 20 May 2024 14:28:36 +0300
Subject: [PATCH 091/130] Add metric for pageserver standby horizon.

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pageserver/src/metrics.rs                         | 15 +++++++++++++++
 pageserver/src/tenant/timeline.rs                 |  3 +++
 .../timeline/walreceiver/connection_manager.rs    |  4 ++++
 test_runner/fixtures/metrics.py                   |  1 +
 4 files changed, 23 insertions(+)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 5315f0b936..27e25d8e32 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -525,6 +525,15 @@ static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+static STANDBY_HORIZON: Lazy<IntGaugeVec> = Lazy::new(|| {
+    register_int_gauge_vec!(
+        "pageserver_standby_horizon",
+        "Standby apply LSN for which GC is hold off, by timeline.",
+        &["tenant_id", "shard_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
 static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
     register_uint_gauge_vec!(
         "pageserver_resident_physical_size",
@@ -2098,6 +2107,7 @@ pub(crate) struct TimelineMetrics {
     pub garbage_collect_histo: StorageTimeMetrics,
     pub find_gc_cutoffs_histo: StorageTimeMetrics,
     pub last_record_gauge: IntGauge,
+    pub standby_horizon_gauge: IntGauge,
     pub resident_physical_size_gauge: UIntGauge,
     /// copy of LayeredTimeline.current_logical_size
     pub current_logical_size_gauge: UIntGauge,
@@ -2167,6 +2177,9 @@ impl TimelineMetrics {
         let last_record_gauge = LAST_RECORD_LSN
             .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
             .unwrap();
+        let standby_horizon_gauge = STANDBY_HORIZON
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
+            .unwrap();
         let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
             .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
             .unwrap();
@@ -2212,6 +2225,7 @@ impl TimelineMetrics {
             find_gc_cutoffs_histo,
             load_layer_map_histo,
             last_record_gauge,
+            standby_horizon_gauge,
             resident_physical_size_gauge,
             current_logical_size_gauge,
             aux_file_size_gauge,
@@ -2246,6 +2260,7 @@ impl TimelineMetrics {
         let timeline_id = &self.timeline_id;
         let shard_id = &self.shard_id;
         let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
+        let _ = STANDBY_HORIZON.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         {
             RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
             let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 2c43c26359..b0e5275b5f 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4871,6 +4871,9 @@ impl Timeline {
         // It is an easy way to unset it when standby disappears without adding
         // more conf options.
         self.standby_horizon.store(Lsn::INVALID);
+        self.metrics
+            .standby_horizon_gauge
+            .set(Lsn::INVALID.0 as i64);
 
         let res = self
             .gc_timeline(horizon_cutoff, pitr_cutoff, retain_lsns, new_gc_cutoff)
diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
index a3c7adae44..1d2ffec08f 100644
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -735,6 +735,10 @@ impl ConnectionManagerState {
             self.timeline
                 .standby_horizon
                 .store(Lsn(timeline_update.standby_horizon));
+            self.timeline
+                .metrics
+                .standby_horizon_gauge
+                .set(timeline_update.standby_horizon as i64);
         }
 
         let new_safekeeper_id = NodeId(timeline_update.safekeeper_id);
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index 8fa67e75c9..8b8075f8c1 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -142,6 +142,7 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
     "pageserver_resident_physical_size",
     "pageserver_io_operations_bytes_total",
     "pageserver_last_record_lsn",
+    "pageserver_standby_horizon",
     "pageserver_smgr_query_seconds_bucket",
     "pageserver_smgr_query_seconds_count",
     "pageserver_smgr_query_seconds_sum",

From d43dcceef9bac464cd2b80bd8f40cfae496c7f62 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 20 May 2024 18:41:39 +0300
Subject: [PATCH 092/130] Minimize hot standby feedback xmins to next_xid.

Hot standby feedback xmins can be greater than next_xid due to sparse update of
nextXid on pageserver (to do less writes it advances next xid on
1024). ProcessStandbyHSFeedback ignores such xids from the future; to fix,
minimize received xmin to next_xid.

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/walproposer_pg.c | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index 492a46fd54..316e23a72e 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -1944,13 +1944,26 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, Safekeeper *sk)
 	CombineHotStanbyFeedbacks(&hsFeedback, wp);
 	if (memcmp(&hsFeedback, &agg_hs_feedback, sizeof hsFeedback) != 0)
 	{
+		FullTransactionId xmin = hsFeedback.xmin;
+		FullTransactionId catalog_xmin = hsFeedback.catalog_xmin;
+		FullTransactionId next_xid = ReadNextFullTransactionId();
+		/*
+		 * Page server is updating nextXid in checkpoint each 1024 transactions,
+		 * so feedback xmin can be actually larger then nextXid and
+		 * function TransactionIdInRecentPast return false in this case,
+		 * preventing update of slot's xmin.
+		 */
+		if (FullTransactionIdPrecedes(next_xid, xmin))
+			xmin = next_xid;
+		if (FullTransactionIdPrecedes(next_xid, catalog_xmin))
+			catalog_xmin = next_xid;
 		agg_hs_feedback = hsFeedback;
 		elog(DEBUG2, "ProcessStandbyHSFeedback(xmin=%d, catalog_xmin=%d", XidFromFullTransactionId(hsFeedback.xmin), XidFromFullTransactionId(hsFeedback.catalog_xmin));
 		ProcessStandbyHSFeedback(hsFeedback.ts,
-								 XidFromFullTransactionId(hsFeedback.xmin),
-								 EpochFromFullTransactionId(hsFeedback.xmin),
-								 XidFromFullTransactionId(hsFeedback.catalog_xmin),
-								 EpochFromFullTransactionId(hsFeedback.catalog_xmin));
+								 XidFromFullTransactionId(xmin),
+								 EpochFromFullTransactionId(xmin),
+								 XidFromFullTransactionId(catalog_xmin),
+								 EpochFromFullTransactionId(catalog_xmin));
 	}
 
 	CheckGracefulShutdown(wp);

From d9d471e3c43c9e47b0ffc1ce9a796bd15aacf36c Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 14 May 2024 13:49:04 -0500
Subject: [PATCH 093/130] Add some Python typing in a few test files

---
 test_runner/fixtures/neon_fixtures.py  |  7 ++++++-
 test_runner/regress/test_pg_regress.py | 22 +++++++++++++++-------
 2 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 41377e2db2..5c865baa54 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2721,7 +2721,12 @@ class PgBin:
         env.update(env_add)
         return env
 
-    def run(self, command: List[str], env: Optional[Env] = None, cwd: Optional[str] = None):
+    def run(
+        self,
+        command: List[str],
+        env: Optional[Env] = None,
+        cwd: Optional[Union[str, Path]] = None,
+    ):
         """
         Run one of the postgres binaries.
 
diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py
index 2b1b7fff34..c00a8ff6b7 100644
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -1,8 +1,10 @@
 #
 # This file runs pg_regress-based tests.
 #
+from __future__ import annotations
+
 from pathlib import Path
-from typing import Optional
+from typing import TYPE_CHECKING
 
 import pytest
 from fixtures.neon_fixtures import (
@@ -11,6 +13,12 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.remote_storage import s3_storage
 
+if TYPE_CHECKING:
+    from typing import Optional
+
+    from fixtures.neon_fixtures import PgBin
+    from pytest import CaptureFixture
+
 
 # Run the main PostgreSQL regression tests, in src/test/regress.
 #
@@ -19,8 +27,8 @@ def test_pg_regress(
     neon_env_builder: NeonEnvBuilder,
     test_output_dir: Path,
     build_type: str,
-    pg_bin,
-    capsys,
+    pg_bin: PgBin,
+    capsys: CaptureFixture[str],
     base_dir: Path,
     pg_distrib_dir: Path,
     shard_count: Optional[int],
@@ -86,8 +94,8 @@ def test_pg_regress(
 def test_isolation(
     neon_env_builder: NeonEnvBuilder,
     test_output_dir: Path,
-    pg_bin,
-    capsys,
+    pg_bin: PgBin,
+    capsys: CaptureFixture[str],
     base_dir: Path,
     pg_distrib_dir: Path,
     shard_count: Optional[int],
@@ -142,8 +150,8 @@ def test_isolation(
 def test_sql_regress(
     neon_env_builder: NeonEnvBuilder,
     test_output_dir: Path,
-    pg_bin,
-    capsys,
+    pg_bin: PgBin,
+    capsys: CaptureFixture[str],
     base_dir: Path,
     pg_distrib_dir: Path,
     shard_count: Optional[int],

From e8b8ebfa1da3f528b5ebfb0a5aff7b946a8eabc1 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Fri, 17 May 2024 13:39:33 -0500
Subject: [PATCH 094/130] Allow check_restored_datadir_content to ignore
 certain files

Some files may have known differences that we are okay with.
---
 test_runner/fixtures/neon_fixtures.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 5c865baa54..7a660b64ed 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4150,7 +4150,12 @@ def list_files_to_compare(pgdata_dir: Path) -> List[str]:
 
 
 # pg is the existing and running compute node, that we want to compare with a basebackup
-def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint: Endpoint):
+def check_restored_datadir_content(
+    test_output_dir: Path,
+    env: NeonEnv,
+    endpoint: Endpoint,
+    ignored_files: Optional[list[str]] = None,
+):
     pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version)
 
     # Get the timeline ID. We need it for the 'basebackup' command
@@ -4203,6 +4208,10 @@ def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint
             if not f.startswith("pg_xact") and not f.startswith("pg_multixact")
         ]
 
+    if ignored_files:
+        pgdata_files = [f for f in pgdata_files if f not in ignored_files]
+        restored_files = [f for f in restored_files if f not in ignored_files]
+
     # check that file sets are equal
     assert pgdata_files == restored_files
 

From 9a4b896636465d4d24e5b11c6a7bdb6aed83f23f Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Fri, 17 May 2024 13:50:36 -0500
Subject: [PATCH 095/130] Use a constant for database name in test_pg_regress

---
 test_runner/regress/test_pg_regress.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py
index c00a8ff6b7..302f94064c 100644
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -33,6 +33,8 @@ def test_pg_regress(
     pg_distrib_dir: Path,
     shard_count: Optional[int],
 ):
+    DBNAME = "regression"
+
     """
     :param shard_count: if None, create an unsharded tenant.  Otherwise create a tenant with this
                         many shards.
@@ -50,7 +52,7 @@ def test_pg_regress(
 
     # Connect to postgres and create a database called "regression".
     endpoint = env.endpoints.create_start("main")
-    endpoint.safe_psql("CREATE DATABASE regression")
+    endpoint.safe_psql(f"CREATE DATABASE {DBNAME}")
 
     # Create some local directories for pg_regress to run in.
     runpath = test_output_dir / "regress"

From 8030b8e4c5f50a2320888433a39e550bfa1ec22e Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Fri, 17 May 2024 13:56:02 -0500
Subject: [PATCH 096/130] Fix test_pg_regress for unlogged relations

Previously we worked around file comparison issues by dropping unlogged
relations in the pg_regress tests, but this would lead to an unnecessary
diff when compared to upstream in our Postgres fork. Instead, we can
precompute the files that we know will be different, and ignore them.
---
 test_runner/regress/test_pg_regress.py | 65 +++++++++++++++++++++++++-
 1 file changed, 63 insertions(+), 2 deletions(-)

diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py
index 302f94064c..885a94a557 100644
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -4,13 +4,14 @@
 from __future__ import annotations
 
 from pathlib import Path
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, cast
 
 import pytest
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     check_restored_datadir_content,
 )
+from fixtures.pg_version import PgVersion
 from fixtures.remote_storage import s3_storage
 
 if TYPE_CHECKING:
@@ -87,7 +88,67 @@ def test_pg_regress(
     with capsys.disabled():
         pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath)
 
-        check_restored_datadir_content(test_output_dir, env, endpoint)
+        ignored_files: Optional[list[str]] = None
+
+        # Neon handles unlogged relations in a special manner. During a
+        # basebackup, we ship the init fork as the main fork. This presents a
+        # problem in that the endpoint's data directory and the basebackup will
+        # have differences and will fail the eventual file comparison.
+        #
+        # Unlogged tables were introduced in version 9.1. ALTER TABLE grew
+        # support for setting the persistence of a table in 9.5. The reason that
+        # this doesn't affect versions < 15 (but probably would between 9.1 and
+        # 9.5) is that all the regression tests that deal with unlogged tables
+        # up until that point dropped the unlogged tables or set them to logged
+        # at some point during the test.
+        #
+        # In version 15, Postgres grew support for unlogged sequences, and with
+        # that came a few more regression tests. These tests did not all drop
+        # the unlogged tables/sequences prior to finishing.
+        #
+        # But unlogged sequences came with a bug in that, sequences didn't
+        # inherit the persistence of their "parent" tables if they had one. This
+        # was fixed and backported to 15, thus exacerbating our problem a bit.
+        #
+        # So what we can do is just ignore file differences between the data
+        # directory and basebackup for unlogged relations.
+        results = cast(
+            "list[tuple[str, str]]",
+            endpoint.safe_psql(
+                """
+            SELECT
+                relkind,
+                pg_relation_filepath(
+                    pg_filenode_relation(reltablespace, relfilenode)
+                ) AS unlogged_relation_paths
+            FROM pg_class
+            WHERE relpersistence = 'u'
+            """,
+                dbname=DBNAME,
+            ),
+        )
+
+        unlogged_relation_files: list[str] = []
+        for r in results:
+            unlogged_relation_files.append(r[1])
+            # This is related to the following Postgres commit:
+            #
+            # commit ccadf73163ca88bdaa74b8223d4dde05d17f550b
+            # Author: Heikki Linnakangas <heikki.linnakangas@iki.fi>
+            # Date:   2023-08-23 09:21:31 -0500
+            #
+            # Use the buffer cache when initializing an unlogged index.
+            #
+            # This patch was backpatched to 16. Without it, the LSN in the
+            # page header would be 0/0 in the data directory, which wouldn't
+            # match the LSN generated during the basebackup, thus creating
+            # a difference.
+            if env.pg_version <= PgVersion.V15 and r[0] == "i":
+                unlogged_relation_files.append(f"{r[1]}_init")
+
+        ignored_files = unlogged_relation_files
+
+        check_restored_datadir_content(test_output_dir, env, endpoint, ignored_files=ignored_files)
 
 
 # Run the PostgreSQL "isolation" tests, in src/test/isolation.

From 781352bd8e4dca5676168fe67521b178fa91a0ec Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 20 May 2024 10:01:58 -0500
Subject: [PATCH 097/130] Upgrade Postgres v14 to 14.12

---
 vendor/postgres-v14   | 2 +-
 vendor/revisions.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index d6f7e2c604..21ec61d539 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit d6f7e2c604bfc7cbc4c46bcea0a8e800f4bc778a
+Subproject commit 21ec61d539d22a81fe811c2d79e26436820bc3f4
diff --git a/vendor/revisions.json b/vendor/revisions.json
index c5b55762fa..ec82786109 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
   "v16": ["16.2", "8ef3c33aa01631e17cb24a122776349fcc777b46"],
   "v15": ["15.6", "f0d6b0ef7581bd78011832e23d8420a7d2c8a83a"],
-  "v14": ["14.11", "d6f7e2c604bfc7cbc4c46bcea0a8e800f4bc778a"]
+  "v14": ["14.12", "21ec61d539d22a81fe811c2d79e26436820bc3f4"]
 }

From 9d081851ec846523d3893f8e66e8fb5112c1e7ca Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 20 May 2024 10:03:09 -0500
Subject: [PATCH 098/130] Upgrade Postgres v15 to 15.7

---
 vendor/postgres-v15   | 2 +-
 vendor/revisions.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index f0d6b0ef75..e2dbd63345 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit f0d6b0ef7581bd78011832e23d8420a7d2c8a83a
+Subproject commit e2dbd63345c584de75173c27951f111249ae0016
diff --git a/vendor/revisions.json b/vendor/revisions.json
index ec82786109..d022e6f455 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
   "v16": ["16.2", "8ef3c33aa01631e17cb24a122776349fcc777b46"],
-  "v15": ["15.6", "f0d6b0ef7581bd78011832e23d8420a7d2c8a83a"],
+  "v15": ["15.7", "e2dbd63345c584de75173c27951f111249ae0016"],
   "v14": ["14.12", "21ec61d539d22a81fe811c2d79e26436820bc3f4"]
 }

From e3415706b7b23a30f21051a8063bc257a79e3953 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 20 May 2024 10:04:04 -0500
Subject: [PATCH 099/130] Upgrade Postgres v16 to 16.3

---
 libs/walproposer/src/walproposer.rs | 4 ++--
 vendor/postgres-v16                 | 2 +-
 vendor/revisions.json               | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs
index fb815607a7..f7b72b205f 100644
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -496,9 +496,9 @@ mod tests {
                 // TODO: When updating Postgres versions, this test will cause
                 // problems. Postgres version in message needs updating.
                 //
-                // Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160002, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
+                // Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160003, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
                 vec![
-                    103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 3, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 76, 143, 54, 6, 60, 108, 110,
                     147, 188, 32, 214, 90, 130, 15, 61, 158, 76, 143, 54, 6, 60, 108, 110, 147,
                     188, 32, 214, 90, 130, 15, 61, 1, 0, 0, 0, 0, 0, 0, 1,
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 8ef3c33aa0..c271017c6c 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 8ef3c33aa01631e17cb24a122776349fcc777b46
+Subproject commit c271017c6c4846be59948766baec2ba4ace5dc9c
diff --git a/vendor/revisions.json b/vendor/revisions.json
index d022e6f455..a3af9331fe 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
-  "v16": ["16.2", "8ef3c33aa01631e17cb24a122776349fcc777b46"],
+  "v16": ["16.3", "c271017c6c4846be59948766baec2ba4ace5dc9c"],
   "v15": ["15.7", "e2dbd63345c584de75173c27951f111249ae0016"],
   "v14": ["14.12", "21ec61d539d22a81fe811c2d79e26436820bc3f4"]
 }

From 1988ad8db702a24afc19ce4332e1199531441a8c Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Fri, 17 May 2024 13:42:51 -0500
Subject: [PATCH 100/130] Extend test_unlogged to include a sequence

Unlogged sequences were added in v15, so let's just test to make sure
they work on Neon.
---
 test_runner/regress/test_unlogged.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/test_runner/regress/test_unlogged.py b/test_runner/regress/test_unlogged.py
index 708bf0dfeb..137d28b9fa 100644
--- a/test_runner/regress/test_unlogged.py
+++ b/test_runner/regress/test_unlogged.py
@@ -1,4 +1,5 @@
 from fixtures.neon_fixtures import NeonEnv, fork_at_current_lsn
+from fixtures.pg_version import PgVersion
 
 
 #
@@ -17,7 +18,8 @@ def test_unlogged(neon_simple_env: NeonEnv):
     cur.execute("CREATE UNLOGGED TABLE iut (id int);")
     # create index to test unlogged index relation as well
     cur.execute("CREATE UNIQUE INDEX iut_idx ON iut (id);")
-    cur.execute("INSERT INTO iut values (42);")
+    cur.execute("ALTER TABLE iut ADD COLUMN seq int GENERATED ALWAYS AS IDENTITY;")
+    cur.execute("INSERT INTO iut (id) values (42);")
 
     # create another compute to fetch inital empty contents from pageserver
     fork_at_current_lsn(env, endpoint, "test_unlogged_basebackup", "test_unlogged")
@@ -26,7 +28,15 @@ def test_unlogged(neon_simple_env: NeonEnv):
     conn2 = endpoint2.connect()
     cur2 = conn2.cursor()
     # after restart table should be empty but valid
-    cur2.execute("PREPARE iut_plan (int) AS INSERT INTO iut VALUES ($1)")
+    cur2.execute("PREPARE iut_plan (int) AS INSERT INTO iut (id) VALUES ($1)")
     cur2.execute("EXECUTE iut_plan (43);")
     cur2.execute("SELECT * FROM iut")
-    assert cur2.fetchall() == [(43,)]
+    results = cur2.fetchall()
+    # Unlogged sequences were introduced in v15. On <= v14, the sequence created
+    # for the GENERATED ALWAYS AS IDENTITY column is logged, and hence it keeps
+    # the old value (2) on restart. While on v15 and above, it's unlogged, so it
+    # gets reset to 1.
+    if env.pg_version <= PgVersion.V14:
+        assert results == [(43, 2)]
+    else:
+        assert results == [(43, 1)]

From 353afe4fe7dc2c5a131aa01d06d10d0d229fd84a Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 21 May 2024 16:13:54 +0100
Subject: [PATCH 101/130] neon_local: run controller's postgres with fsync=off
 (#7817)

## Problem

In `test_storage_controller_many_tenants` we
[occasionally](https://neon-github-public-dev.s3.amazonaws.com/reports/main/9155810417/index.html#/testresult/8fbdf57a0e859c2d)
see it hit the retry limit on serializable transactions. That's likely
due to a combination of relative slow fsync on the hetzner nodes running
the test, and the way the test does lots of parallel timeline creations,
putting high load on the drive.

Running the storage controller's db with fsync=off may help here.

## Summary of changes

- Set `fsync=off` in the postgres config for the database used by the
storage controller in tests
---
 control_plane/src/storage_controller.rs | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index 96e8276f4d..b6b7ea7762 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -243,9 +243,13 @@ impl StorageController {
                 anyhow::bail!("initdb failed with status {status}");
             }
 
+            // Write a minimal config file:
+            // - Specify the port, since this is chosen dynamically
+            // - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
+            //   the storage controller we don't want a slow local disk to interfere with that.
             tokio::fs::write(
                 &pg_data_path.join("postgresql.conf"),
-                format!("port = {}", self.postgres_port),
+                format!("port = {}\nfsync=off\n", self.postgres_port),
             )
             .await?;
         };

From a8a88ba7bcf99505b1552858db0f5415974b8f8a Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 21 May 2024 20:08:43 +0300
Subject: [PATCH 102/130] test(detach_ancestor): ensure L0 compaction in
 history is ok (#7813)

detaching a timeline from its ancestor can leave the resulting timeline
with more L0 layers than the compaction threshold. most of the time, the
detached timeline has made progress, and next time the L0 -> L1
compaction happens near the original branch point and not near the
last_record_lsn.

add a test to ensure that inheriting the historical L0s does not change
fullbackup. additionally:
- add `wait_until_completed` to test-only timeline checkpoint and
compact HTTP endpoints. with `?wait_until_completed=true` the endpoints
will wait until the remote client has completed uploads.
- for delta layers, describe L0-ness with the `/layer` endpoint

Cc: #6994
---
 libs/pageserver_api/src/models.rs             |   2 +
 pageserver/src/http/routes.rs                 |  11 ++
 pageserver/src/tenant/storage_layer/layer.rs  |   1 +
 .../src/tenant/storage_layer/layer_name.rs    |  16 +-
 test_runner/fixtures/pageserver/http.py       |  20 ++-
 test_runner/regress/test_compaction.py        |   3 +-
 test_runner/regress/test_layer_eviction.py    |   4 +-
 test_runner/regress/test_ondemand_download.py |   1 -
 .../regress/test_pageserver_layer_rolling.py  |   2 +-
 test_runner/regress/test_sharding.py          |   1 -
 .../regress/test_timeline_detach_ancestor.py  | 157 ++++++++++++++++++
 test_runner/regress/test_timeline_size.py     |   2 +-
 12 files changed, 200 insertions(+), 20 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 451ee1a13c..05a444d738 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -789,6 +789,8 @@ pub enum HistoricLayerInfo {
         lsn_end: Lsn,
         remote: bool,
         access_stats: LayerAccessStats,
+
+        l0: bool,
     },
     Image {
         layer_file_name: String,
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 0eab6510ca..ec3b1141f3 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1736,6 +1736,8 @@ async fn timeline_compact_handler(
     if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? {
         flags |= CompactFlags::ForceImageLayerCreation;
     }
+    let wait_until_uploaded =
+        parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
 
     async {
         let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
@@ -1744,6 +1746,9 @@ async fn timeline_compact_handler(
             .compact(&cancel, flags, &ctx)
             .await
             .map_err(|e| ApiError::InternalServerError(e.into()))?;
+        if wait_until_uploaded {
+            timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?;
+        }
         json_response(StatusCode::OK, ())
     }
     .instrument(info_span!("manual_compaction", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
@@ -1768,6 +1773,8 @@ async fn timeline_checkpoint_handler(
     if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? {
         flags |= CompactFlags::ForceImageLayerCreation;
     }
+    let wait_until_uploaded =
+        parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
 
     async {
         let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
@@ -1781,6 +1788,10 @@ async fn timeline_checkpoint_handler(
             .await
             .map_err(|e| ApiError::InternalServerError(e.into()))?;
 
+        if wait_until_uploaded {
+            timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?;
+        }
+
         json_response(StatusCode::OK, ())
     }
     .instrument(info_span!("manual_checkpoint", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 97b349f635..45d61ce048 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1264,6 +1264,7 @@ impl LayerInner {
                 lsn_end: lsn_range.end,
                 remote: !resident,
                 access_stats,
+                l0: crate::tenant::layer_map::LayerMap::is_l0(self.layer_desc()),
             }
         } else {
             let lsn = self.desc.image_layer_lsn();
diff --git a/pageserver/src/tenant/storage_layer/layer_name.rs b/pageserver/src/tenant/storage_layer/layer_name.rs
index c733404693..da26e1eeb7 100644
--- a/pageserver/src/tenant/storage_layer/layer_name.rs
+++ b/pageserver/src/tenant/storage_layer/layer_name.rs
@@ -347,37 +347,33 @@ impl<'de> serde::de::Visitor<'de> for LayerNameVisitor {
 mod test {
     use super::*;
     #[test]
-    fn image_layer_parse() -> anyhow::Result<()> {
+    fn image_layer_parse() {
         let expected = LayerName::Image(ImageLayerName {
             key_range: Key::from_i128(0)
                 ..Key::from_hex("000000067F00000001000004DF0000000006").unwrap(),
             lsn: Lsn::from_hex("00000000014FED58").unwrap(),
         });
-        let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-v1-00000001").map_err(|s| anyhow::anyhow!(s))?;
+        let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-v1-00000001").unwrap();
         assert_eq!(parsed, expected,);
 
         // Omitting generation suffix is valid
-        let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58").map_err(|s| anyhow::anyhow!(s))?;
+        let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58").unwrap();
         assert_eq!(parsed, expected,);
-
-        Ok(())
     }
 
     #[test]
-    fn delta_layer_parse() -> anyhow::Result<()> {
+    fn delta_layer_parse() {
         let expected = LayerName::Delta(DeltaLayerName {
             key_range: Key::from_i128(0)
                 ..Key::from_hex("000000067F00000001000004DF0000000006").unwrap(),
             lsn_range: Lsn::from_hex("00000000014FED58").unwrap()
                 ..Lsn::from_hex("000000000154C481").unwrap(),
         });
-        let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481-v1-00000001").map_err(|s| anyhow::anyhow!(s))?;
+        let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481-v1-00000001").unwrap();
         assert_eq!(parsed, expected);
 
         // Omitting generation suffix is valid
-        let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481").map_err(|s| anyhow::anyhow!(s))?;
+        let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481").unwrap();
         assert_eq!(parsed, expected);
-
-        Ok(())
     }
 }
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 4d563a532b..f1f96f6d5f 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -56,20 +56,30 @@ class InMemoryLayerInfo:
 class HistoricLayerInfo:
     kind: str
     layer_file_name: str
-    layer_file_size: Optional[int]
+    layer_file_size: int
     lsn_start: str
     lsn_end: Optional[str]
     remote: bool
+    # None for image layers, true if pageserver thinks this is an L0 delta layer
+    l0: Optional[bool]
 
     @classmethod
     def from_json(cls, d: Dict[str, Any]) -> HistoricLayerInfo:
+        # instead of parsing the key range lets keep the definition of "L0" in pageserver
+        l0_ness = d.get("l0")
+        assert l0_ness is None or isinstance(l0_ness, bool)
+
+        size = d["layer_file_size"]
+        assert isinstance(size, int)
+
         return HistoricLayerInfo(
             kind=d["kind"],
             layer_file_name=d["layer_file_name"],
-            layer_file_size=d.get("layer_file_size"),
+            layer_file_size=size,
             lsn_start=d["lsn_start"],
             lsn_end=d.get("lsn_end"),
             remote=d["remote"],
+            l0=l0_ness,
         )
 
 
@@ -583,6 +593,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         timeline_id: TimelineId,
         force_repartition=False,
         force_image_layer_creation=False,
+        wait_until_uploaded=False,
     ):
         self.is_testing_enabled_or_skip()
         query = {}
@@ -590,6 +601,8 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
             query["force_repartition"] = "true"
         if force_image_layer_creation:
             query["force_image_layer_creation"] = "true"
+        if wait_until_uploaded:
+            query["wait_until_uploaded"] = "true"
 
         log.info(f"Requesting compact: tenant {tenant_id}, timeline {timeline_id}")
         res = self.put(
@@ -656,6 +669,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         timeline_id: TimelineId,
         force_repartition=False,
         force_image_layer_creation=False,
+        wait_until_uploaded=False,
     ):
         self.is_testing_enabled_or_skip()
         query = {}
@@ -663,6 +677,8 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
             query["force_repartition"] = "true"
         if force_image_layer_creation:
             query["force_image_layer_creation"] = "true"
+        if wait_until_uploaded:
+            query["wait_until_uploaded"] = "true"
 
         log.info(f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}")
         res = self.put(
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index 93a16620a3..6a3515e1bd 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -165,7 +165,6 @@ def test_sharding_compaction(
                 image_layer_sizes[layer.layer_file_name] = layer.layer_file_size
 
                 # Pageserver should assert rather than emit an empty layer file, but double check here
-                assert layer.layer_file_size is not None
                 assert layer.layer_file_size > 0
 
         shard_has_image_layers.append(len(image_layer_sizes) > 1)
@@ -178,7 +177,7 @@ def test_sharding_compaction(
             #
             # We only do this check with tiny stripes, because large stripes may not give all shards enough
             # data to have statistically significant image layers
-            avg_size = sum(v for v in image_layer_sizes.values()) / len(image_layer_sizes)  # type: ignore
+            avg_size = sum(v for v in image_layer_sizes.values()) / len(image_layer_sizes)
             log.info(f"Shard {shard_id} average image layer size: {avg_size}")
             assert avg_size > compaction_target_size / 2
 
diff --git a/test_runner/regress/test_layer_eviction.py b/test_runner/regress/test_layer_eviction.py
index 0ef4f6d95b..193149ea03 100644
--- a/test_runner/regress/test_layer_eviction.py
+++ b/test_runner/regress/test_layer_eviction.py
@@ -272,14 +272,14 @@ def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder):
             resident_physical_size_metric == 0
         ), "ensure that resident_physical_size metric is zero"
         assert resident_physical_size_metric == sum(
-            layer.layer_file_size or 0 for layer in info.historic_layers if not layer.remote
+            layer.layer_file_size for layer in info.historic_layers if not layer.remote
         ), "ensure that resident_physical_size metric corresponds to layer map dump"
 
         remote_physical_size_metric = ps_http.get_timeline_metric(
             tenant_id, timeline_id, "pageserver_remote_physical_size"
         )
         assert remote_physical_size_metric == sum(
-            layer.layer_file_size or 0 for layer in info.historic_layers if layer.remote
+            layer.layer_file_size for layer in info.historic_layers if layer.remote
         ), "ensure that remote_physical_size metric corresponds to layer map dump"
 
     log.info("before runnning GC, ensure that remote_physical size is zero")
diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py
index b51754c9e0..b137fb3a5c 100644
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -540,7 +540,6 @@ def test_compaction_downloads_on_demand_without_image_creation(neon_env_builder:
 
     for layer in layers.historic_layers:
         log.info(f"pre-compact:  {layer}")
-        assert layer.layer_file_size is not None, "we must know layer file sizes"
         layer_sizes += layer.layer_file_size
         pageserver_http.evict_layer(tenant_id, timeline_id, layer.layer_file_name)
 
diff --git a/test_runner/regress/test_pageserver_layer_rolling.py b/test_runner/regress/test_pageserver_layer_rolling.py
index aab0536f5a..66b6185aaa 100644
--- a/test_runner/regress/test_pageserver_layer_rolling.py
+++ b/test_runner/regress/test_pageserver_layer_rolling.py
@@ -287,7 +287,7 @@ def test_total_size_limit(neon_env_builder: NeonEnvBuilder):
             total_historic_bytes += sum(
                 layer.layer_file_size
                 for layer in layer_map.historic_layers
-                if layer.layer_file_size is not None and Lsn(layer.lsn_start) > initdb_lsn
+                if Lsn(layer.lsn_start) > initdb_lsn
             )
             total_ephemeral_layers += len(layer_map.in_memory_layers)
 
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 1bfeec6f4b..bbb1ad0c6d 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -632,7 +632,6 @@ def test_sharding_ingest_layer_sizes(
         historic_layers = sorted(layer_map.historic_layers, key=lambda layer: layer.lsn_start)
 
         for layer in historic_layers:
-            assert layer.layer_file_size is not None
             if layer.layer_file_size < expect_layer_size // 2:
                 classification = "Small"
                 small_layer_count += 1
diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index 8406de8bc1..3e435caeee 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -482,6 +482,163 @@ def test_detached_receives_flushes_while_being_detached(neon_env_builder: NeonEn
     env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
 
 
+def test_compaction_induced_by_detaches_in_history(
+    neon_env_builder: NeonEnvBuilder, test_output_dir, pg_distrib_dir, pg_bin: PgBin
+):
+    """
+    Assuming the tree of timelines:
+
+    root
+    |- child1
+       |- ...
+          |- wanted_detached_child
+
+    Each detach can add N more L0 per level, this is actually unbounded because
+    compaction can be arbitrarily delayed (or detach happen right before one
+    starts). If "wanted_detached_child" has already made progress and compacted
+    L1s, we want to make sure "compaction in the history" does not leave the
+    timeline broken.
+    """
+
+    psql_env = {"LD_LIBRARY_PATH": str(pg_distrib_dir / "lib")}
+
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            # we want to create layers manually so we don't branch on arbitrary
+            # Lsn, but we also do not want to compact L0 -> L1.
+            "compaction_threshold": "99999",
+            "compaction_period": "0s",
+            # shouldn't matter, but just in case
+            "gc_period": "0s",
+        }
+    )
+    env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
+    client = env.pageserver.http_client()
+
+    def delta_layers(timeline_id: TimelineId):
+        # shorthand for more readable formatting
+        return client.layer_map_info(env.initial_tenant, timeline_id).delta_layers()
+
+    with env.endpoints.create_start("main", tenant_id=env.initial_tenant) as ep:
+        ep.safe_psql("create table integers (i bigint not null);")
+        ep.safe_psql("insert into integers (i) values (42)")
+        branch_lsn = wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline)
+
+        client.timeline_checkpoint(env.initial_tenant, env.initial_timeline)
+
+        assert len(delta_layers(env.initial_timeline)) == 2
+
+    more_good_numbers = range(0, 3)
+
+    branches: List[Tuple[str, TimelineId]] = [("main", env.initial_timeline)]
+
+    for num in more_good_numbers:
+        branch_name = f"br-{len(branches)}"
+        branch_timeline_id = env.neon_cli.create_branch(
+            branch_name,
+            ancestor_branch_name=branches[-1][0],
+            tenant_id=env.initial_tenant,
+            ancestor_start_lsn=branch_lsn,
+        )
+        branches.append((branch_name, branch_timeline_id))
+
+        with env.endpoints.create_start(branches[-1][0], tenant_id=env.initial_tenant) as ep:
+            ep.safe_psql(
+                f"insert into integers (i) select i from generate_series({num}, {num + 100}) as s(i)"
+            )
+            branch_lsn = wait_for_last_flush_lsn(env, ep, env.initial_tenant, branch_timeline_id)
+            client.timeline_checkpoint(env.initial_tenant, branch_timeline_id)
+
+        assert len(delta_layers(branch_timeline_id)) == 1
+
+    # now fill in the final, most growing timeline
+
+    branch_name, branch_timeline_id = branches[-1]
+    with env.endpoints.create_start(branch_name, tenant_id=env.initial_tenant) as ep:
+        ep.safe_psql("insert into integers (i) select i from generate_series(50, 500) s(i)")
+
+        last_suffix = None
+        for suffix in range(0, 4):
+            ep.safe_psql(f"create table other_table_{suffix} as select * from integers")
+            wait_for_last_flush_lsn(env, ep, env.initial_tenant, branch_timeline_id)
+            client.timeline_checkpoint(env.initial_tenant, branch_timeline_id)
+            last_suffix = suffix
+
+        assert last_suffix is not None
+
+        assert len(delta_layers(branch_timeline_id)) == 5
+
+        client.patch_tenant_config_client_side(
+            env.initial_tenant, {"compaction_threshold": 5}, None
+        )
+
+        client.timeline_compact(env.initial_tenant, branch_timeline_id)
+
+        # one more layer
+        ep.safe_psql(f"create table other_table_{last_suffix + 1} as select * from integers")
+        wait_for_last_flush_lsn(env, ep, env.initial_tenant, branch_timeline_id)
+
+        # we need to wait here, because the detaches will do implicit tenant restart,
+        # and we could get unexpected layer counts
+        client.timeline_checkpoint(env.initial_tenant, branch_timeline_id, wait_until_uploaded=True)
+
+    assert len([filter(lambda x: x.l0, delta_layers(branch_timeline_id))]) == 1
+
+    skip_main = branches[1:]
+    branch_lsn = client.timeline_detail(env.initial_tenant, branch_timeline_id)["ancestor_lsn"]
+
+    # take the fullbackup before and after inheriting the new L0s
+    fullbackup_before = test_output_dir / "fullbackup-before.tar"
+    cmd = [
+        "psql",
+        "--no-psqlrc",
+        env.pageserver.connstr(),
+        "-c",
+        f"fullbackup {env.initial_tenant} {branch_timeline_id} {branch_lsn}",
+        "-o",
+        str(fullbackup_before),
+    ]
+    pg_bin.run_capture(cmd, env=psql_env)
+
+    for _, timeline_id in skip_main:
+        reparented = client.detach_ancestor(env.initial_tenant, timeline_id)
+        assert reparented == set(), "we have no earlier branches at any level"
+
+    post_detach_l0s = list(filter(lambda x: x.l0, delta_layers(branch_timeline_id)))
+    assert len(post_detach_l0s) == 5, "should had inherited 4 L0s, have 5 in total"
+
+    # checkpoint does compaction, which in turn decides to run, because
+    # there is now in total threshold number L0s even if they are not
+    # adjacent in Lsn space:
+    #
+    # inherited  flushed during this checkpoint
+    #       \\\\ /
+    #       1234X5---> lsn
+    #           |
+    #       l1 layers from "fill in the final, most growing timeline"
+    #
+    # branch_lsn is between 4 and first X.
+    client.timeline_checkpoint(env.initial_tenant, branch_timeline_id)
+
+    post_compact_l0s = list(filter(lambda x: x.l0, delta_layers(branch_timeline_id)))
+    assert len(post_compact_l0s) == 1, "only the consecutive inherited L0s should be compacted"
+
+    fullbackup_after = test_output_dir / "fullbackup_after.tar"
+    cmd = [
+        "psql",
+        "--no-psqlrc",
+        env.pageserver.connstr(),
+        "-c",
+        f"fullbackup {env.initial_tenant} {branch_timeline_id} {branch_lsn}",
+        "-o",
+        str(fullbackup_after),
+    ]
+    pg_bin.run_capture(cmd, env=psql_env)
+
+    # we don't need to skip any files, because zenith.signal will be identical
+    tar_cmp(fullbackup_before, fullbackup_after, set())
+
+
 # TODO:
 # - after starting the operation, tenant is deleted
 # - after starting the operation, pageserver is shutdown, restarted
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index a6d06df3b6..db5297870e 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -656,7 +656,7 @@ def get_physical_size_values(
     client = env.pageserver.http_client()
 
     res.layer_map_file_size_sum = sum(
-        layer.layer_file_size or 0
+        layer.layer_file_size
         for layer in client.layer_map_info(tenant_id, timeline_id).historic_layers
     )
 

From e3f6a07ca300549181042752c5943c71424e476a Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Tue, 21 May 2024 13:33:29 -0400
Subject: [PATCH 103/130] chore(pageserver): remove metrics for in-memory
 ingestion (#7823)

The metrics was added in https://github.com/neondatabase/neon/pull/7515/
to observe if https://github.com/neondatabase/neon/pull/7467 introduces
any perf regressions.

The change was deployed on 5/7 and no changes are observed in the
metrics. So it's safe to remove the metrics now.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/metrics.rs           | 7 -------
 pageserver/src/pgdatadir_mapping.rs | 5 -----
 2 files changed, 12 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 27e25d8e32..4f2c75d308 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1867,7 +1867,6 @@ pub(crate) struct WalIngestMetrics {
     pub(crate) records_received: IntCounter,
     pub(crate) records_committed: IntCounter,
     pub(crate) records_filtered: IntCounter,
-    pub(crate) time_spent_on_ingest: Histogram,
 }
 
 pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
@@ -1891,12 +1890,6 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMet
         "Number of WAL records filtered out due to sharding"
     )
     .expect("failed to define a metric"),
-    time_spent_on_ingest: register_histogram!(
-        "pageserver_wal_ingest_put_value_seconds",
-        "Actual time spent on ingesting a record",
-        redo_histogram_time_buckets!(),
-    )
-    .expect("failed to define a metric"),
 });
 
 pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index b4fc4a08ee..f9d8c1020d 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -9,7 +9,6 @@
 use super::tenant::{PageReconstructError, Timeline};
 use crate::context::RequestContext;
 use crate::keyspace::{KeySpace, KeySpaceAccum};
-use crate::metrics::WAL_INGEST;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
 use crate::walrecord::NeonWalRecord;
 use crate::{aux_file, repository::*};
@@ -1702,8 +1701,6 @@ impl<'a> DatadirModification<'a> {
     pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
         let mut writer = self.tline.writer().await;
 
-        let timer = WAL_INGEST.time_spent_on_ingest.start_timer();
-
         let pending_nblocks = self.pending_nblocks;
         self.pending_nblocks = 0;
 
@@ -1743,8 +1740,6 @@ impl<'a> DatadirModification<'a> {
             writer.update_directory_entries_count(kind, count as u64);
         }
 
-        timer.observe_duration();
-
         Ok(())
     }
 

From 679e031cf641d45d95705c61e121aa40781cc86f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 21 May 2024 23:31:20 +0200
Subject: [PATCH 104/130] Add dummy lsn lease http and page service APIs
 (#7815)

We want to introduce a concept of temporary and expiring LSN leases.
This adds both a http API as well as one for the page service to obtain
temporary LSN leases.

This adds a dummy implementation to unblock integration work of this
API. A functional implementation of the lease feature is deferred to a
later step.

Fixes #7808

Co-authored-by: Joonas Koivunen <joonas@neon.tech>
---
 libs/pageserver_api/src/models.rs    | 16 ++++++
 libs/pageserver_api/src/shard.rs     |  8 +++
 pageserver/src/http/openapi_spec.yml | 40 +++++++++++++++
 pageserver/src/http/routes.rs        | 30 ++++++++++++
 pageserver/src/page_service.rs       | 73 ++++++++++++++++++++++++++++
 pageserver/src/tenant/mgr.rs         |  9 +++-
 pageserver/src/tenant/timeline.rs    | 16 +++++-
 7 files changed, 190 insertions(+), 2 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 05a444d738..55e9c48421 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -162,6 +162,22 @@ impl std::fmt::Debug for TenantState {
     }
 }
 
+/// A temporary lease to a specific lsn inside a timeline.
+/// Access to the lsn is guaranteed by the pageserver until the expiration indicated by `valid_until`.
+#[serde_as]
+#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
+pub struct LsnLease {
+    #[serde_as(as = "SystemTimeAsRfc3339Millis")]
+    pub valid_until: SystemTime,
+}
+
+serde_with::serde_conv!(
+    SystemTimeAsRfc3339Millis,
+    SystemTime,
+    |time: &SystemTime| humantime::format_rfc3339_millis(*time).to_string(),
+    |value: String| -> Result<_, humantime::TimestampError> { humantime::parse_rfc3339(&value) }
+);
+
 /// The only [`TenantState`] variants we could be `TenantState::Activating` from.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 pub enum ActivatingFrom {
diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index 43d9b2e48c..1c05a01926 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -559,6 +559,14 @@ impl ShardIdentity {
         }
     }
 
+    /// Obtains the shard number and count combined into a `ShardIndex`.
+    pub fn shard_index(&self) -> ShardIndex {
+        ShardIndex {
+            shard_count: self.count,
+            shard_number: self.number,
+        }
+    }
+
     pub fn shard_slug(&self) -> String {
         if self.count > ShardCount(0) {
             format!("-{:02x}{:02x}", self.number.0, self.count.0)
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 36c74ed140..107bcd4a22 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -257,6 +257,37 @@ paths:
               schema:
                 $ref: "#/components/schemas/LsnByTimestampResponse"
 
+  /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/lsn_lease:
+    parameters:
+      - name: tenant_shard_id
+        in: path
+        required: true
+        schema:
+          type: string
+      - name: timeline_id
+        in: path
+        required: true
+        schema:
+          type: string
+          format: hex
+    post:
+      description: Obtain lease for the given LSN
+      parameters:
+        - name: lsn
+          in: query
+          required: true
+          schema:
+            type: string
+            format: hex
+          description: A LSN to obtain the lease for
+      responses:
+        "200":
+          description: OK
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/LsnLease"
+
   /v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc:
     parameters:
       - name: tenant_id
@@ -980,6 +1011,15 @@ components:
           type: string
           enum: [past, present, future, nodata]
 
+    LsnLease:
+      type: object
+      required:
+        - valid_until
+      properties:
+        valid_until:
+          type: string
+          format: date-time
+
     PageserverUtilization:
       type: object
       required:
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index ec3b1141f3..c75e4ca5a9 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1701,6 +1701,32 @@ async fn handle_tenant_break(
     json_response(StatusCode::OK, ())
 }
 
+// Obtains an lsn lease on the given timeline.
+async fn lsn_lease_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+
+    let lsn: Lsn = parse_query_param(&request, "lsn")?
+        .ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'lsn' query parameter")))?;
+
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+
+    let state = get_state(&request);
+
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
+    let result = timeline
+        .make_lsn_lease(lsn, &ctx)
+        .map_err(|e| ApiError::InternalServerError(e.context("lsn lease http handler")))?;
+
+    json_response(StatusCode::OK, result)
+}
+
 // Run GC immediately on given timeline.
 async fn timeline_gc_handler(
     mut request: Request<Body>,
@@ -2712,6 +2738,10 @@ pub fn make_router(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/get_timestamp_of_lsn",
             |r| api_handler(r, get_timestamp_of_lsn_handler),
         )
+        .post(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/lsn_lease",
+            |r| api_handler(r, lsn_lease_handler),
+        )
         .put(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/do_gc",
             |r| api_handler(r, timeline_gc_handler),
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 35aba044b2..c066f56c17 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -19,6 +19,7 @@ use pageserver_api::models::{
 };
 use pageserver_api::shard::ShardIndex;
 use pageserver_api::shard::ShardNumber;
+use pageserver_api::shard::TenantShardId;
 use postgres_backend::{is_expected_io_error, AuthType, PostgresBackend, QueryError};
 use pq_proto::framed::ConnectionError;
 use pq_proto::FeStartupPacket;
@@ -33,6 +34,7 @@ use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
 use std::time::Instant;
+use std::time::SystemTime;
 use tokio::io::AsyncWriteExt;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_util::io::StreamReader;
@@ -905,6 +907,39 @@ impl PageServerHandler {
         }
     }
 
+    #[instrument(skip_all, fields(shard_id, %lsn))]
+    async fn handle_make_lsn_lease<IO>(
+        &self,
+        pgb: &mut PostgresBackend<IO>,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> Result<(), QueryError>
+    where
+        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
+    {
+        let shard_selector = ShardSelector::Known(tenant_shard_id.to_index());
+        let timeline = self
+            .get_active_tenant_timeline(tenant_shard_id.tenant_id, timeline_id, shard_selector)
+            .await?;
+        let lease = timeline.make_lsn_lease(lsn, ctx)?;
+        let valid_until = lease
+            .valid_until
+            .duration_since(SystemTime::UNIX_EPOCH)
+            .map_err(|e| QueryError::Other(e.into()))?;
+
+        pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor::text_col(
+            b"valid_until",
+        )]))?
+        .write_message_noflush(&BeMessage::DataRow(&[Some(
+            &valid_until.as_millis().to_be_bytes(),
+        )]))?
+        .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
+
+        Ok(())
+    }
+
     #[instrument(skip_all, fields(shard_id))]
     async fn handle_get_rel_exists_request(
         &mut self,
@@ -1802,6 +1837,44 @@ where
             // important because psycopg2 executes "SET datestyle TO 'ISO'"
             // on connect
             pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
+        } else if query_string.starts_with("lease lsn ") {
+            let (_, params_raw) = query_string.split_at("lease lsn ".len());
+            let params = params_raw.split_whitespace().collect::<Vec<_>>();
+            if params.len() != 3 {
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "invalid param number {} for lease lsn command",
+                    params.len()
+                )));
+            }
+
+            let tenant_shard_id = TenantShardId::from_str(params[0])
+                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
+            let timeline_id = TimelineId::from_str(params[1])
+                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
+
+            tracing::Span::current()
+                .record("tenant_id", field::display(tenant_shard_id))
+                .record("timeline_id", field::display(timeline_id));
+
+            self.check_permission(Some(tenant_shard_id.tenant_id))?;
+
+            // The caller is responsible for providing correct lsn.
+            let lsn = Lsn::from_str(params[2])
+                .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
+
+            match self
+                .handle_make_lsn_lease(pgb, tenant_shard_id, timeline_id, lsn, &ctx)
+                .await
+            {
+                Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
+                Err(e) => {
+                    error!("error obtaining lsn lease for {lsn}: {e:?}");
+                    pgb.write_message_noflush(&BeMessage::ErrorResponse(
+                        &e.to_string(),
+                        Some(e.pg_error_code()),
+                    ))?
+                }
+            };
         } else if query_string.starts_with("show ") {
             // show <tenant_id>
             let (_, params_raw) = query_string.split_at("show ".len());
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 1d8e2cf6d3..89fdf31849 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -7,7 +7,7 @@ use itertools::Itertools;
 use pageserver_api::key::Key;
 use pageserver_api::models::LocationConfigMode;
 use pageserver_api::shard::{
-    ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId,
+    ShardCount, ShardIdentity, ShardIndex, ShardNumber, ShardStripeSize, TenantShardId,
 };
 use pageserver_api::upcall_api::ReAttachResponseTenant;
 use rand::{distributions::Alphanumeric, Rng};
@@ -127,6 +127,8 @@ pub(crate) enum ShardSelector {
     First,
     /// Pick the shard that holds this key
     Page(Key),
+    /// The shard ID is known: pick the given shard
+    Known(ShardIndex),
 }
 
 /// A convenience for use with the re_attach ControlPlaneClient function: rather
@@ -2067,6 +2069,11 @@ impl TenantManager {
                                 return ShardResolveResult::Found(tenant.clone());
                             }
                         }
+                        ShardSelector::Known(shard)
+                            if tenant.shard_identity.shard_index() == shard =>
+                        {
+                            return ShardResolveResult::Found(tenant.clone());
+                        }
                         _ => continue,
                     }
                 }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index b0e5275b5f..1f8ee9ffc4 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -25,7 +25,7 @@ use pageserver_api::{
     models::{
         AtomicAuxFilePolicy, AuxFilePolicy, CompactionAlgorithm, DownloadRemoteLayersTaskInfo,
         DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, InMemoryLayerInfo, LayerMapInfo,
-        TimelineState,
+        LsnLease, TimelineState,
     },
     reltag::BlockNumber,
     shard::{ShardIdentity, ShardNumber, TenantShardId},
@@ -1532,6 +1532,20 @@ impl Timeline {
         Ok(())
     }
 
+    /// Obtains a temporary lease blocking garbage collection for the given LSN
+    pub(crate) fn make_lsn_lease(
+        &self,
+        _lsn: Lsn,
+        _ctx: &RequestContext,
+    ) -> anyhow::Result<LsnLease> {
+        const LEASE_LENGTH: Duration = Duration::from_secs(5 * 60);
+        let lease = LsnLease {
+            valid_until: SystemTime::now() + LEASE_LENGTH,
+        };
+        // TODO: dummy implementation
+        Ok(lease)
+    }
+
     /// Flush to disk all data that was written with the put_* functions
     #[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))]
     pub(crate) async fn freeze_and_flush(&self) -> anyhow::Result<()> {

From 00d66e8012c2d4b0df706fb5e89ab8351e4eb402 Mon Sep 17 00:00:00 2001
From: Em Sharnoff <sharnoff@neon.tech>
Date: Tue, 21 May 2024 16:52:48 -0700
Subject: [PATCH 105/130] compute_ctl: Fix handling of missing
 /neonvm/bin/resize-swap (#7832)

The logic added in the original PR (#7434) only worked before sudo was
used, because 'sudo foo' will only fail with NotFound if 'sudo' doesn't
exist; if 'foo' doesn't exist, then sudo will fail with a normal error
exit.

This means that compute_ctl may fail to restart if it exits after
successfully enabling swap.
---
 compute_tools/src/swap.rs | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/compute_tools/src/swap.rs b/compute_tools/src/swap.rs
index c22b6bc14e..024c5b338e 100644
--- a/compute_tools/src/swap.rs
+++ b/compute_tools/src/swap.rs
@@ -1,3 +1,5 @@
+use std::path::Path;
+
 use anyhow::{anyhow, Context};
 use tracing::warn;
 
@@ -17,17 +19,24 @@ pub fn resize_swap(size_bytes: u64) -> anyhow::Result<()> {
         .arg(size_bytes.to_string())
         .spawn();
 
-    if matches!(&child_result, Err(e) if e.kind() == std::io::ErrorKind::NotFound) {
-        warn!("ignoring \"not found\" error from resize-swap to avoid swapoff while compute is running");
-        return Ok(());
-    }
-
     child_result
         .context("spawn() failed")
         .and_then(|mut child| child.wait().context("wait() failed"))
         .and_then(|status| match status.success() {
             true => Ok(()),
-            false => Err(anyhow!("process exited with {status}")),
+            false => {
+                // The command failed. Maybe it was because the resize-swap file doesn't exist?
+                // The --once flag causes it to delete itself on success so we don't disable swap
+                // while postgres is running; maybe this is fine.
+                match Path::new(RESIZE_SWAP_BIN).try_exists() {
+                    Err(_) | Ok(true) => Err(anyhow!("process exited with {status}")),
+                    // The path doesn't exist; we're actually ok 
+                    Ok(false) => {
+                        warn!("ignoring \"not found\" error from resize-swap to avoid swapoff while compute is running");
+                        Ok(())
+                    },
+                }
+            }
         })
         // wrap any prior error with the overall context that we couldn't run the command
         .with_context(|| {

From bd5cb9e86b2f8623b2528df9fa59daa21759e183 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Wed, 22 May 2024 09:34:39 +0100
Subject: [PATCH 106/130] Implement timeline_manager for safekeeper background
 tasks (#7768)

In safekeepers we have several background tasks. Previously `WAL backup`
task was spawned by another task called `wal_backup_launcher`. That task
received notifications via `wal_backup_launcher_rx` and decided to spawn
or kill existing backup task associated with the timeline. This was
inconvenient because each code segment that touched shared state was
responsible for pushing notification into `wal_backup_launcher_tx`
channel. This was error prone because it's easy to miss and could lead
to deadlock in some cases, if notification pushing was done in the wrong
order.

We also had a similar issue with `is_active` timeline flag. That flag
was calculated based on the state and code modifying the state had to
call function to update the flag. We had a few bugs related to that,
when we forgot to update `is_active` flag in some places where it could
change.

To fix these issues, this PR adds a new `timeline_manager` background
task associated with each timeline. This task is responsible for
managing all background tasks, including `is_active` flag which is used
for pushing broker messages. It is subscribed for updates in timeline
state in a loop and decides to spawn/kill background tasks when needed.

There is a new structure called `TimelinesSet`. It stores a set of
`Arc<Timeline>` and allows to copy the set to iterate without holding
the mutex. This is what replaced `is_active` flag for the broker. Now
broker push task holds a reference to the `TimelinesSet` with active
timelines and use it instead of iterating over all timelines and
filtering by `is_active` flag.

Also added some metrics for manager iterations and active backup tasks.
Ideally manager should be doing not too many iterations and we should
not have a lot of backup tasks spawned at the same time.

Fixes #7751

---------

Co-authored-by: Arseny Sher <sher-ars@yandex.ru>
---
 safekeeper/src/bin/safekeeper.rs       |  17 +-
 safekeeper/src/broker.rs               |  11 +-
 safekeeper/src/lib.rs                  |   2 +
 safekeeper/src/metrics.rs              |  35 ++-
 safekeeper/src/receive_wal.rs          |  57 ++--
 safekeeper/src/remove_wal.rs           |  19 +-
 safekeeper/src/timeline.rs             | 404 +++++++++++--------------
 safekeeper/src/timeline_manager.rs     | 153 ++++++++++
 safekeeper/src/timelines_global_map.rs |  60 ++--
 safekeeper/src/timelines_set.rs        |  90 ++++++
 safekeeper/src/wal_backup.rs           | 201 +++++-------
 11 files changed, 587 insertions(+), 462 deletions(-)
 create mode 100644 safekeeper/src/timeline_manager.rs
 create mode 100644 safekeeper/src/timelines_set.rs

diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 09c565ce71..aee3898ac7 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -20,7 +20,6 @@ use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
 use storage_broker::Uri;
-use tokio::sync::mpsc;
 
 use tracing::*;
 use utils::pid_file;
@@ -30,13 +29,13 @@ use safekeeper::defaults::{
     DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES,
     DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR,
 };
+use safekeeper::remove_wal;
 use safekeeper::wal_service;
 use safekeeper::GlobalTimelines;
 use safekeeper::SafeKeeperConf;
 use safekeeper::{broker, WAL_SERVICE_RUNTIME};
 use safekeeper::{control_file, BROKER_RUNTIME};
 use safekeeper::{http, WAL_REMOVER_RUNTIME};
-use safekeeper::{remove_wal, WAL_BACKUP_RUNTIME};
 use safekeeper::{wal_backup, HTTP_RUNTIME};
 use storage_broker::DEFAULT_ENDPOINT;
 use utils::auth::{JwtAuth, Scope, SwappableJwtAuth};
@@ -377,8 +376,6 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
     let timeline_collector = safekeeper::metrics::TimelineCollector::new();
     metrics::register_internal(Box::new(timeline_collector))?;
 
-    let (wal_backup_launcher_tx, wal_backup_launcher_rx) = mpsc::channel(100);
-
     wal_backup::init_remote_storage(&conf);
 
     // Keep handles to main tasks to die if any of them disappears.
@@ -391,19 +388,9 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
     let current_thread_rt = conf
         .current_thread_runtime
         .then(|| Handle::try_current().expect("no runtime in main"));
-    let conf_ = conf.clone();
-    let wal_backup_handle = current_thread_rt
-        .as_ref()
-        .unwrap_or_else(|| WAL_BACKUP_RUNTIME.handle())
-        .spawn(wal_backup::wal_backup_launcher_task_main(
-            conf_,
-            wal_backup_launcher_rx,
-        ))
-        .map(|res| ("WAL backup launcher".to_owned(), res));
-    tasks_handles.push(Box::pin(wal_backup_handle));
 
     // Load all timelines from disk to memory.
-    GlobalTimelines::init(conf.clone(), wal_backup_launcher_tx).await?;
+    GlobalTimelines::init(conf.clone()).await?;
 
     let conf_ = conf.clone();
     // Run everything in current thread rt, if asked.
diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs
index 46a51438ea..7cc2142291 100644
--- a/safekeeper/src/broker.rs
+++ b/safekeeper/src/broker.rs
@@ -46,6 +46,8 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
         return Ok(());
     }
 
+    let active_timelines_set = GlobalTimelines::get_global_broker_active_set();
+
     let mut client =
         storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?;
     let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC);
@@ -57,15 +59,9 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
             // sensitive and there is no risk of deadlock as we don't await while
             // lock is held.
             let now = Instant::now();
-            let all_tlis = GlobalTimelines::get_all();
+            let all_tlis = active_timelines_set.get_all();
             let mut n_pushed_tlis = 0;
             for tli in &all_tlis {
-                // filtering alternative futures::stream::iter(all_tlis)
-                //   .filter(|tli| {let tli = tli.clone(); async move { tli.is_active().await}}).collect::<Vec<_>>().await;
-                // doesn't look better, and I'm not sure how to do that without collect.
-                if !tli.is_active().await {
-                    continue;
-                }
                 let sk_info = tli.get_safekeeper_info(&conf).await;
                 yield sk_info;
                 BROKER_PUSHED_UPDATES.inc();
@@ -90,6 +86,7 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
 }
 
 /// Subscribe and fetch all the interesting data from the broker.
+#[instrument(name = "broker pull", skip_all)]
 async fn pull_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<()> {
     let mut client = storage_broker::connect(conf.broker_endpoint, conf.broker_keepalive_interval)?;
 
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index 543714a54e..8d8d2cf23e 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -31,6 +31,8 @@ pub mod safekeeper;
 pub mod send_wal;
 pub mod state;
 pub mod timeline;
+pub mod timeline_manager;
+pub mod timelines_set;
 pub mod wal_backup;
 pub mod wal_backup_partial;
 pub mod wal_service;
diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs
index 28ae042bb3..1e965393e3 100644
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -11,8 +11,9 @@ use futures::Future;
 use metrics::{
     core::{AtomicU64, Collector, Desc, GenericCounter, GenericGaugeVec, Opts},
     proto::MetricFamily,
-    register_int_counter, register_int_counter_pair_vec, register_int_counter_vec, Gauge,
-    IntCounter, IntCounterPairVec, IntCounterVec, IntGaugeVec,
+    register_int_counter, register_int_counter_pair, register_int_counter_pair_vec,
+    register_int_counter_vec, Gauge, IntCounter, IntCounterPair, IntCounterPairVec, IntCounterVec,
+    IntGaugeVec,
 };
 use once_cell::sync::Lazy;
 
@@ -162,6 +163,29 @@ pub static PARTIAL_BACKUP_UPLOADED_BYTES: Lazy<IntCounter> = Lazy::new(|| {
     )
     .expect("Failed to register safekeeper_partial_backup_uploaded_bytes_total counter")
 });
+pub static MANAGER_ITERATIONS_TOTAL: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "safekeeper_manager_iterations_total",
+        "Number of iterations of the timeline manager task"
+    )
+    .expect("Failed to register safekeeper_manager_iterations_total counter")
+});
+pub static MANAGER_ACTIVE_CHANGES: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "safekeeper_manager_active_changes_total",
+        "Number of timeline active status changes in the timeline manager task"
+    )
+    .expect("Failed to register safekeeper_manager_active_changes_total counter")
+});
+pub static WAL_BACKUP_TASKS: Lazy<IntCounterPair> = Lazy::new(|| {
+    register_int_counter_pair!(
+        "safekeeper_wal_backup_tasks_started_total",
+        "Number of active WAL backup tasks",
+        "safekeeper_wal_backup_tasks_finished_total",
+        "Number of finished WAL backup tasks",
+    )
+    .expect("Failed to register safekeeper_wal_backup_tasks_finished_total counter")
+});
 
 pub const LABEL_UNKNOWN: &str = "unknown";
 
@@ -614,8 +638,7 @@ impl Collector for TimelineCollector {
         self.written_wal_seconds.reset();
         self.flushed_wal_seconds.reset();
 
-        let timelines = GlobalTimelines::get_all();
-        let timelines_count = timelines.len();
+        let timelines_count = GlobalTimelines::get_all().len();
         let mut active_timelines_count = 0;
 
         // Prometheus Collector is sync, and data is stored under async lock. To
@@ -746,9 +769,9 @@ impl Collector for TimelineCollector {
 
 async fn collect_timeline_metrics() -> Vec<FullTimelineInfo> {
     let mut res = vec![];
-    let timelines = GlobalTimelines::get_all();
+    let active_timelines = GlobalTimelines::get_global_broker_active_set().get_all();
 
-    for tli in timelines {
+    for tli in active_timelines {
         if let Some(info) = tli.info_for_metrics().await {
             res.push(info);
         }
diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs
index 0356def7df..03cfa882c4 100644
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -45,6 +45,9 @@ const DEFAULT_FEEDBACK_CAPACITY: usize = 8;
 pub struct WalReceivers {
     mutex: Mutex<WalReceiversShared>,
     pageserver_feedback_tx: tokio::sync::broadcast::Sender<PageserverFeedback>,
+
+    num_computes_tx: tokio::sync::watch::Sender<usize>,
+    num_computes_rx: tokio::sync::watch::Receiver<usize>,
 }
 
 /// Id under which walreceiver is registered in shmem.
@@ -55,16 +58,21 @@ impl WalReceivers {
         let (pageserver_feedback_tx, _) =
             tokio::sync::broadcast::channel(DEFAULT_FEEDBACK_CAPACITY);
 
+        let (num_computes_tx, num_computes_rx) = tokio::sync::watch::channel(0usize);
+
         Arc::new(WalReceivers {
             mutex: Mutex::new(WalReceiversShared { slots: Vec::new() }),
             pageserver_feedback_tx,
+            num_computes_tx,
+            num_computes_rx,
         })
     }
 
     /// Register new walreceiver. Returned guard provides access to the slot and
     /// automatically deregisters in Drop.
     pub fn register(self: &Arc<WalReceivers>, conn_id: Option<ConnectionId>) -> WalReceiverGuard {
-        let slots = &mut self.mutex.lock().slots;
+        let mut shared = self.mutex.lock();
+        let slots = &mut shared.slots;
         let walreceiver = WalReceiverState {
             conn_id,
             status: WalReceiverStatus::Voting,
@@ -78,6 +86,9 @@ impl WalReceivers {
             slots.push(Some(walreceiver));
             pos
         };
+
+        self.update_num(&shared);
+
         WalReceiverGuard {
             id: pos,
             walreceivers: self.clone(),
@@ -99,7 +110,18 @@ impl WalReceivers {
 
     /// Get number of walreceivers (compute connections).
     pub fn get_num(self: &Arc<WalReceivers>) -> usize {
-        self.mutex.lock().slots.iter().flatten().count()
+        self.mutex.lock().get_num()
+    }
+
+    /// Get channel for number of walreceivers.
+    pub fn get_num_rx(self: &Arc<WalReceivers>) -> tokio::sync::watch::Receiver<usize> {
+        self.num_computes_rx.clone()
+    }
+
+    /// Should get called after every update of slots.
+    fn update_num(self: &Arc<WalReceivers>, shared: &MutexGuard<WalReceiversShared>) {
+        let num = shared.get_num();
+        self.num_computes_tx.send_replace(num);
     }
 
     /// Get state of all walreceivers.
@@ -123,6 +145,7 @@ impl WalReceivers {
     fn unregister(self: &Arc<WalReceivers>, id: WalReceiverId) {
         let mut shared = self.mutex.lock();
         shared.slots[id] = None;
+        self.update_num(&shared);
     }
 
     /// Broadcast pageserver feedback to connected walproposers.
@@ -137,6 +160,13 @@ struct WalReceiversShared {
     slots: Vec<Option<WalReceiverState>>,
 }
 
+impl WalReceiversShared {
+    /// Get number of walreceivers (compute connections).
+    fn get_num(&self) -> usize {
+        self.slots.iter().flatten().count()
+    }
+}
+
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct WalReceiverState {
     /// None means it is recovery initiated by us (this safekeeper).
@@ -456,14 +486,7 @@ impl WalAcceptor {
     /// The main loop. Returns Ok(()) if either msg_rx or reply_tx got closed;
     /// it must mean that network thread terminated.
     async fn run(&mut self) -> anyhow::Result<()> {
-        // Register the connection and defer unregister.
-        // Order of the next two lines is important: we want first to remove our entry and then
-        // update status which depends on registered connections.
-        let _compute_conn_guard = ComputeConnectionGuard {
-            timeline: Arc::clone(&self.tli),
-        };
         let walreceiver_guard = self.tli.get_walreceivers().register(self.conn_id);
-        self.tli.update_status_notify().await?;
 
         // After this timestamp we will stop processing AppendRequests and send a response
         // to the walproposer. walproposer sends at least one AppendRequest per second,
@@ -529,19 +552,3 @@ impl WalAcceptor {
         }
     }
 }
-
-/// Calls update_status_notify in drop to update timeline status.
-struct ComputeConnectionGuard {
-    timeline: Arc<Timeline>,
-}
-
-impl Drop for ComputeConnectionGuard {
-    fn drop(&mut self) {
-        let tli = self.timeline.clone();
-        tokio::spawn(async move {
-            if let Err(e) = tli.update_status_notify().await {
-                error!("failed to update timeline status: {}", e);
-            }
-        });
-    }
-}
diff --git a/safekeeper/src/remove_wal.rs b/safekeeper/src/remove_wal.rs
index 9dce06a886..98ce671182 100644
--- a/safekeeper/src/remove_wal.rs
+++ b/safekeeper/src/remove_wal.rs
@@ -7,29 +7,18 @@ use tracing::*;
 
 use crate::{GlobalTimelines, SafeKeeperConf};
 
-const ALLOW_INACTIVE_TIMELINES: bool = true;
-
-pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
+pub async fn task_main(_conf: SafeKeeperConf) -> anyhow::Result<()> {
     let wal_removal_interval = Duration::from_millis(5000);
     loop {
         let now = tokio::time::Instant::now();
-        let mut active_timelines = 0;
-
         let tlis = GlobalTimelines::get_all();
         for tli in &tlis {
-            let is_active = tli.is_active().await;
-            if is_active {
-                active_timelines += 1;
-            }
-            if !ALLOW_INACTIVE_TIMELINES && !is_active {
-                continue;
-            }
             let ttid = tli.ttid;
             async {
                 if let Err(e) = tli.maybe_persist_control_file().await {
                     warn!("failed to persist control file: {e}");
                 }
-                if let Err(e) = tli.remove_old_wal(conf.wal_backup_enabled).await {
+                if let Err(e) = tli.remove_old_wal().await {
                     error!("failed to remove WAL: {}", e);
                 }
             }
@@ -42,8 +31,8 @@ pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
 
         if elapsed > wal_removal_interval {
             info!(
-                "WAL removal is too long, processed {} active timelines ({} total) in {:?}",
-                active_timelines, total_timelines, elapsed
+                "WAL removal is too long, processed {} timelines in {:?}",
+                total_timelines, elapsed
             );
         }
 
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index e97247dc7c..da2e3f4538 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -8,13 +8,12 @@ use serde::{Deserialize, Serialize};
 use tokio::fs;
 
 use std::cmp::max;
+use std::ops::{Deref, DerefMut};
+use std::sync::atomic::{AtomicBool, Ordering};
 use std::sync::Arc;
 use std::time::Duration;
-use tokio::sync::{Mutex, MutexGuard};
-use tokio::{
-    sync::{mpsc::Sender, watch},
-    time::Instant,
-};
+use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
+use tokio::{sync::watch, time::Instant};
 use tracing::*;
 use utils::http::error::ApiError;
 use utils::{
@@ -33,12 +32,13 @@ use crate::safekeeper::{
 };
 use crate::send_wal::WalSenders;
 use crate::state::{TimelineMemState, TimelinePersistentState};
+use crate::timelines_set::TimelinesSet;
 use crate::wal_backup::{self};
 use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION};
 
 use crate::metrics::FullTimelineInfo;
 use crate::wal_storage::Storage as wal_storage_iface;
-use crate::{debug_dump, wal_backup_partial, wal_storage};
+use crate::{debug_dump, timeline_manager, wal_backup_partial, wal_storage};
 use crate::{GlobalTimelines, SafeKeeperConf};
 
 /// Things safekeeper should know about timeline state on peers.
@@ -51,8 +51,7 @@ pub struct PeerInfo {
     /// LSN of the last record.
     pub flush_lsn: Lsn,
     pub commit_lsn: Lsn,
-    /// Since which LSN safekeeper has WAL. TODO: remove this once we fill new
-    /// sk since backup_lsn.
+    /// Since which LSN safekeeper has WAL.
     pub local_start_lsn: Lsn,
     /// When info was received. Serde annotations are not very useful but make
     /// the code compile -- we don't rely on this field externally.
@@ -97,25 +96,72 @@ impl PeersInfo {
     }
 }
 
+pub type ReadGuardSharedState<'a> = RwLockReadGuard<'a, SharedState>;
+
+/// WriteGuardSharedState is a wrapper around `RwLockWriteGuard<SharedState>` that
+/// automatically updates `watch::Sender` channels with state on drop.
+pub struct WriteGuardSharedState<'a> {
+    tli: Arc<Timeline>,
+    guard: RwLockWriteGuard<'a, SharedState>,
+}
+
+impl<'a> WriteGuardSharedState<'a> {
+    fn new(tli: Arc<Timeline>, guard: RwLockWriteGuard<'a, SharedState>) -> Self {
+        WriteGuardSharedState { tli, guard }
+    }
+}
+
+impl<'a> Deref for WriteGuardSharedState<'a> {
+    type Target = SharedState;
+
+    fn deref(&self) -> &Self::Target {
+        &self.guard
+    }
+}
+
+impl<'a> DerefMut for WriteGuardSharedState<'a> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.guard
+    }
+}
+
+impl<'a> Drop for WriteGuardSharedState<'a> {
+    fn drop(&mut self) {
+        let term_flush_lsn = TermLsn::from((self.guard.sk.get_term(), self.guard.sk.flush_lsn()));
+        let commit_lsn = self.guard.sk.state.inmem.commit_lsn;
+
+        let _ = self.tli.term_flush_lsn_watch_tx.send_if_modified(|old| {
+            if *old != term_flush_lsn {
+                *old = term_flush_lsn;
+                true
+            } else {
+                false
+            }
+        });
+
+        let _ = self.tli.commit_lsn_watch_tx.send_if_modified(|old| {
+            if *old != commit_lsn {
+                *old = commit_lsn;
+                true
+            } else {
+                false
+            }
+        });
+
+        // send notification about shared state update
+        self.tli.shared_state_version_tx.send_modify(|old| {
+            *old += 1;
+        });
+    }
+}
+
 /// Shared state associated with database instance
 pub struct SharedState {
     /// Safekeeper object
-    sk: SafeKeeper<control_file::FileStorage, wal_storage::PhysicalStorage>,
+    pub(crate) sk: SafeKeeper<control_file::FileStorage, wal_storage::PhysicalStorage>,
     /// In memory list containing state of peers sent in latest messages from them.
-    peers_info: PeersInfo,
-    /// True when WAL backup launcher oversees the timeline, making sure WAL is
-    /// offloaded, allows to bother launcher less.
-    wal_backup_active: bool,
-    /// True whenever there is at least some pending activity on timeline: live
-    /// compute connection, pageserver is not caughtup (it must have latest WAL
-    /// for new compute start) or WAL backuping is not finished. Practically it
-    /// means safekeepers broadcast info to peers about the timeline, old WAL is
-    /// trimmed.
-    ///
-    /// TODO: it might be better to remove tli completely from GlobalTimelines
-    /// when tli is inactive instead of having this flag.
-    active: bool,
-    last_removed_segno: XLogSegNo,
+    pub(crate) peers_info: PeersInfo,
+    pub(crate) last_removed_segno: XLogSegNo,
 }
 
 impl SharedState {
@@ -152,8 +198,6 @@ impl SharedState {
         Ok(Self {
             sk,
             peers_info: PeersInfo(vec![]),
-            wal_backup_active: false,
-            active: false,
             last_removed_segno: 0,
         })
     }
@@ -171,75 +215,10 @@ impl SharedState {
         Ok(Self {
             sk: SafeKeeper::new(control_store, wal_store, conf.my_id)?,
             peers_info: PeersInfo(vec![]),
-            wal_backup_active: false,
-            active: false,
             last_removed_segno: 0,
         })
     }
 
-    fn is_active(&self, num_computes: usize) -> bool {
-        self.is_wal_backup_required(num_computes)
-            // FIXME: add tracking of relevant pageservers and check them here individually,
-            // otherwise migration won't work (we suspend too early).
-            || self.sk.state.inmem.remote_consistent_lsn < self.sk.state.inmem.commit_lsn
-    }
-
-    /// Mark timeline active/inactive and return whether s3 offloading requires
-    /// start/stop action. If timeline is deactivated, control file is persisted
-    /// as maintenance task does that only for active timelines.
-    async fn update_status(&mut self, num_computes: usize, ttid: TenantTimelineId) -> bool {
-        let is_active = self.is_active(num_computes);
-        if self.active != is_active {
-            info!(
-                "timeline {} active={} now, remote_consistent_lsn={}, commit_lsn={}",
-                ttid,
-                is_active,
-                self.sk.state.inmem.remote_consistent_lsn,
-                self.sk.state.inmem.commit_lsn
-            );
-            if !is_active {
-                if let Err(e) = self.sk.state.flush().await {
-                    warn!("control file save in update_status failed: {:?}", e);
-                }
-            }
-        }
-        self.active = is_active;
-        self.is_wal_backup_action_pending(num_computes)
-    }
-
-    /// Should we run s3 offloading in current state?
-    fn is_wal_backup_required(&self, num_computes: usize) -> bool {
-        let seg_size = self.get_wal_seg_size();
-        num_computes > 0 ||
-        // Currently only the whole segment is offloaded, so compare segment numbers.
-            (self.sk.state.inmem.commit_lsn.segment_number(seg_size) >
-             self.sk.state.inmem.backup_lsn.segment_number(seg_size))
-    }
-
-    /// Is current state of s3 offloading is not what it ought to be?
-    fn is_wal_backup_action_pending(&self, num_computes: usize) -> bool {
-        let res = self.wal_backup_active != self.is_wal_backup_required(num_computes);
-        if res {
-            let action_pending = if self.is_wal_backup_required(num_computes) {
-                "start"
-            } else {
-                "stop"
-            };
-            trace!(
-                "timeline {} s3 offloading action {} pending: num_computes={}, commit_lsn={}, backup_lsn={}",
-                self.sk.state.timeline_id, action_pending, num_computes, self.sk.state.inmem.commit_lsn, self.sk.state.inmem.backup_lsn
-            );
-        }
-        res
-    }
-
-    /// Returns whether s3 offloading is required and sets current status as
-    /// matching.
-    fn wal_backup_attend(&mut self, num_computes: usize) -> bool {
-        self.wal_backup_active = self.is_wal_backup_required(num_computes);
-        self.wal_backup_active
-    }
-
     fn get_wal_seg_size(&self) -> usize {
         self.sk.state.server.wal_seg_size as usize
     }
@@ -278,7 +257,7 @@ impl SharedState {
     /// Get our latest view of alive peers status on the timeline.
     /// We pass our own info through the broker as well, so when we don't have connection
     /// to the broker returned vec is empty.
-    fn get_peers(&self, heartbeat_timeout: Duration) -> Vec<PeerInfo> {
+    pub(crate) fn get_peers(&self, heartbeat_timeout: Duration) -> Vec<PeerInfo> {
         let now = Instant::now();
         self.peers_info
             .0
@@ -294,18 +273,13 @@ impl SharedState {
     /// offloading.
     /// While it is safe to use inmem values for determining horizon,
     /// we use persistent to make possible normal states less surprising.
-    fn get_horizon_segno(
-        &self,
-        wal_backup_enabled: bool,
-        extra_horizon_lsn: Option<Lsn>,
-    ) -> XLogSegNo {
+    fn get_horizon_segno(&self, extra_horizon_lsn: Option<Lsn>) -> XLogSegNo {
         let state = &self.sk.state;
 
         use std::cmp::min;
         let mut horizon_lsn = min(state.remote_consistent_lsn, state.peer_horizon_lsn);
-        if wal_backup_enabled {
-            horizon_lsn = min(horizon_lsn, state.backup_lsn);
-        }
+        // we don't want to remove WAL that is not yet offloaded to s3
+        horizon_lsn = min(horizon_lsn, state.backup_lsn);
         if let Some(extra_horizon_lsn) = extra_horizon_lsn {
             horizon_lsn = min(horizon_lsn, extra_horizon_lsn);
         }
@@ -346,11 +320,6 @@ impl From<TimelineError> for ApiError {
 pub struct Timeline {
     pub ttid: TenantTimelineId,
 
-    /// Sending here asks for wal backup launcher attention (start/stop
-    /// offloading). Sending ttid instead of concrete command allows to do
-    /// sending without timeline lock.
-    pub wal_backup_launcher_tx: Sender<TenantTimelineId>,
-
     /// Used to broadcast commit_lsn updates to all background jobs.
     commit_lsn_watch_tx: watch::Sender<Lsn>,
     commit_lsn_watch_rx: watch::Receiver<Lsn>,
@@ -362,10 +331,14 @@ pub struct Timeline {
     term_flush_lsn_watch_tx: watch::Sender<TermLsn>,
     term_flush_lsn_watch_rx: watch::Receiver<TermLsn>,
 
+    /// Broadcasts shared state updates.
+    shared_state_version_tx: watch::Sender<usize>,
+    shared_state_version_rx: watch::Receiver<usize>,
+
     /// Safekeeper and other state, that should remain consistent and
     /// synchronized with the disk. This is tokio mutex as we write WAL to disk
     /// while holding it, ensuring that consensus checks are in order.
-    mutex: Mutex<SharedState>,
+    mutex: RwLock<SharedState>,
     walsenders: Arc<WalSenders>,
     walreceivers: Arc<WalReceivers>,
 
@@ -384,15 +357,15 @@ pub struct Timeline {
     /// with different speed.
     // TODO: add `Arc<SafeKeeperConf>` here instead of adding each field separately.
     walsenders_keep_horizon: bool,
+
+    // timeline_manager controlled state
+    pub(crate) broker_active: AtomicBool,
+    pub(crate) wal_backup_active: AtomicBool,
 }
 
 impl Timeline {
     /// Load existing timeline from disk.
-    pub fn load_timeline(
-        conf: &SafeKeeperConf,
-        ttid: TenantTimelineId,
-        wal_backup_launcher_tx: Sender<TenantTimelineId>,
-    ) -> Result<Timeline> {
+    pub fn load_timeline(conf: &SafeKeeperConf, ttid: TenantTimelineId) -> Result<Timeline> {
         let _enter = info_span!("load_timeline", timeline = %ttid.timeline_id).entered();
 
         let shared_state = SharedState::restore(conf, &ttid)?;
@@ -402,23 +375,27 @@ impl Timeline {
             shared_state.sk.get_term(),
             shared_state.sk.flush_lsn(),
         )));
+        let (shared_state_version_tx, shared_state_version_rx) = watch::channel(0);
         let (cancellation_tx, cancellation_rx) = watch::channel(false);
 
         let walreceivers = WalReceivers::new();
         Ok(Timeline {
             ttid,
-            wal_backup_launcher_tx,
             commit_lsn_watch_tx,
             commit_lsn_watch_rx,
             term_flush_lsn_watch_tx,
             term_flush_lsn_watch_rx,
-            mutex: Mutex::new(shared_state),
+            shared_state_version_tx,
+            shared_state_version_rx,
+            mutex: RwLock::new(shared_state),
             walsenders: WalSenders::new(walreceivers.clone()),
             walreceivers,
             cancellation_rx,
             cancellation_tx,
             timeline_dir: conf.timeline_dir(&ttid),
             walsenders_keep_horizon: conf.walsenders_keep_horizon,
+            broker_active: AtomicBool::new(false),
+            wal_backup_active: AtomicBool::new(false),
         })
     }
 
@@ -426,7 +403,6 @@ impl Timeline {
     pub fn create_empty(
         conf: &SafeKeeperConf,
         ttid: TenantTimelineId,
-        wal_backup_launcher_tx: Sender<TenantTimelineId>,
         server_info: ServerInfo,
         commit_lsn: Lsn,
         local_start_lsn: Lsn,
@@ -434,25 +410,30 @@ impl Timeline {
         let (commit_lsn_watch_tx, commit_lsn_watch_rx) = watch::channel(Lsn::INVALID);
         let (term_flush_lsn_watch_tx, term_flush_lsn_watch_rx) =
             watch::channel(TermLsn::from((INVALID_TERM, Lsn::INVALID)));
+        let (shared_state_version_tx, shared_state_version_rx) = watch::channel(0);
         let (cancellation_tx, cancellation_rx) = watch::channel(false);
+
         let state =
             TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn);
 
         let walreceivers = WalReceivers::new();
         Ok(Timeline {
             ttid,
-            wal_backup_launcher_tx,
             commit_lsn_watch_tx,
             commit_lsn_watch_rx,
             term_flush_lsn_watch_tx,
             term_flush_lsn_watch_rx,
-            mutex: Mutex::new(SharedState::create_new(conf, &ttid, state)?),
+            shared_state_version_tx,
+            shared_state_version_rx,
+            mutex: RwLock::new(SharedState::create_new(conf, &ttid, state)?),
             walsenders: WalSenders::new(walreceivers.clone()),
             walreceivers,
             cancellation_rx,
             cancellation_tx,
             timeline_dir: conf.timeline_dir(&ttid),
             walsenders_keep_horizon: conf.walsenders_keep_horizon,
+            broker_active: AtomicBool::new(false),
+            wal_backup_active: AtomicBool::new(false),
         })
     }
 
@@ -463,8 +444,9 @@ impl Timeline {
     /// and state on disk should remain unchanged.
     pub async fn init_new(
         self: &Arc<Timeline>,
-        shared_state: &mut MutexGuard<'_, SharedState>,
+        shared_state: &mut WriteGuardSharedState<'_>,
         conf: &SafeKeeperConf,
+        broker_active_set: Arc<TimelinesSet>,
     ) -> Result<()> {
         match fs::metadata(&self.timeline_dir).await {
             Ok(_) => {
@@ -495,16 +477,29 @@ impl Timeline {
 
             return Err(e);
         }
-        self.bootstrap(conf);
+        self.bootstrap(conf, broker_active_set);
         Ok(())
     }
 
-    /// Bootstrap new or existing timeline starting background stasks.
-    pub fn bootstrap(self: &Arc<Timeline>, conf: &SafeKeeperConf) {
+    /// Bootstrap new or existing timeline starting background tasks.
+    pub fn bootstrap(
+        self: &Arc<Timeline>,
+        conf: &SafeKeeperConf,
+        broker_active_set: Arc<TimelinesSet>,
+    ) {
+        // Start manager task which will monitor timeline state and update
+        // background tasks.
+        tokio::spawn(timeline_manager::main_task(
+            self.clone(),
+            conf.clone(),
+            broker_active_set,
+        ));
+
         // Start recovery task which always runs on the timeline.
         if conf.peer_recovery_enabled {
             tokio::spawn(recovery_main(self.clone(), conf.clone()));
         }
+        // TODO: migrate to timeline_manager
         if conf.is_wal_backup_enabled() && conf.partial_backup_enabled {
             tokio::spawn(wal_backup_partial::main_task(self.clone(), conf.clone()));
         }
@@ -517,10 +512,9 @@ impl Timeline {
     /// deletion API endpoint is retriable.
     pub async fn delete(
         &self,
-        shared_state: &mut MutexGuard<'_, SharedState>,
+        shared_state: &mut WriteGuardSharedState<'_>,
         only_local: bool,
-    ) -> Result<(bool, bool)> {
-        let was_active = shared_state.active;
+    ) -> Result<bool> {
         self.cancel(shared_state);
 
         // TODO: It's better to wait for s3 offloader termination before
@@ -534,18 +528,12 @@ impl Timeline {
             wal_backup::delete_timeline(&self.ttid).await?;
         }
         let dir_existed = delete_dir(&self.timeline_dir).await?;
-        Ok((dir_existed, was_active))
+        Ok(dir_existed)
     }
 
     /// Cancel timeline to prevent further usage. Background tasks will stop
     /// eventually after receiving cancellation signal.
-    ///
-    /// Note that we can't notify backup launcher here while holding
-    /// shared_state lock, as this is a potential deadlock: caller is
-    /// responsible for that. Generally we should probably make WAL backup tasks
-    /// to shut down on their own, checking once in a while whether it is the
-    /// time.
-    fn cancel(&self, shared_state: &mut MutexGuard<'_, SharedState>) {
+    fn cancel(&self, shared_state: &mut WriteGuardSharedState<'_>) {
         info!("timeline {} is cancelled", self.ttid);
         let _ = self.cancellation_tx.send(true);
         // Close associated FDs. Nobody will be able to touch timeline data once
@@ -569,30 +557,12 @@ impl Timeline {
     }
 
     /// Take a writing mutual exclusive lock on timeline shared_state.
-    pub async fn write_shared_state(&self) -> MutexGuard<SharedState> {
-        self.mutex.lock().await
+    pub async fn write_shared_state<'a>(self: &'a Arc<Self>) -> WriteGuardSharedState<'a> {
+        WriteGuardSharedState::new(self.clone(), self.mutex.write().await)
     }
 
-    async fn update_status(&self, shared_state: &mut SharedState) -> bool {
-        shared_state
-            .update_status(self.walreceivers.get_num(), self.ttid)
-            .await
-    }
-
-    /// Update timeline status and kick wal backup launcher to stop/start offloading if needed.
-    pub async fn update_status_notify(&self) -> Result<()> {
-        if self.is_cancelled() {
-            bail!(TimelineError::Cancelled(self.ttid));
-        }
-        let is_wal_backup_action_pending: bool = {
-            let mut shared_state = self.write_shared_state().await;
-            self.update_status(&mut shared_state).await
-        };
-        if is_wal_backup_action_pending {
-            // Can fail only if channel to a static thread got closed, which is not normal at all.
-            self.wal_backup_launcher_tx.send(self.ttid).await?;
-        }
-        Ok(())
+    pub async fn read_shared_state(&self) -> ReadGuardSharedState {
+        self.mutex.read().await
     }
 
     /// Returns true if walsender should stop sending WAL to pageserver. We
@@ -604,7 +574,7 @@ impl Timeline {
         if self.is_cancelled() {
             return true;
         }
-        let shared_state = self.write_shared_state().await;
+        let shared_state = self.read_shared_state().await;
         if self.walreceivers.get_num() == 0 {
             return shared_state.sk.state.inmem.commit_lsn == Lsn(0) || // no data at all yet
             reported_remote_consistent_lsn >= shared_state.sk.state.inmem.commit_lsn;
@@ -612,9 +582,9 @@ impl Timeline {
         false
     }
 
-    /// Ensure taht current term is t, erroring otherwise, and lock the state.
-    pub async fn acquire_term(&self, t: Term) -> Result<MutexGuard<SharedState>> {
-        let ss = self.write_shared_state().await;
+    /// Ensure that current term is t, erroring otherwise, and lock the state.
+    pub async fn acquire_term(&self, t: Term) -> Result<ReadGuardSharedState> {
+        let ss = self.read_shared_state().await;
         if ss.sk.state.acceptor_state.term != t {
             bail!(
                 "failed to acquire term {}, current term {}",
@@ -625,18 +595,6 @@ impl Timeline {
         Ok(ss)
     }
 
-    /// Returns whether s3 offloading is required and sets current status as
-    /// matching it.
-    pub async fn wal_backup_attend(&self) -> bool {
-        if self.is_cancelled() {
-            return false;
-        }
-
-        self.write_shared_state()
-            .await
-            .wal_backup_attend(self.walreceivers.get_num())
-    }
-
     /// Returns commit_lsn watch channel.
     pub fn get_commit_lsn_watch_rx(&self) -> watch::Receiver<Lsn> {
         self.commit_lsn_watch_rx.clone()
@@ -647,9 +605,14 @@ impl Timeline {
         self.term_flush_lsn_watch_rx.clone()
     }
 
+    /// Returns watch channel for SharedState update version.
+    pub fn get_state_version_rx(&self) -> watch::Receiver<usize> {
+        self.shared_state_version_rx.clone()
+    }
+
     /// Pass arrived message to the safekeeper.
     pub async fn process_msg(
-        &self,
+        self: &Arc<Self>,
         msg: &ProposerAcceptorMessage,
     ) -> Result<Option<AcceptorProposerMessage>> {
         if self.is_cancelled() {
@@ -657,8 +620,6 @@ impl Timeline {
         }
 
         let mut rmsg: Option<AcceptorProposerMessage>;
-        let commit_lsn: Lsn;
-        let term_flush_lsn: TermLsn;
         {
             let mut shared_state = self.write_shared_state().await;
             rmsg = shared_state.sk.process_msg(msg).await?;
@@ -667,43 +628,28 @@ impl Timeline {
             if let Some(AcceptorProposerMessage::AppendResponse(ref mut resp)) = rmsg {
                 resp.hs_feedback = self.walsenders.get_hotstandby().hs_feedback;
             }
-
-            commit_lsn = shared_state.sk.state.inmem.commit_lsn;
-            term_flush_lsn =
-                TermLsn::from((shared_state.sk.get_term(), shared_state.sk.flush_lsn()));
         }
-        self.term_flush_lsn_watch_tx.send(term_flush_lsn)?;
-        self.commit_lsn_watch_tx.send(commit_lsn)?;
         Ok(rmsg)
     }
 
     /// Returns wal_seg_size.
     pub async fn get_wal_seg_size(&self) -> usize {
-        self.write_shared_state().await.get_wal_seg_size()
-    }
-
-    /// Returns true only if the timeline is loaded and active.
-    pub async fn is_active(&self) -> bool {
-        if self.is_cancelled() {
-            return false;
-        }
-
-        self.write_shared_state().await.active
+        self.read_shared_state().await.get_wal_seg_size()
     }
 
     /// Returns state of the timeline.
     pub async fn get_state(&self) -> (TimelineMemState, TimelinePersistentState) {
-        let state = self.write_shared_state().await;
+        let state = self.read_shared_state().await;
         (state.sk.state.inmem.clone(), state.sk.state.clone())
     }
 
     /// Returns latest backup_lsn.
     pub async fn get_wal_backup_lsn(&self) -> Lsn {
-        self.write_shared_state().await.sk.state.inmem.backup_lsn
+        self.read_shared_state().await.sk.state.inmem.backup_lsn
     }
 
     /// Sets backup_lsn to the given value.
-    pub async fn set_wal_backup_lsn(&self, backup_lsn: Lsn) -> Result<()> {
+    pub async fn set_wal_backup_lsn(self: &Arc<Self>, backup_lsn: Lsn) -> Result<()> {
         if self.is_cancelled() {
             bail!(TimelineError::Cancelled(self.ttid));
         }
@@ -717,40 +663,34 @@ impl Timeline {
 
     /// Get safekeeper info for broadcasting to broker and other peers.
     pub async fn get_safekeeper_info(&self, conf: &SafeKeeperConf) -> SafekeeperTimelineInfo {
-        let shared_state = self.write_shared_state().await;
         let standby_apply_lsn = self.walsenders.get_hotstandby().reply.apply_lsn;
+        let shared_state = self.read_shared_state().await;
         shared_state.get_safekeeper_info(&self.ttid, conf, standby_apply_lsn)
     }
 
     /// Update timeline state with peer safekeeper data.
-    pub async fn record_safekeeper_info(&self, sk_info: SafekeeperTimelineInfo) -> Result<()> {
-        let is_wal_backup_action_pending: bool;
-        let commit_lsn: Lsn;
+    pub async fn record_safekeeper_info(
+        self: &Arc<Self>,
+        sk_info: SafekeeperTimelineInfo,
+    ) -> Result<()> {
         {
             let mut shared_state = self.write_shared_state().await;
             shared_state.sk.record_safekeeper_info(&sk_info).await?;
             let peer_info = PeerInfo::from_sk_info(&sk_info, Instant::now());
             shared_state.peers_info.upsert(&peer_info);
-            is_wal_backup_action_pending = self.update_status(&mut shared_state).await;
-            commit_lsn = shared_state.sk.state.inmem.commit_lsn;
-        }
-        self.commit_lsn_watch_tx.send(commit_lsn)?;
-        // Wake up wal backup launcher, if it is time to stop the offloading.
-        if is_wal_backup_action_pending {
-            self.wal_backup_launcher_tx.send(self.ttid).await?;
         }
         Ok(())
     }
 
     /// Update in memory remote consistent lsn.
-    pub async fn update_remote_consistent_lsn(&self, candidate: Lsn) {
+    pub async fn update_remote_consistent_lsn(self: &Arc<Self>, candidate: Lsn) {
         let mut shared_state = self.write_shared_state().await;
         shared_state.sk.state.inmem.remote_consistent_lsn =
             max(shared_state.sk.state.inmem.remote_consistent_lsn, candidate);
     }
 
     pub async fn get_peers(&self, conf: &SafeKeeperConf) -> Vec<PeerInfo> {
-        let shared_state = self.write_shared_state().await;
+        let shared_state = self.read_shared_state().await;
         shared_state.get_peers(conf.heartbeat_timeout)
     }
 
@@ -772,7 +712,7 @@ impl Timeline {
     /// depending on assembled quorum (e.g. classic picture 8 from Raft paper).
     /// Thus we don't try to predict it here.
     pub async fn recovery_needed(&self, heartbeat_timeout: Duration) -> RecoveryNeededInfo {
-        let ss = self.write_shared_state().await;
+        let ss = self.read_shared_state().await;
         let term = ss.sk.state.acceptor_state.term;
         let last_log_term = ss.sk.get_epoch();
         let flush_lsn = ss.sk.flush_lsn();
@@ -843,12 +783,12 @@ impl Timeline {
 
     /// Returns flush_lsn.
     pub async fn get_flush_lsn(&self) -> Lsn {
-        self.write_shared_state().await.sk.wal_store.flush_lsn()
+        self.read_shared_state().await.sk.wal_store.flush_lsn()
     }
 
     /// Delete WAL segments from disk that are no longer needed. This is determined
     /// based on pageserver's remote_consistent_lsn and local backup_lsn/peer_lsn.
-    pub async fn remove_old_wal(&self, wal_backup_enabled: bool) -> Result<()> {
+    pub async fn remove_old_wal(self: &Arc<Self>) -> Result<()> {
         if self.is_cancelled() {
             bail!(TimelineError::Cancelled(self.ttid));
         }
@@ -864,9 +804,8 @@ impl Timeline {
 
         let horizon_segno: XLogSegNo;
         let remover = {
-            let shared_state = self.write_shared_state().await;
-            horizon_segno =
-                shared_state.get_horizon_segno(wal_backup_enabled, replication_horizon_lsn);
+            let shared_state = self.read_shared_state().await;
+            horizon_segno = shared_state.get_horizon_segno(replication_horizon_lsn);
             if horizon_segno <= 1 || horizon_segno <= shared_state.last_removed_segno {
                 return Ok(()); // nothing to do
             }
@@ -888,7 +827,7 @@ impl Timeline {
     /// passed after the last save. This helps to keep remote_consistent_lsn up
     /// to date so that storage nodes restart doesn't cause many pageserver ->
     /// safekeeper reconnections.
-    pub async fn maybe_persist_control_file(&self) -> Result<()> {
+    pub async fn maybe_persist_control_file(self: &Arc<Self>) -> Result<()> {
         self.write_shared_state()
             .await
             .sk
@@ -896,38 +835,33 @@ impl Timeline {
             .await
     }
 
-    /// Gather timeline data for metrics. If the timeline is not active, returns
-    /// None, we do not collect these.
+    /// Gather timeline data for metrics.
     pub async fn info_for_metrics(&self) -> Option<FullTimelineInfo> {
         if self.is_cancelled() {
             return None;
         }
 
         let (ps_feedback_count, last_ps_feedback) = self.walsenders.get_ps_feedback_stats();
-        let state = self.write_shared_state().await;
-        if state.active {
-            Some(FullTimelineInfo {
-                ttid: self.ttid,
-                ps_feedback_count,
-                last_ps_feedback,
-                wal_backup_active: state.wal_backup_active,
-                timeline_is_active: state.active,
-                num_computes: self.walreceivers.get_num() as u32,
-                last_removed_segno: state.last_removed_segno,
-                epoch_start_lsn: state.sk.epoch_start_lsn,
-                mem_state: state.sk.state.inmem.clone(),
-                persisted_state: state.sk.state.clone(),
-                flush_lsn: state.sk.wal_store.flush_lsn(),
-                wal_storage: state.sk.wal_store.get_metrics(),
-            })
-        } else {
-            None
-        }
+        let state = self.read_shared_state().await;
+        Some(FullTimelineInfo {
+            ttid: self.ttid,
+            ps_feedback_count,
+            last_ps_feedback,
+            wal_backup_active: self.wal_backup_active.load(Ordering::Relaxed),
+            timeline_is_active: self.broker_active.load(Ordering::Relaxed),
+            num_computes: self.walreceivers.get_num() as u32,
+            last_removed_segno: state.last_removed_segno,
+            epoch_start_lsn: state.sk.epoch_start_lsn,
+            mem_state: state.sk.state.inmem.clone(),
+            persisted_state: state.sk.state.clone(),
+            flush_lsn: state.sk.wal_store.flush_lsn(),
+            wal_storage: state.sk.wal_store.get_metrics(),
+        })
     }
 
     /// Returns in-memory timeline state to build a full debug dump.
     pub async fn memory_dump(&self) -> debug_dump::Memory {
-        let state = self.write_shared_state().await;
+        let state = self.read_shared_state().await;
 
         let (write_lsn, write_record_lsn, flush_lsn, file_open) =
             state.sk.wal_store.internal_state();
@@ -936,8 +870,8 @@ impl Timeline {
             is_cancelled: self.is_cancelled(),
             peers_info_len: state.peers_info.0.len(),
             walsenders: self.walsenders.get_all(),
-            wal_backup_active: state.wal_backup_active,
-            active: state.active,
+            wal_backup_active: self.wal_backup_active.load(Ordering::Relaxed),
+            active: self.broker_active.load(Ordering::Relaxed),
             num_computes: self.walreceivers.get_num() as u32,
             last_removed_segno: state.last_removed_segno,
             epoch_start_lsn: state.sk.epoch_start_lsn,
@@ -951,7 +885,7 @@ impl Timeline {
 
     /// Apply a function to the control file state and persist it.
     pub async fn map_control_file<T>(
-        &self,
+        self: &Arc<Self>,
         f: impl FnOnce(&mut TimelinePersistentState) -> Result<T>,
     ) -> Result<T> {
         let mut state = self.write_shared_state().await;
diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs
new file mode 100644
index 0000000000..52ad915065
--- /dev/null
+++ b/safekeeper/src/timeline_manager.rs
@@ -0,0 +1,153 @@
+//! The timeline manager task is responsible for managing the timeline's background tasks.
+//! It is spawned alongside each timeline and exits when the timeline is deleted.
+//! It watches for changes in the timeline state and decides when to spawn or kill background tasks.
+//! It also can manage some reactive state, like should the timeline be active for broker pushes or not.
+
+use std::{sync::Arc, time::Duration};
+
+use tracing::{info, instrument, warn};
+use utils::lsn::Lsn;
+
+use crate::{
+    metrics::{MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL},
+    timeline::{PeerInfo, ReadGuardSharedState, Timeline},
+    timelines_set::TimelinesSet,
+    wal_backup::{self, WalBackupTaskHandle},
+    SafeKeeperConf,
+};
+
+pub struct StateSnapshot {
+    pub commit_lsn: Lsn,
+    pub backup_lsn: Lsn,
+    pub remote_consistent_lsn: Lsn,
+    pub peers: Vec<PeerInfo>,
+}
+
+impl StateSnapshot {
+    /// Create a new snapshot of the timeline state.
+    fn new(read_guard: ReadGuardSharedState, heartbeat_timeout: Duration) -> Self {
+        Self {
+            commit_lsn: read_guard.sk.state.inmem.commit_lsn,
+            backup_lsn: read_guard.sk.state.inmem.backup_lsn,
+            remote_consistent_lsn: read_guard.sk.state.inmem.remote_consistent_lsn,
+            peers: read_guard.get_peers(heartbeat_timeout),
+        }
+    }
+}
+
+/// Control how often the manager task should wake up to check updates.
+/// There is no need to check for updates more often than this.
+const REFRESH_INTERVAL: Duration = Duration::from_millis(300);
+
+/// This task gets spawned alongside each timeline and is responsible for managing the timeline's
+/// background tasks.
+#[instrument(name = "manager", skip_all, fields(ttid = %tli.ttid))]
+pub async fn main_task(
+    tli: Arc<Timeline>,
+    conf: SafeKeeperConf,
+    broker_active_set: Arc<TimelinesSet>,
+) {
+    let mut cancellation_rx = match tli.get_cancellation_rx() {
+        Ok(rx) => rx,
+        Err(_) => {
+            info!("timeline canceled during task start");
+            return;
+        }
+    };
+
+    scopeguard::defer! {
+        if tli.is_cancelled() {
+            info!("manager task finished");
+        } else {
+            warn!("manager task finished prematurely");
+        }
+    };
+
+    // sets whether timeline is active for broker pushes or not
+    let mut tli_broker_active = broker_active_set.guard(tli.clone());
+
+    let ttid = tli.ttid;
+    let wal_seg_size = tli.get_wal_seg_size().await;
+    let heartbeat_timeout = conf.heartbeat_timeout;
+
+    let mut state_version_rx = tli.get_state_version_rx();
+
+    let walreceivers = tli.get_walreceivers();
+    let mut num_computes_rx = walreceivers.get_num_rx();
+
+    // list of background tasks
+    let mut backup_task: Option<WalBackupTaskHandle> = None;
+
+    let last_state = 'outer: loop {
+        MANAGER_ITERATIONS_TOTAL.inc();
+
+        let state_snapshot = StateSnapshot::new(tli.read_shared_state().await, heartbeat_timeout);
+        let num_computes = *num_computes_rx.borrow();
+
+        let is_wal_backup_required =
+            wal_backup::is_wal_backup_required(wal_seg_size, num_computes, &state_snapshot);
+
+        if conf.is_wal_backup_enabled() {
+            wal_backup::update_task(
+                &conf,
+                ttid,
+                is_wal_backup_required,
+                &state_snapshot,
+                &mut backup_task,
+            )
+            .await;
+        }
+
+        let is_active = is_wal_backup_required
+            || num_computes > 0
+            || state_snapshot.remote_consistent_lsn < state_snapshot.commit_lsn;
+
+        // update the broker timeline set
+        if tli_broker_active.set(is_active) {
+            // write log if state has changed
+            info!(
+                "timeline active={} now, remote_consistent_lsn={}, commit_lsn={}",
+                is_active, state_snapshot.remote_consistent_lsn, state_snapshot.commit_lsn,
+            );
+
+            MANAGER_ACTIVE_CHANGES.inc();
+
+            if !is_active {
+                // TODO: maybe use tokio::spawn?
+                if let Err(e) = tli.maybe_persist_control_file().await {
+                    warn!("control file save in update_status failed: {:?}", e);
+                }
+            }
+        }
+
+        // update the state in Arc<Timeline>
+        tli.wal_backup_active
+            .store(backup_task.is_some(), std::sync::atomic::Ordering::Relaxed);
+        tli.broker_active
+            .store(is_active, std::sync::atomic::Ordering::Relaxed);
+
+        // wait until something changes. tx channels are stored under Arc, so they will not be
+        // dropped until the manager task is finished.
+        tokio::select! {
+            _ = cancellation_rx.changed() => {
+                // timeline was deleted
+                break 'outer state_snapshot;
+            }
+            _ = async {
+                // don't wake up on every state change, but at most every REFRESH_INTERVAL
+                tokio::time::sleep(REFRESH_INTERVAL).await;
+                let _ = state_version_rx.changed().await;
+            } => {
+                // state was updated
+            }
+            _ = num_computes_rx.changed() => {
+                // number of connected computes was updated
+            }
+        }
+    };
+
+    // shutdown background tasks
+    if conf.is_wal_backup_enabled() {
+        wal_backup::update_task(&conf, ttid, false, &last_state, &mut backup_task).await;
+    }
+}
diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs
index 079e706ff8..8d37bd6371 100644
--- a/safekeeper/src/timelines_global_map.rs
+++ b/safekeeper/src/timelines_global_map.rs
@@ -4,6 +4,7 @@
 
 use crate::safekeeper::ServerInfo;
 use crate::timeline::{Timeline, TimelineError};
+use crate::timelines_set::TimelinesSet;
 use crate::SafeKeeperConf;
 use anyhow::{bail, Context, Result};
 use camino::Utf8PathBuf;
@@ -11,16 +12,16 @@ use once_cell::sync::Lazy;
 use serde::Serialize;
 use std::collections::HashMap;
 use std::str::FromStr;
+use std::sync::atomic::Ordering;
 use std::sync::{Arc, Mutex};
-use tokio::sync::mpsc::Sender;
 use tracing::*;
 use utils::id::{TenantId, TenantTimelineId, TimelineId};
 use utils::lsn::Lsn;
 
 struct GlobalTimelinesState {
     timelines: HashMap<TenantTimelineId, Arc<Timeline>>,
-    wal_backup_launcher_tx: Option<Sender<TenantTimelineId>>,
     conf: Option<SafeKeeperConf>,
+    broker_active_set: Arc<TimelinesSet>,
     load_lock: Arc<tokio::sync::Mutex<TimelineLoadLock>>,
 }
 
@@ -36,11 +37,8 @@ impl GlobalTimelinesState {
     }
 
     /// Get dependencies for a timeline constructor.
-    fn get_dependencies(&self) -> (SafeKeeperConf, Sender<TenantTimelineId>) {
-        (
-            self.get_conf().clone(),
-            self.wal_backup_launcher_tx.as_ref().unwrap().clone(),
-        )
+    fn get_dependencies(&self) -> (SafeKeeperConf, Arc<TimelinesSet>) {
+        (self.get_conf().clone(), self.broker_active_set.clone())
     }
 
     /// Insert timeline into the map. Returns error if timeline with the same id already exists.
@@ -65,8 +63,8 @@ impl GlobalTimelinesState {
 static TIMELINES_STATE: Lazy<Mutex<GlobalTimelinesState>> = Lazy::new(|| {
     Mutex::new(GlobalTimelinesState {
         timelines: HashMap::new(),
-        wal_backup_launcher_tx: None,
         conf: None,
+        broker_active_set: Arc::new(TimelinesSet::default()),
         load_lock: Arc::new(tokio::sync::Mutex::new(TimelineLoadLock)),
     })
 });
@@ -76,16 +74,11 @@ pub struct GlobalTimelines;
 
 impl GlobalTimelines {
     /// Inject dependencies needed for the timeline constructors and load all timelines to memory.
-    pub async fn init(
-        conf: SafeKeeperConf,
-        wal_backup_launcher_tx: Sender<TenantTimelineId>,
-    ) -> Result<()> {
+    pub async fn init(conf: SafeKeeperConf) -> Result<()> {
         // clippy isn't smart enough to understand that drop(state) releases the
         // lock, so use explicit block
         let tenants_dir = {
             let mut state = TIMELINES_STATE.lock().unwrap();
-            assert!(state.wal_backup_launcher_tx.is_none());
-            state.wal_backup_launcher_tx = Some(wal_backup_launcher_tx);
             state.conf = Some(conf);
 
             // Iterate through all directories and load tenants for all directories
@@ -129,12 +122,9 @@ impl GlobalTimelines {
     /// this function is called during init when nothing else is running, so
     /// this is fine.
     async fn load_tenant_timelines(tenant_id: TenantId) -> Result<()> {
-        let (conf, wal_backup_launcher_tx) = {
+        let (conf, broker_active_set) = {
             let state = TIMELINES_STATE.lock().unwrap();
-            (
-                state.get_conf().clone(),
-                state.wal_backup_launcher_tx.as_ref().unwrap().clone(),
-            )
+            state.get_dependencies()
         };
 
         let timelines_dir = conf.tenant_dir(&tenant_id);
@@ -147,7 +137,7 @@ impl GlobalTimelines {
                         TimelineId::from_str(timeline_dir_entry.file_name().to_str().unwrap_or(""))
                     {
                         let ttid = TenantTimelineId::new(tenant_id, timeline_id);
-                        match Timeline::load_timeline(&conf, ttid, wal_backup_launcher_tx.clone()) {
+                        match Timeline::load_timeline(&conf, ttid) {
                             Ok(timeline) => {
                                 let tli = Arc::new(timeline);
                                 TIMELINES_STATE
@@ -155,8 +145,7 @@ impl GlobalTimelines {
                                     .unwrap()
                                     .timelines
                                     .insert(ttid, tli.clone());
-                                tli.bootstrap(&conf);
-                                tli.update_status_notify().await.unwrap();
+                                tli.bootstrap(&conf, broker_active_set.clone());
                             }
                             // If we can't load a timeline, it's most likely because of a corrupted
                             // directory. We will log an error and won't allow to delete/recreate
@@ -189,9 +178,9 @@ impl GlobalTimelines {
         _guard: &tokio::sync::MutexGuard<'a, TimelineLoadLock>,
         ttid: TenantTimelineId,
     ) -> Result<Arc<Timeline>> {
-        let (conf, wal_backup_launcher_tx) = TIMELINES_STATE.lock().unwrap().get_dependencies();
+        let (conf, broker_active_set) = TIMELINES_STATE.lock().unwrap().get_dependencies();
 
-        match Timeline::load_timeline(&conf, ttid, wal_backup_launcher_tx) {
+        match Timeline::load_timeline(&conf, ttid) {
             Ok(timeline) => {
                 let tli = Arc::new(timeline);
 
@@ -202,7 +191,7 @@ impl GlobalTimelines {
                     .timelines
                     .insert(ttid, tli.clone());
 
-                tli.bootstrap(&conf);
+                tli.bootstrap(&conf, broker_active_set);
 
                 Ok(tli)
             }
@@ -221,6 +210,10 @@ impl GlobalTimelines {
         TIMELINES_STATE.lock().unwrap().get_conf().clone()
     }
 
+    pub fn get_global_broker_active_set() -> Arc<TimelinesSet> {
+        TIMELINES_STATE.lock().unwrap().broker_active_set.clone()
+    }
+
     /// Create a new timeline with the given id. If the timeline already exists, returns
     /// an existing timeline.
     pub async fn create(
@@ -229,7 +222,7 @@ impl GlobalTimelines {
         commit_lsn: Lsn,
         local_start_lsn: Lsn,
     ) -> Result<Arc<Timeline>> {
-        let (conf, wal_backup_launcher_tx) = {
+        let (conf, broker_active_set) = {
             let state = TIMELINES_STATE.lock().unwrap();
             if let Ok(timeline) = state.get(&ttid) {
                 // Timeline already exists, return it.
@@ -243,7 +236,6 @@ impl GlobalTimelines {
         let timeline = Arc::new(Timeline::create_empty(
             &conf,
             ttid,
-            wal_backup_launcher_tx,
             server_info,
             commit_lsn,
             local_start_lsn,
@@ -264,7 +256,10 @@ impl GlobalTimelines {
             // Write the new timeline to the disk and start background workers.
             // Bootstrap is transactional, so if it fails, the timeline will be deleted,
             // and the state on disk should remain unchanged.
-            if let Err(e) = timeline.init_new(&mut shared_state, &conf).await {
+            if let Err(e) = timeline
+                .init_new(&mut shared_state, &conf, broker_active_set)
+                .await
+            {
                 // Note: the most likely reason for init failure is that the timeline
                 // directory already exists on disk. This happens when timeline is corrupted
                 // and wasn't loaded from disk on startup because of that. We want to preserve
@@ -281,8 +276,6 @@ impl GlobalTimelines {
             // We are done with bootstrap, release the lock, return the timeline.
             // {} block forces release before .await
         }
-        timeline.update_status_notify().await?;
-        timeline.wal_backup_launcher_tx.send(timeline.ttid).await?;
         Ok(timeline)
     }
 
@@ -335,12 +328,13 @@ impl GlobalTimelines {
         let tli_res = TIMELINES_STATE.lock().unwrap().get(ttid);
         match tli_res {
             Ok(timeline) => {
+                let was_active = timeline.broker_active.load(Ordering::Relaxed);
+
                 // Take a lock and finish the deletion holding this mutex.
                 let mut shared_state = timeline.write_shared_state().await;
 
                 info!("deleting timeline {}, only_local={}", ttid, only_local);
-                let (dir_existed, was_active) =
-                    timeline.delete(&mut shared_state, only_local).await?;
+                let dir_existed = timeline.delete(&mut shared_state, only_local).await?;
 
                 // Remove timeline from the map.
                 // FIXME: re-enable it once we fix the issue with recreation of deleted timelines
@@ -349,7 +343,7 @@ impl GlobalTimelines {
 
                 Ok(TimelineDeleteForceResult {
                     dir_existed,
-                    was_active,
+                    was_active, // TODO: we probably should remove this field
                 })
             }
             Err(_) => {
diff --git a/safekeeper/src/timelines_set.rs b/safekeeper/src/timelines_set.rs
new file mode 100644
index 0000000000..ea8e23bb72
--- /dev/null
+++ b/safekeeper/src/timelines_set.rs
@@ -0,0 +1,90 @@
+use std::{collections::HashMap, sync::Arc};
+
+use utils::id::TenantTimelineId;
+
+use crate::timeline::Timeline;
+
+/// Set of timelines, supports operations:
+/// - add timeline
+/// - remove timeline
+/// - clone the set
+///
+/// Usually used for keeping subset of timelines. For example active timelines that require broker push.
+pub struct TimelinesSet {
+    timelines: std::sync::Mutex<HashMap<TenantTimelineId, Arc<Timeline>>>,
+}
+
+impl Default for TimelinesSet {
+    fn default() -> Self {
+        Self {
+            timelines: std::sync::Mutex::new(HashMap::new()),
+        }
+    }
+}
+
+impl TimelinesSet {
+    pub fn insert(&self, tli: Arc<Timeline>) {
+        self.timelines.lock().unwrap().insert(tli.ttid, tli);
+    }
+
+    pub fn delete(&self, ttid: &TenantTimelineId) {
+        self.timelines.lock().unwrap().remove(ttid);
+    }
+
+    /// If present is true, adds timeline to the set, otherwise removes it.
+    pub fn set_present(&self, tli: Arc<Timeline>, present: bool) {
+        if present {
+            self.insert(tli);
+        } else {
+            self.delete(&tli.ttid);
+        }
+    }
+
+    pub fn is_present(&self, ttid: &TenantTimelineId) -> bool {
+        self.timelines.lock().unwrap().contains_key(ttid)
+    }
+
+    /// Returns all timelines in the set.
+    pub fn get_all(&self) -> Vec<Arc<Timeline>> {
+        self.timelines.lock().unwrap().values().cloned().collect()
+    }
+
+    /// Returns a timeline guard for easy presence control.
+    pub fn guard(self: &Arc<Self>, tli: Arc<Timeline>) -> TimelineSetGuard {
+        let is_present = self.is_present(&tli.ttid);
+        TimelineSetGuard {
+            timelines_set: self.clone(),
+            tli,
+            is_present,
+        }
+    }
+}
+
+/// Guard is used to add or remove timeline from the set.
+/// If the timeline present in set, it will be removed from it on drop.
+/// Note: do not use more than one guard for the same timeline, it caches the presence state.
+/// It is designed to be used in the manager task only.
+pub struct TimelineSetGuard {
+    timelines_set: Arc<TimelinesSet>,
+    tli: Arc<Timeline>,
+    is_present: bool,
+}
+
+impl TimelineSetGuard {
+    /// Returns true if the state was changed.
+    pub fn set(&mut self, present: bool) -> bool {
+        if present == self.is_present {
+            return false;
+        }
+        self.is_present = present;
+        self.timelines_set.set_present(self.tli.clone(), present);
+        true
+    }
+}
+
+impl Drop for TimelineSetGuard {
+    fn drop(&mut self) {
+        // remove timeline from the map on drop
+        self.timelines_set.delete(&self.tli.ttid);
+    }
+}
diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs
index e496f07114..84680557f9 100644
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -9,7 +9,7 @@ use utils::backoff;
 use utils::id::NodeId;
 
 use std::cmp::min;
-use std::collections::{HashMap, HashSet};
+use std::collections::HashSet;
 use std::num::NonZeroU32;
 use std::pin::Pin;
 use std::sync::Arc;
@@ -29,9 +29,10 @@ use tracing::*;
 
 use utils::{id::TenantTimelineId, lsn::Lsn};
 
-use crate::metrics::{BACKED_UP_SEGMENTS, BACKUP_ERRORS};
+use crate::metrics::{BACKED_UP_SEGMENTS, BACKUP_ERRORS, WAL_BACKUP_TASKS};
 use crate::timeline::{PeerInfo, Timeline};
-use crate::{GlobalTimelines, SafeKeeperConf};
+use crate::timeline_manager::StateSnapshot;
+use crate::{GlobalTimelines, SafeKeeperConf, WAL_BACKUP_RUNTIME};
 
 use once_cell::sync::OnceCell;
 
@@ -41,35 +42,84 @@ const UPLOAD_FAILURE_RETRY_MAX_MS: u64 = 5000;
 /// Default buffer size when interfacing with [`tokio::fs::File`].
 const BUFFER_SIZE: usize = 32 * 1024;
 
-/// Check whether wal backup is required for timeline. If yes, mark that launcher is
-/// aware of current status and return the timeline.
-async fn is_wal_backup_required(ttid: TenantTimelineId) -> Option<Arc<Timeline>> {
-    match GlobalTimelines::get(ttid).ok() {
-        Some(tli) => {
-            tli.wal_backup_attend().await;
-            Some(tli)
-        }
-        None => None,
-    }
-}
-
-struct WalBackupTaskHandle {
+pub struct WalBackupTaskHandle {
     shutdown_tx: Sender<()>,
     handle: JoinHandle<()>,
 }
 
-struct WalBackupTimelineEntry {
-    timeline: Arc<Timeline>,
-    handle: Option<WalBackupTaskHandle>,
+/// Do we have anything to upload to S3, i.e. should safekeepers run backup activity?
+pub fn is_wal_backup_required(
+    wal_seg_size: usize,
+    num_computes: usize,
+    state: &StateSnapshot,
+) -> bool {
+    num_computes > 0 ||
+    // Currently only the whole segment is offloaded, so compare segment numbers.
+    (state.commit_lsn.segment_number(wal_seg_size) > state.backup_lsn.segment_number(wal_seg_size))
 }
 
-async fn shut_down_task(ttid: TenantTimelineId, entry: &mut WalBackupTimelineEntry) {
-    if let Some(wb_handle) = entry.handle.take() {
+/// Based on peer information determine which safekeeper should offload; if it
+/// is me, run (per timeline) task, if not yet. OTOH, if it is not me and task
+/// is running, kill it.
+pub async fn update_task(
+    conf: &SafeKeeperConf,
+    ttid: TenantTimelineId,
+    need_backup: bool,
+    state: &StateSnapshot,
+    entry: &mut Option<WalBackupTaskHandle>,
+) {
+    let (offloader, election_dbg_str) =
+        determine_offloader(&state.peers, state.backup_lsn, ttid, conf);
+    let elected_me = Some(conf.my_id) == offloader;
+
+    let should_task_run = need_backup && elected_me;
+
+    // start or stop the task
+    if should_task_run != (entry.is_some()) {
+        if should_task_run {
+            info!("elected for backup: {}", election_dbg_str);
+
+            let (shutdown_tx, shutdown_rx) = mpsc::channel(1);
+            let timeline_dir = conf.timeline_dir(&ttid);
+
+            let async_task = backup_task_main(
+                ttid,
+                timeline_dir,
+                conf.workdir.clone(),
+                conf.backup_parallel_jobs,
+                shutdown_rx,
+            );
+
+            let handle = if conf.current_thread_runtime {
+                tokio::spawn(async_task)
+            } else {
+                WAL_BACKUP_RUNTIME.spawn(async_task)
+            };
+
+            *entry = Some(WalBackupTaskHandle {
+                shutdown_tx,
+                handle,
+            });
+        } else {
+            if !need_backup {
+                // don't need backup at all
+                info!("stepping down from backup, need_backup={}", need_backup);
+            } else {
+                // someone else has been elected
+                info!("stepping down from backup: {}", election_dbg_str);
+            }
+            shut_down_task(entry).await;
+        }
+    }
+}
+
+async fn shut_down_task(entry: &mut Option<WalBackupTaskHandle>) {
+    if let Some(wb_handle) = entry.take() {
         // Tell the task to shutdown. Error means task exited earlier, that's ok.
         let _ = wb_handle.shutdown_tx.send(()).await;
         // Await the task itself. TODO: restart panicked tasks earlier.
         if let Err(e) = wb_handle.handle.await {
-            warn!("WAL backup task for {} panicked: {}", ttid, e);
+            warn!("WAL backup task panicked: {}", e);
         }
     }
 }
@@ -126,49 +176,6 @@ fn determine_offloader(
     }
 }
 
-/// Based on peer information determine which safekeeper should offload; if it
-/// is me, run (per timeline) task, if not yet. OTOH, if it is not me and task
-/// is running, kill it.
-async fn update_task(
-    conf: &SafeKeeperConf,
-    ttid: TenantTimelineId,
-    entry: &mut WalBackupTimelineEntry,
-) {
-    let alive_peers = entry.timeline.get_peers(conf).await;
-    let wal_backup_lsn = entry.timeline.get_wal_backup_lsn().await;
-    let (offloader, election_dbg_str) =
-        determine_offloader(&alive_peers, wal_backup_lsn, ttid, conf);
-    let elected_me = Some(conf.my_id) == offloader;
-
-    if elected_me != (entry.handle.is_some()) {
-        if elected_me {
-            info!("elected for backup: {}", election_dbg_str);
-
-            let (shutdown_tx, shutdown_rx) = mpsc::channel(1);
-            let timeline_dir = conf.timeline_dir(&ttid);
-
-            let handle = tokio::spawn(
-                backup_task_main(
-                    ttid,
-                    timeline_dir,
-                    conf.workdir.clone(),
-                    conf.backup_parallel_jobs,
-                    shutdown_rx,
-                )
-                .in_current_span(),
-            );
-
-            entry.handle = Some(WalBackupTaskHandle {
-                shutdown_tx,
-                handle,
-            });
-        } else {
-            info!("stepping down from backup: {}", election_dbg_str);
-            shut_down_task(ttid, entry).await;
-        }
-    }
-}
-
 static REMOTE_STORAGE: OnceCell<Option<GenericRemoteStorage>> = OnceCell::new();
 
 // Storage must be configured and initialized when this is called.
@@ -190,67 +197,6 @@ pub fn init_remote_storage(conf: &SafeKeeperConf) {
     });
 }
 
-const CHECK_TASKS_INTERVAL_MSEC: u64 = 1000;
-
-/// Sits on wal_backup_launcher_rx and starts/stops per timeline wal backup
-/// tasks. Having this in separate task simplifies locking, allows to reap
-/// panics and separate elections from offloading itself.
-pub async fn wal_backup_launcher_task_main(
-    conf: SafeKeeperConf,
-    mut wal_backup_launcher_rx: Receiver<TenantTimelineId>,
-) -> anyhow::Result<()> {
-    info!(
-        "WAL backup launcher started, remote config {:?}",
-        conf.remote_storage
-    );
-
-    // Presence in this map means launcher is aware s3 offloading is needed for
-    // the timeline, but task is started only if it makes sense for to offload
-    // from this safekeeper.
-    let mut tasks: HashMap<TenantTimelineId, WalBackupTimelineEntry> = HashMap::new();
-
-    let mut ticker = tokio::time::interval(Duration::from_millis(CHECK_TASKS_INTERVAL_MSEC));
-    loop {
-        tokio::select! {
-            ttid = wal_backup_launcher_rx.recv() => {
-                // channel is never expected to get closed
-                let ttid = ttid.unwrap();
-                if !conf.is_wal_backup_enabled() {
-                    continue; /* just drain the channel and do nothing */
-                }
-                async {
-                    let timeline = is_wal_backup_required(ttid).await;
-                    // do we need to do anything at all?
-                    if timeline.is_some() != tasks.contains_key(&ttid) {
-                        if let Some(timeline) = timeline {
-                            // need to start the task
-                            let entry = tasks.entry(ttid).or_insert(WalBackupTimelineEntry {
-                                timeline,
-                                handle: None,
-                            });
-                            update_task(&conf, ttid, entry).await;
-                        } else {
-                            // need to stop the task
-                            info!("stopping WAL backup task");
-                            let mut entry = tasks.remove(&ttid).unwrap();
-                            shut_down_task(ttid, &mut entry).await;
-                        }
-                    }
-                }.instrument(info_span!("WAL backup", ttid = %ttid)).await;
-            }
-            // For each timeline needing offloading, check if this safekeeper
-            // should do the job and start/stop the task accordingly.
-            _ = ticker.tick() => {
-                for (ttid, entry) in tasks.iter_mut() {
-                    update_task(&conf, *ttid, entry)
-                        .instrument(info_span!("WAL backup", ttid = %ttid))
-                        .await;
-                }
-            }
-        }
-    }
-}
-
 struct WalBackupTask {
     timeline: Arc<Timeline>,
     timeline_dir: Utf8PathBuf,
@@ -261,6 +207,7 @@ struct WalBackupTask {
 }
 
 /// Offload single timeline.
+#[instrument(name = "WAL backup", skip_all, fields(ttid = %ttid))]
 async fn backup_task_main(
     ttid: TenantTimelineId,
     timeline_dir: Utf8PathBuf,
@@ -268,6 +215,8 @@ async fn backup_task_main(
     parallel_jobs: usize,
     mut shutdown_rx: Receiver<()>,
 ) {
+    let _guard = WAL_BACKUP_TASKS.guard();
+
     info!("started");
     let res = GlobalTimelines::get(ttid);
     if let Err(e) = res {

From 664f92dc6e6eac25380272fe4e466b4ddc4954bb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 22 May 2024 12:43:03 +0200
Subject: [PATCH 107/130] Refactor PageServerHandler::process_query parsing
 (#7835)

In the process_query function in page_service.rs there was some
redundant duplication. Remove it and create a vector of whitespace
separated parts at the start and then use `slice::strip_prefix`. Only
use `starts_with` in the places with multiple whitespace separated
parameters: here we want to preserve grep/rg ability.

Followup of #7815, requested in
https://github.com/neondatabase/neon/pull/7815#pullrequestreview-2068835674
---
 pageserver/src/page_service.rs | 70 ++++++++++++----------------------
 1 file changed, 25 insertions(+), 45 deletions(-)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index c066f56c17..d250864fd6 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1521,9 +1521,8 @@ where
 
         let ctx = self.connection_ctx.attached_child();
         debug!("process query {query_string:?}");
-        if query_string.starts_with("pagestream_v2 ") {
-            let (_, params_raw) = query_string.split_at("pagestream_v2 ".len());
-            let params = params_raw.split(' ').collect::<Vec<_>>();
+        let parts = query_string.split_whitespace().collect::<Vec<_>>();
+        if let Some(params) = parts.strip_prefix(&["pagestream_v2"]) {
             if params.len() != 2 {
                 return Err(QueryError::Other(anyhow::anyhow!(
                     "invalid param number for pagestream command"
@@ -1548,9 +1547,7 @@ where
                 ctx,
             )
             .await?;
-        } else if query_string.starts_with("pagestream ") {
-            let (_, params_raw) = query_string.split_at("pagestream ".len());
-            let params = params_raw.split(' ').collect::<Vec<_>>();
+        } else if let Some(params) = parts.strip_prefix(&["pagestream"]) {
             if params.len() != 2 {
                 return Err(QueryError::Other(anyhow::anyhow!(
                     "invalid param number for pagestream command"
@@ -1575,10 +1572,7 @@ where
                 ctx,
             )
             .await?;
-        } else if query_string.starts_with("basebackup ") {
-            let (_, params_raw) = query_string.split_at("basebackup ".len());
-            let params = params_raw.split_whitespace().collect::<Vec<_>>();
-
+        } else if let Some(params) = parts.strip_prefix(&["basebackup"]) {
             if params.len() < 2 {
                 return Err(QueryError::Other(anyhow::anyhow!(
                     "invalid param number for basebackup command"
@@ -1596,26 +1590,23 @@ where
 
             self.check_permission(Some(tenant_id))?;
 
-            let lsn = if params.len() >= 3 {
+            let lsn = if let Some(lsn_str) = params.get(2) {
                 Some(
-                    Lsn::from_str(params[2])
-                        .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?,
+                    Lsn::from_str(lsn_str)
+                        .with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?,
                 )
             } else {
                 None
             };
 
-            let gzip = if params.len() >= 4 {
-                if params[3] == "--gzip" {
-                    true
-                } else {
+            let gzip = match params.get(3) {
+                Some(&"--gzip") => true,
+                None => false,
+                Some(third_param) => {
                     return Err(QueryError::Other(anyhow::anyhow!(
-                        "Parameter in position 3 unknown {}",
-                        params[3],
-                    )));
+                        "Parameter in position 3 unknown {third_param}",
+                    )))
                 }
-            } else {
-                false
             };
 
             let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(&ctx);
@@ -1639,10 +1630,7 @@ where
             res?;
         }
         // return pair of prev_lsn and last_lsn
-        else if query_string.starts_with("get_last_record_rlsn ") {
-            let (_, params_raw) = query_string.split_at("get_last_record_rlsn ".len());
-            let params = params_raw.split_whitespace().collect::<Vec<_>>();
-
+        else if let Some(params) = parts.strip_prefix(&["get_last_record_rlsn"]) {
             if params.len() != 2 {
                 return Err(QueryError::Other(anyhow::anyhow!(
                     "invalid param number for get_last_record_rlsn command"
@@ -1684,10 +1672,7 @@ where
             .await?;
         }
         // same as basebackup, but result includes relational data as well
-        else if query_string.starts_with("fullbackup ") {
-            let (_, params_raw) = query_string.split_at("fullbackup ".len());
-            let params = params_raw.split_whitespace().collect::<Vec<_>>();
-
+        else if let Some(params) = parts.strip_prefix(&["fullbackup"]) {
             if params.len() < 2 {
                 return Err(QueryError::Other(anyhow::anyhow!(
                     "invalid param number for fullbackup command"
@@ -1704,18 +1689,18 @@ where
                 .record("timeline_id", field::display(timeline_id));
 
             // The caller is responsible for providing correct lsn and prev_lsn.
-            let lsn = if params.len() > 2 {
+            let lsn = if let Some(lsn_str) = params.get(2) {
                 Some(
-                    Lsn::from_str(params[2])
-                        .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?,
+                    Lsn::from_str(lsn_str)
+                        .with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?,
                 )
             } else {
                 None
             };
-            let prev_lsn = if params.len() > 3 {
+            let prev_lsn = if let Some(prev_lsn_str) = params.get(3) {
                 Some(
-                    Lsn::from_str(params[3])
-                        .with_context(|| format!("Failed to parse Lsn from {}", params[3]))?,
+                    Lsn::from_str(prev_lsn_str)
+                        .with_context(|| format!("Failed to parse Lsn from {prev_lsn_str}"))?,
                 )
             } else {
                 None
@@ -1748,8 +1733,7 @@ where
             // 2. Run:
             // cat my_backup/base.tar | psql -h $PAGESERVER \
             //     -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN $PG_VERSION"
-            let (_, params_raw) = query_string.split_at("import basebackup ".len());
-            let params = params_raw.split_whitespace().collect::<Vec<_>>();
+            let params = &parts[2..];
             if params.len() != 5 {
                 return Err(QueryError::Other(anyhow::anyhow!(
                     "invalid param number for import basebackup command"
@@ -1798,8 +1782,7 @@ where
             //
             // Files are scheduled to be persisted to remote storage, and the
             // caller should poll the http api to check when that is done.
-            let (_, params_raw) = query_string.split_at("import wal ".len());
-            let params = params_raw.split_whitespace().collect::<Vec<_>>();
+            let params = &parts[2..];
             if params.len() != 4 {
                 return Err(QueryError::Other(anyhow::anyhow!(
                     "invalid param number for import wal command"
@@ -1838,8 +1821,7 @@ where
             // on connect
             pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
         } else if query_string.starts_with("lease lsn ") {
-            let (_, params_raw) = query_string.split_at("lease lsn ".len());
-            let params = params_raw.split_whitespace().collect::<Vec<_>>();
+            let params = &parts[2..];
             if params.len() != 3 {
                 return Err(QueryError::Other(anyhow::anyhow!(
                     "invalid param number {} for lease lsn command",
@@ -1875,10 +1857,8 @@ where
                     ))?
                 }
             };
-        } else if query_string.starts_with("show ") {
+        } else if let Some(params) = parts.strip_prefix(&["show"]) {
             // show <tenant_id>
-            let (_, params_raw) = query_string.split_at("show ".len());
-            let params = params_raw.split(' ').collect::<Vec<_>>();
             if params.len() != 1 {
                 return Err(QueryError::Other(anyhow::anyhow!(
                     "invalid param number for config command"

From b43f6daa488de05691775a8908920d274fb53369 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Thu, 2 May 2024 15:53:30 +0300
Subject: [PATCH 108/130] One more iteration on making walcraft test more
 robust.

Some WAL might be inserted on the page boundary before XLOG_SWITCH lands there,
repeat construction in this case.
---
 libs/postgres_ffi/wal_craft/src/lib.rs | 46 ++++++++++++--------------
 1 file changed, 22 insertions(+), 24 deletions(-)

diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs
index b6769629a8..6052f04d11 100644
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -373,31 +373,29 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
                 "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))",
                 &[&(repeats as i32)],
             )?;
-            break;
-        }
-        info!(
-            "current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}",
-            client.pg_current_wal_insert_lsn()?,
-            XLOG_SIZE_OF_XLOG_RECORD
-        );
+            info!(
+                "current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}",
+                client.pg_current_wal_insert_lsn()?,
+                XLOG_SIZE_OF_XLOG_RECORD
+            );
 
-        // Emit the XLOG_SWITCH
-        let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
-        let xlog_switch_record_end: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
-        let next_segment = PgLsn::from(0x0200_0000);
-        ensure!(
-            xlog_switch_record_end < next_segment,
-            "XLOG_SWITCH record ended on or after the expected segment boundary: {} > {}",
-            xlog_switch_record_end,
-            next_segment
-        );
-        ensure!(
-            u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD,
-            "XLOG_SWITCH message ended not on page boundary: {}, offset = {}",
-            xlog_switch_record_end,
-            u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ
-        );
-        Ok(vec![before_xlog_switch, xlog_switch_record_end])
+            // Emit the XLOG_SWITCH
+            let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
+            let xlog_switch_record_end: PgLsn =
+                client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
+
+            if u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ
+                != XLOG_SIZE_OF_XLOG_SHORT_PHD
+            {
+                warn!(
+                    "XLOG_SWITCH message ended not on page boundary: {}, offset = {}, repeating",
+                    xlog_switch_record_end,
+                    u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ
+                );
+                continue;
+            }
+            return Ok(vec![before_xlog_switch, xlog_switch_record_end]);
+        }
     }
 }
 

From ef96c82c9f189633629e6bfc051a78fbab12c9a6 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 21 May 2024 19:33:03 +0300
Subject: [PATCH 109/130] Fix zenith_test_evict mode and clear_buffer_cache()
 function

Using InvalidateBuffer is wrong, because if the page is concurrently
dirtied, it will throw away the dirty page without calling
smgwrite(). In Neon, that means that the last-written LSN update for
the page is missed.

In v16, use the new InvalidateVictimBuffer() function that does what
we need. In v15 and v14, backport the InvalidateVictimBuffer()
function.

Fixes issue https://github.com/neondatabase/neon/issues/7802
---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/postgres-v16   | 2 +-
 vendor/revisions.json | 6 +++---
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 21ec61d539..0d30e28f74 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 21ec61d539d22a81fe811c2d79e26436820bc3f4
+Subproject commit 0d30e28f74f49fe6a27a6bd45dcfeb1060656b8f
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index e2dbd63345..74fb144890 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit e2dbd63345c584de75173c27951f111249ae0016
+Subproject commit 74fb144890c4f955db1ef50ee1eeb9d8a6c2f69d
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index c271017c6c..3c2b9d576c 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit c271017c6c4846be59948766baec2ba4ace5dc9c
+Subproject commit 3c2b9d576c580e0b5b7108001f959b8c5b42e0a2
diff --git a/vendor/revisions.json b/vendor/revisions.json
index a3af9331fe..2f16f334c5 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
-  "v16": ["16.3", "c271017c6c4846be59948766baec2ba4ace5dc9c"],
-  "v15": ["15.7", "e2dbd63345c584de75173c27951f111249ae0016"],
-  "v14": ["14.12", "21ec61d539d22a81fe811c2d79e26436820bc3f4"]
+  "v16": ["16.3", "3c2b9d576c580e0b5b7108001f959b8c5b42e0a2"],
+  "v15": ["15.7", "74fb144890c4f955db1ef50ee1eeb9d8a6c2f69d"],
+  "v14": ["14.12", "0d30e28f74f49fe6a27a6bd45dcfeb1060656b8f"]
 }

From df9ab1b5e3962da4d98c7b1cd6a7fa20f4ff3902 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 22 May 2024 15:43:21 +0300
Subject: [PATCH 110/130] refactor(test): duplication with fullbackup, tar
 content hashing (#7828)

"taking a fullbackup" is an ugly multi-liner copypasted in multiple
places, most recently with timeline ancestor detach tests. move it under
`PgBin` which is not a great place, but better than yet another utility
function.

Additionally:
- cleanup `psql_env` repetition (PgBin already configures that)
- move the backup tar comparison as a yet another free utility function
- use backup tar comparison in `test_import.py` where a size check was
done previously
- cleanup extra timeline creation from test

Cc: #7715
---
 test_runner/fixtures/neon_fixtures.py         |  22 ++++
 test_runner/fixtures/utils.py                 |  50 ++++++++
 test_runner/regress/test_fullbackup.py        |  23 ++--
 test_runner/regress/test_import.py            |  25 +---
 test_runner/regress/test_next_xid.py          |  11 +-
 .../regress/test_timeline_detach_ancestor.py  | 120 +++---------------
 6 files changed, 106 insertions(+), 145 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 7a660b64ed..b02054a702 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2788,6 +2788,28 @@ class PgBin:
         log.info(f"last checkpoint at {checkpoint_lsn}")
         return Lsn(checkpoint_lsn)
 
+    def take_fullbackup(
+        self,
+        pageserver: NeonPageserver,
+        tenant: TenantId,
+        timeline: TimelineId,
+        lsn: Lsn,
+        output: Path,
+    ):
+        """
+        Request fullbackup from pageserver, store it at 'output'.
+        """
+        cmd = [
+            "psql",
+            "--no-psqlrc",
+            pageserver.connstr(),
+            "-c",
+            f"fullbackup {tenant} {timeline} {lsn}",
+            "-o",
+            str(output),
+        ]
+        self.run_capture(cmd)
+
 
 @pytest.fixture(scope="function")
 def pg_bin(test_output_dir: Path, pg_distrib_dir: Path, pg_version: PgVersion) -> PgBin:
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index 16dc9e8cfb..70263245e7 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -4,10 +4,13 @@ import json
 import os
 import re
 import subprocess
+import tarfile
 import threading
 import time
+from hashlib import sha256
 from pathlib import Path
 from typing import (
+    IO,
     TYPE_CHECKING,
     Any,
     Callable,
@@ -15,8 +18,10 @@ from typing import (
     Iterable,
     List,
     Optional,
+    Set,
     Tuple,
     TypeVar,
+    Union,
 )
 from urllib.parse import urlencode
 
@@ -499,3 +504,48 @@ class AuxFileStore(str, enum.Enum):
 
     def __str__(self) -> str:
         return f"'aux-{self.value}'"
+
+
+def assert_pageserver_backups_equal(left: Path, right: Path, skip_files: Set[str]):
+    """
+    This is essentially:
+
+    lines=$(comm -3 \
+        <(mkdir left && cd left && tar xf "$left" && find . -type f -print0 | xargs sha256sum | sort -k2) \
+        <(mkdir right && cd right && tar xf "$right" && find . -type f -print0 | xargs sha256sum | sort -k2) \
+        | wc -l)
+    [ "$lines" = "0" ]
+
+    But in a more mac friendly fashion.
+    """
+    started_at = time.time()
+
+    def hash_extracted(reader: Union[IO[bytes], None]) -> bytes:
+        assert reader is not None
+        digest = sha256(usedforsecurity=False)
+        while True:
+            buf = reader.read(64 * 1024)
+            if not buf:
+                break
+            digest.update(buf)
+        return digest.digest()
+
+    def build_hash_list(p: Path) -> List[Tuple[str, bytes]]:
+        with tarfile.open(p) as f:
+            matching_files = (info for info in f if info.isreg() and info.name not in skip_files)
+            ret = list(
+                map(lambda info: (info.name, hash_extracted(f.extractfile(info))), matching_files)
+            )
+            ret.sort(key=lambda t: t[0])
+            return ret
+
+    left_list, right_list = map(build_hash_list, [left, right])
+
+    try:
+        assert len(left_list) == len(right_list)
+
+        for left_tuple, right_tuple in zip(left_list, right_list):
+            assert left_tuple == right_tuple
+    finally:
+        elapsed = time.time() - started_at
+        log.info(f"assert_pageserver_backups_equal completed in {elapsed}s")
diff --git a/test_runner/regress/test_fullbackup.py b/test_runner/regress/test_fullbackup.py
index e1e4f700d4..e6d51a77a6 100644
--- a/test_runner/regress/test_fullbackup.py
+++ b/test_runner/regress/test_fullbackup.py
@@ -1,7 +1,7 @@
 import os
 from pathlib import Path
 
-from fixtures.common_types import Lsn, TimelineId
+from fixtures.common_types import Lsn
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
@@ -19,17 +19,16 @@ def test_fullbackup(
     neon_env_builder: NeonEnvBuilder,
     pg_bin: PgBin,
     port_distributor: PortDistributor,
-    pg_distrib_dir: Path,
     test_output_dir: Path,
 ):
     env = neon_env_builder.init_start()
 
-    env.neon_cli.create_branch("test_fullbackup")
-    endpoint_main = env.endpoints.create_start("test_fullbackup")
+    # endpoint needs to be alive until the fullbackup so that we have
+    # prev_record_lsn for the vanilla_pg to start in read-write mode
+    # for some reason this does not happen if endpoint is shutdown.
+    endpoint_main = env.endpoints.create_start("main")
 
     with endpoint_main.cursor() as cur:
-        timeline = TimelineId(query_scalar(cur, "SHOW neon.timeline_id"))
-
         # data loading may take a while, so increase statement timeout
         cur.execute("SET statement_timeout='300s'")
         cur.execute(
@@ -41,17 +40,13 @@ def test_fullbackup(
         lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_insert_lsn()"))
         log.info(f"start_backup_lsn = {lsn}")
 
-    # Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq.
-    # PgBin sets it automatically, but here we need to pipe psql output to the tar command.
-    psql_env = {"LD_LIBRARY_PATH": str(pg_distrib_dir / "lib")}
-
     # Get and unpack fullbackup from pageserver
     restored_dir_path = env.repo_dir / "restored_datadir"
     os.mkdir(restored_dir_path, 0o750)
-    query = f"fullbackup {env.initial_tenant} {timeline} {lsn}"
     tar_output_file = test_output_dir / "fullbackup.tar"
-    cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query, "-o", str(tar_output_file)]
-    pg_bin.run_capture(cmd, env=psql_env)
+    pg_bin.take_fullbackup(
+        env.pageserver, env.initial_tenant, env.initial_timeline, lsn, tar_output_file
+    )
     subprocess_capture(
         env.repo_dir, ["tar", "-xf", str(tar_output_file), "-C", str(restored_dir_path)]
     )
@@ -61,7 +56,7 @@ def test_fullbackup(
     # use resetwal to overwrite it
     pg_resetwal_path = os.path.join(pg_bin.pg_bin_path, "pg_resetwal")
     cmd = [pg_resetwal_path, "-D", str(restored_dir_path)]
-    pg_bin.run_capture(cmd, env=psql_env)
+    pg_bin.run_capture(cmd)
 
     # Restore from the backup and find the data we inserted
     port = port_distributor.get_port()
diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py
index 1f1c8cc582..62229ebfe7 100644
--- a/test_runner/regress/test_import.py
+++ b/test_runner/regress/test_import.py
@@ -21,7 +21,7 @@ from fixtures.pageserver.utils import (
     wait_for_upload,
 )
 from fixtures.remote_storage import RemoteStorageKind
-from fixtures.utils import subprocess_capture
+from fixtures.utils import assert_pageserver_backups_equal, subprocess_capture
 
 
 def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_builder):
@@ -248,15 +248,9 @@ def _import(
     path to the backup archive file"""
     log.info(f"start_backup_lsn = {lsn}")
 
-    # Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq.
-    # PgBin sets it automatically, but here we need to pipe psql output to the tar command.
-    psql_env = {"LD_LIBRARY_PATH": str(pg_distrib_dir / "lib")}
-
     # Get a fullbackup from pageserver
-    query = f"fullbackup { env.initial_tenant} {timeline} {lsn}"
     tar_output_file = test_output_dir / "fullbackup.tar"
-    cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query, "-o", str(tar_output_file)]
-    pg_bin.run_capture(cmd, env=psql_env)
+    pg_bin.take_fullbackup(env.pageserver, env.initial_tenant, timeline, lsn, tar_output_file)
 
     # Stop the first pageserver instance, erase all its data
     env.endpoints.stop_all()
@@ -305,22 +299,11 @@ def _import(
     assert endpoint.safe_psql("select count(*) from tbl") == [(expected_num_rows,)]
 
     # Take another fullbackup
-    query = f"fullbackup { tenant} {timeline} {lsn}"
     new_tar_output_file = test_output_dir / "fullbackup-new.tar"
-    cmd = [
-        "psql",
-        "--no-psqlrc",
-        env.pageserver.connstr(),
-        "-c",
-        query,
-        "-o",
-        str(new_tar_output_file),
-    ]
-    pg_bin.run_capture(cmd, env=psql_env)
+    pg_bin.take_fullbackup(env.pageserver, tenant, timeline, lsn, new_tar_output_file)
 
     # Check it's the same as the first fullbackup
-    # TODO pageserver should be checking checksum
-    assert os.path.getsize(tar_output_file) == os.path.getsize(new_tar_output_file)
+    assert_pageserver_backups_equal(tar_output_file, new_tar_output_file, set())
 
     # Check that gc works
     pageserver_http = env.pageserver.http_client()
diff --git a/test_runner/regress/test_next_xid.py b/test_runner/regress/test_next_xid.py
index 45c0e3e409..98fb06a0d6 100644
--- a/test_runner/regress/test_next_xid.py
+++ b/test_runner/regress/test_next_xid.py
@@ -5,7 +5,7 @@ from pathlib import Path
 
 from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_wal_insert_lsn
+from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_wal_insert_lsn
 from fixtures.pageserver.utils import (
     wait_for_last_record_lsn,
 )
@@ -71,22 +71,17 @@ def test_next_xid(neon_env_builder: NeonEnvBuilder):
 def test_import_at_2bil(
     neon_env_builder: NeonEnvBuilder,
     test_output_dir: Path,
-    pg_distrib_dir: Path,
-    pg_bin,
+    pg_bin: PgBin,
     vanilla_pg,
 ):
     neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
     env = neon_env_builder.init_start()
     ps_http = env.pageserver.http_client()
 
-    # Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq.
-    # PgBin sets it automatically, but here we need to pipe psql output to the tar command.
-    psql_env = {"LD_LIBRARY_PATH": str(pg_distrib_dir / "lib")}
-
     # Reset the vanilla Postgres instance to somewhat before 2 billion transactions.
     pg_resetwal_path = os.path.join(pg_bin.pg_bin_path, "pg_resetwal")
     cmd = [pg_resetwal_path, "--next-transaction-id=2129920000", "-D", str(vanilla_pg.pgdatadir)]
-    pg_bin.run_capture(cmd, env=psql_env)
+    pg_bin.run_capture(cmd)
 
     vanilla_pg.start()
     vanilla_pg.safe_psql("create user cloud_admin with password 'postgres' superuser")
diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index 3e435caeee..1563c161e0 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -1,25 +1,18 @@
 import datetime
 import enum
-import tarfile
-import time
 from concurrent.futures import ThreadPoolExecutor
-from hashlib import sha256
-from pathlib import Path
 from queue import Empty, Queue
 from threading import Barrier
-from typing import IO, List, Set, Tuple, Union
+from typing import List, Tuple
 
 import pytest
 from fixtures.common_types import Lsn, TimelineId
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import (
-    NeonEnvBuilder,
-    PgBin,
-    wait_for_last_flush_lsn,
-)
+from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_last_flush_lsn
 from fixtures.pageserver.http import HistoricLayerInfo
 from fixtures.pageserver.utils import wait_timeline_detail_404
 from fixtures.remote_storage import LocalFsStorage
+from fixtures.utils import assert_pageserver_backups_equal
 
 
 def by_end_lsn(info: HistoricLayerInfo) -> Lsn:
@@ -68,7 +61,6 @@ SHUTDOWN_ALLOWED_ERRORS = [
 @pytest.mark.parametrize("write_to_branch_first", [True, False])
 def test_ancestor_detach_branched_from(
     test_output_dir,
-    pg_distrib_dir,
     neon_env_builder: NeonEnvBuilder,
     pg_bin: PgBin,
     branchpoint: Branchpoint,
@@ -80,7 +72,6 @@ def test_ancestor_detach_branched_from(
     """
     env = neon_env_builder.init_start()
 
-    psql_env = {"LD_LIBRARY_PATH": str(pg_distrib_dir / "lib")}
     env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
 
     client = env.pageserver.http_client()
@@ -160,16 +151,9 @@ def test_ancestor_detach_branched_from(
     # run fullbackup to make sure there are no off by one errors
     # take this on the parent
     fullbackup_before = test_output_dir / "fullbackup-before.tar"
-    cmd = [
-        "psql",
-        "--no-psqlrc",
-        env.pageserver.connstr(),
-        "-c",
-        f"fullbackup {env.initial_tenant} {env.initial_timeline} {branch_at}",
-        "-o",
-        str(fullbackup_before),
-    ]
-    pg_bin.run_capture(cmd, env=psql_env)
+    pg_bin.take_fullbackup(
+        env.pageserver, env.initial_tenant, env.initial_timeline, branch_at, fullbackup_before
+    )
 
     all_reparented = client.detach_ancestor(env.initial_tenant, timeline_id)
     assert all_reparented == set()
@@ -200,16 +184,9 @@ def test_ancestor_detach_branched_from(
 
     # take this on the detached, at same lsn
     fullbackup_after = test_output_dir / "fullbackup-after.tar"
-    cmd = [
-        "psql",
-        "--no-psqlrc",
-        env.pageserver.connstr(),
-        "-c",
-        f"fullbackup {env.initial_tenant} {timeline_id} {branch_at}",
-        "-o",
-        str(fullbackup_after),
-    ]
-    pg_bin.run_capture(cmd, env=psql_env)
+    pg_bin.take_fullbackup(
+        env.pageserver, env.initial_tenant, timeline_id, branch_at, fullbackup_after
+    )
 
     client.timeline_delete(env.initial_tenant, env.initial_timeline)
     wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline, 10, 1.0)
@@ -218,52 +195,7 @@ def test_ancestor_detach_branched_from(
     # as there is always "PREV_LSN: invalid" for "before"
     skip_files = {"zenith.signal"}
 
-    tar_cmp(fullbackup_before, fullbackup_after, skip_files)
-
-
-def tar_cmp(left: Path, right: Path, skip_files: Set[str]):
-    """
-    This is essentially:
-
-    lines=$(comm -3 \
-        <(mkdir left && cd left && tar xf "$left" && find . -type f -print0 | xargs sha256sum | sort -k2) \
-        <(mkdir right && cd right && tar xf "$right" && find . -type f -print0 | xargs sha256sum | sort -k2) \
-        | wc -l)
-    [ "$lines" = "0" ]
-
-    But in a more mac friendly fashion.
-    """
-    started_at = time.time()
-
-    def hash_extracted(reader: Union[IO[bytes], None]) -> bytes:
-        assert reader is not None
-        digest = sha256(usedforsecurity=False)
-        while True:
-            buf = reader.read(64 * 1024)
-            if not buf:
-                break
-            digest.update(buf)
-        return digest.digest()
-
-    def build_hash_list(p: Path) -> List[Tuple[str, bytes]]:
-        with tarfile.open(p) as f:
-            matching_files = (info for info in f if info.isreg() and info.name not in skip_files)
-            ret = list(
-                map(lambda info: (info.name, hash_extracted(f.extractfile(info))), matching_files)
-            )
-            ret.sort(key=lambda t: t[0])
-            return ret
-
-    left_list, right_list = map(build_hash_list, [left, right])
-
-    try:
-        assert len(left_list) == len(right_list)
-
-        for left_tuple, right_tuple in zip(left_list, right_list):
-            assert left_tuple == right_tuple
-    finally:
-        elapsed = time.time() - started_at
-        log.info(f"tar_cmp completed in {elapsed}s")
+    assert_pageserver_backups_equal(fullbackup_before, fullbackup_after, skip_files)
 
 
 def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder):
@@ -483,7 +415,7 @@ def test_detached_receives_flushes_while_being_detached(neon_env_builder: NeonEn
 
 
 def test_compaction_induced_by_detaches_in_history(
-    neon_env_builder: NeonEnvBuilder, test_output_dir, pg_distrib_dir, pg_bin: PgBin
+    neon_env_builder: NeonEnvBuilder, test_output_dir, pg_bin: PgBin
 ):
     """
     Assuming the tree of timelines:
@@ -500,8 +432,6 @@ def test_compaction_induced_by_detaches_in_history(
     timeline broken.
     """
 
-    psql_env = {"LD_LIBRARY_PATH": str(pg_distrib_dir / "lib")}
-
     env = neon_env_builder.init_start(
         initial_tenant_conf={
             # we want to create layers manually so we don't branch on arbitrary
@@ -589,16 +519,9 @@ def test_compaction_induced_by_detaches_in_history(
 
     # take the fullbackup before and after inheriting the new L0s
     fullbackup_before = test_output_dir / "fullbackup-before.tar"
-    cmd = [
-        "psql",
-        "--no-psqlrc",
-        env.pageserver.connstr(),
-        "-c",
-        f"fullbackup {env.initial_tenant} {branch_timeline_id} {branch_lsn}",
-        "-o",
-        str(fullbackup_before),
-    ]
-    pg_bin.run_capture(cmd, env=psql_env)
+    pg_bin.take_fullbackup(
+        env.pageserver, env.initial_tenant, branch_timeline_id, branch_lsn, fullbackup_before
+    )
 
     for _, timeline_id in skip_main:
         reparented = client.detach_ancestor(env.initial_tenant, timeline_id)
@@ -624,19 +547,12 @@ def test_compaction_induced_by_detaches_in_history(
     assert len(post_compact_l0s) == 1, "only the consecutive inherited L0s should be compacted"
 
     fullbackup_after = test_output_dir / "fullbackup_after.tar"
-    cmd = [
-        "psql",
-        "--no-psqlrc",
-        env.pageserver.connstr(),
-        "-c",
-        f"fullbackup {env.initial_tenant} {branch_timeline_id} {branch_lsn}",
-        "-o",
-        str(fullbackup_after),
-    ]
-    pg_bin.run_capture(cmd, env=psql_env)
+    pg_bin.take_fullbackup(
+        env.pageserver, env.initial_tenant, branch_timeline_id, branch_lsn, fullbackup_after
+    )
 
     # we don't need to skip any files, because zenith.signal will be identical
-    tar_cmp(fullbackup_before, fullbackup_after, set())
+    assert_pageserver_backups_equal(fullbackup_before, fullbackup_after, set())
 
 
 # TODO:

From d1d55bbd9fbdca34d7f96c391f4a0368fb8c6583 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 22 May 2024 14:43:10 +0100
Subject: [PATCH 111/130] CI(report-benchmarks-failures): fix condition (#7820)

## Problem

`report-benchmarks-failures` got skipped if a dependent job fails.

## Summary of changes
- Fix the if-condition by adding `&& failures()` to it; it'll make the
job run if the dependent job fails.
---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 14d19f7ae3..d8ad6e26d0 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -548,7 +548,7 @@ jobs:
 
   report-benchmarks-failures:
     needs: [ benchmarks, create-test-report ]
-    if: github.ref_name == 'main' && needs.benchmarks.result == 'failure'
+    if: github.ref_name == 'main' && failure() && needs.benchmarks.result == 'failure'
     runs-on: ubuntu-latest
 
     steps:

From ce44dfe3532b22689464cbeddbf392d05049b9c4 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 22 May 2024 16:55:34 +0300
Subject: [PATCH 112/130] openapi: document timeline ancestor detach (#7650)

The openapi description with the error descriptions:

- 200 is used for "detached or has been detached previously"
- 400 is used for "cannot be detached right now" -- it's an odd thing,
but good enough
- 404 is used for tenant or timeline not found
- 409 is used for "can never be detached" (root timeline)
- 500 is used for transient errors (basically ill-defined shutdown
errors)
- 503 is used for busy (other tenant ancestor detach underway,
pageserver shutdown)

Cc: #6994
---
 pageserver/src/http/openapi_spec.yml          | 87 +++++++++++++++++++
 pageserver/src/http/routes.rs                 | 27 +++---
 .../src/tenant/timeline/detach_ancestor.rs    | 30 ++++++-
 .../regress/test_timeline_detach_ancestor.py  | 36 +++++++-
 4 files changed, 161 insertions(+), 19 deletions(-)

diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 107bcd4a22..e5eafc51f4 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -612,6 +612,80 @@ paths:
               schema:
                 $ref: "#/components/schemas/Error"
 
+  /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/detach_ancestor:
+    parameters:
+      - name: tenant_shard_id
+        in: path
+        required: true
+        schema:
+          type: string
+      - name: timeline_id
+        in: path
+        ŕequired: true
+        schema:
+          type: string
+
+    put:
+      description: |
+        Detach a timeline from its ancestor and reparent all ancestors timelines with lower `ancestor_lsn`.
+        Current implementation might not be retryable across failure cases, but will be enhanced in future.
+        Detaching should be expected to be expensive operation. Timeouts should be retried.
+      responses:
+        "200":
+          description: |
+            The timeline has been detached from it's ancestor (now or earlier), and at least the returned timelines have been reparented.
+            If any timelines were deleted after reparenting, they might not be on this list.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/AncestorDetached"
+
+        "400":
+          description: |
+            Number of early checks meaning the timeline cannot be detached now:
+              - the ancestor of timeline has an ancestor: not supported, see RFC
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+
+        "404":
+          description: Tenant or timeline not found.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/NotFoundError"
+
+        "409":
+          description: |
+            The timeline can never be detached:
+              - timeline has no ancestor, implying that the timeline has never had an ancestor
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ConflictError"
+
+        "500":
+          description: |
+            Transient error, for example, pageserver shutdown happened while
+            processing the request but we were unable to distinguish that. Must
+            be retried.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+
+        "503":
+          description: |
+            Temporarily unavailable, please retry. Possible reasons:
+              - another timeline detach for the same tenant is underway, please retry later
+              - detected shutdown error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+
+
   /v1/tenant/:
     get:
       description: Get tenants list
@@ -1077,6 +1151,19 @@ components:
           format: int64
           description: How many bytes of layer content were in the latest layer heatmap
 
+    AncestorDetached:
+      type: object
+      required:
+        - reparented_timelines
+      properties:
+        reparented_timelines:
+          type: array
+          description: Set of reparented timeline ids
+          properties:
+            type: string
+            format: hex
+            description: TimelineId
+
 
     Error:
       type: object
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index c75e4ca5a9..34b9806a26 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -74,6 +74,7 @@ use crate::tenant::storage_layer::LayerAccessStatsReset;
 use crate::tenant::storage_layer::LayerName;
 use crate::tenant::timeline::CompactFlags;
 use crate::tenant::timeline::Timeline;
+use crate::tenant::GetTimelineError;
 use crate::tenant::SpawnMode;
 use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
 use crate::{config::PageServerConf, tenant::mgr};
@@ -279,6 +280,13 @@ impl From<GetTenantError> for ApiError {
     }
 }
 
+impl From<GetTimelineError> for ApiError {
+    fn from(gte: GetTimelineError) -> Self {
+        // Rationale: tenant is activated only after eligble timelines activate
+        ApiError::NotFound(gte.into())
+    }
+}
+
 impl From<GetActiveTenantError> for ApiError {
     fn from(e: GetActiveTenantError) -> ApiError {
         match e {
@@ -643,9 +651,7 @@ async fn timeline_preserve_initdb_handler(
             .tenant_manager
             .get_attached_tenant_shard(tenant_shard_id)?;
 
-        let timeline = tenant
-            .get_timeline(timeline_id, false)
-            .map_err(|e| ApiError::NotFound(e.into()))?;
+        let timeline = tenant.get_timeline(timeline_id, false)?;
 
         timeline
             .preserve_initdb_archive()
@@ -687,9 +693,7 @@ async fn timeline_detail_handler(
 
         tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
 
-        let timeline = tenant
-            .get_timeline(timeline_id, false)
-            .map_err(|e| ApiError::NotFound(e.into()))?;
+        let timeline = tenant.get_timeline(timeline_id, false)?;
 
         let timeline_info = build_timeline_info(
             &timeline,
@@ -1901,14 +1905,11 @@ async fn timeline_detach_ancestor_handler(
         let ctx = RequestContext::new(TaskKind::DetachAncestor, DownloadBehavior::Download);
         let ctx = &ctx;
 
-        let timeline = tenant
-            .get_timeline(timeline_id, true)
-            .map_err(|e| ApiError::NotFound(e.into()))?;
+        let timeline = tenant.get_timeline(timeline_id, true)?;
 
         let (_guard, prepared) = timeline
             .prepare_to_detach_from_ancestor(&tenant, options, ctx)
-            .await
-            .map_err(|e| ApiError::InternalServerError(e.into()))?;
+            .await?;
 
         let res = state
             .tenant_manager
@@ -2042,9 +2043,7 @@ async fn active_timeline_of_active_tenant(
 
     tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
 
-    tenant
-        .get_timeline(timeline_id, true)
-        .map_err(|e| ApiError::NotFound(e.into()))
+    Ok(tenant.get_timeline(timeline_id, true)?)
 }
 
 async fn always_panic_handler(
diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs
index 4d8e570181..e6ddabe5b5 100644
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -12,7 +12,7 @@ use crate::{
 };
 use tokio_util::sync::CancellationToken;
 use tracing::Instrument;
-use utils::{completion, generation::Generation, id::TimelineId, lsn::Lsn};
+use utils::{completion, generation::Generation, http::error::ApiError, id::TimelineId, lsn::Lsn};
 
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum Error {
@@ -41,6 +41,27 @@ pub(crate) enum Error {
     Unexpected(#[source] anyhow::Error),
 }
 
+impl From<Error> for ApiError {
+    fn from(value: Error) -> Self {
+        match value {
+            e @ Error::NoAncestor => ApiError::Conflict(e.to_string()),
+            // TODO: ApiError converts the anyhow using debug formatting ... just stop using ApiError?
+            e @ Error::TooManyAncestors => ApiError::BadRequest(anyhow::anyhow!("{}", e)),
+            Error::ShuttingDown => ApiError::ShuttingDown,
+            Error::OtherTimelineDetachOngoing(_) => {
+                ApiError::ResourceUnavailable("other timeline detach is already ongoing".into())
+            }
+            // All of these contain shutdown errors, in fact, it's the most common
+            e @ Error::FlushAncestor(_)
+            | e @ Error::RewrittenDeltaDownloadFailed(_)
+            | e @ Error::CopyDeltaPrefix(_)
+            | e @ Error::UploadRewritten(_)
+            | e @ Error::CopyFailed(_)
+            | e @ Error::Unexpected(_) => ApiError::InternalServerError(e.into()),
+        }
+    }
+}
+
 pub(crate) struct PreparedTimelineDetach {
     layers: Vec<Layer>,
 }
@@ -75,6 +96,11 @@ pub(super) async fn prepare(
         .as_ref()
         .map(|tl| (tl.clone(), detached.ancestor_lsn))
     else {
+        // TODO: check if we have already been detached; for this we need to read the stored data
+        // on remote client, for that we need a follow-up which makes uploads cheaper and maintains
+        // a projection of the commited data.
+        //
+        // the error is wrong per openapi
         return Err(NoAncestor);
     };
 
@@ -84,7 +110,7 @@ pub(super) async fn prepare(
 
     if ancestor.ancestor_timeline.is_some() {
         // non-technical requirement; we could flatten N ancestors just as easily but we chose
-        // not to
+        // not to, at least initially
         return Err(TooManyAncestors);
     }
 
diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index 1563c161e0..f0b2f7d733 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -8,9 +8,13 @@ from typing import List, Tuple
 import pytest
 from fixtures.common_types import Lsn, TimelineId
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_last_flush_lsn
-from fixtures.pageserver.http import HistoricLayerInfo
-from fixtures.pageserver.utils import wait_timeline_detail_404
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+    PgBin,
+    wait_for_last_flush_lsn,
+)
+from fixtures.pageserver.http import HistoricLayerInfo, PageserverApiException
+from fixtures.pageserver.utils import wait_tenant_status_404, wait_timeline_detail_404
 from fixtures.remote_storage import LocalFsStorage
 from fixtures.utils import assert_pageserver_backups_equal
 
@@ -555,6 +559,32 @@ def test_compaction_induced_by_detaches_in_history(
     assert_pageserver_backups_equal(fullbackup_before, fullbackup_after, set())
 
 
+def test_timeline_ancestor_errors(neon_env_builder: NeonEnvBuilder):
+    env = neon_env_builder.init_start()
+    env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
+
+    client = env.pageserver.http_client()
+
+    with pytest.raises(PageserverApiException, match=".* no ancestors") as info:
+        client.detach_ancestor(env.initial_tenant, env.initial_timeline)
+    assert info.value.status_code == 409
+
+    first_branch = env.neon_cli.create_branch("first_branch")
+    second_branch = env.neon_cli.create_branch("second_branch", ancestor_branch_name="first_branch")
+
+    # funnily enough this does not have a prefix
+    with pytest.raises(PageserverApiException, match="too many ancestors") as info:
+        client.detach_ancestor(env.initial_tenant, second_branch)
+    assert info.value.status_code == 400
+
+    client.tenant_delete(env.initial_tenant)
+    wait_tenant_status_404(client, env.initial_tenant, 10, 1)
+
+    with pytest.raises(PageserverApiException) as e:
+        client.detach_ancestor(env.initial_tenant, first_branch)
+    assert e.value.status_code == 404
+
+
 # TODO:
 # - after starting the operation, tenant is deleted
 # - after starting the operation, pageserver is shutdown, restarted

From 8901ce9c99f068464e9ee15673f25569ff298f17 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 21 May 2024 18:26:47 -0500
Subject: [PATCH 113/130] Fix typos in action definitions

---
 .github/actions/neon-branch-create/action.yml  |  6 +++---
 .github/actions/neon-branch-delete/action.yml  |  8 ++++----
 .github/actions/neon-project-create/action.yml | 12 ++++++------
 .github/actions/neon-project-delete/action.yml |  6 +++---
 4 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/.github/actions/neon-branch-create/action.yml b/.github/actions/neon-branch-create/action.yml
index dea3fc2357..9f752d5a89 100644
--- a/.github/actions/neon-branch-create/action.yml
+++ b/.github/actions/neon-branch-create/action.yml
@@ -3,13 +3,13 @@ description: 'Create Branch using API'
 
 inputs:
   api_key:
-    desctiption: 'Neon API key'
+    description: 'Neon API key'
     required: true
   project_id:
-    desctiption: 'ID of the Project to create Branch in'
+    description: 'ID of the Project to create Branch in'
     required: true
   api_host:
-    desctiption: 'Neon API host'
+    description: 'Neon API host'
     default: console-stage.neon.build
 outputs:
   dsn:
diff --git a/.github/actions/neon-branch-delete/action.yml b/.github/actions/neon-branch-delete/action.yml
index 8acba7ad00..58141a4a3f 100644
--- a/.github/actions/neon-branch-delete/action.yml
+++ b/.github/actions/neon-branch-delete/action.yml
@@ -3,16 +3,16 @@ description: 'Delete Branch using API'
 
 inputs:
   api_key:
-    desctiption: 'Neon API key'
+    description: 'Neon API key'
     required: true
   project_id:
-    desctiption: 'ID of the Project which should be deleted'
+    description: 'ID of the Project which should be deleted'
     required: true
   branch_id:
-    desctiption: 'ID of the branch to delete'
+    description: 'ID of the branch to delete'
     required: true
   api_host:
-    desctiption: 'Neon API host'
+    description: 'Neon API host'
     default: console-stage.neon.build
 
 runs:
diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml
index 7f0e599b97..4039a58b9e 100644
--- a/.github/actions/neon-project-create/action.yml
+++ b/.github/actions/neon-project-create/action.yml
@@ -3,22 +3,22 @@ description: 'Create Neon Project using API'
 
 inputs:
   api_key:
-    desctiption: 'Neon API key'
+    description: 'Neon API key'
     required: true
   region_id:
-    desctiption: 'Region ID, if not set the project will be created in the default region'
+    description: 'Region ID, if not set the project will be created in the default region'
     default: aws-us-east-2
   postgres_version:
-    desctiption: 'Postgres version; default is 15'
+    description: 'Postgres version; default is 15'
     default: 15
   api_host:
-    desctiption: 'Neon API host'
+    description: 'Neon API host'
     default: console-stage.neon.build
   provisioner:
-    desctiption: 'k8s-pod or k8s-neonvm'
+    description: 'k8s-pod or k8s-neonvm'
     default: 'k8s-pod'
   compute_units:
-    desctiption: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal'
+    description: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal'
     default: '[1, 1]'
 
 outputs:
diff --git a/.github/actions/neon-project-delete/action.yml b/.github/actions/neon-project-delete/action.yml
index b8ec6cac70..35e165fd61 100644
--- a/.github/actions/neon-project-delete/action.yml
+++ b/.github/actions/neon-project-delete/action.yml
@@ -3,13 +3,13 @@ description: 'Delete Neon Project using API'
 
 inputs:
   api_key:
-    desctiption: 'Neon API key'
+    description: 'Neon API key'
     required: true
   project_id:
-    desctiption: 'ID of the Project to delete'
+    description: 'ID of the Project to delete'
     required: true
   api_host:
-    desctiption: 'Neon API host'
+    description: 'Neon API host'
     default: console-stage.neon.build
 
 runs:

From 900f39111507bc058bd541b66deddd66d6473443 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 21 May 2024 18:27:30 -0500
Subject: [PATCH 114/130] Make postgres_version action input default to a
 string

This is "required" by GitHub Actions, though they must do some coersion
on their side.
---
 .github/actions/neon-project-create/action.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml
index 4039a58b9e..16759ad038 100644
--- a/.github/actions/neon-project-create/action.yml
+++ b/.github/actions/neon-project-create/action.yml
@@ -10,7 +10,7 @@ inputs:
     default: aws-us-east-2
   postgres_version:
     description: 'Postgres version; default is 15'
-    default: 15
+    default: '15'
   api_host:
     description: 'Neon API host'
     default: console-stage.neon.build

From 325f3784f9fe46b5152000ba50ae4a01c9fd68a4 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 22 May 2024 16:02:20 +0100
Subject: [PATCH 115/130] CI(promote-images): simplify & fix the job (#7826)

## Problem

Currently, `latest` tag is added to the images in several cases:
```
github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy'
```

This leads to a race; the `latest` tag jumps back and forth depending on
the branch that has built images.

## Summary of changes
- Do not push `latest` images to prod ECR (we don't use it)
- Use `docker buildx imagetools` instead of `crane` for tagging images
- Unify `vm-compute-node-image` job with others and use dockerhub as a
first source for images (sync images with ECR)
- Tag images with `latest` only for commits in `main`
---
 .github/workflows/build_and_test.yml | 117 ++++++++++++---------------
 1 file changed, 52 insertions(+), 65 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index d8ad6e26d0..5056025457 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -883,22 +883,39 @@ jobs:
           curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder
           chmod +x vm-builder
 
+      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
+      # The default value is ~/.docker
+      - name: Set custom docker config directory
+        run: |
+          mkdir -p .docker-custom
+          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
+
+      - uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+
       # Note: we need a separate pull step here because otherwise vm-builder will try to pull, and
       # it won't have the proper authentication (written at v0.6.0)
       - name: Pulling compute-node image
         run: |
-          docker pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+          docker pull neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
 
       - name: Build vm image
         run: |
           ./vm-builder \
             -spec=vm-image-spec.yaml \
-            -src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
-            -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+            -src=neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
+            -dst=neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
 
       - name: Pushing vm-compute-node image
         run: |
-          docker push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+          docker push neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
+
+      - name: Remove custom docker config directory
+        if: always()
+        run: |
+          rm -rf .docker-custom
 
   test-images:
     needs: [ check-permissions, tag, neon-image, compute-node-image ]
@@ -946,78 +963,48 @@ jobs:
 
   promote-images:
     needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
-    runs-on: [ self-hosted, gen3, small ]
-    container: golang:1.19-bullseye
-    # Don't add if-condition here.
-    # The job should always be run because we have dependant other jobs that shouldn't be skipped
+    runs-on: ubuntu-latest
+
+    env:
+      VERSIONS: v14 v15 v16
 
     steps:
-      - name: Install Crane & ECR helper
-        run: |
-          go install github.com/google/go-containerregistry/cmd/crane@31786c6cbb82d6ec4fb8eb79cd9387905130534e # v0.11.0
-          go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0
+      - uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
 
-      - name: Configure ECR login
-        run: |
-          mkdir /github/home/.docker/
-          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
+      - uses: docker/login-action@v3
+        with:
+          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
+          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
 
-      - name: Copy vm-compute-node images to Docker Hub
+      - name: Copy vm-compute-node images to ECR
         run: |
-          crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} vm-compute-node-v14
-          crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} vm-compute-node-v15
-          crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} vm-compute-node-v16
+          for version in ${VERSIONS}; do
+            docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }} \
+                                               neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
+          done
 
       - name: Add latest tag to images
-        if: github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy'
+        if: github.ref_name == 'main'
         run: |
-          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} latest
-          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest
-          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
-          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
-          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
-          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
-          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:${{needs.tag.outputs.build-tag}} latest
-          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest
+          for repo in neondatabase 369495373322.dkr.ecr.eu-central-1.amazonaws.com; do
+            docker buildx imagetools create -t $repo/neon:latest \
+                                               $repo/neon:${{ needs.tag.outputs.build-tag }}
 
-      - name: Push images to production ECR
-        if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
-        run: |
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:latest
+            docker buildx imagetools create -t $repo/compute-tools:latest \
+                                               $repo/compute-tools:${{ needs.tag.outputs.build-tag }}
 
-      - name: Configure Docker Hub login
-        run: |
-          # ECR Credential Helper & Docker Hub don't work together in config, hence reset
-          echo "" > /github/home/.docker/config.json
-          crane auth login -u ${{ secrets.NEON_DOCKERHUB_USERNAME }} -p ${{ secrets.NEON_DOCKERHUB_PASSWORD }} index.docker.io
+            for version in ${VERSIONS}; do
+              docker buildx imagetools create -t $repo/compute-node-${version}:latest \
+                                                 $repo/compute-node-${version}:${{ needs.tag.outputs.build-tag }}
 
-      - name: Push vm-compute-node to Docker Hub
-        run: |
-          crane push vm-compute-node-v14 neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}}
-          crane push vm-compute-node-v15 neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}}
-          crane push vm-compute-node-v16 neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}}
-
-      - name: Push latest tags to Docker Hub
-        if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
-        run: |
-          crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/compute-node-v16:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest
-
-      - name: Cleanup ECR folder
-        run: rm -rf ~/.ecr
+              docker buildx imagetools create -t $repo/vm-compute-node-${version}:latest \
+                                                 $repo/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
+            done
+          done
 
   trigger-custom-extensions-build-and-wait:
     needs: [ check-permissions, tag ]

From a7f31f1a59aec1cdd66df71ffaf3b7ad176c4966 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 22 May 2024 16:06:05 +0100
Subject: [PATCH 116/130] CI: build multi-arch images (#7696)

## Problem

We don't build our docker images for ARM arch, and that makes it harder
to run images on ARM (on MacBooks with Apple Silicon, for example).

## Summary of changes
- Build `neondatabase/neon` for ARM and create a multi-arch image
- Build `neondatabase/compute-node-vXX` for ARM and create a multi-arch
image
- Run `test-images` job on ARM as well
---
 .github/workflows/build_and_test.yml      | 123 +++++++++++++++++-----
 docker-compose/compute_wrapper/Dockerfile |   2 +-
 docker-compose/docker_compose_test.sh     |   2 -
 3 files changed, 97 insertions(+), 30 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 5056025457..2ab1417d6d 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -723,9 +723,13 @@ jobs:
     uses: ./.github/workflows/trigger-e2e-tests.yml
     secrets: inherit
 
-  neon-image:
+  neon-image-arch:
     needs: [ check-permissions, build-build-tools-image, tag ]
-    runs-on: [ self-hosted, gen3, large ]
+    strategy:
+      matrix:
+        arch: [ x64, arm64 ]
+
+    runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
 
     steps:
       - name: Checkout
@@ -747,12 +751,6 @@ jobs:
           username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
           password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
 
-      - uses: docker/login-action@v3
-        with:
-          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
-          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
-
       - uses: docker/build-push-action@v5
         with:
           context: .
@@ -764,25 +762,52 @@ jobs:
           push: true
           pull: true
           file: Dockerfile
-          cache-from: type=registry,ref=neondatabase/neon:cache
-          cache-to: type=registry,ref=neondatabase/neon:cache,mode=max
+          cache-from: type=registry,ref=neondatabase/neon:cache-${{ matrix.arch }}
+          cache-to: type=registry,ref=neondatabase/neon:cache-${{ matrix.arch }},mode=max
           tags: |
-            369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
-            neondatabase/neon:${{needs.tag.outputs.build-tag}}
+            neondatabase/neon:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
 
       - name: Remove custom docker config directory
         if: always()
         run: |
           rm -rf .docker-custom
 
-  compute-node-image:
-    needs: [ check-permissions, build-build-tools-image, tag ]
-    runs-on: [ self-hosted, gen3, large ]
+  neon-image:
+    needs: [ neon-image-arch, tag ]
+    runs-on: ubuntu-latest
 
+    steps:
+      - uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+
+      - name: Create multi-arch image
+        run: |
+          docker buildx imagetools create -t neondatabase/neon:${{ needs.tag.outputs.build-tag }} \
+                                             neondatabase/neon:${{ needs.tag.outputs.build-tag }}-x64 \
+                                             neondatabase/neon:${{ needs.tag.outputs.build-tag }}-arm64
+
+      - uses: docker/login-action@v3
+        with:
+          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
+          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
+
+      - name: Push multi-arch image to ECR
+        run: |
+          docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{ needs.tag.outputs.build-tag }} \
+                                                                                neondatabase/neon:${{ needs.tag.outputs.build-tag }}
+
+  compute-node-image-arch:
+    needs: [ check-permissions, build-build-tools-image, tag ]
     strategy:
       fail-fast: false
       matrix:
         version: [ v14, v15, v16 ]
+        arch: [ x64, arm64 ]
+
+    runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
 
     steps:
       - name: Checkout
@@ -829,15 +854,14 @@ jobs:
           push: true
           pull: true
           file: Dockerfile.compute-node
-          cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache
-          cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache,mode=max
+          cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }}
+          cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }},mode=max
           tags: |
-            369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
-            neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+            neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
 
       - name: Build compute-tools image
         # compute-tools are Postgres independent, so build it only once
-        if: ${{ matrix.version == 'v16' }}
+        if: matrix.version == 'v16'
         uses: docker/build-push-action@v5
         with:
           target: compute-tools-image
@@ -851,14 +875,57 @@ jobs:
           pull: true
           file: Dockerfile.compute-node
           tags: |
-            369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }}
-            neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}
+            neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
 
       - name: Remove custom docker config directory
         if: always()
         run: |
           rm -rf .docker-custom
 
+  compute-node-image:
+    needs: [ compute-node-image-arch, tag ]
+    runs-on: ubuntu-latest
+
+    strategy:
+      matrix:
+        version: [ v14, v15, v16 ]
+
+    steps:
+      - uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+
+      - name: Create multi-arch compute-node image
+        run: |
+          docker buildx imagetools create -t neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
+                                             neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-x64 \
+                                             neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64
+
+      - name: Create multi-arch compute-tools image
+        if: matrix.version == 'v16'
+        run: |
+          docker buildx imagetools create -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} \
+                                             neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-x64 \
+                                             neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-arm64
+
+      - uses: docker/login-action@v3
+        with:
+          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
+          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
+
+      - name: Push multi-arch compute-node-${{ matrix.version }} image to ECR
+        run: |
+          docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
+                                                                                neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
+
+      - name: Push multi-arch compute-tools image to ECR
+        if: matrix.version == 'v16'
+        run: |
+          docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }} \
+                                                                                neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}
+
   vm-compute-node-image:
     needs: [ check-permissions, tag, compute-node-image ]
     runs-on: [ self-hosted, gen3, large ]
@@ -866,9 +933,6 @@ jobs:
       fail-fast: false
       matrix:
         version: [ v14, v15, v16 ]
-    defaults:
-      run:
-        shell: sh -eu {0}
     env:
       VM_BUILDER_VERSION: v0.28.1
 
@@ -919,7 +983,12 @@ jobs:
 
   test-images:
     needs: [ check-permissions, tag, neon-image, compute-node-image ]
-    runs-on: [ self-hosted, gen3, small ]
+    strategy:
+      fail-fast: false
+      matrix:
+        arch: [ x64, arm64 ]
+
+    runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
 
     steps:
       - name: Checkout
@@ -937,7 +1006,7 @@ jobs:
       - name: Verify image versions
         shell: bash # ensure no set -e for better error messages
         run: |
-          pageserver_version=$(docker run --rm 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} "/bin/sh" "-c" "/usr/local/bin/pageserver --version")
+          pageserver_version=$(docker run --rm neondatabase/neon:${{ needs.tag.outputs.build-tag }} "/bin/sh" "-c" "/usr/local/bin/pageserver --version")
 
           echo "Pageserver version string: $pageserver_version"
 
diff --git a/docker-compose/compute_wrapper/Dockerfile b/docker-compose/compute_wrapper/Dockerfile
index f1b1986072..974dcd7f03 100644
--- a/docker-compose/compute_wrapper/Dockerfile
+++ b/docker-compose/compute_wrapper/Dockerfile
@@ -1,4 +1,4 @@
-ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
+ARG REPOSITORY=neondatabase
 ARG COMPUTE_IMAGE=compute-node-v14
 ARG TAG=latest
 
diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh
index e18b0f9176..062fc6fc92 100755
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -8,8 +8,6 @@
 # Their defaults point at DockerHub `neondatabase/neon:latest` image.`,
 # to verify custom image builds (e.g pre-published ones).
 
-# XXX: Current does not work on M1 macs due to x86_64 Docker images compiled only, and no seccomp support in M1 Docker emulation layer.
-
 set -eux -o pipefail
 
 SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"

From e015b2bf3eb96c7a5d8b9bf0253ae13e569f5747 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 22 May 2024 16:10:58 +0100
Subject: [PATCH 117/130] safekeeper: use CancellationToken instead of watch
 channel (#7836)

## Problem

Safekeeper Timeline uses a channel for cancellation, but we have a
dedicated type for that.

## Summary of changes

- Use CancellationToken in Timeline
---
 safekeeper/src/recovery.rs           | 10 ++-------
 safekeeper/src/timeline.rs           | 31 +++++++---------------------
 safekeeper/src/timeline_manager.rs   | 10 +--------
 safekeeper/src/wal_backup_partial.rs | 14 +++----------
 4 files changed, 13 insertions(+), 52 deletions(-)

diff --git a/safekeeper/src/recovery.rs b/safekeeper/src/recovery.rs
index e8fa6c55f4..dfa1892c40 100644
--- a/safekeeper/src/recovery.rs
+++ b/safekeeper/src/recovery.rs
@@ -37,17 +37,11 @@ use crate::{
 #[instrument(name = "recovery task", skip_all, fields(ttid = %tli.ttid))]
 pub async fn recovery_main(tli: Arc<Timeline>, conf: SafeKeeperConf) {
     info!("started");
-    let mut cancellation_rx = match tli.get_cancellation_rx() {
-        Ok(rx) => rx,
-        Err(_) => {
-            info!("timeline canceled during task start");
-            return;
-        }
-    };
 
+    let cancel = tli.cancel.clone();
     select! {
         _ = recovery_main_loop(tli, conf) => { unreachable!() }
-        _ = cancellation_rx.changed() => {
+        _ = cancel.cancelled() => {
             info!("stopped");
         }
     }
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index da2e3f4538..89c157d514 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -6,6 +6,7 @@ use camino::Utf8PathBuf;
 use postgres_ffi::XLogSegNo;
 use serde::{Deserialize, Serialize};
 use tokio::fs;
+use tokio_util::sync::CancellationToken;
 
 use std::cmp::max;
 use std::ops::{Deref, DerefMut};
@@ -342,12 +343,8 @@ pub struct Timeline {
     walsenders: Arc<WalSenders>,
     walreceivers: Arc<WalReceivers>,
 
-    /// Cancellation channel. Delete/cancel will send `true` here as a cancellation signal.
-    cancellation_tx: watch::Sender<bool>,
-
-    /// Timeline should not be used after cancellation. Background tasks should
-    /// monitor this channel and stop eventually after receiving `true` from this channel.
-    cancellation_rx: watch::Receiver<bool>,
+    /// Delete/cancel will trigger this, background tasks should drop out as soon as it fires
+    pub(crate) cancel: CancellationToken,
 
     /// Directory where timeline state is stored.
     pub timeline_dir: Utf8PathBuf,
@@ -376,7 +373,6 @@ impl Timeline {
             shared_state.sk.flush_lsn(),
         )));
         let (shared_state_version_tx, shared_state_version_rx) = watch::channel(0);
-        let (cancellation_tx, cancellation_rx) = watch::channel(false);
 
         let walreceivers = WalReceivers::new();
         Ok(Timeline {
@@ -390,8 +386,7 @@ impl Timeline {
             mutex: RwLock::new(shared_state),
             walsenders: WalSenders::new(walreceivers.clone()),
             walreceivers,
-            cancellation_rx,
-            cancellation_tx,
+            cancel: CancellationToken::default(),
             timeline_dir: conf.timeline_dir(&ttid),
             walsenders_keep_horizon: conf.walsenders_keep_horizon,
             broker_active: AtomicBool::new(false),
@@ -411,7 +406,6 @@ impl Timeline {
         let (term_flush_lsn_watch_tx, term_flush_lsn_watch_rx) =
             watch::channel(TermLsn::from((INVALID_TERM, Lsn::INVALID)));
         let (shared_state_version_tx, shared_state_version_rx) = watch::channel(0);
-        let (cancellation_tx, cancellation_rx) = watch::channel(false);
 
         let state =
             TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn);
@@ -428,8 +422,7 @@ impl Timeline {
             mutex: RwLock::new(SharedState::create_new(conf, &ttid, state)?),
             walsenders: WalSenders::new(walreceivers.clone()),
             walreceivers,
-            cancellation_rx,
-            cancellation_tx,
+            cancel: CancellationToken::default(),
             timeline_dir: conf.timeline_dir(&ttid),
             walsenders_keep_horizon: conf.walsenders_keep_horizon,
             broker_active: AtomicBool::new(false),
@@ -535,7 +528,7 @@ impl Timeline {
     /// eventually after receiving cancellation signal.
     fn cancel(&self, shared_state: &mut WriteGuardSharedState<'_>) {
         info!("timeline {} is cancelled", self.ttid);
-        let _ = self.cancellation_tx.send(true);
+        self.cancel.cancel();
         // Close associated FDs. Nobody will be able to touch timeline data once
         // it is cancelled, so WAL storage won't be opened again.
         shared_state.sk.wal_store.close();
@@ -543,17 +536,7 @@ impl Timeline {
 
     /// Returns if timeline is cancelled.
     pub fn is_cancelled(&self) -> bool {
-        *self.cancellation_rx.borrow()
-    }
-
-    /// Returns watch channel which gets value when timeline is cancelled. It is
-    /// guaranteed to have not cancelled value observed (errors otherwise).
-    pub fn get_cancellation_rx(&self) -> Result<watch::Receiver<bool>> {
-        let rx = self.cancellation_rx.clone();
-        if *rx.borrow() {
-            bail!(TimelineError::Cancelled(self.ttid));
-        }
-        Ok(rx)
+        self.cancel.is_cancelled()
     }
 
     /// Take a writing mutual exclusive lock on timeline shared_state.
diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs
index 52ad915065..e74ba37ad8 100644
--- a/safekeeper/src/timeline_manager.rs
+++ b/safekeeper/src/timeline_manager.rs
@@ -47,14 +47,6 @@ pub async fn main_task(
     conf: SafeKeeperConf,
     broker_active_set: Arc<TimelinesSet>,
 ) {
-    let mut cancellation_rx = match tli.get_cancellation_rx() {
-        Ok(rx) => rx,
-        Err(_) => {
-            info!("timeline canceled during task start");
-            return;
-        }
-    };
-
     scopeguard::defer! {
         if tli.is_cancelled() {
             info!("manager task finished");
@@ -129,7 +121,7 @@ pub async fn main_task(
         // wait until something changes. tx channels are stored under Arc, so they will not be
         // dropped until the manager task is finished.
         tokio::select! {
-            _ = cancellation_rx.changed() => {
+            _ = tli.cancel.cancelled() => {
                 // timeline was deleted
                 break 'outer state_snapshot;
             }
diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs
index 200096ac5c..29e944bff3 100644
--- a/safekeeper/src/wal_backup_partial.rs
+++ b/safekeeper/src/wal_backup_partial.rs
@@ -277,14 +277,6 @@ pub async fn main_task(tli: Arc<Timeline>, conf: SafeKeeperConf) {
     debug!("started");
     let await_duration = conf.partial_backup_timeout;
 
-    let mut cancellation_rx = match tli.get_cancellation_rx() {
-        Ok(rx) => rx,
-        Err(_) => {
-            info!("timeline canceled during task start");
-            return;
-        }
-    };
-
     // sleep for random time to avoid thundering herd
     {
         let randf64 = rand::thread_rng().gen_range(0.0..1.0);
@@ -327,7 +319,7 @@ pub async fn main_task(tli: Arc<Timeline>, conf: SafeKeeperConf) {
                 && flush_lsn_rx.borrow().term == seg.term
             {
                 tokio::select! {
-                    _ = cancellation_rx.changed() => {
+                    _ = backup.tli.cancel.cancelled() => {
                         info!("timeline canceled");
                         return;
                     }
@@ -340,7 +332,7 @@ pub async fn main_task(tli: Arc<Timeline>, conf: SafeKeeperConf) {
         // if we don't have any data and zero LSNs, wait for something
         while flush_lsn_rx.borrow().lsn == Lsn(0) {
             tokio::select! {
-                _ = cancellation_rx.changed() => {
+                _ = backup.tli.cancel.cancelled() => {
                     info!("timeline canceled");
                     return;
                 }
@@ -357,7 +349,7 @@ pub async fn main_task(tli: Arc<Timeline>, conf: SafeKeeperConf) {
         // waiting until timeout expires OR segno changes
         'inner: loop {
             tokio::select! {
-                _ = cancellation_rx.changed() => {
+                _ = backup.tli.cancel.cancelled() => {
                     info!("timeline canceled");
                     return;
                 }

From 62aac6c8add432da1ab2d206f19323e808622e8d Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 22 May 2024 18:13:45 +0300
Subject: [PATCH 118/130] fix(Layer): carry gate until eviction is complete
 (#7838)

the gate was accidentially being dropped before the final blocking
phase, possibly explaining the resident physical size global problems
during deletions.

it could had caused more harm as well, but the path is not actively
being tested because cplane no longer puts locationconfigs with higher
generation number during normal operation which prompted the last wave
of fixes.

Cc: #7341.
---
 pageserver/src/tenant/storage_layer/layer.rs | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 45d61ce048..8c64621710 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -12,7 +12,7 @@ use std::time::{Duration, SystemTime};
 use tracing::Instrument;
 use utils::id::TimelineId;
 use utils::lsn::Lsn;
-use utils::sync::heavier_once_cell;
+use utils::sync::{gate, heavier_once_cell};
 
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
@@ -1333,7 +1333,7 @@ impl LayerInner {
 
         is_good_to_continue(&rx.borrow_and_update())?;
 
-        let Ok(_gate) = timeline.gate.enter() else {
+        let Ok(gate) = timeline.gate.enter() else {
             return Err(EvictionCancelled::TimelineGone);
         };
 
@@ -1421,7 +1421,7 @@ impl LayerInner {
         Self::spawn_blocking(move || {
             let _span = span.entered();
 
-            let res = self.evict_blocking(&timeline, &permit);
+            let res = self.evict_blocking(&timeline, &gate, &permit);
 
             let waiters = self.inner.initializer_count();
 
@@ -1447,6 +1447,7 @@ impl LayerInner {
     fn evict_blocking(
         &self,
         timeline: &Timeline,
+        _gate: &gate::GateGuard,
         _permit: &heavier_once_cell::InitPermit,
     ) -> Result<(), EvictionCancelled> {
         // now accesses to `self.inner.get_or_init*` wait on the semaphore or the `_permit`

From 3404e76a51311a595e28beb7d0962488cafaf143 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 22 May 2024 14:36:13 +0300
Subject: [PATCH 119/130] Fix confusion between 1-based Buffer and 0-based
 index (#7825)

The code was working correctly, but was incorrectly using Buffer for a
0-based index into the BufferDesc array.
---
 pgxn/neon/pagestore_smgr.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 249ad313b0..4361c74905 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -3216,7 +3216,7 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 	BufferTag	tag;
 	uint32		hash;
 	LWLock	   *partitionLock;
-	Buffer		buffer;
+	int			buf_id;
 	bool		no_redo_needed;
 
 	if (old_redo_read_buffer_filter && old_redo_read_buffer_filter(record, block_id))
@@ -3254,9 +3254,9 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 	else
 	{
 		/* Try to find the relevant buffer */
-		buffer = BufTableLookup(&tag, hash);
+		buf_id = BufTableLookup(&tag, hash);
 
-		no_redo_needed = buffer < 0;
+		no_redo_needed = buf_id < 0;
 	}
 	/* In both cases st lwlsn past this WAL record */
 	SetLastWrittenLSNForBlock(end_recptr, rinfo, forknum, blkno);

From 921756402673e9e2fa764fbacf33457f05bc3d59 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 22 May 2024 14:57:09 +0300
Subject: [PATCH 120/130] Fix issues with determining request LSN in read
 replica (#7795)

Don't set last-written LSN of a page when the record is replayed, only
when the page is evicted from cache. For comparison, we don't update
the last-written LSN on every page modification on the primary either,
only when the page is evicted. Do update the last-written LSN when the
page update is skipped in WAL redo, however.

In neon_get_request_lsns(), don't be surprised if the last-written LSN
is equal to the record being replayed. Use the LSN of the record being
replayed as the request LSN in that case. Add a long comment
explaining how that can happen.

In neon_wallog_page, update last-written LSN also when Shutdown has
been requested. We might still fetch and evict pages for a while,
after shutdown has been requested, so we better continue to do that
correctly.

Enable the check that we don't evict a page with zero LSN also in
standby, but make it a LOG message instead of PANIC

Fixes issue https://github.com/neondatabase/neon/issues/7791
---
 pgxn/neon/pagestore_smgr.c              | 142 +++++++++++++++++++++---
 test_runner/regress/test_hot_standby.py |  60 ++++++++++
 2 files changed, 185 insertions(+), 17 deletions(-)

diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 4361c74905..41546eae85 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -1349,6 +1349,10 @@ PageIsEmptyHeapPage(char *buffer)
 	return memcmp(buffer, empty_page.data, BLCKSZ) == 0;
 }
 
+/*
+ * A page is being evicted from the shared buffer cache. Update the
+ * last-written LSN of the page, and WAL-log it if needed.
+ */
 static void
 #if PG_MAJORVERSION_NUM < 16
 neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool force)
@@ -1357,12 +1361,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
 #endif
 {
 	XLogRecPtr	lsn = PageGetLSN((Page) buffer);
-
-	if (ShutdownRequestPending)
-		return;
-	/* Don't log any pages if we're not allowed to do so. */
-	if (!XLogInsertAllowed())
-		return;
+	bool		log_page;
 
 	/*
 	 * Whenever a VM or FSM page is evicted, WAL-log it. FSM and (some) VM
@@ -1371,9 +1370,21 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
 	 * correctness, the non-logged updates are not critical. But we want to
 	 * have a reasonably up-to-date VM and FSM in the page server.
 	 */
-	if ((force || forknum == FSM_FORKNUM || forknum == VISIBILITYMAP_FORKNUM) && !RecoveryInProgress())
+	log_page = false;
+	if (force)
+	{
+		Assert(XLogInsertAllowed());
+		log_page = true;
+	}
+	else if (XLogInsertAllowed() &&
+			 !ShutdownRequestPending &&
+			 (forknum == FSM_FORKNUM || forknum == VISIBILITYMAP_FORKNUM))
+	{
+		log_page = true;
+	}
+
+	if (log_page)
 	{
-		/* FSM is never WAL-logged and we don't care. */
 		XLogRecPtr	recptr;
 
 		recptr = log_newpage_copy(&InfoFromSMgrRel(reln), forknum, blocknum,
@@ -1386,7 +1397,8 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
 						RelFileInfoFmt(InfoFromSMgrRel(reln)),
 						forknum, LSN_FORMAT_ARGS(lsn))));
 	}
-	else if (lsn == InvalidXLogRecPtr)
+
+	if (lsn == InvalidXLogRecPtr)
 	{
 		/*
 		 * When PostgreSQL extends a relation, it calls smgrextend() with an
@@ -1422,19 +1434,31 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
 							RelFileInfoFmt(InfoFromSMgrRel(reln)),
 							forknum)));
 		}
-		else
+		else if (forknum != FSM_FORKNUM && forknum != VISIBILITYMAP_FORKNUM)
 		{
-			ereport(PANIC,
+			/*
+			 * Its a bad sign if there is a page with zero LSN in the buffer
+			 * cache in a standby, too. However, PANICing seems like a cure
+			 * worse than the disease, as the damage has likely already been
+			 * done in the primary. So in a standby, make this an assertion,
+			 * and in a release build just LOG the error and soldier on. We
+			 * update the last-written LSN of the page with a conservative
+			 * value in that case, which is the last replayed LSN.
+			 */
+			ereport(RecoveryInProgress() ? LOG : PANIC,
 					(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is evicted with zero LSN",
 							blocknum,
 							RelFileInfoFmt(InfoFromSMgrRel(reln)),
 							forknum)));
+			Assert(false);
+
+			lsn = GetXLogReplayRecPtr(NULL); /* in standby mode, soldier on */
 		}
 	}
 	else
 	{
 		ereport(SmgrTrace,
-				(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is already wal logged at lsn=%X/%X",
+				(errmsg(NEON_TAG "Evicting page %u of relation %u/%u/%u.%u with lsn=%X/%X",
 						blocknum,
 						RelFileInfoFmt(InfoFromSMgrRel(reln)),
 						forknum, LSN_FORMAT_ARGS(lsn))));
@@ -1527,8 +1551,92 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno)
 
 	if (RecoveryInProgress())
 	{
-		/* Request the page at the last replayed LSN. */
-		result.request_lsn = GetXLogReplayRecPtr(NULL);
+		/*---
+		 * In broad strokes, a replica always requests the page at the current
+		 * replay LSN. But looking closer, what exactly is the replay LSN? Is
+		 * it the last replayed record, or the record being replayed? And does
+		 * the startup process performing the replay need to do something
+		 * differently than backends running queries? Let's take a closer look
+		 * at the different scenarios:
+		 *
+		 * 1. Startup process reads a page, last_written_lsn is old.
+		 *
+		 * Read the old version of the page. We will apply the WAL record on
+		 * it to bring it up-to-date.
+		 *
+		 * We could read the new version, with the changes from this WAL
+		 * record already applied, to offload the work of replaying the record
+		 * to the pageserver. The pageserver might not have received the WAL
+		 * record yet, though, so a read of the old page version and applying
+		 * the record ourselves is likely faster. Also, the redo function
+		 * might be surprised if the changes have already applied. That's
+		 * normal during crash recovery, but not in hot standby.
+		 *
+		 * 2. Startup process reads a page, last_written_lsn == record we're
+		 *    replaying.
+		 *
+		 * Can this happen? There are a few theoretical cases when it might:
+		 *
+		 * A) The redo function reads the same page twice. We had already read
+		 *    and applied the changes once, and now we're reading it for the
+		 *    second time.  That would be a rather silly thing for a redo
+		 *    function to do, and I'm not aware of any that would do it.
+		 *
+		 * B) The redo function modifies multiple pages, and it already
+		 *    applied the changes to one of the pages, released the lock on
+		 *    it, and is now reading a second page.  Furthermore, the first
+		 *    page was already evicted from the buffer cache, and also from
+		 *    the last-written LSN cache, so that the per-relation or global
+		 *    last-written LSN was already updated. All the WAL redo functions
+		 *    hold the locks on pages that they modify, until all the changes
+		 *    have been modified (?), which would make that impossible.
+		 *    However, we skip the locking, if the page isn't currently in the
+		 *    page cache (see neon_redo_read_buffer_filter below).
+		 *
+		 * Even if the one of the above cases were possible in theory, they
+		 * would also require the pages being modified by the redo function to
+		 * be immediately evicted from the page cache.
+		 *
+		 * So this probably does not happen in practice. But if it does, we
+		 * request the new version, including the changes from the record
+		 * being replayed. That seems like the correct behavior in any case.
+		 *
+		 * 3. Backend process reads a page with old last-written LSN
+		 *
+		 * Nothing special here. Read the old version.
+		 *
+		 * 4. Backend process reads a page with last_written_lsn == record being replayed
+		 *
+		 * This can happen, if the redo function has started to run, and saw
+		 * that the page isn't present in the page cache (see
+		 * neon_redo_read_buffer_filter below).  Normally, in a normal
+		 * Postgres server, the redo function would hold a lock on the page,
+		 * so we would get blocked waiting the redo function to release the
+		 * lock. To emulate that, wait for the WAL replay of the record to
+		 * finish.
+		 */
+		/* Request the page at the end of the last fully replayed LSN. */
+		XLogRecPtr replay_lsn = GetXLogReplayRecPtr(NULL);
+
+		if (last_written_lsn > replay_lsn)
+		{
+			/* GetCurrentReplayRecPtr was introduced in v15 */
+#if PG_VERSION_NUM >= 150000
+			Assert(last_written_lsn == GetCurrentReplayRecPtr(NULL));
+#endif
+
+			/*
+			 * Cases 2 and 4. If this is a backend (case 4), the
+			 * neon_read_at_lsn() call later will wait for the WAL record to be
+			 * fully replayed.
+			 */
+			result.request_lsn = last_written_lsn;
+		}
+		else
+		{
+			/* cases 1 and 3 */
+			result.request_lsn = replay_lsn;
+		}
 		result.not_modified_since = last_written_lsn;
 		result.effective_request_lsn = result.request_lsn;
 		Assert(last_written_lsn <= result.request_lsn);
@@ -3258,16 +3366,16 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 
 		no_redo_needed = buf_id < 0;
 	}
-	/* In both cases st lwlsn past this WAL record */
-	SetLastWrittenLSNForBlock(end_recptr, rinfo, forknum, blkno);
 
 	/*
 	 * we don't have the buffer in memory, update lwLsn past this record, also
 	 * evict page from file cache
 	 */
 	if (no_redo_needed)
+	{
+		SetLastWrittenLSNForBlock(end_recptr, rinfo, forknum, blkno);
 		lfc_evict(rinfo, forknum, blkno);
-
+	}
 
 	LWLockRelease(partitionLock);
 
diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py
index 244d482c18..cf7a1c56ee 100644
--- a/test_runner/regress/test_hot_standby.py
+++ b/test_runner/regress/test_hot_standby.py
@@ -1,3 +1,4 @@
+import asyncio
 import os
 import re
 import threading
@@ -292,3 +293,62 @@ def test_hot_standby_feedback(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
             assert slot_xmin is None
 
         wait_until(10, 1.0, xmin_is_null)
+
+
+# Test race condition between WAL replay and backends performing queries
+# https://github.com/neondatabase/neon/issues/7791
+def test_replica_query_race(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+
+    primary_ep = env.endpoints.create_start(
+        branch_name="main",
+        endpoint_id="primary",
+    )
+
+    with primary_ep.connect() as p_con:
+        with p_con.cursor() as p_cur:
+            p_cur.execute("CREATE EXTENSION neon_test_utils")
+            p_cur.execute("CREATE TABLE test AS SELECT 0 AS counter")
+
+    standby_ep = env.endpoints.new_replica_start(origin=primary_ep, endpoint_id="standby")
+    time.sleep(1)
+
+    # In primary, run a lot of UPDATEs on a single page
+    finished = False
+    writecounter = 1
+
+    async def primary_workload():
+        nonlocal writecounter, finished
+        conn = await primary_ep.connect_async()
+        while writecounter < 10000:
+            writecounter += 1
+            await conn.execute(f"UPDATE test SET counter = {writecounter}")
+        finished = True
+
+    # In standby, at the same time, run queries on it. And repeatedly drop caches
+    async def standby_workload():
+        nonlocal writecounter, finished
+        conn = await standby_ep.connect_async()
+        reads = 0
+        while not finished:
+            readcounter = await conn.fetchval("SELECT counter FROM test")
+
+            # Check that the replica is keeping up with the primary. In local
+            # testing, the lag between primary and standby is much smaller, in
+            # the ballpark of 2-3 counter values. But be generous in case there's
+            # some hiccup.
+            # assert(writecounter - readcounter < 1000)
+            assert readcounter <= writecounter
+            if reads % 100 == 0:
+                log.info(f"read {reads}: counter {readcounter}, last update {writecounter}")
+            reads += 1
+
+            await conn.execute("SELECT clear_buffer_cache()")
+
+    async def both():
+        await asyncio.gather(
+            primary_workload(),
+            standby_workload(),
+        )
+
+    asyncio.run(both())

From 37f81289c2d1b27f88316980fb6b307e7e46d409 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 22 May 2024 18:24:52 +0300
Subject: [PATCH 121/130] Make 'neon.protocol_version = 2' the default, take
 two (#7819)

Once all the computes in production have restarted, we can remove
protocol version 1 altogether.

See issue #6211.

This was done earlier already in commit 0115fe6cb2, but reverted before
it was released to production in commit bbe730d7ca because of issue
https://github.com/neondatabase/neon/issues/7692. That issue was fixed
in commit 22afaea6e1, so we are ready to change the default again.
---
 pgxn/neon/libpagestore.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index b7b1e7ccbf..f5ce2caff3 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -49,7 +49,7 @@ char	   *neon_auth_token;
 int			readahead_buffer_size = 128;
 int			flush_every_n_requests = 8;
 
-int         neon_protocol_version = 1;
+int         neon_protocol_version = 2;
 
 static int	n_reconnect_attempts = 0;
 static int	max_reconnect_attempts = 60;
@@ -860,7 +860,7 @@ pg_init_libpagestore(void)
 							"Version of compute<->page server protocol",
 							NULL,
 							&neon_protocol_version,
-							1, /* default to old protocol for now */
+							2, /* use protocol version 2 */
 							1, /* min */
 							2, /* max */
 							PGC_SU_BACKEND,

From 64577cfddcdac89d649e1ba6db3a6f44e14e2eee Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Wed, 22 May 2024 12:41:13 -0400
Subject: [PATCH 122/130] feat(pageserver): auto-detect previous aux file
 policy (#7841)

## Problem

If an existing user already has some aux v1 files, we don't want to
switch them to the global tenant-level config.

Part of #7462

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/pgdatadir_mapping.rs | 29 +++++++++++++
 pageserver/src/tenant.rs            | 67 ++++++++++++++++++++++++++++-
 2 files changed, 95 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index f9d8c1020d..7dea687c46 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1480,6 +1480,29 @@ impl<'a> DatadirModification<'a> {
             // Allowed switch path:
             // * no aux files -> v1/v2/cross-validation
             // * cross-validation->v2
+
+            let current_policy = if current_policy.is_none() {
+                // This path will only be hit once per tenant: we will decide the final policy in this code block.
+                // The next call to `put_file` will always have `last_aux_file_policy != None`.
+                let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
+                let aux_files_key_v1 = self.tline.list_aux_files_v1(lsn, ctx).await?;
+                if aux_files_key_v1.is_empty() {
+                    None
+                } else {
+                    self.tline
+                        .last_aux_file_policy
+                        .store(Some(AuxFilePolicy::V1));
+                    self.tline
+                        .remote_client
+                        .schedule_index_upload_for_aux_file_policy_update(Some(
+                            AuxFilePolicy::V1,
+                        ))?;
+                    Some(AuxFilePolicy::V1)
+                }
+            } else {
+                current_policy
+            };
+
             if AuxFilePolicy::is_valid_migration_path(current_policy, switch_policy) {
                 self.tline.last_aux_file_policy.store(Some(switch_policy));
                 self.tline
@@ -1775,6 +1798,12 @@ impl<'a> DatadirModification<'a> {
         self.tline.get(key, lsn, ctx).await
     }
 
+    /// Only used during unit tests, force putting a key into the modification.
+    #[cfg(test)]
+    pub(crate) fn put_for_test(&mut self, key: Key, val: Value) {
+        self.put(key, val);
+    }
+
     fn put(&mut self, key: Key, val: Value) {
         let values = self.pending_updates.entry(key).or_default();
         // Replace the previous value if it exists at the same lsn
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 1a66f2c919..caf26e0a0b 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3964,18 +3964,20 @@ mod tests {
 
     use super::*;
     use crate::keyspace::KeySpaceAccum;
+    use crate::pgdatadir_mapping::AuxFilesDirectory;
     use crate::repository::{Key, Value};
     use crate::tenant::harness::*;
     use crate::tenant::timeline::CompactFlags;
     use crate::DEFAULT_PG_VERSION;
     use bytes::{Bytes, BytesMut};
     use hex_literal::hex;
-    use pageserver_api::key::{AUX_KEY_PREFIX, NON_INHERITED_RANGE};
+    use pageserver_api::key::{AUX_FILES_KEY, AUX_KEY_PREFIX, NON_INHERITED_RANGE};
     use pageserver_api::keyspace::KeySpace;
     use pageserver_api::models::CompactionAlgorithm;
     use rand::{thread_rng, Rng};
     use tests::storage_layer::ValuesReconstructState;
     use tests::timeline::{GetVectoredError, ShutdownMode};
+    use utils::bin_ser::BeSer;
 
     static TEST_KEY: Lazy<Key> =
         Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));
@@ -5997,6 +5999,69 @@ mod tests {
         );
     }
 
+    #[tokio::test]
+    async fn aux_file_policy_auto_detect() {
+        let mut harness = TenantHarness::create("aux_file_policy_auto_detect").unwrap();
+        harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::V2; // set to cross-validation mode
+        let (tenant, ctx) = harness.load().await;
+
+        let mut lsn = Lsn(0x08);
+
+        let tline: Arc<Timeline> = tenant
+            .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx)
+            .await
+            .unwrap();
+
+        assert_eq!(
+            tline.last_aux_file_policy.load(),
+            None,
+            "no aux file is written so it should be unset"
+        );
+
+        {
+            lsn += 8;
+            let mut modification = tline.begin_modification(lsn);
+            let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
+                files: vec![(
+                    "test_file".to_string(),
+                    Bytes::copy_from_slice(b"test_file"),
+                )]
+                .into_iter()
+                .collect(),
+            })
+            .unwrap();
+            modification.put_for_test(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
+            modification.commit(&ctx).await.unwrap();
+        }
+
+        {
+            lsn += 8;
+            let mut modification = tline.begin_modification(lsn);
+            modification
+                .put_file("pg_logical/mappings/test1", b"first", &ctx)
+                .await
+                .unwrap();
+            modification.commit(&ctx).await.unwrap();
+        }
+
+        assert_eq!(
+            tline.last_aux_file_policy.load(),
+            Some(AuxFilePolicy::V1),
+            "keep using v1 because there are aux files writting with v1"
+        );
+
+        // we can still read the auxfile v1
+        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
+        assert_eq!(
+            files.get("pg_logical/mappings/test1"),
+            Some(&bytes::Bytes::from_static(b"first"))
+        );
+        assert_eq!(
+            files.get("test_file"),
+            Some(&bytes::Bytes::from_static(b"test_file"))
+        );
+    }
+
     #[tokio::test]
     async fn test_metadata_image_creation() -> anyhow::Result<()> {
         let harness = TenantHarness::create("test_metadata_image_creation")?;

From 9cfe08e3d9f1181a163322705ff41cbcfb11db3b Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 22 May 2024 18:05:43 +0100
Subject: [PATCH 123/130] proxy password threadpool (#7806)

## Problem

Despite making password hashing async, it can still take time away from
the network code.

## Summary of changes

Introduce a custom threadpool, inspired by rayon. Features:

### Fairness

Each task is tagged with it's endpoint ID. The more times we have seen
the endpoint, the more likely we are to skip the task if it comes up in
the queue. This is using a min-count-sketch estimator for the number of
times we have seen the endpoint, resetting it every 1000+ steps.

Since tasks are immediately rescheduled if they do not complete, the
worker could get stuck in a "always work available loop". To combat
this, we check the global queue every 61 steps to ensure all tasks
quickly get a worker assigned to them.

### Balanced

Using crossbeam_deque, like rayon does, we have workstealing out of the
box. I've tested it a fair amount and it seems to balance the workload
accordingly
---
 Cargo.lock                      |  20 +-
 Cargo.toml                      |   2 +
 proxy/Cargo.toml                |   4 +-
 proxy/src/auth/backend.rs       |  10 +-
 proxy/src/auth/backend/hacks.rs |  11 +-
 proxy/src/auth/flow.rs          |  24 ++-
 proxy/src/bin/proxy.rs          |   8 +
 proxy/src/config.rs             |   2 +
 proxy/src/metrics.rs            |  89 +++++++--
 proxy/src/scram.rs              |  18 +-
 proxy/src/scram/countmin.rs     | 173 +++++++++++++++++
 proxy/src/scram/exchange.rs     |  49 ++---
 proxy/src/scram/pbkdf2.rs       |  89 +++++++++
 proxy/src/scram/threadpool.rs   | 321 ++++++++++++++++++++++++++++++++
 proxy/src/serverless/backend.rs |  11 +-
 workspace_hack/Cargo.toml       |   2 +
 16 files changed, 759 insertions(+), 74 deletions(-)
 create mode 100644 proxy/src/scram/countmin.rs
 create mode 100644 proxy/src/scram/pbkdf2.rs
 create mode 100644 proxy/src/scram/threadpool.rs

diff --git a/Cargo.lock b/Cargo.lock
index e6060c82f5..d8f9021eb8 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1471,26 +1471,21 @@ dependencies = [
 
 [[package]]
 name = "crossbeam-deque"
-version = "0.8.3"
+version = "0.8.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef"
+checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d"
 dependencies = [
- "cfg-if",
  "crossbeam-epoch",
  "crossbeam-utils",
 ]
 
 [[package]]
 name = "crossbeam-epoch"
-version = "0.9.14"
+version = "0.9.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "46bd5f3f85273295a9d14aedfb86f6aadbff6d8f5295c4a9edb08e819dcf5695"
+checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
 dependencies = [
- "autocfg",
- "cfg-if",
  "crossbeam-utils",
- "memoffset 0.8.0",
- "scopeguard",
 ]
 
 [[package]]
@@ -3961,9 +3956,9 @@ checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c"
 
 [[package]]
 name = "pbkdf2"
-version = "0.12.1"
+version = "0.12.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f0ca0b5a68607598bf3bad68f32227a8164f6254833f84eafaac409cd6746c31"
+checksum = "f8ed6a7761f76e3b9f92dfb0a60a6a6477c61024b775147ff0973a02653abaf2"
 dependencies = [
  "digest",
  "hmac",
@@ -4386,6 +4381,7 @@ dependencies = [
 name = "proxy"
 version = "0.1.0"
 dependencies = [
+ "ahash",
  "anyhow",
  "async-compression",
  "async-trait",
@@ -4402,6 +4398,7 @@ dependencies = [
  "chrono",
  "clap",
  "consumption_metrics",
+ "crossbeam-deque",
  "dashmap",
  "env_logger",
  "fallible-iterator",
@@ -7473,6 +7470,7 @@ dependencies = [
 name = "workspace_hack"
 version = "0.1.0"
 dependencies = [
+ "ahash",
  "anyhow",
  "aws-config",
  "aws-runtime",
diff --git a/Cargo.toml b/Cargo.toml
index 2a7dea447e..0887c039f8 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -41,6 +41,7 @@ license = "Apache-2.0"
 
 ## All dependency versions, used in the project
 [workspace.dependencies]
+ahash = "0.8"
 anyhow = { version = "1.0", features = ["backtrace"] }
 arc-swap = "1.6"
 async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
@@ -74,6 +75,7 @@ clap = { version = "4.0", features = ["derive"] }
 comfy-table = "6.1"
 const_format = "0.2"
 crc32c = "0.6"
+crossbeam-deque = "0.8.5"
 crossbeam-utils = "0.8.5"
 dashmap = { version = "5.5.0", features = ["raw-api"] }
 either = "1.8"
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 5f9b0aa75b..7da0763bc1 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -9,6 +9,7 @@ default = []
 testing = []
 
 [dependencies]
+ahash.workspace = true
 anyhow.workspace = true
 async-compression.workspace = true
 async-trait.workspace = true
@@ -24,6 +25,7 @@ camino.workspace = true
 chrono.workspace = true
 clap.workspace = true
 consumption_metrics.workspace = true
+crossbeam-deque.workspace = true
 dashmap.workspace = true
 env_logger.workspace = true
 framed-websockets.workspace = true
@@ -52,7 +54,6 @@ opentelemetry.workspace = true
 parking_lot.workspace = true
 parquet.workspace = true
 parquet_derive.workspace = true
-pbkdf2 = { workspace = true, features = ["simple", "std"] }
 pin-project-lite.workspace = true
 postgres_backend.workspace = true
 pq_proto.workspace = true
@@ -106,6 +107,7 @@ workspace_hack.workspace = true
 camino-tempfile.workspace = true
 fallible-iterator.workspace = true
 tokio-tungstenite.workspace = true
+pbkdf2 = { workspace = true, features = ["simple", "std"] }
 rcgen.workspace = true
 rstest.workspace = true
 tokio-postgres-rustls.workspace = true
diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 6a906b299b..3555eba543 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -365,7 +365,10 @@ async fn authenticate_with_secret(
     config: &'static AuthenticationConfig,
 ) -> auth::Result<ComputeCredentials> {
     if let Some(password) = unauthenticated_password {
-        let auth_outcome = validate_password_and_exchange(&password, secret).await?;
+        let ep = EndpointIdInt::from(&info.endpoint);
+
+        let auth_outcome =
+            validate_password_and_exchange(&config.thread_pool, ep, &password, secret).await?;
         let keys = match auth_outcome {
             crate::sasl::Outcome::Success(key) => key,
             crate::sasl::Outcome::Failure(reason) => {
@@ -386,7 +389,7 @@ async fn authenticate_with_secret(
     // Currently, we use it for websocket connections (latency).
     if allow_cleartext {
         ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
-        return hacks::authenticate_cleartext(ctx, info, client, secret).await;
+        return hacks::authenticate_cleartext(ctx, info, client, secret, config).await;
     }
 
     // Finally, proceed with the main auth flow (SCRAM-based).
@@ -554,7 +557,7 @@ mod tests {
         context::RequestMonitoring,
         proxy::NeonOptions,
         rate_limiter::{EndpointRateLimiter, RateBucketInfo},
-        scram::ServerSecret,
+        scram::{threadpool::ThreadPool, ServerSecret},
         stream::{PqStream, Stream},
     };
 
@@ -596,6 +599,7 @@ mod tests {
     }
 
     static CONFIG: Lazy<AuthenticationConfig> = Lazy::new(|| AuthenticationConfig {
+        thread_pool: ThreadPool::new(1),
         scram_protocol_timeout: std::time::Duration::from_secs(5),
         rate_limiter_enabled: true,
         rate_limiter: AuthRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET),
diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs
index f7241be4a9..6b0f5e1726 100644
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -3,8 +3,10 @@ use super::{
 };
 use crate::{
     auth::{self, AuthFlow},
+    config::AuthenticationConfig,
     console::AuthSecret,
     context::RequestMonitoring,
+    intern::EndpointIdInt,
     sasl,
     stream::{self, Stream},
 };
@@ -20,6 +22,7 @@ pub async fn authenticate_cleartext(
     info: ComputeUserInfo,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
     secret: AuthSecret,
+    config: &'static AuthenticationConfig,
 ) -> auth::Result<ComputeCredentials> {
     warn!("cleartext auth flow override is enabled, proceeding");
     ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
@@ -27,8 +30,14 @@ pub async fn authenticate_cleartext(
     // pause the timer while we communicate with the client
     let paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
 
+    let ep = EndpointIdInt::from(&info.endpoint);
+
     let auth_flow = AuthFlow::new(client)
-        .begin(auth::CleartextPassword(secret))
+        .begin(auth::CleartextPassword {
+            secret,
+            endpoint: ep,
+            pool: config.thread_pool.clone(),
+        })
         .await?;
     drop(paused);
     // cleartext auth is only allowed to the ws/http protocol.
diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs
index 45bbad8cb2..59d1ac17f4 100644
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -5,12 +5,14 @@ use crate::{
     config::TlsServerEndPoint,
     console::AuthSecret,
     context::RequestMonitoring,
-    sasl, scram,
+    intern::EndpointIdInt,
+    sasl,
+    scram::{self, threadpool::ThreadPool},
     stream::{PqStream, Stream},
 };
 use postgres_protocol::authentication::sasl::{SCRAM_SHA_256, SCRAM_SHA_256_PLUS};
 use pq_proto::{BeAuthenticationSaslMessage, BeMessage, BeMessage as Be};
-use std::io;
+use std::{io, sync::Arc};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::info;
 
@@ -53,7 +55,11 @@ impl AuthMethod for PasswordHack {
 
 /// Use clear-text password auth called `password` in docs
 /// <https://www.postgresql.org/docs/current/auth-password.html>
-pub struct CleartextPassword(pub AuthSecret);
+pub struct CleartextPassword {
+    pub pool: Arc<ThreadPool>,
+    pub endpoint: EndpointIdInt,
+    pub secret: AuthSecret,
+}
 
 impl AuthMethod for CleartextPassword {
     #[inline(always)]
@@ -126,7 +132,13 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, CleartextPassword> {
             .strip_suffix(&[0])
             .ok_or(AuthErrorImpl::MalformedPassword("missing terminator"))?;
 
-        let outcome = validate_password_and_exchange(password, self.state.0).await?;
+        let outcome = validate_password_and_exchange(
+            &self.state.pool,
+            self.state.endpoint,
+            password,
+            self.state.secret,
+        )
+        .await?;
 
         if let sasl::Outcome::Success(_) = &outcome {
             self.stream.write_message_noflush(&Be::AuthenticationOk)?;
@@ -181,6 +193,8 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
 }
 
 pub(crate) async fn validate_password_and_exchange(
+    pool: &ThreadPool,
+    endpoint: EndpointIdInt,
     password: &[u8],
     secret: AuthSecret,
 ) -> super::Result<sasl::Outcome<ComputeCredentialKeys>> {
@@ -194,7 +208,7 @@ pub(crate) async fn validate_password_and_exchange(
         }
         // perform scram authentication as both client and server to validate the keys
         AuthSecret::Scram(scram_secret) => {
-            let outcome = crate::scram::exchange(&scram_secret, password).await?;
+            let outcome = crate::scram::exchange(pool, endpoint, &scram_secret, password).await?;
 
             let client_key = match outcome {
                 sasl::Outcome::Success(client_key) => client_key,
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index be7d961b8c..30f2e6f4b7 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -27,6 +27,7 @@ use proxy::redis::cancellation_publisher::RedisPublisherClient;
 use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
 use proxy::redis::elasticache;
 use proxy::redis::notifications;
+use proxy::scram::threadpool::ThreadPool;
 use proxy::serverless::cancel_set::CancelSet;
 use proxy::serverless::GlobalConnPoolOptions;
 use proxy::usage_metrics;
@@ -132,6 +133,9 @@ struct ProxyCliArgs {
     /// timeout for scram authentication protocol
     #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
     scram_protocol_timeout: tokio::time::Duration,
+    /// size of the threadpool for password hashing
+    #[clap(long, default_value_t = 4)]
+    scram_thread_pool_size: u8,
     /// Require that all incoming requests have a Proxy Protocol V2 packet **and** have an IP address associated.
     #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
     require_client_ip: bool,
@@ -489,6 +493,9 @@ async fn main() -> anyhow::Result<()> {
 
 /// ProxyConfig is created at proxy startup, and lives forever.
 fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
+    let thread_pool = ThreadPool::new(args.scram_thread_pool_size);
+    Metrics::install(thread_pool.metrics.clone());
+
     let tls_config = match (&args.tls_key, &args.tls_cert) {
         (Some(key_path), Some(cert_path)) => Some(config::configure_tls(
             key_path,
@@ -624,6 +631,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold,
     };
     let authentication_config = AuthenticationConfig {
+        thread_pool,
         scram_protocol_timeout: args.scram_protocol_timeout,
         rate_limiter_enabled: args.auth_rate_limit_enabled,
         rate_limiter: AuthRateLimiter::new(args.auth_rate_limit.clone()),
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index b7ab2c00f9..5a0c251ce2 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -2,6 +2,7 @@ use crate::{
     auth::{self, backend::AuthRateLimiter},
     console::locks::ApiLocks,
     rate_limiter::RateBucketInfo,
+    scram::threadpool::ThreadPool,
     serverless::{cancel_set::CancelSet, GlobalConnPoolOptions},
     Host,
 };
@@ -61,6 +62,7 @@ pub struct HttpConfig {
 }
 
 pub struct AuthenticationConfig {
+    pub thread_pool: Arc<ThreadPool>,
     pub scram_protocol_timeout: tokio::time::Duration,
     pub rate_limiter_enabled: bool,
     pub rate_limiter: AuthRateLimiter,
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 1590316925..e2a75a8720 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -1,11 +1,11 @@
-use std::sync::OnceLock;
+use std::sync::{Arc, OnceLock};
 
 use lasso::ThreadedRodeo;
 use measured::{
-    label::StaticLabelSet,
+    label::{FixedCardinalitySet, LabelName, LabelSet, LabelValue, StaticLabelSet},
     metric::{histogram::Thresholds, name::MetricName},
-    Counter, CounterVec, FixedCardinalityLabel, Gauge, Histogram, HistogramVec, LabelGroup,
-    MetricGroup,
+    Counter, CounterVec, FixedCardinalityLabel, Gauge, GaugeVec, Histogram, HistogramVec,
+    LabelGroup, MetricGroup,
 };
 use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLog, HyperLogLogVec};
 
@@ -14,26 +14,36 @@ use tokio::time::{self, Instant};
 use crate::console::messages::ColdStartInfo;
 
 #[derive(MetricGroup)]
+#[metric(new(thread_pool: Arc<ThreadPoolMetrics>))]
 pub struct Metrics {
     #[metric(namespace = "proxy")]
+    #[metric(init = ProxyMetrics::new(thread_pool))]
     pub proxy: ProxyMetrics,
 
     #[metric(namespace = "wake_compute_lock")]
     pub wake_compute_lock: ApiLockMetrics,
 }
 
+static SELF: OnceLock<Metrics> = OnceLock::new();
 impl Metrics {
+    pub fn install(thread_pool: Arc<ThreadPoolMetrics>) {
+        SELF.set(Metrics::new(thread_pool))
+            .ok()
+            .expect("proxy metrics must not be installed more than once");
+    }
+
     pub fn get() -> &'static Self {
-        static SELF: OnceLock<Metrics> = OnceLock::new();
-        SELF.get_or_init(|| Metrics {
-            proxy: ProxyMetrics::default(),
-            wake_compute_lock: ApiLockMetrics::new(),
-        })
+        #[cfg(test)]
+        return SELF.get_or_init(|| Metrics::new(Arc::new(ThreadPoolMetrics::new(0))));
+
+        #[cfg(not(test))]
+        SELF.get()
+            .expect("proxy metrics must be installed by the main() function")
     }
 }
 
 #[derive(MetricGroup)]
-#[metric(new())]
+#[metric(new(thread_pool: Arc<ThreadPoolMetrics>))]
 pub struct ProxyMetrics {
     #[metric(flatten)]
     pub db_connections: CounterPairVec<NumDbConnectionsGauge>,
@@ -129,6 +139,10 @@ pub struct ProxyMetrics {
 
     #[metric(namespace = "connect_compute_lock")]
     pub connect_compute_lock: ApiLockMetrics,
+
+    #[metric(namespace = "scram_pool")]
+    #[metric(init = thread_pool)]
+    pub scram_pool: Arc<ThreadPoolMetrics>,
 }
 
 #[derive(MetricGroup)]
@@ -146,12 +160,6 @@ pub struct ApiLockMetrics {
     pub semaphore_acquire_seconds: Histogram<16>,
 }
 
-impl Default for ProxyMetrics {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
 impl Default for ApiLockMetrics {
     fn default() -> Self {
         Self::new()
@@ -553,3 +561,52 @@ pub enum RedisEventsCount {
     PasswordUpdate,
     AllowedIpsUpdate,
 }
+
+pub struct ThreadPoolWorkers(usize);
+pub struct ThreadPoolWorkerId(pub usize);
+
+impl LabelValue for ThreadPoolWorkerId {
+    fn visit<V: measured::label::LabelVisitor>(&self, v: V) -> V::Output {
+        v.write_int(self.0 as i64)
+    }
+}
+
+impl LabelGroup for ThreadPoolWorkerId {
+    fn visit_values(&self, v: &mut impl measured::label::LabelGroupVisitor) {
+        v.write_value(LabelName::from_str("worker"), self);
+    }
+}
+
+impl LabelSet for ThreadPoolWorkers {
+    type Value<'a> = ThreadPoolWorkerId;
+
+    fn dynamic_cardinality(&self) -> Option<usize> {
+        Some(self.0)
+    }
+
+    fn encode(&self, value: Self::Value<'_>) -> Option<usize> {
+        (value.0 < self.0).then_some(value.0)
+    }
+
+    fn decode(&self, value: usize) -> Self::Value<'_> {
+        ThreadPoolWorkerId(value)
+    }
+}
+
+impl FixedCardinalitySet for ThreadPoolWorkers {
+    fn cardinality(&self) -> usize {
+        self.0
+    }
+}
+
+#[derive(MetricGroup)]
+#[metric(new(workers: usize))]
+pub struct ThreadPoolMetrics {
+    pub injector_queue_depth: Gauge,
+    #[metric(init = GaugeVec::with_label_set(ThreadPoolWorkers(workers)))]
+    pub worker_queue_depth: GaugeVec<ThreadPoolWorkers>,
+    #[metric(init = CounterVec::with_label_set(ThreadPoolWorkers(workers)))]
+    pub worker_task_turns_total: CounterVec<ThreadPoolWorkers>,
+    #[metric(init = CounterVec::with_label_set(ThreadPoolWorkers(workers)))]
+    pub worker_task_skips_total: CounterVec<ThreadPoolWorkers>,
+}
diff --git a/proxy/src/scram.rs b/proxy/src/scram.rs
index ed80675f8a..862facb4e5 100644
--- a/proxy/src/scram.rs
+++ b/proxy/src/scram.rs
@@ -6,11 +6,14 @@
 //! * <https://github.com/postgres/postgres/blob/94226d4506e66d6e7cbf4b391f1e7393c1962841/src/backend/libpq/auth-scram.c>
 //! * <https://github.com/postgres/postgres/blob/94226d4506e66d6e7cbf4b391f1e7393c1962841/src/interfaces/libpq/fe-auth-scram.c>
 
+mod countmin;
 mod exchange;
 mod key;
 mod messages;
+mod pbkdf2;
 mod secret;
 mod signature;
+pub mod threadpool;
 
 pub use exchange::{exchange, Exchange};
 pub use key::ScramKey;
@@ -56,9 +59,13 @@ fn sha256<'a>(parts: impl IntoIterator<Item = &'a [u8]>) -> [u8; 32] {
 
 #[cfg(test)]
 mod tests {
-    use crate::sasl::{Mechanism, Step};
+    use crate::{
+        intern::EndpointIdInt,
+        sasl::{Mechanism, Step},
+        EndpointId,
+    };
 
-    use super::{Exchange, ServerSecret};
+    use super::{threadpool::ThreadPool, Exchange, ServerSecret};
 
     #[test]
     fn snapshot() {
@@ -112,8 +119,13 @@ mod tests {
     }
 
     async fn run_round_trip_test(server_password: &str, client_password: &str) {
+        let pool = ThreadPool::new(1);
+
+        let ep = EndpointId::from("foo");
+        let ep = EndpointIdInt::from(ep);
+
         let scram_secret = ServerSecret::build(server_password).await.unwrap();
-        let outcome = super::exchange(&scram_secret, client_password.as_bytes())
+        let outcome = super::exchange(&pool, ep, &scram_secret, client_password.as_bytes())
             .await
             .unwrap();
 
diff --git a/proxy/src/scram/countmin.rs b/proxy/src/scram/countmin.rs
new file mode 100644
index 0000000000..f2b794e5fe
--- /dev/null
+++ b/proxy/src/scram/countmin.rs
@@ -0,0 +1,173 @@
+use std::hash::Hash;
+
+/// estimator of hash jobs per second.
+/// <https://en.wikipedia.org/wiki/Count%E2%80%93min_sketch>
+pub struct CountMinSketch {
+    // one for each depth
+    hashers: Vec<ahash::RandomState>,
+    width: usize,
+    depth: usize,
+    // buckets, width*depth
+    buckets: Vec<u32>,
+}
+
+impl CountMinSketch {
+    /// Given parameters (ε, δ),
+    ///   set width = ceil(e/ε)
+    ///   set depth = ceil(ln(1/δ))
+    ///
+    /// guarantees:
+    /// actual <= estimate
+    /// estimate <= actual + ε * N with probability 1 - δ
+    /// where N is the cardinality of the stream
+    pub fn with_params(epsilon: f64, delta: f64) -> Self {
+        CountMinSketch::new(
+            (std::f64::consts::E / epsilon).ceil() as usize,
+            (1.0_f64 / delta).ln().ceil() as usize,
+        )
+    }
+
+    fn new(width: usize, depth: usize) -> Self {
+        Self {
+            #[cfg(test)]
+            hashers: (0..depth)
+                .map(|i| {
+                    // digits of pi for good randomness
+                    ahash::RandomState::with_seeds(
+                        314159265358979323,
+                        84626433832795028,
+                        84197169399375105,
+                        82097494459230781 + i as u64,
+                    )
+                })
+                .collect(),
+            #[cfg(not(test))]
+            hashers: (0..depth).map(|_| ahash::RandomState::new()).collect(),
+            width,
+            depth,
+            buckets: vec![0; width * depth],
+        }
+    }
+
+    pub fn inc_and_return<T: Hash>(&mut self, t: &T, x: u32) -> u32 {
+        let mut min = u32::MAX;
+        for row in 0..self.depth {
+            let col = (self.hashers[row].hash_one(t) as usize) % self.width;
+
+            let row = &mut self.buckets[row * self.width..][..self.width];
+            row[col] = row[col].saturating_add(x);
+            min = std::cmp::min(min, row[col]);
+        }
+        min
+    }
+
+    pub fn reset(&mut self) {
+        self.buckets.clear();
+        self.buckets.resize(self.width * self.depth, 0);
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use rand::{rngs::StdRng, seq::SliceRandom, Rng, SeedableRng};
+
+    use super::CountMinSketch;
+
+    fn eval_precision(n: usize, p: f64, q: f64) -> usize {
+        // fixed value of phi for consistent test
+        let mut rng = StdRng::seed_from_u64(16180339887498948482);
+
+        #[allow(non_snake_case)]
+        let mut N = 0;
+
+        let mut ids = vec![];
+
+        for _ in 0..n {
+            // number of insert operations
+            let n = rng.gen_range(1..100);
+            // number to insert at once
+            let m = rng.gen_range(1..4096);
+
+            let id = uuid::Builder::from_random_bytes(rng.gen()).into_uuid();
+            ids.push((id, n, m));
+
+            // N = sum(actual)
+            N += n * m;
+        }
+
+        // q% of counts will be within p of the actual value
+        let mut sketch = CountMinSketch::with_params(p / N as f64, 1.0 - q);
+
+        dbg!(sketch.buckets.len());
+
+        // insert a bunch of entries in a random order
+        let mut ids2 = ids.clone();
+        while !ids2.is_empty() {
+            ids2.shuffle(&mut rng);
+
+            let mut i = 0;
+            while i < ids2.len() {
+                sketch.inc_and_return(&ids2[i].0, ids2[i].1);
+                ids2[i].2 -= 1;
+                if ids2[i].2 == 0 {
+                    ids2.remove(i);
+                } else {
+                    i += 1;
+                }
+            }
+        }
+
+        let mut within_p = 0;
+        for (id, n, m) in ids {
+            let actual = n * m;
+            let estimate = sketch.inc_and_return(&id, 0);
+
+            // This estimate has the guarantee that actual <= estimate
+            assert!(actual <= estimate);
+
+            // This estimate has the guarantee that estimate <= actual + εN with probability 1 - δ.
+            // ε = p / N, δ = 1 - q;
+            // therefore, estimate <= actual + p with probability q.
+            if estimate as f64 <= actual as f64 + p {
+                within_p += 1;
+            }
+        }
+        within_p
+    }
+
+    #[test]
+    fn precision() {
+        assert_eq!(eval_precision(100, 100.0, 0.99), 100);
+        assert_eq!(eval_precision(1000, 100.0, 0.99), 1000);
+        assert_eq!(eval_precision(100, 4096.0, 0.99), 100);
+        assert_eq!(eval_precision(1000, 4096.0, 0.99), 1000);
+
+        // seems to be more precise than the literature indicates?
+        // probably numbers are too small to truly represent the probabilities.
+        assert_eq!(eval_precision(100, 4096.0, 0.90), 100);
+        assert_eq!(eval_precision(1000, 4096.0, 0.90), 1000);
+        assert_eq!(eval_precision(100, 4096.0, 0.1), 98);
+        assert_eq!(eval_precision(1000, 4096.0, 0.1), 991);
+    }
+
+    // returns memory usage in bytes, and the time complexity per insert.
+    fn eval_cost(p: f64, q: f64) -> (usize, usize) {
+        #[allow(non_snake_case)]
+        // N = sum(actual)
+        // Let's assume 1021 samples, all of 4096
+        let N = 1021 * 4096;
+        let sketch = CountMinSketch::with_params(p / N as f64, 1.0 - q);
+
+        let memory = std::mem::size_of::<u32>() * sketch.buckets.len();
+        let time = sketch.depth;
+        (memory, time)
+    }
+
+    #[test]
+    fn memory_usage() {
+        assert_eq!(eval_cost(100.0, 0.99), (2273580, 5));
+        assert_eq!(eval_cost(4096.0, 0.99), (55520, 5));
+        assert_eq!(eval_cost(4096.0, 0.90), (33312, 3));
+        assert_eq!(eval_cost(4096.0, 0.1), (11104, 1));
+    }
+}
diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs
index 89dd33e59f..d0adbc780e 100644
--- a/proxy/src/scram/exchange.rs
+++ b/proxy/src/scram/exchange.rs
@@ -4,15 +4,17 @@ use std::convert::Infallible;
 
 use hmac::{Hmac, Mac};
 use sha2::Sha256;
-use tokio::task::yield_now;
 
 use super::messages::{
     ClientFinalMessage, ClientFirstMessage, OwnedServerFirstMessage, SCRAM_RAW_NONCE_LEN,
 };
+use super::pbkdf2::Pbkdf2;
 use super::secret::ServerSecret;
 use super::signature::SignatureBuilder;
+use super::threadpool::ThreadPool;
 use super::ScramKey;
 use crate::config;
+use crate::intern::EndpointIdInt;
 use crate::sasl::{self, ChannelBinding, Error as SaslError};
 
 /// The only channel binding mode we currently support.
@@ -74,37 +76,18 @@ impl<'a> Exchange<'a> {
     }
 }
 
-// copied from <https://github.com/neondatabase/rust-postgres/blob/20031d7a9ee1addeae6e0968e3899ae6bf01cee2/postgres-protocol/src/authentication/sasl.rs#L36-L61>
-async fn pbkdf2(str: &[u8], salt: &[u8], iterations: u32) -> [u8; 32] {
-    let hmac = Hmac::<Sha256>::new_from_slice(str).expect("HMAC is able to accept all key sizes");
-    let mut prev = hmac
-        .clone()
-        .chain_update(salt)
-        .chain_update(1u32.to_be_bytes())
-        .finalize()
-        .into_bytes();
-
-    let mut hi = prev;
-
-    for i in 1..iterations {
-        prev = hmac.clone().chain_update(prev).finalize().into_bytes();
-
-        for (hi, prev) in hi.iter_mut().zip(prev) {
-            *hi ^= prev;
-        }
-        // yield every ~250us
-        // hopefully reduces tail latencies
-        if i % 1024 == 0 {
-            yield_now().await
-        }
-    }
-
-    hi.into()
-}
-
 // copied from <https://github.com/neondatabase/rust-postgres/blob/20031d7a9ee1addeae6e0968e3899ae6bf01cee2/postgres-protocol/src/authentication/sasl.rs#L236-L248>
-async fn derive_client_key(password: &[u8], salt: &[u8], iterations: u32) -> ScramKey {
-    let salted_password = pbkdf2(password, salt, iterations).await;
+async fn derive_client_key(
+    pool: &ThreadPool,
+    endpoint: EndpointIdInt,
+    password: &[u8],
+    salt: &[u8],
+    iterations: u32,
+) -> ScramKey {
+    let salted_password = pool
+        .spawn_job(endpoint, Pbkdf2::start(password, salt, iterations))
+        .await
+        .expect("job should not be cancelled");
 
     let make_key = |name| {
         let key = Hmac::<Sha256>::new_from_slice(&salted_password)
@@ -119,11 +102,13 @@ async fn derive_client_key(password: &[u8], salt: &[u8], iterations: u32) -> Scr
 }
 
 pub async fn exchange(
+    pool: &ThreadPool,
+    endpoint: EndpointIdInt,
     secret: &ServerSecret,
     password: &[u8],
 ) -> sasl::Result<sasl::Outcome<super::ScramKey>> {
     let salt = base64::decode(&secret.salt_base64)?;
-    let client_key = derive_client_key(password, &salt, secret.iterations).await;
+    let client_key = derive_client_key(pool, endpoint, password, &salt, secret.iterations).await;
 
     if secret.is_password_invalid(&client_key).into() {
         Ok(sasl::Outcome::Failure("password doesn't match"))
diff --git a/proxy/src/scram/pbkdf2.rs b/proxy/src/scram/pbkdf2.rs
new file mode 100644
index 0000000000..a803ba7e1b
--- /dev/null
+++ b/proxy/src/scram/pbkdf2.rs
@@ -0,0 +1,89 @@
+use hmac::{
+    digest::{consts::U32, generic_array::GenericArray},
+    Hmac, Mac,
+};
+use sha2::Sha256;
+
+pub struct Pbkdf2 {
+    hmac: Hmac<Sha256>,
+    prev: GenericArray<u8, U32>,
+    hi: GenericArray<u8, U32>,
+    iterations: u32,
+}
+
+// inspired from <https://github.com/neondatabase/rust-postgres/blob/20031d7a9ee1addeae6e0968e3899ae6bf01cee2/postgres-protocol/src/authentication/sasl.rs#L36-L61>
+impl Pbkdf2 {
+    pub fn start(str: &[u8], salt: &[u8], iterations: u32) -> Self {
+        let hmac =
+            Hmac::<Sha256>::new_from_slice(str).expect("HMAC is able to accept all key sizes");
+
+        let prev = hmac
+            .clone()
+            .chain_update(salt)
+            .chain_update(1u32.to_be_bytes())
+            .finalize()
+            .into_bytes();
+
+        Self {
+            hmac,
+            // one consumed for the hash above
+            iterations: iterations - 1,
+            hi: prev,
+            prev,
+        }
+    }
+
+    pub fn cost(&self) -> u32 {
+        (self.iterations).clamp(0, 4096)
+    }
+
+    pub fn turn(&mut self) -> std::task::Poll<[u8; 32]> {
+        let Self {
+            hmac,
+            prev,
+            hi,
+            iterations,
+        } = self;
+
+        // only do 4096 iterations per turn before sharing the thread for fairness
+        let n = (*iterations).clamp(0, 4096);
+        for _ in 0..n {
+            *prev = hmac.clone().chain_update(*prev).finalize().into_bytes();
+
+            for (hi, prev) in hi.iter_mut().zip(*prev) {
+                *hi ^= prev;
+            }
+        }
+
+        *iterations -= n;
+        if *iterations == 0 {
+            std::task::Poll::Ready((*hi).into())
+        } else {
+            std::task::Poll::Pending
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::Pbkdf2;
+    use pbkdf2::pbkdf2_hmac_array;
+    use sha2::Sha256;
+
+    #[test]
+    fn works() {
+        let salt = b"sodium chloride";
+        let pass = b"Ne0n_!5_50_C007";
+
+        let mut job = Pbkdf2::start(pass, salt, 600000);
+        let hash = loop {
+            let std::task::Poll::Ready(hash) = job.turn() else {
+                continue;
+            };
+            break hash;
+        };
+
+        let expected = pbkdf2_hmac_array::<Sha256, 32>(pass, salt, 600000);
+        assert_eq!(hash, expected)
+    }
+}
diff --git a/proxy/src/scram/threadpool.rs b/proxy/src/scram/threadpool.rs
new file mode 100644
index 0000000000..7701b869a3
--- /dev/null
+++ b/proxy/src/scram/threadpool.rs
@@ -0,0 +1,321 @@
+//! Custom threadpool implementation for password hashing.
+//!
+//! Requirements:
+//! 1. Fairness per endpoint.
+//! 2. Yield support for high iteration counts.
+
+use std::sync::{
+    atomic::{AtomicU64, Ordering},
+    Arc,
+};
+
+use crossbeam_deque::{Injector, Stealer, Worker};
+use itertools::Itertools;
+use parking_lot::{Condvar, Mutex};
+use rand::Rng;
+use rand::{rngs::SmallRng, SeedableRng};
+use tokio::sync::oneshot;
+
+use crate::{
+    intern::EndpointIdInt,
+    metrics::{ThreadPoolMetrics, ThreadPoolWorkerId},
+    scram::countmin::CountMinSketch,
+};
+
+use super::pbkdf2::Pbkdf2;
+
+pub struct ThreadPool {
+    queue: Injector<JobSpec>,
+    stealers: Vec<Stealer<JobSpec>>,
+    parkers: Vec<(Condvar, Mutex<ThreadState>)>,
+    /// bitpacked representation.
+    /// lower 8 bits = number of sleeping threads
+    /// next 8 bits = number of idle threads (searching for work)
+    counters: AtomicU64,
+
+    pub metrics: Arc<ThreadPoolMetrics>,
+}
+
+#[derive(PartialEq)]
+enum ThreadState {
+    Parked,
+    Active,
+}
+
+impl ThreadPool {
+    pub fn new(n_workers: u8) -> Arc<Self> {
+        let workers = (0..n_workers).map(|_| Worker::new_fifo()).collect_vec();
+        let stealers = workers.iter().map(|w| w.stealer()).collect_vec();
+
+        let parkers = (0..n_workers)
+            .map(|_| (Condvar::new(), Mutex::new(ThreadState::Active)))
+            .collect_vec();
+
+        let pool = Arc::new(Self {
+            queue: Injector::new(),
+            stealers,
+            parkers,
+            // threads start searching for work
+            counters: AtomicU64::new((n_workers as u64) << 8),
+            metrics: Arc::new(ThreadPoolMetrics::new(n_workers as usize)),
+        });
+
+        for (i, worker) in workers.into_iter().enumerate() {
+            let pool = Arc::clone(&pool);
+            std::thread::spawn(move || thread_rt(pool, worker, i));
+        }
+
+        pool
+    }
+
+    pub fn spawn_job(
+        &self,
+        endpoint: EndpointIdInt,
+        pbkdf2: Pbkdf2,
+    ) -> oneshot::Receiver<[u8; 32]> {
+        let (tx, rx) = oneshot::channel();
+
+        let queue_was_empty = self.queue.is_empty();
+
+        self.metrics.injector_queue_depth.inc();
+        self.queue.push(JobSpec {
+            response: tx,
+            pbkdf2,
+            endpoint,
+        });
+
+        // inspired from <https://github.com/rayon-rs/rayon/blob/3e3962cb8f7b50773bcc360b48a7a674a53a2c77/rayon-core/src/sleep/mod.rs#L242>
+        let counts = self.counters.load(Ordering::SeqCst);
+        let num_awake_but_idle = (counts >> 8) & 0xff;
+        let num_sleepers = counts & 0xff;
+
+        // If the queue is non-empty, then we always wake up a worker
+        // -- clearly the existing idle jobs aren't enough. Otherwise,
+        // check to see if we have enough idle workers.
+        if !queue_was_empty || num_awake_but_idle == 0 {
+            let num_to_wake = Ord::min(1, num_sleepers);
+            self.wake_any_threads(num_to_wake);
+        }
+
+        rx
+    }
+
+    #[cold]
+    fn wake_any_threads(&self, mut num_to_wake: u64) {
+        if num_to_wake > 0 {
+            for i in 0..self.parkers.len() {
+                if self.wake_specific_thread(i) {
+                    num_to_wake -= 1;
+                    if num_to_wake == 0 {
+                        return;
+                    }
+                }
+            }
+        }
+    }
+
+    fn wake_specific_thread(&self, index: usize) -> bool {
+        let (condvar, lock) = &self.parkers[index];
+
+        let mut state = lock.lock();
+        if *state == ThreadState::Parked {
+            condvar.notify_one();
+
+            // When the thread went to sleep, it will have incremented
+            // this value. When we wake it, its our job to decrement
+            // it. We could have the thread do it, but that would
+            // introduce a delay between when the thread was
+            // *notified* and when this counter was decremented. That
+            // might mislead people with new work into thinking that
+            // there are sleeping threads that they should try to
+            // wake, when in fact there is nothing left for them to
+            // do.
+            self.counters.fetch_sub(1, Ordering::SeqCst);
+            *state = ThreadState::Active;
+
+            true
+        } else {
+            false
+        }
+    }
+
+    fn steal(&self, rng: &mut impl Rng, skip: usize, worker: &Worker<JobSpec>) -> Option<JobSpec> {
+        // announce thread as idle
+        self.counters.fetch_add(256, Ordering::SeqCst);
+
+        // try steal from the global queue
+        loop {
+            match self.queue.steal_batch_and_pop(worker) {
+                crossbeam_deque::Steal::Success(job) => {
+                    self.metrics
+                        .injector_queue_depth
+                        .set(self.queue.len() as i64);
+                    // no longer idle
+                    self.counters.fetch_sub(256, Ordering::SeqCst);
+                    return Some(job);
+                }
+                crossbeam_deque::Steal::Retry => continue,
+                crossbeam_deque::Steal::Empty => break,
+            }
+        }
+
+        // try steal from our neighbours
+        loop {
+            let mut retry = false;
+            let start = rng.gen_range(0..self.stealers.len());
+            let job = (start..self.stealers.len())
+                .chain(0..start)
+                .filter(|i| *i != skip)
+                .find_map(
+                    |victim| match self.stealers[victim].steal_batch_and_pop(worker) {
+                        crossbeam_deque::Steal::Success(job) => Some(job),
+                        crossbeam_deque::Steal::Empty => None,
+                        crossbeam_deque::Steal::Retry => {
+                            retry = true;
+                            None
+                        }
+                    },
+                );
+            if job.is_some() {
+                // no longer idle
+                self.counters.fetch_sub(256, Ordering::SeqCst);
+                return job;
+            }
+            if !retry {
+                return None;
+            }
+        }
+    }
+}
+
+fn thread_rt(pool: Arc<ThreadPool>, worker: Worker<JobSpec>, index: usize) {
+    /// interval when we should steal from the global queue
+    /// so that tail latencies are managed appropriately
+    const STEAL_INTERVAL: usize = 61;
+
+    /// How often to reset the sketch values
+    const SKETCH_RESET_INTERVAL: usize = 1021;
+
+    let mut rng = SmallRng::from_entropy();
+
+    // used to determine whether we should temporarily skip tasks for fairness.
+    // 99% of estimates will overcount by no more than 4096 samples
+    let mut sketch = CountMinSketch::with_params(1.0 / (SKETCH_RESET_INTERVAL as f64), 0.01);
+
+    let (condvar, lock) = &pool.parkers[index];
+
+    'wait: loop {
+        // wait for notification of work
+        {
+            let mut lock = lock.lock();
+
+            // queue is empty
+            pool.metrics
+                .worker_queue_depth
+                .set(ThreadPoolWorkerId(index), 0);
+
+            // subtract 1 from idle count, add 1 to sleeping count.
+            pool.counters.fetch_sub(255, Ordering::SeqCst);
+
+            *lock = ThreadState::Parked;
+            condvar.wait(&mut lock);
+        }
+
+        for i in 0.. {
+            let mut job = match worker
+                .pop()
+                .or_else(|| pool.steal(&mut rng, index, &worker))
+            {
+                Some(job) => job,
+                None => continue 'wait,
+            };
+
+            pool.metrics
+                .worker_queue_depth
+                .set(ThreadPoolWorkerId(index), worker.len() as i64);
+
+            // receiver is closed, cancel the task
+            if !job.response.is_closed() {
+                let rate = sketch.inc_and_return(&job.endpoint, job.pbkdf2.cost());
+
+                const P: f64 = 2000.0;
+                // probability decreases as rate increases.
+                // lower probability, higher chance of being skipped
+                //
+                // estimates (rate in terms of 4096 rounds):
+                // rate = 0    => probability = 100%
+                // rate = 10   => probability = 71.3%
+                // rate = 50   => probability = 62.1%
+                // rate = 500  => probability = 52.3%
+                // rate = 1021 => probability = 49.8%
+                //
+                // My expectation is that the pool queue will only begin backing up at ~1000rps
+                // in which case the SKETCH_RESET_INTERVAL represents 1 second. Thus, the rates above
+                // are in requests per second.
+                let probability = P.ln() / (P + rate as f64).ln();
+                if pool.queue.len() > 32 || rng.gen_bool(probability) {
+                    pool.metrics
+                        .worker_task_turns_total
+                        .inc(ThreadPoolWorkerId(index));
+
+                    match job.pbkdf2.turn() {
+                        std::task::Poll::Ready(result) => {
+                            let _ = job.response.send(result);
+                        }
+                        std::task::Poll::Pending => worker.push(job),
+                    }
+                } else {
+                    pool.metrics
+                        .worker_task_skips_total
+                        .inc(ThreadPoolWorkerId(index));
+
+                    // skip for now
+                    worker.push(job)
+                }
+            }
+
+            // if we get stuck with a few long lived jobs in the queue
+            // it's better to try and steal from the queue too for fairness
+            if i % STEAL_INTERVAL == 0 {
+                let _ = pool.queue.steal_batch(&worker);
+            }
+
+            if i % SKETCH_RESET_INTERVAL == 0 {
+                sketch.reset();
+            }
+        }
+    }
+}
+
+struct JobSpec {
+    response: oneshot::Sender<[u8; 32]>,
+    pbkdf2: Pbkdf2,
+    endpoint: EndpointIdInt,
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::EndpointId;
+
+    use super::*;
+
+    #[tokio::test]
+    async fn hash_is_correct() {
+        let pool = ThreadPool::new(1);
+
+        let ep = EndpointId::from("foo");
+        let ep = EndpointIdInt::from(ep);
+
+        let salt = [0x55; 32];
+        let actual = pool
+            .spawn_job(ep, Pbkdf2::start(b"password", &salt, 4096))
+            .await
+            .unwrap();
+
+        let expected = [
+            10, 114, 73, 188, 140, 222, 196, 156, 214, 184, 79, 157, 119, 242, 16, 31, 53, 242,
+            178, 43, 95, 8, 225, 182, 122, 40, 219, 21, 89, 147, 64, 140,
+        ];
+        assert_eq!(actual, expected)
+    }
+}
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 6b79c12316..52fc7b556a 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -15,6 +15,7 @@ use crate::{
     },
     context::RequestMonitoring,
     error::{ErrorKind, ReportableError, UserFacingError},
+    intern::EndpointIdInt,
     proxy::{connect_compute::ConnectMechanism, retry::ShouldRetry},
     rate_limiter::EndpointRateLimiter,
     Host,
@@ -66,8 +67,14 @@ impl PoolingBackend {
                 return Err(AuthError::auth_failed(&*user_info.user));
             }
         };
-        let auth_outcome =
-            crate::auth::validate_password_and_exchange(&conn_info.password, secret).await?;
+        let ep = EndpointIdInt::from(&conn_info.user_info.endpoint);
+        let auth_outcome = crate::auth::validate_password_and_exchange(
+            &config.thread_pool,
+            ep,
+            &conn_info.password,
+            secret,
+        )
+        .await?;
         let res = match auth_outcome {
             crate::sasl::Outcome::Success(key) => {
                 info!("user successfully authenticated");
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 7582562450..f364a6c2e0 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -13,6 +13,7 @@ publish = false
 
 ### BEGIN HAKARI SECTION
 [dependencies]
+ahash = { version = "0.8" }
 anyhow = { version = "1", features = ["backtrace"] }
 aws-config = { version = "1", default-features = false, features = ["rustls", "sso"] }
 aws-runtime = { version = "1", default-features = false, features = ["event-stream", "http-02x", "sigv4a"] }
@@ -85,6 +86,7 @@ zstd-safe = { version = "7", default-features = false, features = ["arrays", "le
 zstd-sys = { version = "2", default-features = false, features = ["legacy", "std", "zdict_builder"] }
 
 [build-dependencies]
+ahash = { version = "0.8" }
 anyhow = { version = "1", features = ["backtrace"] }
 bytes = { version = "1", features = ["serde"] }
 cc = { version = "1", default-features = false, features = ["parallel"] }

From ddd8ebd2536f90872da2678aecbd94a3e1b3f547 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Wed, 22 May 2024 13:06:00 -0400
Subject: [PATCH 124/130] chore(pageserver): use kebab case for aux file flag
 (#7840)

part of https://github.com/neondatabase/neon/issues/7462

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/pageserver_api/src/models.rs             | 45 +++++++++++--------
 test_runner/fixtures/neon_fixtures.py         |  2 +-
 test_runner/fixtures/utils.py                 |  6 +--
 .../regress/test_attach_tenant_config.py      |  2 +-
 test_runner/regress/test_aux_files.py         | 12 +++--
 5 files changed, 39 insertions(+), 28 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 55e9c48421..6b0403c8ab 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -9,7 +9,6 @@ use std::{
     collections::HashMap,
     io::{BufRead, Read},
     num::{NonZeroU64, NonZeroUsize},
-    str::FromStr,
     sync::atomic::AtomicUsize,
     time::{Duration, SystemTime},
 };
@@ -334,14 +333,28 @@ pub struct TenantConfig {
 /// Unset -> V1
 ///       -> V2
 ///       -> CrossValidation -> V2
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+#[derive(
+    Eq,
+    PartialEq,
+    Debug,
+    Copy,
+    Clone,
+    strum_macros::EnumString,
+    strum_macros::Display,
+    serde_with::DeserializeFromStr,
+    serde_with::SerializeDisplay,
+)]
+#[strum(serialize_all = "kebab-case")]
 pub enum AuxFilePolicy {
     /// V1 aux file policy: store everything in AUX_FILE_KEY
+    #[strum(ascii_case_insensitive)]
     V1,
     /// V2 aux file policy: store in the AUX_FILE keyspace
+    #[strum(ascii_case_insensitive)]
     V2,
     /// Cross validation runs both formats on the write path and does validation
     /// on the read path.
+    #[strum(ascii_case_insensitive)]
     CrossValidation,
 }
 
@@ -407,23 +420,6 @@ impl AuxFilePolicy {
     }
 }
 
-impl FromStr for AuxFilePolicy {
-    type Err = anyhow::Error;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        let s = s.to_lowercase();
-        if s == "v1" {
-            Ok(Self::V1)
-        } else if s == "v2" {
-            Ok(Self::V2)
-        } else if s == "crossvalidation" || s == "cross_validation" {
-            Ok(Self::CrossValidation)
-        } else {
-            anyhow::bail!("cannot parse {} to aux file policy", s)
-        }
-    }
-}
-
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 #[serde(tag = "kind")]
 pub enum EvictionPolicy {
@@ -1405,6 +1401,7 @@ impl PagestreamBeMessage {
 #[cfg(test)]
 mod tests {
     use serde_json::json;
+    use std::str::FromStr;
 
     use super::*;
 
@@ -1667,4 +1664,14 @@ mod tests {
             AuxFilePolicy::V2
         ));
     }
+
+    #[test]
+    fn test_aux_parse() {
+        assert_eq!(AuxFilePolicy::from_str("V2").unwrap(), AuxFilePolicy::V2);
+        assert_eq!(AuxFilePolicy::from_str("v2").unwrap(), AuxFilePolicy::V2);
+        assert_eq!(
+            AuxFilePolicy::from_str("cross-validation").unwrap(),
+            AuxFilePolicy::CrossValidation
+        );
+    }
 }
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index b02054a702..796ae7217b 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1625,7 +1625,7 @@ class NeonCli(AbstractNeonCli):
             args.extend(["-c", "switch_aux_file_policy:v1"])
 
         if aux_file_v2 is AuxFileStore.CrossValidation:
-            args.extend(["-c", "switch_aux_file_policy:cross_validation"])
+            args.extend(["-c", "switch_aux_file_policy:cross-validation"])
 
         if set_default:
             args.append("--set-default")
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index 70263245e7..22bb43c580 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -495,9 +495,9 @@ def assert_no_errors(log_file, service, allowed_errors):
 
 @enum.unique
 class AuxFileStore(str, enum.Enum):
-    V1 = "V1"
-    V2 = "V2"
-    CrossValidation = "CrossValidation"
+    V1 = "v1"
+    V2 = "v2"
+    CrossValidation = "cross-validation"
 
     def __repr__(self) -> str:
         return f"'aux-{self.value}'"
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 2ec375271c..531679c7af 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -190,7 +190,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
         "trace_read_requests": True,
         "walreceiver_connect_timeout": "13m",
         "image_layer_creation_check_threshold": 1,
-        "switch_aux_file_policy": "CrossValidation",
+        "switch_aux_file_policy": "cross-validation",
     }
 
     ps_http = env.pageserver.http_client()
diff --git a/test_runner/regress/test_aux_files.py b/test_runner/regress/test_aux_files.py
index be9c41a867..5328aef156 100644
--- a/test_runner/regress/test_aux_files.py
+++ b/test_runner/regress/test_aux_files.py
@@ -1,5 +1,6 @@
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
+    AuxFileStore,
     NeonEnvBuilder,
     logical_replication_sync,
 )
@@ -14,7 +15,7 @@ def test_aux_v2_config_switch(neon_env_builder: NeonEnvBuilder, vanilla_pg):
     timeline_id = env.initial_timeline
 
     tenant_config = client.tenant_config(tenant_id).effective_config
-    tenant_config["switch_aux_file_policy"] = "V2"
+    tenant_config["switch_aux_file_policy"] = AuxFileStore.V2
     client.set_tenant_config(tenant_id, tenant_config)
     # aux file v2 is enabled on the write path, so for now, it should be unset (or null)
     assert (
@@ -49,7 +50,10 @@ def test_aux_v2_config_switch(neon_env_builder: NeonEnvBuilder, vanilla_pg):
 
     with env.pageserver.http_client() as client:
         # aux file v2 flag should be enabled at this point
-        assert client.timeline_detail(tenant_id, timeline_id)["last_aux_file_policy"] == "V2"
+        assert (
+            client.timeline_detail(tenant_id, timeline_id)["last_aux_file_policy"]
+            == AuxFileStore.V2
+        )
     with env.pageserver.http_client() as client:
         tenant_config = client.tenant_config(tenant_id).effective_config
         tenant_config["switch_aux_file_policy"] = "V1"
@@ -59,7 +63,7 @@ def test_aux_v2_config_switch(neon_env_builder: NeonEnvBuilder, vanilla_pg):
             client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)[
                 "last_aux_file_policy"
             ]
-            == "V2"
+            == AuxFileStore.V2
         )
     env.pageserver.restart()
     with env.pageserver.http_client() as client:
@@ -68,5 +72,5 @@ def test_aux_v2_config_switch(neon_env_builder: NeonEnvBuilder, vanilla_pg):
             client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)[
                 "last_aux_file_policy"
             ]
-            == "V2"
+            == AuxFileStore.V2
         )

From 014f822a789efd2a64fbcf79de5acd2f07018952 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 22 May 2024 19:17:47 +0100
Subject: [PATCH 125/130] tests: refine test_secondary_background_downloads
 (#7829)

## Problem

This test relied on some sleeps, and was failing ~5% of the time.

## Summary of changes

Use log-watching rather than straight waits, and make timeouts more
generous for the CI environment.
---
 .../regress/test_pageserver_secondary.py      | 51 +++++++++++++++++--
 1 file changed, 46 insertions(+), 5 deletions(-)

diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 127340a1e7..25a3f8521c 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -578,7 +578,7 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
     default_download_period_secs = 60
 
     # The upload period, which will also be the download once the secondary has seen its first heatmap
-    upload_period_secs = 20
+    upload_period_secs = 30
 
     for _i in range(0, tenant_count):
         tenant_id = TenantId.generate()
@@ -596,11 +596,26 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
 
         tenant_timelines[tenant_id] = [timeline_a, timeline_b]
 
+    def await_log(pageserver, deadline, expression):
+        """
+        Wrapper around assert_log_contains that waits with a deadline rather than timeout
+        """
+        now = time.time()
+        if now > deadline:
+            raise RuntimeError(f"Timed out waiting for {expression}")
+        else:
+            timeout = int(deadline - now) + 1
+            try:
+                wait_until(timeout, 1, lambda: pageserver.assert_log_contains(expression))  # type: ignore
+            except:
+                log.error(f"Timed out waiting for '{expression}'")
+                raise
+
     t_start = time.time()
 
     # Wait long enough that the background downloads should happen; we expect all the inital layers
     # of all the initial timelines to show up on the secondary location of each tenant.
-    time.sleep(default_download_period_secs * 1.5)
+    initial_download_deadline = time.time() + default_download_period_secs * 3
 
     for tenant_id, timelines in tenant_timelines.items():
         attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"]
@@ -608,8 +623,24 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
         # We only have two: the other one must be secondary
         ps_secondary = next(p for p in env.pageservers if p != ps_attached)
 
+        now = time.time()
+        if now > initial_download_deadline:
+            raise RuntimeError("Timed out waiting for initial secondary download")
+        else:
+            for timeline_id in timelines:
+                log.info(
+                    f"Waiting for downloads of timeline {timeline_id} on secondary pageserver {ps_secondary.id}"
+                )
+                await_log(
+                    ps_secondary,
+                    initial_download_deadline,
+                    f".*{timeline_id}.*Wrote timeline_detail.*",
+                )
+
         for timeline_id in timelines:
-            log.info(f"Checking for secondary timeline {timeline_id} on node {ps_secondary.id}")
+            log.info(
+                f"Checking for secondary timeline downloads {timeline_id} on node {ps_secondary.id}"
+            )
             # One or more layers should be present for all timelines
             assert ps_secondary.list_layers(tenant_id, timeline_id)
 
@@ -617,7 +648,7 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
         env.storage_controller.pageserver_api().timeline_delete(tenant_id, timelines[1])
 
     # Wait long enough for the secondary locations to see the deletion: 2x period plus a grace factor
-    time.sleep(upload_period_secs * 2.5)
+    deletion_deadline = time.time() + upload_period_secs * 3
 
     for tenant_id, timelines in tenant_timelines.items():
         attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"]
@@ -625,6 +656,16 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
         # We only have two: the other one must be secondary
         ps_secondary = next(p for p in env.pageservers if p != ps_attached)
 
+        expect_del_timeline = timelines[1]
+        log.info(
+            f"Waiting for deletion of timeline {expect_del_timeline} on secondary pageserver {ps_secondary.id}"
+        )
+        await_log(
+            ps_secondary,
+            deletion_deadline,
+            f".*Timeline no longer in heatmap.*{expect_del_timeline}.*",
+        )
+
         # This one was not deleted
         assert ps_secondary.list_layers(tenant_id, timelines[0])
 
@@ -632,7 +673,7 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
         log.info(
             f"Checking for secondary timeline deletion {tenant_id}/{timeline_id} on node {ps_secondary.id}"
         )
-        assert not ps_secondary.list_layers(tenant_id, timelines[1])
+        assert not ps_secondary.list_layers(tenant_id, expect_del_timeline)
 
     t_end = time.time()
 

From f98fdd20e390822a890f596a7932ca81b47e00ce Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 22 May 2024 19:38:22 +0100
Subject: [PATCH 126/130] tests: add a couple of allow lists for shutdown cases
 (#7844)

## Problem

Failures on some of our uglier shutdown log messages:

https://neon-github-public-dev.s3.amazonaws.com/reports/main/9192662995/index.html#suites/07874de07c4a1c9effe0d92da7755ebf/51b365408678c66f/

## Summary of changes

- Allow-list these errors.
---
 test_runner/fixtures/pageserver/allowed_errors.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py
index 91cd67d107..fa6e4eaafd 100755
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -70,6 +70,7 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
     # this is expected given our collaborative shutdown approach for the UploadQueue
     ".*Compaction failed.*, retrying in .*: Other\\(queue is in state Stopped.*",
     ".*Compaction failed.*, retrying in .*: ShuttingDown",
+    ".*Compaction failed.*, retrying in .*: timeline shutting down.*",
     # Pageserver timeline deletion should be polled until it gets 404, so ignore it globally
     ".*Error processing HTTP request: NotFound: Timeline .* was not found",
     ".*took more than expected to complete.*",
@@ -91,6 +92,10 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
     ".*WARN deletion backend: calling control plane generation validation API failed.*error sending request.*",
     # Can happen when the test shuts down the storage controller while it is calling the utilization API
     ".*WARN.*path=/v1/utilization .*request was dropped before completing",
+    # Can happen during shutdown
+    ".*scheduling deletion on drop failed: queue is in state Stopped.*",
+    # Can happen during shutdown
+    ".*ignoring failure to find gc cutoffs: timeline shutting down.*",
 )
 
 

From 4a278cce7ce5b7f32360e85fd41219df95cc9a86 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Wed, 22 May 2024 15:05:26 -0400
Subject: [PATCH 127/130] chore(pageserver): add force aux file policy switch
 handler (#7842)

For existing users, we want to allow doing a force switch for their aux
file policy.

Part of #7462

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/http/routes.rs       | 78 ++++++++++++++++++-----------
 pageserver/src/pgdatadir_mapping.rs | 14 +-----
 pageserver/src/tenant.rs            | 61 ++++++++++++++++++++++
 pageserver/src/tenant/timeline.rs   |  8 +++
 4 files changed, 119 insertions(+), 42 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 34b9806a26..7b55e88096 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -16,6 +16,7 @@ use hyper::header;
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
+use pageserver_api::models::AuxFilePolicy;
 use pageserver_api::models::IngestAuxFilesRequest;
 use pageserver_api::models::ListAuxFilesRequest;
 use pageserver_api::models::LocationConfig;
@@ -2307,6 +2308,31 @@ async fn post_tracing_event_handler(
     json_response(StatusCode::OK, ())
 }
 
+async fn force_aux_policy_switch_handler(
+    mut r: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    check_permission(&r, None)?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&r, "tenant_shard_id")?;
+    let timeline_id: TimelineId = parse_request_param(&r, "timeline_id")?;
+    let policy: AuxFilePolicy = json_request(&mut r).await?;
+
+    let state = get_state(&r);
+
+    let tenant = state
+        .tenant_manager
+        .get_attached_tenant_shard(tenant_shard_id)?;
+    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
+    timeline
+        .do_switch_aux_policy(policy)
+        .map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::OK, ())
+}
+
 async fn put_io_engine_handler(
     mut r: Request<Body>,
     _cancel: CancellationToken,
@@ -2384,19 +2410,9 @@ async fn list_aux_files(
         active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
             .await?;
 
-    let process = || async move {
-        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-        let files = timeline.list_aux_files(body.lsn, &ctx).await?;
-        Ok::<_, anyhow::Error>(files)
-    };
-
-    match process().await {
-        Ok(st) => json_response(StatusCode::OK, st),
-        Err(err) => json_response(
-            StatusCode::INTERNAL_SERVER_ERROR,
-            ApiError::InternalServerError(err).to_string(),
-        ),
-    }
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+    let files = timeline.list_aux_files(body.lsn, &ctx).await?;
+    json_response(StatusCode::OK, files)
 }
 
 async fn ingest_aux_files(
@@ -2414,24 +2430,22 @@ async fn ingest_aux_files(
         active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
             .await?;
 
-    let process = || async move {
-        let mut modification = timeline.begin_modification(Lsn(
-            timeline.get_last_record_lsn().0 + 8
-        ) /* advance LSN by 8 */);
-        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-        for (fname, content) in body.aux_files {
-            modification
-                .put_file(&fname, content.as_bytes(), &ctx)
-                .await?;
-        }
-        modification.commit(&ctx).await?;
-        Ok::<_, anyhow::Error>(())
-    };
-
-    match process().await {
-        Ok(st) => json_response(StatusCode::OK, st),
-        Err(err) => Err(ApiError::InternalServerError(err)),
+    let mut modification = timeline.begin_modification(
+        Lsn(timeline.get_last_record_lsn().0 + 8), /* advance LSN by 8 */
+    );
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+    for (fname, content) in body.aux_files {
+        modification
+            .put_file(&fname, content.as_bytes(), &ctx)
+            .await
+            .map_err(ApiError::InternalServerError)?;
     }
+    modification
+        .commit(&ctx)
+        .await
+        .map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::OK, ())
 }
 
 /// Report on the largest tenants on this pageserver, for the storage controller to identify
@@ -2814,6 +2828,10 @@ pub fn make_router(
             |r| api_handler(r, timeline_collect_keyspace),
         )
         .put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
+        .put(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/force_aux_policy_switch",
+            |r| api_handler(r, force_aux_policy_switch_handler),
+        )
         .get("/v1/utilization", |r| api_handler(r, get_utilization))
         .post(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/ingest_aux_files",
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 7dea687c46..afba34c6d1 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1489,14 +1489,7 @@ impl<'a> DatadirModification<'a> {
                 if aux_files_key_v1.is_empty() {
                     None
                 } else {
-                    self.tline
-                        .last_aux_file_policy
-                        .store(Some(AuxFilePolicy::V1));
-                    self.tline
-                        .remote_client
-                        .schedule_index_upload_for_aux_file_policy_update(Some(
-                            AuxFilePolicy::V1,
-                        ))?;
+                    self.tline.do_switch_aux_policy(AuxFilePolicy::V1)?;
                     Some(AuxFilePolicy::V1)
                 }
             } else {
@@ -1504,10 +1497,7 @@ impl<'a> DatadirModification<'a> {
             };
 
             if AuxFilePolicy::is_valid_migration_path(current_policy, switch_policy) {
-                self.tline.last_aux_file_policy.store(Some(switch_policy));
-                self.tline
-                    .remote_client
-                    .schedule_index_upload_for_aux_file_policy_update(Some(switch_policy))?;
+                self.tline.do_switch_aux_policy(switch_policy)?;
                 info!(current=?current_policy, next=?switch_policy, "switching aux file policy");
                 switch_policy
             } else {
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index caf26e0a0b..2bb199b228 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -5999,6 +5999,67 @@ mod tests {
         );
     }
 
+    #[tokio::test]
+    async fn aux_file_policy_force_switch() {
+        let mut harness = TenantHarness::create("aux_file_policy_force_switch").unwrap();
+        harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::V1;
+        let (tenant, ctx) = harness.load().await;
+
+        let mut lsn = Lsn(0x08);
+
+        let tline: Arc<Timeline> = tenant
+            .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx)
+            .await
+            .unwrap();
+
+        assert_eq!(
+            tline.last_aux_file_policy.load(),
+            None,
+            "no aux file is written so it should be unset"
+        );
+
+        {
+            lsn += 8;
+            let mut modification = tline.begin_modification(lsn);
+            modification
+                .put_file("pg_logical/mappings/test1", b"first", &ctx)
+                .await
+                .unwrap();
+            modification.commit(&ctx).await.unwrap();
+        }
+
+        tline.do_switch_aux_policy(AuxFilePolicy::V2).unwrap();
+
+        assert_eq!(
+            tline.last_aux_file_policy.load(),
+            Some(AuxFilePolicy::V2),
+            "dirty index_part.json reflected state is yet to be updated"
+        );
+
+        // lose all data from v1
+        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
+        assert_eq!(files.get("pg_logical/mappings/test1"), None);
+
+        {
+            lsn += 8;
+            let mut modification = tline.begin_modification(lsn);
+            modification
+                .put_file("pg_logical/mappings/test2", b"second", &ctx)
+                .await
+                .unwrap();
+            modification.commit(&ctx).await.unwrap();
+        }
+
+        // read data ingested in v2
+        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
+        assert_eq!(
+            files.get("pg_logical/mappings/test2"),
+            Some(&bytes::Bytes::from_static(b"second"))
+        );
+        // lose all data from v1
+        assert_eq!(files.get("pg_logical/mappings/test1"), None);
+    }
+
     #[tokio::test]
     async fn aux_file_policy_auto_detect() {
         let mut harness = TenantHarness::create("aux_file_policy_auto_detect").unwrap();
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 1f8ee9ffc4..63d03b3c68 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4606,6 +4606,14 @@ impl Timeline {
     ) -> Result<Vec<TimelineId>, anyhow::Error> {
         detach_ancestor::complete(self, tenant, prepared, ctx).await
     }
+
+    /// Switch aux file policy and schedule upload to the index part.
+    pub(crate) fn do_switch_aux_policy(&self, policy: AuxFilePolicy) -> anyhow::Result<()> {
+        self.last_aux_file_policy.store(Some(policy));
+        self.remote_client
+            .schedule_index_upload_for_aux_file_policy_update(Some(policy))?;
+        Ok(())
+    }
 }
 
 /// Top-level failure to compact.

From ff560a1113046ff29acf2723c0183db7c2d128f1 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Wed, 22 May 2024 17:28:47 -0400
Subject: [PATCH 128/130] chore(pageserver): use kebab case for compaction
 algorithms (#7845)

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/pageserver_api/src/models.rs             | 21 ++++++++++++++++---
 pageserver/src/tenant.rs                      | 10 ++++++---
 pageserver/src/tenant/config.rs               | 13 ++++++++----
 pageserver/src/tenant/timeline.rs             | 14 +++++++------
 .../regress/test_attach_tenant_config.py      |  2 +-
 test_runner/regress/test_compaction.py        |  4 ++--
 6 files changed, 45 insertions(+), 19 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 6b0403c8ab..9311dab33c 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -305,7 +305,7 @@ pub struct TenantConfig {
     pub compaction_period: Option<String>,
     pub compaction_threshold: Option<usize>,
     // defer parsing compaction_algorithm, like eviction_policy
-    pub compaction_algorithm: Option<CompactionAlgorithm>,
+    pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
     pub gc_horizon: Option<u64>,
     pub gc_period: Option<String>,
     pub image_creation_threshold: Option<usize>,
@@ -438,13 +438,28 @@ impl EvictionPolicy {
     }
 }
 
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
-#[serde(tag = "kind")]
+#[derive(
+    Eq,
+    PartialEq,
+    Debug,
+    Copy,
+    Clone,
+    strum_macros::EnumString,
+    strum_macros::Display,
+    serde_with::DeserializeFromStr,
+    serde_with::SerializeDisplay,
+)]
+#[strum(serialize_all = "kebab-case")]
 pub enum CompactionAlgorithm {
     Legacy,
     Tiered,
 }
 
+#[derive(Eq, PartialEq, Debug, Clone, Serialize, Deserialize)]
+pub struct CompactionAlgorithmSettings {
+    pub kind: CompactionAlgorithm,
+}
+
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 pub struct EvictionPolicyLayerAccessThreshold {
     #[serde(with = "humantime_serde")]
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 2bb199b228..540eb10ed2 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3973,7 +3973,7 @@ mod tests {
     use hex_literal::hex;
     use pageserver_api::key::{AUX_FILES_KEY, AUX_KEY_PREFIX, NON_INHERITED_RANGE};
     use pageserver_api::keyspace::KeySpace;
-    use pageserver_api::models::CompactionAlgorithm;
+    use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings};
     use rand::{thread_rng, Rng};
     use tests::storage_layer::ValuesReconstructState;
     use tests::timeline::{GetVectoredError, ShutdownMode};
@@ -5169,7 +5169,9 @@ mod tests {
         compaction_algorithm: CompactionAlgorithm,
     ) -> anyhow::Result<()> {
         let mut harness = TenantHarness::create(name)?;
-        harness.tenant_conf.compaction_algorithm = compaction_algorithm;
+        harness.tenant_conf.compaction_algorithm = CompactionAlgorithmSettings {
+            kind: compaction_algorithm,
+        };
         let (tenant, ctx) = harness.load().await;
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
@@ -5526,7 +5528,9 @@ mod tests {
         compaction_algorithm: CompactionAlgorithm,
     ) -> anyhow::Result<()> {
         let mut harness = TenantHarness::create(name)?;
-        harness.tenant_conf.compaction_algorithm = compaction_algorithm;
+        harness.tenant_conf.compaction_algorithm = CompactionAlgorithmSettings {
+            kind: compaction_algorithm,
+        };
         let (tenant, ctx) = harness.load().await;
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index a695363cdc..342d705954 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -11,6 +11,7 @@
 use anyhow::bail;
 use pageserver_api::models::AuxFilePolicy;
 use pageserver_api::models::CompactionAlgorithm;
+use pageserver_api::models::CompactionAlgorithmSettings;
 use pageserver_api::models::EvictionPolicy;
 use pageserver_api::models::{self, ThrottleConfig};
 use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
@@ -320,7 +321,7 @@ pub struct TenantConf {
     pub compaction_period: Duration,
     // Level0 delta layer threshold for compaction.
     pub compaction_threshold: usize,
-    pub compaction_algorithm: CompactionAlgorithm,
+    pub compaction_algorithm: CompactionAlgorithmSettings,
     // Determines how much history is retained, to allow
     // branching and read replicas at an older point in time.
     // The unit is #of bytes of WAL.
@@ -406,7 +407,7 @@ pub struct TenantConfOpt {
 
     #[serde(skip_serializing_if = "Option::is_none")]
     #[serde(default)]
-    pub compaction_algorithm: Option<CompactionAlgorithm>,
+    pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
 
     #[serde(skip_serializing_if = "Option::is_none")]
     #[serde(default)]
@@ -497,7 +498,9 @@ impl TenantConfOpt {
                 .unwrap_or(global_conf.compaction_threshold),
             compaction_algorithm: self
                 .compaction_algorithm
-                .unwrap_or(global_conf.compaction_algorithm),
+                .as_ref()
+                .unwrap_or(&global_conf.compaction_algorithm)
+                .clone(),
             gc_horizon: self.gc_horizon.unwrap_or(global_conf.gc_horizon),
             gc_period: self.gc_period.unwrap_or(global_conf.gc_period),
             image_creation_threshold: self
@@ -550,7 +553,9 @@ impl Default for TenantConf {
             compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD)
                 .expect("cannot parse default compaction period"),
             compaction_threshold: DEFAULT_COMPACTION_THRESHOLD,
-            compaction_algorithm: DEFAULT_COMPACTION_ALGORITHM,
+            compaction_algorithm: CompactionAlgorithmSettings {
+                kind: DEFAULT_COMPACTION_ALGORITHM,
+            },
             gc_horizon: DEFAULT_GC_HORIZON,
             gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
                 .expect("cannot parse default gc period"),
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 63d03b3c68..881e7f8f3c 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -23,9 +23,9 @@ use pageserver_api::{
     },
     keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning},
     models::{
-        AtomicAuxFilePolicy, AuxFilePolicy, CompactionAlgorithm, DownloadRemoteLayersTaskInfo,
-        DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, InMemoryLayerInfo, LayerMapInfo,
-        LsnLease, TimelineState,
+        AtomicAuxFilePolicy, AuxFilePolicy, CompactionAlgorithm, CompactionAlgorithmSettings,
+        DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy,
+        InMemoryLayerInfo, LayerMapInfo, LsnLease, TimelineState,
     },
     reltag::BlockNumber,
     shard::{ShardIdentity, ShardNumber, TenantShardId},
@@ -1700,7 +1700,7 @@ impl Timeline {
             return Ok(());
         }
 
-        match self.get_compaction_algorithm() {
+        match self.get_compaction_algorithm_settings().kind {
             CompactionAlgorithm::Tiered => self.compact_tiered(cancel, ctx).await,
             CompactionAlgorithm::Legacy => self.compact_legacy(cancel, flags, ctx).await,
         }
@@ -2096,12 +2096,14 @@ impl Timeline {
             .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
     }
 
-    fn get_compaction_algorithm(&self) -> CompactionAlgorithm {
+    fn get_compaction_algorithm_settings(&self) -> CompactionAlgorithmSettings {
         let tenant_conf = &self.tenant_conf.load();
         tenant_conf
             .tenant_conf
             .compaction_algorithm
-            .unwrap_or(self.conf.default_tenant_conf.compaction_algorithm)
+            .as_ref()
+            .unwrap_or(&self.conf.default_tenant_conf.compaction_algorithm)
+            .clone()
     }
 
     fn get_eviction_policy(&self) -> EvictionPolicy {
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 531679c7af..8c60b454d8 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -162,7 +162,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
         "checkpoint_distance": 10000,
         "checkpoint_timeout": "13m",
         "compaction_algorithm": {
-            "kind": "Tiered",
+            "kind": "tiered",
         },
         "eviction_policy": {
             "kind": "LayerAccessThreshold",
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index 6a3515e1bd..4850a5c688 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -194,8 +194,8 @@ def test_sharding_compaction(
 
 
 class CompactionAlgorithm(str, enum.Enum):
-    LEGACY = "Legacy"
-    TIERED = "Tiered"
+    LEGACY = "legacy"
+    TIERED = "tiered"
 
 
 @pytest.mark.parametrize(

From eb0c026aac95bf8aad1c115aabb5697e86594eac Mon Sep 17 00:00:00 2001
From: Oleg Vasilev <oleg@neon.tech>
Date: Thu, 23 May 2024 00:48:59 +0300
Subject: [PATCH 129/130] Bump vm-builder v0.28.1 -> v0.29.3 (#7849)

One change:
runner: allow coredump collection (#931)
---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 2ab1417d6d..f8c011a0a5 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -934,7 +934,7 @@ jobs:
       matrix:
         version: [ v14, v15, v16 ]
     env:
-      VM_BUILDER_VERSION: v0.28.1
+      VM_BUILDER_VERSION: v0.29.3
 
     steps:
       - name: Checkout

From 03c2c569be7a0e379743d99884c2990d484c67f2 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Thu, 23 May 2024 11:41:29 +0200
Subject: [PATCH 130/130] [proxy] Do not fail after parquet upload error
 (#7858)

## Problem

If the parquet upload was unsuccessful, it will panic.

## Summary of changes

Write error in logs instead.
---
 proxy/src/context/parquet.rs | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index 392821c430..a213a32ca4 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -355,7 +355,7 @@ async fn upload_parquet(
         "{year:04}/{month:02}/{day:02}/{hour:02}/requests_{id}.parquet"
     ))?;
     let cancel = CancellationToken::new();
-    backoff::retry(
+    let maybe_err = backoff::retry(
         || async {
             let stream = futures::stream::once(futures::future::ready(Ok(data.clone())));
             storage
@@ -372,7 +372,12 @@ async fn upload_parquet(
     .await
     .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
     .and_then(|x| x)
-    .context("request_data_upload")?;
+    .context("request_data_upload")
+    .err();
+
+    if let Some(err) = maybe_err {
+        tracing::warn!(%id, %err, "failed to upload request data");
+    }
 
     Ok(buffer.writer())
 }