From df7a9d1407cf189a86dc54c8de7b7de4936dd842 Mon Sep 17 00:00:00 2001
From: Andrey Taranik <andrey@cicd.team>
Date: Thu, 17 Mar 2022 00:43:28 +0300
Subject: [PATCH 001/412] release fix 2022-03-16 (#1375)

---
 .circleci/ansible/deploy.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.circleci/ansible/deploy.yaml b/.circleci/ansible/deploy.yaml
index c95524a8a5..2dd109f99a 100644
--- a/.circleci/ansible/deploy.yaml
+++ b/.circleci/ansible/deploy.yaml
@@ -119,7 +119,7 @@
       shell:
         cmd: |
           INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
-          curl -sfS -d '{"version": {{ current_version }} }' -X POST {{ console_mgmt_base_url }}/api/v1/pageservers/$INSTANCE_ID
+          curl -sfS -d '{"version": {{ current_version }} }' -X PATCH {{ console_mgmt_base_url }}/api/v1/pageservers/$INSTANCE_ID
       tags:
       - pageserver
 
@@ -169,6 +169,6 @@
       shell:
         cmd: |
           INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
-          curl -sfS -d '{"version": {{ current_version }} }' -X POST {{ console_mgmt_base_url }}/api/v1/safekeepers/$INSTANCE_ID
+          curl -sfS -d '{"version": {{ current_version }} }' -X PATCH {{ console_mgmt_base_url }}/api/v1/safekeepers/$INSTANCE_ID
       tags:
       - safekeeper

From 73f247d537ebea4719461e0d0d3a8c5c92e45bb0 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 31 May 2022 16:00:50 +0400
Subject: [PATCH 002/412] Bump vendor/postgres to hotfix basebackup LSN
 comparison.

---
 vendor/postgres | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/postgres b/vendor/postgres
index 038b2b98e5..658157375a 160000
--- a/vendor/postgres
+++ b/vendor/postgres
@@ -1 +1 @@
-Subproject commit 038b2b98e5c3d6274cbd43e9b822cdd946cb8b91
+Subproject commit 658157375a2b1b574766c1a055dde224c269a2f8

From cf350c6002e1107f8b800b5cc4e4f273ea432c5f Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Tue, 31 May 2022 17:36:35 +0200
Subject: [PATCH 003/412] Use :local compute-tools tag to build compute-node
 image

---
 .circleci/config.yml | 24 ++++++++++++++----------
 vendor/postgres      |  2 +-
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 5346e35c01..3377b907cb 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -462,9 +462,6 @@ jobs:
       - checkout
       - setup_remote_docker:
           docker_layer_caching: true
-      # Build neondatabase/compute-tools:latest image and push it to Docker hub
-      # TODO: this should probably also use versioned tag, not just :latest.
-      # XXX: but should it? We build and use it only locally now.
       - run:
           name: Build and push compute-tools Docker image
           command: |
@@ -472,7 +469,10 @@ jobs:
             docker build \
               --build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
               --build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
-              --tag neondatabase/compute-tools:latest -f Dockerfile.compute-tools .
+              --tag neondatabase/compute-tools:local \
+              --tag neondatabase/compute-tools:latest \
+              -f Dockerfile.compute-tools .
+            # Only push :latest image
             docker push neondatabase/compute-tools:latest
       - run:
           name: Init postgres submodule
@@ -482,7 +482,9 @@ jobs:
           command: |
             echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
             DOCKER_TAG=$(git log --oneline|wc -l)
-            docker build --tag neondatabase/compute-node:${DOCKER_TAG} --tag neondatabase/compute-node:latest vendor/postgres
+            docker build --tag neondatabase/compute-node:${DOCKER_TAG} \
+              --tag neondatabase/compute-node:latest vendor/postgres \
+              --build-arg COMPUTE_TOOLS_TAG=local
             docker push neondatabase/compute-node:${DOCKER_TAG}
             docker push neondatabase/compute-node:latest
 
@@ -519,9 +521,6 @@ jobs:
       - checkout
       - setup_remote_docker:
           docker_layer_caching: true
-      # Build neondatabase/compute-tools:release image and push it to Docker hub
-      # TODO: this should probably also use versioned tag, not just :latest.
-      # XXX: but should it? We build and use it only locally now.
       - run:
           name: Build and push compute-tools Docker image
           command: |
@@ -529,7 +528,10 @@ jobs:
             docker build \
               --build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
               --build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
-              --tag neondatabase/compute-tools:release -f Dockerfile.compute-tools .
+              --tag neondatabase/compute-tools:release \
+              --tag neondatabase/compute-tools:local \
+              -f Dockerfile.compute-tools .
+            # Only push :release image
             docker push neondatabase/compute-tools:release
       - run:
           name: Init postgres submodule
@@ -539,7 +541,9 @@ jobs:
           command: |
             echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
             DOCKER_TAG="release-$(git log --oneline|wc -l)"
-            docker build --tag neondatabase/compute-node:${DOCKER_TAG} --tag neondatabase/compute-node:release vendor/postgres
+            docker build --tag neondatabase/compute-node:${DOCKER_TAG} \
+              --tag neondatabase/compute-node:release vendor/postgres \
+              --build-arg COMPUTE_TOOLS_TAG=local
             docker push neondatabase/compute-node:${DOCKER_TAG}
             docker push neondatabase/compute-node:release
 
diff --git a/vendor/postgres b/vendor/postgres
index 658157375a..dba273190e 160000
--- a/vendor/postgres
+++ b/vendor/postgres
@@ -1 +1 @@
-Subproject commit 658157375a2b1b574766c1a055dde224c269a2f8
+Subproject commit dba273190e546c2a6345c38435e91780797c734f

From cc856eca85eb6fb586537583b215b790d34cdb7d Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Tue, 31 May 2022 18:35:06 +0200
Subject: [PATCH 004/412] Install missing openssl packages in the Github
 Actions workflow

---
 .github/workflows/testing.yml | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml
index 79b2ba05d0..ad7bddfabc 100644
--- a/.github/workflows/testing.yml
+++ b/.github/workflows/testing.yml
@@ -34,11 +34,11 @@ jobs:
         if: matrix.os == 'ubuntu-latest'
         run: |
           sudo apt update
-          sudo apt install build-essential libreadline-dev zlib1g-dev flex bison libseccomp-dev
+          sudo apt install build-essential libreadline-dev zlib1g-dev flex bison libseccomp-dev libssl-dev
 
-      - name: Install macOs postgres dependencies
+      - name: Install macOS postgres dependencies
         if: matrix.os == 'macos-latest'
-        run: brew install flex bison
+        run: brew install flex bison openssl
 
       - name: Set pg revision for caching
         id: pg_ver
@@ -52,10 +52,27 @@ jobs:
             tmp_install/
           key: ${{ runner.os }}-pg-${{ steps.pg_ver.outputs.pg_rev }}
 
+      - name: Set extra env for macOS
+        if: matrix.os == 'macos-latest'
+        run: |
+          echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV
+          echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
+
       - name: Build postgres
         if: steps.cache_pg.outputs.cache-hit != 'true'
         run: make postgres
 
+      # Plain configure output can contain weird errors like 'error: C compiler cannot create executables'
+      # and the real cause will be inside config.log
+      - name: Print configure logs in case of failure
+        if: failure()
+        continue-on-error: true
+        run: |
+          echo '' && echo '=== config.log ===' && echo ''
+          cat tmp_install/build/config.log
+          echo '' && echo '=== configure.log ===' && echo ''
+          cat tmp_install/build/configure.log
+
       - name: Cache cargo deps
         id: cache_cargo
         uses: actions/cache@v2

From 93467eae1f1b6dd0bb66bc0a2869b3f6e3f6afe5 Mon Sep 17 00:00:00 2001
From: Stas Kelvich <stas.kelvich@gmail.com>
Date: Sat, 22 Oct 2022 02:26:28 +0300
Subject: [PATCH 005/412] Hotfix to disable grant create on public schema

`GRANT CREATE ON SCHEMA public` fails if there is no schema `public`.
Disable it in release for now and make a better fix later (it is
needed for v15 support).
---
 compute_tools/src/spec.rs | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index e0c0e9404b..1e7cd51b6e 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -423,11 +423,11 @@ pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> {
         );
         db_client.simple_query(&alter_query)?;
 
-        // Explicitly grant CREATE ON SCHEMA PUBLIC to the web_access user.
-        // This is needed since postgres 15, where this privilege is removed by default.
-        let grant_query: String = "GRANT CREATE ON SCHEMA public TO web_access".to_string();
-        info!("grant query for db {} : {}", &db.name, &grant_query);
-        db_client.simple_query(&grant_query)?;
+        // // Explicitly grant CREATE ON SCHEMA PUBLIC to the web_access user.
+        // // This is needed since postgres 15, where this privilege is removed by default.
+        // let grant_query: String = "GRANT CREATE ON SCHEMA public TO web_access".to_string();
+        // info!("grant query for db {} : {}", &db.name, &grant_query);
+        // db_client.simple_query(&grant_query)?;
     }
 
     Ok(())

From 323c4ecb4fc67d3ca63de9800fcafa00dfde9a91 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 25 Oct 2022 16:41:50 +0200
Subject: [PATCH 006/412] Add data format backward compatibility tests (#2626)

---
 .../actions/run-python-test-set/action.yml    |  18 ++
 .github/workflows/build_and_test.yml          |  23 +-
 poetry.lock                                   |  52 +++-
 pyproject.toml                                |   2 +
 test_runner/fixtures/neon_fixtures.py         |   2 +-
 test_runner/regress/test_compatibility.py     | 267 ++++++++++++++++++
 6 files changed, 352 insertions(+), 12 deletions(-)
 create mode 100644 test_runner/regress/test_compatibility.py

diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index cc6ab65b76..07cb7edbe7 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -73,6 +73,13 @@ runs:
       shell: bash -euxo pipefail {0}
       run: ./scripts/pysync
 
+    - name: Download compatibility snapshot for Postgres 14
+      uses: ./.github/actions/download
+      with:
+        name: compatibility-snapshot-${{ inputs.build_type }}-pg14
+        path: /tmp/compatibility_snapshot_pg14
+        prefix: latest
+
     - name: Run pytest
       env:
         NEON_BIN: /tmp/neon/bin
@@ -80,6 +87,8 @@ runs:
         BUILD_TYPE: ${{ inputs.build_type }}
         AWS_ACCESS_KEY_ID: ${{ inputs.real_s3_access_key_id }}
         AWS_SECRET_ACCESS_KEY: ${{ inputs.real_s3_secret_access_key }}
+        COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg14
+        ALLOW_BREAKING_CHANGES: contains(github.event.pull_request.labels.*.name, 'breaking changes')
       shell: bash -euxo pipefail {0}
       run: |
         # PLATFORM will be embedded in the perf test report
@@ -154,6 +163,15 @@ runs:
           scripts/generate_and_push_perf_report.sh
         fi
 
+    - name: Upload compatibility snapshot for Postgres 14
+      if: github.ref_name == 'release'
+      uses: ./.github/actions/upload
+      with:
+        name: compatibility-snapshot-${{ inputs.build_type }}-pg14-${{ github.run_id }}
+        # The path includes a test name (test_prepare_snapshot) and directory that the test creates (compatibility_snapshot_pg14), keep the path in sync with the test
+        path: /tmp/test_output/test_prepare_snapshot/compatibility_snapshot_pg14/
+        prefix: latest
+
     - name: Create Allure report
       if: always()
       uses: ./.github/actions/allure-report
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 14ee61c5b9..660f93b025 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -844,7 +844,7 @@ jobs:
           submodules: true
           fetch-depth: 0
 
-      - name: Configure environment 
+      - name: Configure environment
         run: |
           helm repo add neondatabase https://neondatabase.github.io/helm-charts
           aws --region us-east-2 eks update-kubeconfig --name dev-us-east-2-beta --role-arn arn:aws:iam::369495373322:role/github-runner
@@ -853,3 +853,24 @@ jobs:
         run: |
           DOCKER_TAG=${{needs.tag.outputs.build-tag}}
           helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
+
+  promote-compatibility-test-snapshot:
+    runs-on: dev
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      options: --init
+    needs: [ deploy, deploy-proxy ]
+    if: github.ref_name == 'release' && github.event_name != 'workflow_dispatch'
+    steps:
+      - name: Promote compatibility snapshot for the release
+        shell: bash -euxo pipefail {0}
+        env:
+          BUCKET: neon-github-public-dev
+          PREFIX: artifacts/latest
+        run: |
+          for build_type in debug release; do
+            OLD_FILENAME=compatibility-snapshot-${build_type}-pg14-${GITHUB_RUN_ID}.tar.zst
+            NEW_FILENAME=compatibility-snapshot-${build_type}-pg14.tar.zst
+
+            time aws s3 mv --only-show-errors s3://${BUCKET}/${PREFIX}/${OLD_FILENAME} s3://${BUCKET}/${PREFIX}/${NEW_FILENAME}
+          done
diff --git a/poetry.lock b/poetry.lock
index 27de8508ce..dfcb16107f 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -11,7 +11,7 @@ async-timeout = ">=3.0,<5.0"
 psycopg2-binary = ">=2.8.4"
 
 [package.extras]
-sa = ["sqlalchemy[postgresql_psycopg2binary] (>=1.3,<1.5)"]
+sa = ["sqlalchemy[postgresql-psycopg2binary] (>=1.3,<1.5)"]
 
 [[package]]
 name = "allure-pytest"
@@ -80,7 +80,7 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
 dev = ["cloudpickle", "coverage[toml] (>=5.0.2)", "furo", "hypothesis", "mypy", "pre-commit", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "sphinx", "sphinx-notfound-page", "zope.interface"]
 docs = ["furo", "sphinx", "sphinx-notfound-page", "zope.interface"]
 tests = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "zope.interface"]
-tests_no_zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six"]
+tests-no-zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six"]
 
 [[package]]
 name = "aws-sam-translator"
@@ -560,7 +560,7 @@ optional = false
 python-versions = ">=3.6.0"
 
 [package.extras]
-unicode_backport = ["unicodedata2"]
+unicode-backport = ["unicodedata2"]
 
 [[package]]
 name = "click"
@@ -593,7 +593,7 @@ python-versions = ">=3.6"
 cffi = ">=1.12"
 
 [package.extras]
-docs = ["sphinx (>=1.6.5,!=1.8.0,!=3.1.0,!=3.1.1)", "sphinx-rtd-theme"]
+docs = ["sphinx (>=1.6.5,!=1.8.0,!=3.1.0,!=3.1.1)", "sphinx_rtd_theme"]
 docstest = ["pyenchant (>=1.6.11)", "sphinxcontrib-spelling (>=4.0.1)", "twine (>=1.12.0)"]
 pep8test = ["black", "flake8", "flake8-import-order", "pep8-naming"]
 sdist = ["setuptools_rust (>=0.11.4)"]
@@ -738,9 +738,9 @@ python-versions = ">=3.6.1,<4.0"
 
 [package.extras]
 colors = ["colorama (>=0.4.3,<0.5.0)"]
-pipfile_deprecated_finder = ["pipreqs", "requirementslib"]
+pipfile-deprecated-finder = ["pipreqs", "requirementslib"]
 plugins = ["setuptools"]
-requirements_deprecated_finder = ["pip-api", "pipreqs"]
+requirements-deprecated-finder = ["pip-api", "pipreqs"]
 
 [[package]]
 name = "itsdangerous"
@@ -815,7 +815,7 @@ python-versions = ">=2.7"
 [package.extras]
 docs = ["jaraco.packaging (>=3.2)", "rst.linker (>=1.9)", "sphinx"]
 testing = ["ecdsa", "enum34", "feedparser", "jsonlib", "numpy", "pandas", "pymongo", "pytest (>=3.5,!=3.7.3)", "pytest-black-multipy", "pytest-checkdocs (>=1.2.3)", "pytest-cov", "pytest-flake8 (<1.1.0)", "pytest-flake8 (>=1.1.1)", "scikit-learn", "sqlalchemy"]
-"testing.libs" = ["simplejson", "ujson", "yajl"]
+testing-libs = ["simplejson", "ujson", "yajl"]
 
 [[package]]
 name = "jsonpointer"
@@ -836,11 +836,12 @@ python-versions = "*"
 [package.dependencies]
 attrs = ">=17.4.0"
 pyrsistent = ">=0.14.0"
+setuptools = "*"
 six = ">=1.11.0"
 
 [package.extras]
 format = ["idna", "jsonpointer (>1.13)", "rfc3987", "strict-rfc3339", "webcolors"]
-format_nongpl = ["idna", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "webcolors"]
+format-nongpl = ["idna", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "webcolors"]
 
 [[package]]
 name = "junit-xml"
@@ -900,6 +901,7 @@ pytz = "*"
 PyYAML = {version = ">=5.1", optional = true, markers = "extra == \"server\""}
 requests = ">=2.5"
 responses = ">=0.9.0"
+setuptools = {version = "*", optional = true, markers = "extra == \"server\""}
 sshpubkeys = {version = ">=3.1.0", optional = true, markers = "extra == \"server\""}
 werkzeug = ">=0.5,<2.2.0"
 xmltodict = "*"
@@ -1008,6 +1010,7 @@ python-versions = ">=3.7.0,<4.0.0"
 jsonschema = ">=3.2.0,<5.0.0"
 openapi-schema-validator = ">=0.2.0,<0.3.0"
 PyYAML = ">=5.1"
+setuptools = "*"
 
 [package.extras]
 requests = ["requests"]
@@ -1340,7 +1343,7 @@ urllib3 = ">=1.21.1,<1.27"
 
 [package.extras]
 socks = ["PySocks (>=1.5.6,!=1.5.7)"]
-use_chardet_on_py3 = ["chardet (>=3.0.2,<6)"]
+use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
 
 [[package]]
 name = "responses"
@@ -1394,6 +1397,19 @@ python-versions = ">= 2.7"
 attrs = "*"
 pbr = "*"
 
+[[package]]
+name = "setuptools"
+version = "65.5.0"
+description = "Easily download, build, install, upgrade, and uninstall Python packages"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+
+[package.extras]
+docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-notfound-page (==0.8.3)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
+testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8 (<5)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mock", "pip (>=19.1)", "pip-run (>=8.8)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
+testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"]
+
 [[package]]
 name = "six"
 version = "1.16.0"
@@ -1460,6 +1476,14 @@ category = "main"
 optional = false
 python-versions = ">=3.7,<4.0"
 
+[[package]]
+name = "types-toml"
+version = "0.10.8"
+description = "Typing stubs for toml"
+category = "dev"
+optional = false
+python-versions = "*"
+
 [[package]]
 name = "types-urllib3"
 version = "1.26.17"
@@ -1544,7 +1568,7 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>=
 [metadata]
 lock-version = "1.1"
 python-versions = "^3.9"
-content-hash = "ead1495454ee6d880bb240447025db93a25ebe263c2709de5f144cc2d85dc975"
+content-hash = "17cdbfe90f1b06dffaf24c3e076384ec08dd4a2dce5a05e50565f7364932eb2d"
 
 [metadata.files]
 aiopg = [
@@ -2182,6 +2206,10 @@ sarif-om = [
     {file = "sarif_om-1.0.4-py3-none-any.whl", hash = "sha256:539ef47a662329b1c8502388ad92457425e95dc0aaaf995fe46f4984c4771911"},
     {file = "sarif_om-1.0.4.tar.gz", hash = "sha256:cd5f416b3083e00d402a92e449a7ff67af46f11241073eea0461802a3b5aef98"},
 ]
+setuptools = [
+    {file = "setuptools-65.5.0-py3-none-any.whl", hash = "sha256:f62ea9da9ed6289bfe868cd6845968a2c854d1427f8548d52cae02a42b4f0356"},
+    {file = "setuptools-65.5.0.tar.gz", hash = "sha256:512e5536220e38146176efb833d4a62aa726b7bbff82cfbc8ba9eaa3996e0b17"},
+]
 six = [
     {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"},
     {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
@@ -2210,6 +2238,10 @@ types-s3transfer = [
     {file = "types-s3transfer-0.6.0.post3.tar.gz", hash = "sha256:92c3704e5d041202bfb5ddb79d083fd1a02de2c5dfec6a91576823e6b5c93993"},
     {file = "types_s3transfer-0.6.0.post3-py3-none-any.whl", hash = "sha256:eedc5117275565b3c83662c0ccc81662a34da5dda8bd502b89d296b6d5cb091d"},
 ]
+types-toml = [
+    {file = "types-toml-0.10.8.tar.gz", hash = "sha256:b7e7ea572308b1030dc86c3ba825c5210814c2825612ec679eb7814f8dd9295a"},
+    {file = "types_toml-0.10.8-py3-none-any.whl", hash = "sha256:8300fd093e5829eb9c1fba69cee38130347d4b74ddf32d0a7df650ae55c2b599"},
+]
 types-urllib3 = [
     {file = "types-urllib3-1.26.17.tar.gz", hash = "sha256:73fd274524c3fc7cd8cd9ceb0cb67ed99b45f9cb2831013e46d50c1451044800"},
     {file = "types_urllib3-1.26.17-py3-none-any.whl", hash = "sha256:0d027fcd27dbb3cb532453b4d977e05bc1e13aefd70519866af211b3003d895d"},
diff --git a/pyproject.toml b/pyproject.toml
index 1ee6fbe6f4..765e0b97eb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,12 +28,14 @@ Werkzeug = "2.1.2"
 pytest-order = "^1.0.1"
 allure-pytest = "^2.10.0"
 pytest-asyncio = "^0.19.0"
+toml = "^0.10.2"
 
 [tool.poetry.dev-dependencies]
 flake8 = "^5.0.4"
 mypy = "==0.971"
 black = "^22.6.0"
 isort = "^5.10.1"
+types-toml = "^0.10.8"
 
 [build-system]
 requires = ["poetry-core>=1.0.0"]
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 4b2638bb2a..38a0db7cf7 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -970,7 +970,7 @@ class NeonPageserverApiException(Exception):
 
 
 class NeonPageserverHttpClient(requests.Session):
-    def __init__(self, port: int, is_testing_enabled_or_skip, auth_token: Optional[str] = None):
+    def __init__(self, port: int, is_testing_enabled_or_skip: Fn, auth_token: Optional[str] = None):
         super().__init__()
         self.port = port
         self.auth_token = auth_token
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
new file mode 100644
index 0000000000..944ff64390
--- /dev/null
+++ b/test_runner/regress/test_compatibility.py
@@ -0,0 +1,267 @@
+import os
+import re
+import shutil
+import subprocess
+from pathlib import Path
+from typing import Any, Dict, Union
+
+import pytest
+import toml
+from fixtures.neon_fixtures import (
+    NeonCli,
+    NeonEnvBuilder,
+    NeonPageserverHttpClient,
+    PgBin,
+    PortDistributor,
+    wait_for_last_record_lsn,
+    wait_for_upload,
+)
+from fixtures.types import Lsn
+from pytest import FixtureRequest
+
+
+def dump_differs(first: Path, second: Path, output: Path) -> bool:
+    """
+    Runs diff(1) command on two SQL dumps and write the output to the given output file.
+    Returns True if the dumps differ, False otherwise.
+    """
+
+    with output.open("w") as stdout:
+        rv = subprocess.run(
+            [
+                "diff",
+                "--unified",  # Make diff output more readable
+                "--ignore-matching-lines=^--",  # Ignore changes in comments
+                "--ignore-blank-lines",
+                str(first),
+                str(second),
+            ],
+            stdout=stdout,
+        )
+
+    return rv.returncode != 0
+
+
+class PortReplacer(object):
+    """
+    Class-helper for replacing ports in config files.
+    """
+
+    def __init__(self, port_distributor: PortDistributor):
+        self.port_distributor = port_distributor
+        self.port_map: Dict[int, int] = {}
+
+    def replace_port(self, value: Union[int, str]) -> Union[int, str]:
+        if isinstance(value, int):
+            if (known_port := self.port_map.get(value)) is not None:
+                return known_port
+
+            self.port_map[value] = self.port_distributor.get_port()
+            return self.port_map[value]
+
+        if isinstance(value, str):
+            # Use regex to find port in a string
+            # urllib.parse.urlparse produces inconvenient results for cases without scheme like "localhost:5432"
+            # See https://bugs.python.org/issue27657
+            ports = re.findall(r":(\d+)(?:/|$)", value)
+            assert len(ports) == 1, f"can't find port in {value}"
+            port_int = int(ports[0])
+
+            if (known_port := self.port_map.get(port_int)) is not None:
+                return value.replace(f":{port_int}", f":{known_port}")
+
+            self.port_map[port_int] = self.port_distributor.get_port()
+            return value.replace(f":{port_int}", f":{self.port_map[port_int]}")
+
+        raise TypeError(f"unsupported type {type(value)} of {value=}")
+
+
+def test_backward_compatibility(
+    pg_bin: PgBin, port_distributor: PortDistributor, test_output_dir: Path, request: FixtureRequest
+):
+    compatibility_snapshot_dir_env = os.environ.get("COMPATIBILITY_SNAPSHOT_DIR")
+    assert (
+        compatibility_snapshot_dir_env is not None
+    ), "COMPATIBILITY_SNAPSHOT_DIR is not set. It should be set to `compatibility_snapshot_pg14` path generateted by test_prepare_snapshot"
+    compatibility_snapshot_dir = Path(compatibility_snapshot_dir_env).resolve()
+
+    # Make compatibility snapshot artifacts pickupable by Allure
+    # by copying the snapshot directory to the curent test output directory.
+    repo_dir = test_output_dir / "compatibility_snapshot" / "repo"
+
+    shutil.copytree(compatibility_snapshot_dir / "repo", repo_dir)
+
+    # Remove old logs to avoid confusion in test artifacts
+    for logfile in repo_dir.glob("**/*.log"):
+        logfile.unlink()
+
+    # Remove tenants data for computes
+    for tenant in (repo_dir / "pgdatadirs" / "tenants").glob("*"):
+        shutil.rmtree(tenant)
+
+    # Remove wal-redo temp directory
+    for tenant in (repo_dir / "tenants").glob("*"):
+        shutil.rmtree(tenant / "wal-redo-datadir.___temp")
+
+    # Update paths and ports in config files
+    pr = PortReplacer(port_distributor)
+
+    pageserver_toml = repo_dir / "pageserver.toml"
+    pageserver_config = toml.load(pageserver_toml)
+    new_local_path = pageserver_config["remote_storage"]["local_path"].replace(
+        "/test_prepare_snapshot/",
+        "/test_backward_compatibility/compatibility_snapshot/",
+    )
+
+    pageserver_config["remote_storage"]["local_path"] = new_local_path
+    pageserver_config["listen_http_addr"] = pr.replace_port(pageserver_config["listen_http_addr"])
+    pageserver_config["listen_pg_addr"] = pr.replace_port(pageserver_config["listen_pg_addr"])
+    pageserver_config["broker_endpoints"] = [
+        pr.replace_port(ep) for ep in pageserver_config["broker_endpoints"]
+    ]
+
+    with pageserver_toml.open("w") as f:
+        toml.dump(pageserver_config, f)
+
+    snapshot_config_toml = repo_dir / "config"
+    snapshot_config = toml.load(snapshot_config_toml)
+    snapshot_config["etcd_broker"]["broker_endpoints"] = [
+        pr.replace_port(ep) for ep in snapshot_config["etcd_broker"]["broker_endpoints"]
+    ]
+    snapshot_config["pageserver"]["listen_http_addr"] = pr.replace_port(
+        snapshot_config["pageserver"]["listen_http_addr"]
+    )
+    snapshot_config["pageserver"]["listen_pg_addr"] = pr.replace_port(
+        snapshot_config["pageserver"]["listen_pg_addr"]
+    )
+    for sk in snapshot_config["safekeepers"]:
+        sk["http_port"] = pr.replace_port(sk["http_port"])
+        sk["pg_port"] = pr.replace_port(sk["pg_port"])
+
+    with (snapshot_config_toml).open("w") as f:
+        toml.dump(snapshot_config, f)
+
+    # Ensure that snapshot doesn't contain references to the original path
+    rv = subprocess.run(
+        [
+            "grep",
+            "--recursive",
+            "--binary-file=without-match",
+            "--files-with-matches",
+            "test_prepare_snapshot/repo",
+            str(repo_dir),
+        ],
+        capture_output=True,
+        text=True,
+    )
+    assert (
+        rv.returncode != 0
+    ), f"there're files referencing `test_prepare_snapshot/repo`, this path should be replaced with {repo_dir}:\n{rv.stdout}"
+
+    # NeonEnv stub to make NeonCli happy
+    config: Any = type("NeonEnvStub", (object,), {})
+    config.rust_log_override = None
+    config.repo_dir = repo_dir
+    config.pg_version = "14"  # Note: `pg_dumpall` (from pg_bin) version is set by DEFAULT_PG_VERSION_DEFAULT and can be overriden by DEFAULT_PG_VERSION env var
+    config.initial_tenant = snapshot_config["default_tenant_id"]
+
+    # Check that we can start the project
+    cli = NeonCli(config)
+    try:
+        cli.raw_cli(["start"])
+        request.addfinalizer(lambda: cli.raw_cli(["stop"]))
+
+        result = cli.pg_start("main")
+        request.addfinalizer(lambda: cli.pg_stop("main"))
+    except Exception:
+        breaking_changes_allowed = (
+            os.environ.get("ALLOW_BREAKING_CHANGES", "false").lower() == "true"
+        )
+        if breaking_changes_allowed:
+            pytest.xfail("Breaking changes are allowed by ALLOW_BREAKING_CHANGES env var")
+        else:
+            raise
+
+    connstr_all = re.findall(r"Starting postgres node at '([^']+)'", result.stdout)
+    assert len(connstr_all) == 1, f"can't parse connstr from {result.stdout}"
+    connstr = connstr_all[0]
+
+    # Check that the project produces the same dump as the previous version.
+    # The assert itself deferred to the end of the test
+    # to allow us to perform checks that change data before failing
+    pg_bin.run(["pg_dumpall", f"--dbname={connstr}", f"--file={test_output_dir / 'dump.sql'}"])
+    initial_dump_differs = dump_differs(
+        compatibility_snapshot_dir / "dump.sql",
+        test_output_dir / "dump.sql",
+        test_output_dir / "dump.filediff",
+    )
+
+    # Check that project can be recovered from WAL
+    # loosely based on https://github.com/neondatabase/cloud/wiki/Recovery-from-WAL
+    tenant_id = snapshot_config["default_tenant_id"]
+    timeline_id = dict(snapshot_config["branch_name_mappings"]["main"])[tenant_id]
+    pageserver_port = snapshot_config["pageserver"]["listen_http_addr"].split(":")[-1]
+    auth_token = snapshot_config["pageserver"]["auth_token"]
+    pageserver_http = NeonPageserverHttpClient(
+        port=pageserver_port,
+        is_testing_enabled_or_skip=lambda: True,  # TODO: check if testing really enabled
+        auth_token=auth_token,
+    )
+
+    shutil.rmtree(repo_dir / "local_fs_remote_storage")
+    pageserver_http.timeline_delete(tenant_id, timeline_id)
+    pageserver_http.timeline_create(tenant_id, timeline_id)
+    pg_bin.run(
+        ["pg_dumpall", f"--dbname={connstr}", f"--file={test_output_dir / 'dump-from-wal.sql'}"]
+    )
+    # The assert itself deferred to the end of the test
+    # to allow us to perform checks that change data before failing
+    dump_from_wal_differs = dump_differs(
+        test_output_dir / "dump.sql",
+        test_output_dir / "dump-from-wal.sql",
+        test_output_dir / "dump-from-wal.filediff",
+    )
+
+    # Check that we can interract with the data
+    pg_bin.run(["pgbench", "--time=10", "--progress=2", connstr])
+
+    assert not dump_from_wal_differs, "dump from WAL differs"
+    assert not initial_dump_differs, "initial dump differs"
+
+
+@pytest.mark.order(after="test_backward_compatibility")
+# Note: if renaming this test, don't forget to update a reference to it in a workflow file:
+# "Upload compatibility snapshot" step in .github/actions/run-python-test-set/action.yml
+def test_prepare_snapshot(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, test_output_dir: Path):
+    # The test doesn't really test anything
+    # it creates a new snapshot for releases after we tested the current version against the previous snapshot in `test_backward_compatibility`.
+    #
+    # There's no cleanup here, it allows to adjust the data in `test_backward_compatibility` itself without re-collecting it.
+    neon_env_builder.pg_version = "14"
+    neon_env_builder.num_safekeepers = 3
+    neon_env_builder.enable_local_fs_remote_storage()
+
+    env = neon_env_builder.init_start()
+    pg = env.postgres.create_start("main")
+    pg_bin.run(["pgbench", "--initialize", "--scale=10", pg.connstr()])
+    pg_bin.run(["pgbench", "--time=60", "--progress=2", pg.connstr()])
+    pg_bin.run(["pg_dumpall", f"--dbname={pg.connstr()}", f"--file={test_output_dir / 'dump.sql'}"])
+
+    snapshot_config = toml.load(test_output_dir / "repo" / "config")
+    tenant_id = snapshot_config["default_tenant_id"]
+    timeline_id = dict(snapshot_config["branch_name_mappings"]["main"])[tenant_id]
+
+    pageserver_http = env.pageserver.http_client()
+    lsn = Lsn(pg.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
+
+    pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
+    wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, lsn)
+    wait_for_upload(pageserver_http, tenant_id, timeline_id, lsn)
+
+    env.postgres.stop_all()
+    for sk in env.safekeepers:
+        sk.stop()
+    env.pageserver.stop()
+
+    shutil.copytree(test_output_dir, test_output_dir / "compatibility_snapshot_pg14")
+    # Directory `test_output_dir / "compatibility_snapshot_pg14"` is uploaded to S3 in a workflow, keep the name in sync with it

From 7a491f52c451172dd62729b93b35359145ee4661 Mon Sep 17 00:00:00 2001
From: bojanserafimov <bojan.serafimov7@gmail.com>
Date: Tue, 25 Oct 2022 11:25:22 -0400
Subject: [PATCH 007/412] Add draw_timeline binary (#2688)

---
 Cargo.lock                              |   7 ++
 Dockerfile                              |   3 +-
 pageserver/Cargo.toml                   |   1 +
 pageserver/src/bin/draw_timeline_dir.rs | 150 ++++++++++++++++++++++++
 4 files changed, 160 insertions(+), 1 deletion(-)
 create mode 100644 pageserver/src/bin/draw_timeline_dir.rs

diff --git a/Cargo.lock b/Cargo.lock
index 13774f7fe6..b39ca6e5a7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2170,6 +2170,7 @@ dependencies = [
  "serde_json",
  "serde_with",
  "signal-hook",
+ "svg_fmt",
  "tar",
  "tempfile",
  "thiserror",
@@ -3461,6 +3462,12 @@ version = "2.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6bdef32e8150c2a081110b42772ffe7d7c9032b606bc226c8260fd97e0976601"
 
+[[package]]
+name = "svg_fmt"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8fb1df15f412ee2e9dfc1c504260fa695c1c3f10fe9f4a6ee2d2184d7d6450e2"
+
 [[package]]
 name = "symbolic-common"
 version = "8.8.0"
diff --git a/Dockerfile b/Dockerfile
index cb4e213687..b0d934d480 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -44,7 +44,7 @@ COPY . .
 # Show build caching stats to check if it was used in the end.
 # Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
 RUN set -e \
-&& mold -run cargo build --bin pageserver --bin pageserver_binutils --bin safekeeper --bin proxy --locked --release \
+&& mold -run cargo build --bin pageserver --bin pageserver_binutils --bin draw_timeline_dir --bin safekeeper --bin proxy --locked --release \
     && cachepot -s
 
 # Build final image
@@ -65,6 +65,7 @@ RUN set -e \
 
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver_binutils /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/draw_timeline_dir   /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy               /usr/local/bin
 
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 2139e24ee2..b075b86aa1 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -67,6 +67,7 @@ remote_storage = { path = "../libs/remote_storage" }
 workspace_hack = { version = "0.1", path = "../workspace_hack" }
 close_fds = "0.3.2"
 walkdir = "2.3.2"
+svg_fmt = "0.4.1"
 
 [dev-dependencies]
 criterion = "0.4"
diff --git a/pageserver/src/bin/draw_timeline_dir.rs b/pageserver/src/bin/draw_timeline_dir.rs
new file mode 100644
index 0000000000..ea1ff7f3c7
--- /dev/null
+++ b/pageserver/src/bin/draw_timeline_dir.rs
@@ -0,0 +1,150 @@
+//! A tool for visualizing the arrangement of layerfiles within a timeline.
+//!
+//! It reads filenames from stdin and prints a svg on stdout. The image is a plot in
+//! page-lsn space, where every delta layer is a rectangle and every image layer is a
+//! thick line. Legend:
+//! - The x axis (left to right) represents page index.
+//! - The y axis represents LSN, growing upwards.
+//!
+//! Coordinates in both axis are compressed for better readability.
+//! (see https://medium.com/algorithms-digest/coordinate-compression-2fff95326fb)
+//!
+//! Example use:
+//! ```
+//! $ cd test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE
+//! $ ls | grep "__" | cargo run --release --bin draw_timeline_dir > out.svg
+//! $ firefox out.svg
+//! ```
+//!
+//! This API was chosen so that we can easily work with filenames extracted from ssh,
+//! or from pageserver log files.
+//!
+//! TODO Consider shipping this as a grafana panel plugin:
+//!      https://grafana.com/tutorials/build-a-panel-plugin/
+use anyhow::Result;
+use pageserver::repository::Key;
+use std::cmp::Ordering;
+use std::io::{self, BufRead};
+use std::{
+    collections::{BTreeMap, BTreeSet},
+    ops::Range,
+};
+use svg_fmt::{rectangle, rgb, BeginSvg, EndSvg, Fill, Stroke};
+use utils::{lsn::Lsn, project_git_version};
+
+project_git_version!(GIT_VERSION);
+
+// Map values to their compressed coordinate - the index the value
+// would have in a sorted and deduplicated list of all values.
+fn build_coordinate_compression_map<T: Ord + Copy>(coords: Vec<T>) -> BTreeMap<T, usize> {
+    let set: BTreeSet<T> = coords.into_iter().collect();
+
+    let mut map: BTreeMap<T, usize> = BTreeMap::new();
+    for (i, e) in set.iter().enumerate() {
+        map.insert(*e, i);
+    }
+
+    map
+}
+
+fn parse_filename(name: &str) -> (Range<Key>, Range<Lsn>) {
+    let split: Vec<&str> = name.split("__").collect();
+    let keys: Vec<&str> = split[0].split('-').collect();
+    let mut lsns: Vec<&str> = split[1].split('-').collect();
+    if lsns.len() == 1 {
+        lsns.push(lsns[0]);
+    }
+
+    let keys = Key::from_hex(keys[0]).unwrap()..Key::from_hex(keys[1]).unwrap();
+    let lsns = Lsn::from_hex(lsns[0]).unwrap()..Lsn::from_hex(lsns[1]).unwrap();
+    (keys, lsns)
+}
+
+fn main() -> Result<()> {
+    // Parse layer filenames from stdin
+    let mut ranges: Vec<(Range<Key>, Range<Lsn>)> = vec![];
+    let stdin = io::stdin();
+    for line in stdin.lock().lines() {
+        let range = parse_filename(&line.unwrap());
+        ranges.push(range);
+    }
+
+    // Collect all coordinates
+    let mut keys: Vec<Key> = vec![];
+    let mut lsns: Vec<Lsn> = vec![];
+    for (keyr, lsnr) in &ranges {
+        keys.push(keyr.start);
+        keys.push(keyr.end);
+        lsns.push(lsnr.start);
+        lsns.push(lsnr.end);
+    }
+
+    // Analyze
+    let key_map = build_coordinate_compression_map(keys);
+    let lsn_map = build_coordinate_compression_map(lsns);
+
+    // Initialize stats
+    let mut num_deltas = 0;
+    let mut num_images = 0;
+
+    // Draw
+    let stretch = 3.0; // Stretch out vertically for better visibility
+    println!(
+        "{}",
+        BeginSvg {
+            w: key_map.len() as f32,
+            h: stretch * lsn_map.len() as f32
+        }
+    );
+    for (keyr, lsnr) in &ranges {
+        let key_start = *key_map.get(&keyr.start).unwrap();
+        let key_end = *key_map.get(&keyr.end).unwrap();
+        let key_diff = key_end - key_start;
+        let lsn_max = lsn_map.len();
+
+        if key_start >= key_end {
+            panic!("Invalid key range {}-{}", key_start, key_end);
+        }
+
+        let lsn_start = *lsn_map.get(&lsnr.start).unwrap();
+        let lsn_end = *lsn_map.get(&lsnr.end).unwrap();
+
+        let mut lsn_diff = (lsn_end - lsn_start) as f32;
+        let mut fill = Fill::None;
+        let mut margin = 0.05 * lsn_diff; // Height-dependent margin to disambiguate overlapping deltas
+        let mut lsn_offset = 0.0;
+
+        // Fill in and thicken rectangle if it's an
+        // image layer so that we can see it.
+        match lsn_start.cmp(&lsn_end) {
+            Ordering::Less => num_deltas += 1,
+            Ordering::Equal => {
+                num_images += 1;
+                lsn_diff = 0.3;
+                lsn_offset = -lsn_diff / 2.0;
+                margin = 0.05;
+                fill = Fill::Color(rgb(0, 0, 0));
+            }
+            Ordering::Greater => panic!("Invalid lsn range {}-{}", lsn_start, lsn_end),
+        }
+
+        println!(
+            "    {}",
+            rectangle(
+                key_start as f32 + stretch * margin,
+                stretch * (lsn_max as f32 - (lsn_end as f32 - margin - lsn_offset)),
+                key_diff as f32 - stretch * 2.0 * margin,
+                stretch * (lsn_diff - 2.0 * margin)
+            )
+            .fill(fill)
+            .stroke(Stroke::Color(rgb(0, 0, 0), 0.1))
+            .border_radius(0.4)
+        );
+    }
+    println!("{}", EndSvg);
+
+    eprintln!("num_images: {}", num_images);
+    eprintln!("num_deltas: {}", num_deltas);
+
+    Ok(())
+}

From 70c3d18bb0c72992de1438c0f77a4e2b9c72fcab Mon Sep 17 00:00:00 2001
From: Sergey Melnikov <sergey@neon.tech>
Date: Wed, 26 Oct 2022 02:51:23 +0300
Subject: [PATCH 008/412] Do not release to new staging proxies on release
 (#2685)

---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 660f93b025..1b8b380179 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -832,7 +832,7 @@ jobs:
     # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
     needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
     if: |
-      (github.ref_name == 'main' || github.ref_name == 'release') &&
+      (github.ref_name == 'main') &&
       github.event_name != 'workflow_dispatch'
     defaults:
       run:

From ede70d833cc547c00dd6496eee4483a98e39a04a Mon Sep 17 00:00:00 2001
From: mikecaat <35882227+mikecaat@users.noreply.github.com>
Date: Wed, 26 Oct 2022 19:59:25 +0900
Subject: [PATCH 009/412] Add a docker-compose example file (#1943) (#2666)

Co-authored-by: Masahiro Ikeda <masahiro.ikeda.us@hco.ntt.co.jp>
---
 docker-compose/compute/shell/compute.sh       |  48 +++++
 .../compute/var/db/postgres/specs/spec.json   | 141 ++++++++++++
 docker-compose/docker-compose.yml             | 200 ++++++++++++++++++
 docker-compose/image/compute/Dockerfile       |  10 +
 docs/docker.md                                |  64 ++++++
 scripts/docker-compose_test.sh                |  51 +++++
 6 files changed, 514 insertions(+)
 create mode 100755 docker-compose/compute/shell/compute.sh
 create mode 100644 docker-compose/compute/var/db/postgres/specs/spec.json
 create mode 100644 docker-compose/docker-compose.yml
 create mode 100644 docker-compose/image/compute/Dockerfile
 create mode 100755 scripts/docker-compose_test.sh

diff --git a/docker-compose/compute/shell/compute.sh b/docker-compose/compute/shell/compute.sh
new file mode 100755
index 0000000000..cef2b485f3
--- /dev/null
+++ b/docker-compose/compute/shell/compute.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+set -eux
+
+PG_VERSION=${PG_VERSION:-14}
+
+SPEC_FILE_ORG=/var/db/postgres/specs/spec.json
+SPEC_FILE=/tmp/spec.json
+
+echo "Waiting pageserver become ready."
+while ! nc -z pageserver 6400; do
+     sleep 1;
+done
+echo "Page server is ready."
+
+echo "Create a tenant and timeline"
+PARAMS=(
+     -sb 
+     -X POST
+     -H "Content-Type: application/json"
+     -d "{}"
+     http://pageserver:9898/v1/tenant/
+)
+tenant_id=$(curl "${PARAMS[@]}" | sed 's/"//g')
+
+PARAMS=(
+     -sb 
+     -X POST
+     -H "Content-Type: application/json"
+     -d "{\"tenant_id\":\"${tenant_id}\", \"pg_version\": ${PG_VERSION}}"
+     "http://pageserver:9898/v1/tenant/${tenant_id}/timeline/"
+)
+result=$(curl "${PARAMS[@]}")
+echo $result | jq .
+
+echo "Overwrite tenant id and timeline id in spec file"
+tenant_id=$(echo ${result} | jq -r .tenant_id)
+timeline_id=$(echo ${result} | jq -r .timeline_id)
+
+sed "s/TENANT_ID/${tenant_id}/" ${SPEC_FILE_ORG} > ${SPEC_FILE}
+sed -i "s/TIMELINE_ID/${timeline_id}/" ${SPEC_FILE}
+
+cat ${SPEC_FILE}
+
+echo "Start compute node"
+/usr/local/bin/compute_ctl --pgdata /var/db/postgres/compute \
+     -C "postgresql://cloud_admin@localhost:55433/postgres"  \
+     -b /usr/local/bin/postgres                              \
+     -S ${SPEC_FILE}
diff --git a/docker-compose/compute/var/db/postgres/specs/spec.json b/docker-compose/compute/var/db/postgres/specs/spec.json
new file mode 100644
index 0000000000..10ae0b0ecf
--- /dev/null
+++ b/docker-compose/compute/var/db/postgres/specs/spec.json
@@ -0,0 +1,141 @@
+{
+    "format_version": 1.0,
+
+    "timestamp": "2022-10-12T18:00:00.000Z",
+    "operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8c",
+
+    "cluster": {
+        "cluster_id": "docker_compose",
+        "name": "docker_compose_test",
+        "state": "restarted",
+        "roles": [
+            {
+                "name": "cloud_admin",
+                "encrypted_password": "b093c0d3b281ba6da1eacc608620abd8",
+                "options": null
+            }
+        ],
+        "databases": [
+        ],
+        "settings": [
+            {
+                "name": "fsync",
+                "value": "off",
+                "vartype": "bool"
+            },
+            {
+                "name": "wal_level",
+                "value": "replica",
+                "vartype": "enum"
+            },
+            {
+                "name": "hot_standby",
+                "value": "on",
+                "vartype": "bool"
+            },
+            {
+                "name": "wal_log_hints",
+                "value": "on",
+                "vartype": "bool"
+            },
+            {
+                "name": "log_connections",
+                "value": "on",
+                "vartype": "bool"
+            },
+            {
+                "name": "port",
+                "value": "55433",
+                "vartype": "integer"
+            },
+            {
+                "name": "shared_buffers",
+                "value": "1MB",
+                "vartype": "string"
+            },
+            {
+                "name": "max_connections",
+                "value": "100",
+                "vartype": "integer"
+            },
+            {
+                "name": "listen_addresses",
+                "value": "0.0.0.0",
+                "vartype": "string"
+            },
+            {
+                "name": "max_wal_senders",
+                "value": "10",
+                "vartype": "integer"
+            },
+            {
+                "name": "max_replication_slots",
+                "value": "10",
+                "vartype": "integer"
+            },
+            {
+                "name": "wal_sender_timeout",
+                "value": "5s",
+                "vartype": "string"
+            },
+            {
+                "name": "wal_keep_size",
+                "value": "0",
+                "vartype": "integer"
+            },
+            {
+                "name": "password_encryption",
+                "value": "md5",
+                "vartype": "enum"
+            },
+            {
+                "name": "restart_after_crash",
+                "value": "off",
+                "vartype": "bool"
+            },
+            {
+                "name": "synchronous_standby_names",
+                "value": "walproposer",
+                "vartype": "string"
+            },
+            {
+                "name": "shared_preload_libraries",
+                "value": "neon",
+                "vartype": "string"
+            },
+            {
+                "name": "neon.safekeepers",
+                "value": "safekeeper1:5454,safekeeper2:5454,safekeeper3:5454",
+                "vartype": "string"
+            },
+            {
+                "name": "neon.timeline_id",
+                "value": "TIMELINE_ID",
+                "vartype": "string"
+            },
+            {
+                "name": "neon.tenant_id",
+                "value": "TENANT_ID",
+                "vartype": "string"
+            },
+            {
+                "name": "neon.pageserver_connstring",
+                "value": "host=pageserver port=6400",
+                "vartype": "string"
+            },
+            {
+                "name": "max_replication_write_lag",
+                "value": "500MB",
+                "vartype": "string"
+            },
+            {
+                "name": "max_replication_flush_lag",
+                "value": "10GB",
+                "vartype": "string"
+            }
+        ]
+    },
+
+    "delta_operations": [
+    ]
+}
diff --git a/docker-compose/docker-compose.yml b/docker-compose/docker-compose.yml
new file mode 100644
index 0000000000..9ab775c3f9
--- /dev/null
+++ b/docker-compose/docker-compose.yml
@@ -0,0 +1,200 @@
+version: '3'
+
+services:
+  etcd:
+    image: quay.io/coreos/etcd:v3.5.4
+    ports:
+      - 2379:2379
+      - 2380:2380
+    environment:
+      # This signifficantly speeds up etcd and we anyway don't data persistency there.
+      ETCD_UNSAFE_NO_FSYNC: "1"
+    command: 
+      - "etcd"
+      - "--auto-compaction-mode=revision"
+      - "--auto-compaction-retention=1"
+      - "--name=etcd-cluster"
+      - "--initial-cluster-state=new"
+      - "--initial-cluster-token=etcd-cluster-1"
+      - "--initial-cluster=etcd-cluster=http://etcd:2380"
+      - "--initial-advertise-peer-urls=http://etcd:2380"
+      - "--advertise-client-urls=http://etcd:2379"
+      - "--listen-client-urls=http://0.0.0.0:2379"
+      - "--listen-peer-urls=http://0.0.0.0:2380"
+      - "--quota-backend-bytes=134217728" # 128 MB
+
+  minio:
+    image: quay.io/minio/minio:RELEASE.2022-10-20T00-55-09Z
+    ports:
+      - 9000:9000
+      - 9001:9001
+    environment:
+      - MINIO_ROOT_USER=minio
+      - MINIO_ROOT_PASSWORD=password
+    command: server /data --address :9000 --console-address ":9001"
+
+  minio_create_buckets:
+    image: minio/mc
+    environment:
+      - MINIO_ROOT_USER=minio
+      - MINIO_ROOT_PASSWORD=password
+    entrypoint:
+      - "/bin/sh"
+      - "-c"
+    command: 
+      - "until (/usr/bin/mc alias set minio http://minio:9000 $$MINIO_ROOT_USER $$MINIO_ROOT_PASSWORD) do
+             echo 'Waiting to start minio...' && sleep 1;
+         done;
+         /usr/bin/mc mb minio/neon --region=eu-north-1;
+         exit 0;"
+    depends_on:
+      - minio
+
+  pageserver:
+    image: neondatabase/neon:${TAG:-latest}
+    environment:
+      - BROKER_ENDPOINT='http://etcd:2379'
+      - AWS_ACCESS_KEY_ID=minio
+      - AWS_SECRET_ACCESS_KEY=password
+      #- RUST_BACKTRACE=1
+    ports:
+       #- 6400:6400  # pg protocol handler
+       - 9898:9898 # http endpoints
+    entrypoint:
+      - "/bin/sh"
+      - "-c"
+    command:
+      - "/usr/local/bin/pageserver -D /data/.neon/
+                                   -c \"broker_endpoints=[$$BROKER_ENDPOINT]\"
+                                   -c \"listen_pg_addr='0.0.0.0:6400'\"
+                                   -c \"listen_http_addr='0.0.0.0:9898'\"
+                                   -c \"remote_storage={endpoint='http://minio:9000',
+                                                        bucket_name='neon',
+                                                        bucket_region='eu-north-1',
+                                                        prefix_in_bucket='/pageserver/'}\""
+    depends_on:
+      - etcd
+      - minio_create_buckets
+
+  safekeeper1:
+    image: neondatabase/neon:${TAG:-latest}
+    environment:
+      - SAFEKEEPER_ADVERTISE_URL=safekeeper1:5454
+      - SAFEKEEPER_ID=1
+      - BROKER_ENDPOINT=http://etcd:2379
+      - AWS_ACCESS_KEY_ID=minio
+      - AWS_SECRET_ACCESS_KEY=password
+      #- RUST_BACKTRACE=1
+    ports:
+      #- 5454:5454 # pg protocol handler
+      - 7676:7676 # http endpoints
+    entrypoint:
+      - "/bin/sh"
+      - "-c"
+    command:
+      - "safekeeper --listen-pg=$$SAFEKEEPER_ADVERTISE_URL
+                    --listen-http='0.0.0.0:7676'
+                    --id=$$SAFEKEEPER_ID
+                    --broker-endpoints=$$BROKER_ENDPOINT
+                    -D /data
+                    --remote-storage=\"{endpoint='http://minio:9000',
+                                        bucket_name='neon',
+                                        bucket_region='eu-north-1',
+                                        prefix_in_bucket='/safekeeper/'}\""
+    depends_on:
+      - etcd
+      - minio_create_buckets
+
+  safekeeper2:
+    image: neondatabase/neon:${TAG:-latest}
+    environment:
+      - SAFEKEEPER_ADVERTISE_URL=safekeeper2:5454
+      - SAFEKEEPER_ID=2
+      - BROKER_ENDPOINT=http://etcd:2379
+      - AWS_ACCESS_KEY_ID=minio
+      - AWS_SECRET_ACCESS_KEY=password
+      #- RUST_BACKTRACE=1
+    ports:
+      #- 5454:5454 # pg protocol handler
+      - 7677:7676 # http endpoints
+    entrypoint:
+      - "/bin/sh"
+      - "-c"
+    command:
+      - "safekeeper --listen-pg=$$SAFEKEEPER_ADVERTISE_URL
+                    --listen-http='0.0.0.0:7676'
+                    --id=$$SAFEKEEPER_ID
+                    --broker-endpoints=$$BROKER_ENDPOINT
+                    -D /data
+                    --remote-storage=\"{endpoint='http://minio:9000',
+                                        bucket_name='neon',
+                                        bucket_region='eu-north-1',
+                                        prefix_in_bucket='/safekeeper/'}\""
+    depends_on:
+      - etcd
+      - minio_create_buckets
+
+  safekeeper3:
+    image: neondatabase/neon:${TAG:-latest}
+    environment:
+      - SAFEKEEPER_ADVERTISE_URL=safekeeper3:5454
+      - SAFEKEEPER_ID=3
+      - BROKER_ENDPOINT=http://etcd:2379
+      - AWS_ACCESS_KEY_ID=minio
+      - AWS_SECRET_ACCESS_KEY=password
+      #- RUST_BACKTRACE=1
+    ports:
+      #- 5454:5454 # pg protocol handler
+      - 7678:7676 # http endpoints
+    entrypoint:
+      - "/bin/sh"
+      - "-c"
+    command:
+      - "safekeeper --listen-pg=$$SAFEKEEPER_ADVERTISE_URL
+                    --listen-http='0.0.0.0:7676'
+                    --id=$$SAFEKEEPER_ID
+                    --broker-endpoints=$$BROKER_ENDPOINT
+                    -D /data
+                    --remote-storage=\"{endpoint='http://minio:9000',
+                                        bucket_name='neon',
+                                        bucket_region='eu-north-1',
+                                        prefix_in_bucket='/safekeeper/'}\""
+    depends_on:
+      - etcd
+      - minio_create_buckets
+
+  compute:
+    build:
+      context: ./image/compute
+      args:
+        - COMPUTE_IMAGE=compute-node-v${PG_VERSION:-14}:${TAG:-latest}
+        - http_proxy=$http_proxy
+        - https_proxy=$https_proxy
+    environment:
+      - PG_VERSION=${PG_VERSION:-14}
+      #- RUST_BACKTRACE=1
+    volumes:
+      - ./compute/var/db/postgres/specs/:/var/db/postgres/specs/
+      - ./compute/shell/:/shell/
+    ports:
+      - 55433:55433 # pg protocol handler
+      - 3080:3080 # http endpoints
+    entrypoint:
+      - "/shell/compute.sh"
+    depends_on:
+      - safekeeper1
+      - safekeeper2
+      - safekeeper3
+      - pageserver
+
+  compute_is_ready:
+    image: postgres:latest
+    entrypoint:
+      - "/bin/bash"
+      - "-c"
+    command:
+      - "until pg_isready -h compute -p 55433 ; do
+            echo 'Waiting to start compute...' && sleep 1;
+         done"
+    depends_on:
+      - compute
diff --git a/docker-compose/image/compute/Dockerfile b/docker-compose/image/compute/Dockerfile
new file mode 100644
index 0000000000..1b9d8c4900
--- /dev/null
+++ b/docker-compose/image/compute/Dockerfile
@@ -0,0 +1,10 @@
+ARG COMPUTE_IMAGE=compute-node-v14:latest
+FROM neondatabase/${COMPUTE_IMAGE}
+
+USER root
+RUN apt-get update &&       \
+    apt-get install -y curl \
+                       jq   \
+                       netcat
+
+USER postgres
diff --git a/docs/docker.md b/docs/docker.md
index 100cdd248b..42f0048e6f 100644
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -18,3 +18,67 @@ We build all images after a successful `release` tests run and push automaticall
 1. `neondatabase/compute-tools` and `neondatabase/compute-node`
 
 2. `neondatabase/neon`
+
+## Docker Compose example
+
+You can see a [docker compose](https://docs.docker.com/compose/) example to create a neon cluster in [/docker-compose/docker-compose.yml](/docker-compose/docker-compose.yml). It creates the following conatainers.
+
+- etcd x 1
+- pageserver x 1
+- safekeeper x 3
+- compute x 1
+- MinIO x 1        # This is Amazon S3 compatible object storage
+
+### How to use
+
+1. create containers
+
+You can specify version of neon cluster using following environment values.
+- PG_VERSION: postgres version for compute (default is 14)
+- TAG: the tag version of [docker image](https://registry.hub.docker.com/r/neondatabase/neon/tags) (default is latest), which is tagged in [CI test](/.github/workflows/build_and_test.yml)
+```
+$ cd docker-compose/docker-compose.yml
+$ docker-compose down   # remove the conainers if exists
+$ PG_VERSION=15 TAG=2221 docker-compose up --build -d  # You can specify the postgres and image version
+Creating network "dockercompose_default" with the default driver
+Creating dockercompose_etcd3_1 ...
+(...omit...)
+```
+
+2. connect compute node
+```
+$ echo "localhost:55433:postgres:cloud_admin:cloud_admin" >> ~/.pgpass
+$ psql -h localhost -p 55433 -U cloud_admin
+postgres=# CREATE TABLE t(key int primary key, value text);
+CREATE TABLE
+postgres=# insert into t values(1,1);
+INSERT 0 1
+postgres=# select * from t;
+ key | value
+-----+-------
+   1 | 1
+(1 row)
+```
+
+3. If you want to see the log, you can use `docker-compose logs` command.
+```
+# check the container name you want to see
+$ docker ps
+CONTAINER ID   IMAGE                                              COMMAND                  CREATED         STATUS         PORTS                                                                                                                                  NAMES
+d6968a5ae912   dockercompose_compute                              "/shell/compute.sh"      5 minutes ago   Up 5 minutes   0.0.0.0:3080->3080/tcp, 0.0.0.0:55433->55433/tcp                                                                                       dockercompose_compute_1
+(...omit...)
+
+$ docker logs -f dockercompose_compute_1
+2022-10-21 06:15:48.757 GMT [56] LOG:  connection authorized: user=cloud_admin database=postgres application_name=psql
+2022-10-21 06:17:00.307 GMT [56] LOG:  [NEON_SMGR] libpagestore: connected to 'host=pageserver port=6400'
+(...omit...)
+```
+
+4. If you want to see durable data in MinIO which is s3 compatible storage
+
+Access http://localhost:9001 and sign in.
+
+- Username: `minio`
+- Password: `password`
+
+You can see durable pages and WAL data in `neon` bucket.
\ No newline at end of file
diff --git a/scripts/docker-compose_test.sh b/scripts/docker-compose_test.sh
new file mode 100755
index 0000000000..b4551365f8
--- /dev/null
+++ b/scripts/docker-compose_test.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+# this is a shortcut script to avoid duplication in CI
+set -eux -o pipefail
+
+SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+COMPOSE_FILE=$SCRIPT_DIR/../docker-compose/docker-compose.yml
+
+COMPUTE_CONTAINER_NAME=dockercompose_compute_1
+SQL="CREATE TABLE t(key int primary key, value text); insert into t values(1,1); select * from t;"
+PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -c '$SQL' postgres"
+
+cleanup() {
+	echo "show container information"
+	docker ps
+	docker-compose -f $COMPOSE_FILE logs
+	echo "stop containers..."
+	docker-compose -f $COMPOSE_FILE down
+}
+
+echo "clean up containers if exists"
+cleanup
+
+for pg_version in 14 15; do
+	echo "start containers (pg_version=$pg_version)."
+	PG_VERSION=$pg_version TAG=latest docker-compose -f $COMPOSE_FILE up --build -d
+
+	echo "wait until the compute is ready. timeout after 60s. "
+	cnt=0
+	while sleep 1; do
+		# check timeout
+		cnt=`expr $cnt + 1`
+		if [ $cnt -gt 60 ]; then
+			echo "timeout before the compute is ready."
+			cleanup
+			exit 1
+		fi
+
+		# check if the compute is ready
+		set +o pipefail
+		result=`docker-compose -f $COMPOSE_FILE logs "compute_is_ready" | grep "accepting connections" | wc -l`
+		set -o pipefail
+		if [ $result -eq 1 ]; then
+			echo "OK. The compute is ready to connect."
+			echo "execute simple queries."
+			docker exec -it $COMPUTE_CONTAINER_NAME /bin/bash -c "psql $PSQL_OPTION"
+			cleanup
+			break
+		fi
+	done
+done

From b4c55f5d2445452329ee09527da1c3080503e23a Mon Sep 17 00:00:00 2001
From: bojanserafimov <bojan.serafimov7@gmail.com>
Date: Wed, 26 Oct 2022 17:32:31 -0400
Subject: [PATCH 010/412] Move pagestream api to libs/pageserver_api (#2698)

---
 Cargo.lock                                    |   3 +
 libs/pageserver_api/Cargo.toml                |   3 +
 libs/pageserver_api/src/lib.rs                |   1 +
 libs/pageserver_api/src/models.rs             | 161 +++++++++++++++++
 .../pageserver_api}/src/reltag.rs             |   0
 pageserver/src/basebackup.rs                  |   2 +-
 pageserver/src/import_datadir.rs              |   2 +-
 pageserver/src/lib.rs                         |   1 -
 pageserver/src/page_service.rs                | 166 +-----------------
 pageserver/src/pgdatadir_mapping.rs           |   2 +-
 pageserver/src/tenant/timeline.rs             |   2 +-
 pageserver/src/walingest.rs                   |   2 +-
 pageserver/src/walredo.rs                     |   2 +-
 13 files changed, 181 insertions(+), 166 deletions(-)
 rename {pageserver => libs/pageserver_api}/src/reltag.rs (100%)

diff --git a/Cargo.lock b/Cargo.lock
index b39ca6e5a7..3e67126add 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2189,7 +2189,10 @@ dependencies = [
 name = "pageserver_api"
 version = "0.1.0"
 dependencies = [
+ "anyhow",
+ "bytes",
  "const_format",
+ "postgres_ffi",
  "serde",
  "serde_with",
  "utils",
diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml
index 5995325a2f..9121cd4989 100644
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -7,6 +7,9 @@ edition = "2021"
 serde = { version = "1.0", features = ["derive"] }
 serde_with = "2.0"
 const_format = "0.2.21"
+anyhow = { version = "1.0", features = ["backtrace"] }
+bytes = "1.0.1"
 
 utils = { path = "../utils" }
+postgres_ffi = { path = "../postgres_ffi" }
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
diff --git a/libs/pageserver_api/src/lib.rs b/libs/pageserver_api/src/lib.rs
index a36c1692a9..4890d54f36 100644
--- a/libs/pageserver_api/src/lib.rs
+++ b/libs/pageserver_api/src/lib.rs
@@ -2,6 +2,7 @@ use const_format::formatcp;
 
 /// Public API types
 pub mod models;
+pub mod reltag;
 
 pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
 pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index dd40ba9e1c..4360f76fd1 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -7,6 +7,10 @@ use utils::{
     lsn::Lsn,
 };
 
+use crate::reltag::RelTag;
+use anyhow::bail;
+use bytes::{Buf, BufMut, Bytes, BytesMut};
+
 /// A state of a tenant in pageserver's memory.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 pub enum TenantState {
@@ -219,3 +223,160 @@ pub struct FailpointConfig {
 pub struct TimelineGcRequest {
     pub gc_horizon: Option<u64>,
 }
+
+// Wrapped in libpq CopyData
+pub enum PagestreamFeMessage {
+    Exists(PagestreamExistsRequest),
+    Nblocks(PagestreamNblocksRequest),
+    GetPage(PagestreamGetPageRequest),
+    DbSize(PagestreamDbSizeRequest),
+}
+
+// Wrapped in libpq CopyData
+pub enum PagestreamBeMessage {
+    Exists(PagestreamExistsResponse),
+    Nblocks(PagestreamNblocksResponse),
+    GetPage(PagestreamGetPageResponse),
+    Error(PagestreamErrorResponse),
+    DbSize(PagestreamDbSizeResponse),
+}
+
+#[derive(Debug)]
+pub struct PagestreamExistsRequest {
+    pub latest: bool,
+    pub lsn: Lsn,
+    pub rel: RelTag,
+}
+
+#[derive(Debug)]
+pub struct PagestreamNblocksRequest {
+    pub latest: bool,
+    pub lsn: Lsn,
+    pub rel: RelTag,
+}
+
+#[derive(Debug)]
+pub struct PagestreamGetPageRequest {
+    pub latest: bool,
+    pub lsn: Lsn,
+    pub rel: RelTag,
+    pub blkno: u32,
+}
+
+#[derive(Debug)]
+pub struct PagestreamDbSizeRequest {
+    pub latest: bool,
+    pub lsn: Lsn,
+    pub dbnode: u32,
+}
+
+#[derive(Debug)]
+pub struct PagestreamExistsResponse {
+    pub exists: bool,
+}
+
+#[derive(Debug)]
+pub struct PagestreamNblocksResponse {
+    pub n_blocks: u32,
+}
+
+#[derive(Debug)]
+pub struct PagestreamGetPageResponse {
+    pub page: Bytes,
+}
+
+#[derive(Debug)]
+pub struct PagestreamErrorResponse {
+    pub message: String,
+}
+
+#[derive(Debug)]
+pub struct PagestreamDbSizeResponse {
+    pub db_size: i64,
+}
+
+impl PagestreamFeMessage {
+    pub fn parse(mut body: Bytes) -> anyhow::Result<PagestreamFeMessage> {
+        // TODO these gets can fail
+
+        // these correspond to the NeonMessageTag enum in pagestore_client.h
+        //
+        // TODO: consider using protobuf or serde bincode for less error prone
+        // serialization.
+        let msg_tag = body.get_u8();
+        match msg_tag {
+            0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
+                latest: body.get_u8() != 0,
+                lsn: Lsn::from(body.get_u64()),
+                rel: RelTag {
+                    spcnode: body.get_u32(),
+                    dbnode: body.get_u32(),
+                    relnode: body.get_u32(),
+                    forknum: body.get_u8(),
+                },
+            })),
+            1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
+                latest: body.get_u8() != 0,
+                lsn: Lsn::from(body.get_u64()),
+                rel: RelTag {
+                    spcnode: body.get_u32(),
+                    dbnode: body.get_u32(),
+                    relnode: body.get_u32(),
+                    forknum: body.get_u8(),
+                },
+            })),
+            2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
+                latest: body.get_u8() != 0,
+                lsn: Lsn::from(body.get_u64()),
+                rel: RelTag {
+                    spcnode: body.get_u32(),
+                    dbnode: body.get_u32(),
+                    relnode: body.get_u32(),
+                    forknum: body.get_u8(),
+                },
+                blkno: body.get_u32(),
+            })),
+            3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
+                latest: body.get_u8() != 0,
+                lsn: Lsn::from(body.get_u64()),
+                dbnode: body.get_u32(),
+            })),
+            _ => bail!("unknown smgr message tag: {},'{:?}'", msg_tag, body),
+        }
+    }
+}
+
+impl PagestreamBeMessage {
+    pub fn serialize(&self) -> Bytes {
+        let mut bytes = BytesMut::new();
+
+        match self {
+            Self::Exists(resp) => {
+                bytes.put_u8(100); /* tag from pagestore_client.h */
+                bytes.put_u8(resp.exists as u8);
+            }
+
+            Self::Nblocks(resp) => {
+                bytes.put_u8(101); /* tag from pagestore_client.h */
+                bytes.put_u32(resp.n_blocks);
+            }
+
+            Self::GetPage(resp) => {
+                bytes.put_u8(102); /* tag from pagestore_client.h */
+                bytes.put(&resp.page[..]);
+            }
+
+            Self::Error(resp) => {
+                bytes.put_u8(103); /* tag from pagestore_client.h */
+                bytes.put(resp.message.as_bytes());
+                bytes.put_u8(0); // null terminator
+            }
+            Self::DbSize(resp) => {
+                bytes.put_u8(104); /* tag from pagestore_client.h */
+                bytes.put_i64(resp.db_size);
+            }
+        }
+
+        bytes.into()
+    }
+}
diff --git a/pageserver/src/reltag.rs b/libs/pageserver_api/src/reltag.rs
similarity index 100%
rename from pageserver/src/reltag.rs
rename to libs/pageserver_api/src/reltag.rs
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index d0a57a473b..973c3cd3a6 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -22,8 +22,8 @@ use std::time::SystemTime;
 use tar::{Builder, EntryType, Header};
 use tracing::*;
 
-use crate::reltag::{RelTag, SlruKind};
 use crate::tenant::Timeline;
+use pageserver_api::reltag::{RelTag, SlruKind};
 
 use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
 use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PGDATA_SUBDIRS, PG_HBA};
diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs
index ee3dc684e3..642e41765b 100644
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -12,10 +12,10 @@ use tracing::*;
 use walkdir::WalkDir;
 
 use crate::pgdatadir_mapping::*;
-use crate::reltag::{RelTag, SlruKind};
 use crate::tenant::Timeline;
 use crate::walingest::WalIngest;
 use crate::walrecord::DecodedWALRecord;
+use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::*;
 use postgres_ffi::waldecoder::WalStreamDecoder;
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index c75f940386..52a4cb0381 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -8,7 +8,6 @@ pub mod page_cache;
 pub mod page_service;
 pub mod pgdatadir_mapping;
 pub mod profiling;
-pub mod reltag;
 pub mod repository;
 pub mod storage_sync;
 pub mod task_mgr;
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index d61885314e..aec91bc7f1 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -10,8 +10,14 @@
 //
 
 use anyhow::{bail, ensure, Context, Result};
-use bytes::{Buf, BufMut, Bytes, BytesMut};
+use bytes::Bytes;
 use futures::{Stream, StreamExt};
+use pageserver_api::models::{
+    PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
+    PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse,
+    PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse,
+    PagestreamNblocksRequest, PagestreamNblocksResponse,
+};
 use std::io;
 use std::net::TcpListener;
 use std::str;
@@ -35,7 +41,6 @@ use crate::config::{PageServerConf, ProfilingConfig};
 use crate::import_datadir::import_wal_from_tar;
 use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME};
 use crate::profiling::profpoint_start;
-use crate::reltag::RelTag;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
 use crate::tenant::Timeline;
@@ -45,163 +50,6 @@ use crate::CheckpointConfig;
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use postgres_ffi::BLCKSZ;
 
-// Wrapped in libpq CopyData
-enum PagestreamFeMessage {
-    Exists(PagestreamExistsRequest),
-    Nblocks(PagestreamNblocksRequest),
-    GetPage(PagestreamGetPageRequest),
-    DbSize(PagestreamDbSizeRequest),
-}
-
-// Wrapped in libpq CopyData
-enum PagestreamBeMessage {
-    Exists(PagestreamExistsResponse),
-    Nblocks(PagestreamNblocksResponse),
-    GetPage(PagestreamGetPageResponse),
-    Error(PagestreamErrorResponse),
-    DbSize(PagestreamDbSizeResponse),
-}
-
-#[derive(Debug)]
-struct PagestreamExistsRequest {
-    latest: bool,
-    lsn: Lsn,
-    rel: RelTag,
-}
-
-#[derive(Debug)]
-struct PagestreamNblocksRequest {
-    latest: bool,
-    lsn: Lsn,
-    rel: RelTag,
-}
-
-#[derive(Debug)]
-struct PagestreamGetPageRequest {
-    latest: bool,
-    lsn: Lsn,
-    rel: RelTag,
-    blkno: u32,
-}
-
-#[derive(Debug)]
-struct PagestreamDbSizeRequest {
-    latest: bool,
-    lsn: Lsn,
-    dbnode: u32,
-}
-
-#[derive(Debug)]
-struct PagestreamExistsResponse {
-    exists: bool,
-}
-
-#[derive(Debug)]
-struct PagestreamNblocksResponse {
-    n_blocks: u32,
-}
-
-#[derive(Debug)]
-struct PagestreamGetPageResponse {
-    page: Bytes,
-}
-
-#[derive(Debug)]
-struct PagestreamErrorResponse {
-    message: String,
-}
-
-#[derive(Debug)]
-struct PagestreamDbSizeResponse {
-    db_size: i64,
-}
-
-impl PagestreamFeMessage {
-    fn parse(mut body: Bytes) -> anyhow::Result<PagestreamFeMessage> {
-        // TODO these gets can fail
-
-        // these correspond to the NeonMessageTag enum in pagestore_client.h
-        //
-        // TODO: consider using protobuf or serde bincode for less error prone
-        // serialization.
-        let msg_tag = body.get_u8();
-        match msg_tag {
-            0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
-                latest: body.get_u8() != 0,
-                lsn: Lsn::from(body.get_u64()),
-                rel: RelTag {
-                    spcnode: body.get_u32(),
-                    dbnode: body.get_u32(),
-                    relnode: body.get_u32(),
-                    forknum: body.get_u8(),
-                },
-            })),
-            1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
-                latest: body.get_u8() != 0,
-                lsn: Lsn::from(body.get_u64()),
-                rel: RelTag {
-                    spcnode: body.get_u32(),
-                    dbnode: body.get_u32(),
-                    relnode: body.get_u32(),
-                    forknum: body.get_u8(),
-                },
-            })),
-            2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
-                latest: body.get_u8() != 0,
-                lsn: Lsn::from(body.get_u64()),
-                rel: RelTag {
-                    spcnode: body.get_u32(),
-                    dbnode: body.get_u32(),
-                    relnode: body.get_u32(),
-                    forknum: body.get_u8(),
-                },
-                blkno: body.get_u32(),
-            })),
-            3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
-                latest: body.get_u8() != 0,
-                lsn: Lsn::from(body.get_u64()),
-                dbnode: body.get_u32(),
-            })),
-            _ => bail!("unknown smgr message tag: {},'{:?}'", msg_tag, body),
-        }
-    }
-}
-
-impl PagestreamBeMessage {
-    fn serialize(&self) -> Bytes {
-        let mut bytes = BytesMut::new();
-
-        match self {
-            Self::Exists(resp) => {
-                bytes.put_u8(100); /* tag from pagestore_client.h */
-                bytes.put_u8(resp.exists as u8);
-            }
-
-            Self::Nblocks(resp) => {
-                bytes.put_u8(101); /* tag from pagestore_client.h */
-                bytes.put_u32(resp.n_blocks);
-            }
-
-            Self::GetPage(resp) => {
-                bytes.put_u8(102); /* tag from pagestore_client.h */
-                bytes.put(&resp.page[..]);
-            }
-
-            Self::Error(resp) => {
-                bytes.put_u8(103); /* tag from pagestore_client.h */
-                bytes.put(resp.message.as_bytes());
-                bytes.put_u8(0); // null terminator
-            }
-            Self::DbSize(resp) => {
-                bytes.put_u8(104); /* tag from pagestore_client.h */
-                bytes.put_i64(resp.db_size);
-            }
-        }
-
-        bytes.into()
-    }
-}
-
 fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Bytes>> + '_ {
     async_stream::try_stream! {
         loop {
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index ca931ed37d..0e334a63df 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -7,12 +7,12 @@
 //! Clarify that)
 //!
 use crate::keyspace::{KeySpace, KeySpaceAccum};
-use crate::reltag::{RelTag, SlruKind};
 use crate::repository::*;
 use crate::tenant::Timeline;
 use crate::walrecord::NeonWalRecord;
 use anyhow::{bail, ensure, Result};
 use bytes::{Buf, Bytes};
+use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::BLCKSZ;
 use postgres_ffi::{Oid, TimestampTz, TransactionId};
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 194ca0d857..6a96254df4 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -37,8 +37,8 @@ use crate::metrics::TimelineMetrics;
 use crate::pgdatadir_mapping::BlockNumber;
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key};
-use crate::reltag::RelTag;
 use crate::tenant_config::TenantConfOpt;
+use pageserver_api::reltag::RelTag;
 
 use postgres_ffi::to_pg_timestamp;
 use utils::{
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 9a6b99d991..8c81ed824b 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -31,10 +31,10 @@ use bytes::{Buf, Bytes, BytesMut};
 use tracing::*;
 
 use crate::pgdatadir_mapping::*;
-use crate::reltag::{RelTag, SlruKind};
 use crate::tenant::Timeline;
 use crate::walrecord::*;
 use crate::ZERO_PAGE;
+use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::v14::nonrelfile_utils::mx_offset_to_member_segment;
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index e683c301d8..1cde11082e 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -43,10 +43,10 @@ use crate::metrics::{
     WAL_REDO_WAIT_TIME,
 };
 use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block};
-use crate::reltag::{RelTag, SlruKind};
 use crate::repository::Key;
 use crate::walrecord::NeonWalRecord;
 use crate::{config::PageServerConf, TEMP_FILE_SUFFIX};
+use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM;
 use postgres_ffi::v14::nonrelfile_utils::{

From ab0be7b8da124839dbaf5fc85978d292e6ef427c Mon Sep 17 00:00:00 2001
From: Stas Kelvich <stas.kelvich@gmail.com>
Date: Thu, 27 Oct 2022 03:50:46 +0300
Subject: [PATCH 011/412] Avoid debian-testing packages in compute Dockerfiles

plv8 can only be built with a fairly new gold linker version. We used to install
it via binutils packages from testing, but it also updates libc and that causes
troubles in the resulting image as different extensions were built against
different libc versions. We could either use libc from debian-testing everywhere
or restrain from using testing packages and install necessary programs manually.
This patch uses the latter approach: gold for plv8 and cmake for h3 are
installed manually.

In a passing declare h3_postgis as a safe extension (previous omission).
---
 Dockerfile.compute-node-v14 | 87 ++++++++++++++++++++++---------------
 Dockerfile.compute-node-v15 | 74 ++++++++++++++++++-------------
 2 files changed, 95 insertions(+), 66 deletions(-)

diff --git a/Dockerfile.compute-node-v14 b/Dockerfile.compute-node-v14
index 6d2b285fa3..035dfc0d08 100644
--- a/Dockerfile.compute-node-v14
+++ b/Dockerfile.compute-node-v14
@@ -1,24 +1,26 @@
-ARG TAG=pinned
-# apparently, ARGs don't get replaced in RUN commands in kaniko
-# ARG POSTGIS_VERSION=3.3.0
-# ARG PLV8_VERSION=3.1.4
-# ARG PG_VERSION=v14
+#
+# This file is identical to the Dockerfile.compute-node-v15 file
+# except for the version of Postgres that is built.
+#
 
+ARG TAG=pinned
+
+#########################################################################################
 #
 # Layer "build-deps"
 #
+#########################################################################################
 FROM debian:bullseye-slim AS build-deps
-RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \
-    echo "Package: *\nPin: release n=bullseye\nPin-Priority: 50" > /etc/apt/preferences && \
-    apt update
 RUN apt update &&  \
-    apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \
-    libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libglib2.0-dev
+    apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
+    zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config
 
+#########################################################################################
 #
 # Layer "pg-build"
 # Build Postgres from the neon postgres repository.
 #
+#########################################################################################
 FROM build-deps AS pg-build
 COPY vendor/postgres-v14 postgres
 RUN cd postgres && \
@@ -29,22 +31,20 @@ RUN cd postgres && \
     make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \
     make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install
 
+#########################################################################################
 #
 # Layer "postgis-build"
 # Build PostGIS from the upstream PostGIS mirror.
 #
-# PostGIS compiles against neon postgres sources without changes. Perhaps we
-# could even use the upstream binaries, compiled against vanilla Postgres, but
-# it would require some investigation to check that it works, and also keeps
-# working in the future. So for now, we compile our own binaries.
+#########################################################################################
 FROM build-deps AS postgis-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt update && \
     apt install -y gdal-bin libgdal-dev libprotobuf-c-dev protobuf-c-compiler xsltproc
 
-RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.0.tar.gz && \
-    tar xvzf postgis-3.3.0.tar.gz && \
-    cd postgis-3.3.0 && \
+RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \
+    tar xvzf postgis-3.3.1.tar.gz && \
+    cd postgis-3.3.1 && \
     ./autogen.sh && \
     export PATH="/usr/local/pgsql/bin:$PATH" && \
     ./configure && \
@@ -57,19 +57,29 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.0.tar.gz && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control
 
+#########################################################################################
 #
 # Layer "plv8-build"
 # Build plv8
 #
+#########################################################################################
 FROM build-deps AS plv8-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt update && \
-    apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5
+    apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5 binutils
 
-# https://github.com/plv8/plv8/issues/475
-# Debian bullseye provides binutils 2.35 when >= 2.38 is necessary
-RUN apt update && \
-    apt install -y --no-install-recommends -t testing binutils
+# https://github.com/plv8/plv8/issues/475:
+#   v8 uses gold for linking and sets `--thread-count=4` which breaks
+#   gold version <= 1.35 (https://sourceware.org/bugzilla/show_bug.cgi?id=23607)
+# Install newer gold version manually as debian-testing binutils version updates
+# libc version, which in turn breaks other extension built against non-testing libc.
+RUN wget https://ftp.gnu.org/gnu/binutils/binutils-2.38.tar.gz && \
+    tar xvzf binutils-2.38.tar.gz && \
+    cd binutils-2.38 && \
+    cd libiberty && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && \
+    cd ../bfd && ./configure && make bfdver.h && \
+    cd ../gold && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && make install && \
+    cp /usr/local/bin/ld.gold /usr/bin/gold
 
 # Sed is used to patch for https://github.com/plv8/plv8/issues/503
 RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
@@ -77,21 +87,25 @@ RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
     cd plv8-3.1.4 && \
     export PATH="/usr/local/pgsql/bin:$PATH" && \
     sed -i 's/MemoryContextAlloc(/MemoryContextAllocZero(/' plv8.cc && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
     rm -rf /plv8-* && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control
 
+#########################################################################################
 #
 # Layer "h3-pg-build"
 # Build h3_pg
 #
+#########################################################################################
 FROM build-deps AS h3-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # packaged cmake is too old
-RUN apt update && \
-    apt install -y --no-install-recommends -t testing cmake
+RUN wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2-linux-x86_64.sh \
+      -q -O /tmp/cmake-install.sh \
+      && chmod u+x /tmp/cmake-install.sh \
+      && /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \
+      && rm /tmp/cmake-install.sh
 
 RUN wget https://github.com/uber/h3/archive/refs/tags/v4.0.1.tar.gz -O h3.tgz && \
     tar xvzf h3.tgz  && \
@@ -110,12 +124,15 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3
     export PATH="/usr/local/pgsql/bin:$PATH" && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3_postgis.control
 
+#########################################################################################
 #
 # Layer "neon-pg-ext-build"
 # compile neon extensions
 #
+#########################################################################################
 FROM build-deps AS neon-pg-ext-build
 COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
@@ -128,16 +145,22 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
         -C pgxn/neon \
         -s install
 
+#########################################################################################
+#
 # Compile and run the Neon-specific `compute_ctl` binary
+#
+#########################################################################################
 FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:$TAG AS compute-tools
 USER nonroot
 # Copy entire project to get Cargo.* files with proper dependencies for the whole project
 COPY --chown=nonroot . .
 RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto
 
+#########################################################################################
 #
 # Clean up postgres folder before inclusion
 #
+#########################################################################################
 FROM neon-pg-ext-build AS postgres-cleanup-layer
 COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql
 
@@ -155,10 +178,12 @@ RUN rm -r /usr/local/pgsql/lib/pgxs/src
 # if they were to be used by other libraries.
 RUN rm /usr/local/pgsql/lib/lib*.a
 
+#########################################################################################
 #
 # Final layer
 # Put it all together into the final image
 #
+#########################################################################################
 FROM debian:bullseye-slim
 # Add user postgres
 RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
@@ -175,8 +200,6 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
 # libreadline8 for psql
 # libossp-uuid16 for extension ossp-uuid
 # libgeos, libgdal, libproj and libprotobuf-c1 for PostGIS
-# GLIBC 2.34 for plv8.
-#     Debian bullseye provides GLIBC 2.31, so we install the library from testing
 #
 # Lastly, link compute_ctl into zenith_ctl while we're at it,
 # so that we don't need to put this in another layer.
@@ -189,12 +212,6 @@ RUN apt update &&  \
         libproj19 \
         libprotobuf-c1 && \
     rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
-    echo "Installing GLIBC 2.34" && \
-    echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \
-    echo "Package: *\nPin: release n=bullseye\nPin-Priority: 50" > /etc/apt/preferences && \
-    apt update && \
-    apt install -y --no-install-recommends -t testing libc6 && \
-    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
     ln /usr/local/bin/compute_ctl /usr/local/bin/zenith_ctl
 
 USER postgres
diff --git a/Dockerfile.compute-node-v15 b/Dockerfile.compute-node-v15
index b7b1f25103..0b6e570b44 100644
--- a/Dockerfile.compute-node-v15
+++ b/Dockerfile.compute-node-v15
@@ -4,26 +4,23 @@
 #
 
 ARG TAG=pinned
-# apparently, ARGs don't get replaced in RUN commands in kaniko
-# ARG POSTGIS_VERSION=3.3.1
-# ARG PLV8_VERSION=3.1.4
-# ARG PG_VERSION=v15
 
+#########################################################################################
 #
 # Layer "build-deps"
 #
+#########################################################################################
 FROM debian:bullseye-slim AS build-deps
-RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \
-    echo "Package: *\nPin: release n=bullseye\nPin-Priority: 50" > /etc/apt/preferences && \
-    apt update
 RUN apt update &&  \
-    apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \
-    libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libglib2.0-dev
+    apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
+    zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config
 
+#########################################################################################
 #
 # Layer "pg-build"
 # Build Postgres from the neon postgres repository.
 #
+#########################################################################################
 FROM build-deps AS pg-build
 COPY vendor/postgres-v15 postgres
 RUN cd postgres && \
@@ -34,14 +31,12 @@ RUN cd postgres && \
     make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \
     make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install
 
+#########################################################################################
 #
 # Layer "postgis-build"
 # Build PostGIS from the upstream PostGIS mirror.
 #
-# PostGIS compiles against neon postgres sources without changes. Perhaps we
-# could even use the upstream binaries, compiled against vanilla Postgres, but
-# it would require some investigation to check that it works, and also keeps
-# working in the future. So for now, we compile our own binaries.
+#########################################################################################
 FROM build-deps AS postgis-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt update && \
@@ -62,19 +57,29 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control
 
+#########################################################################################
 #
 # Layer "plv8-build"
 # Build plv8
 #
+#########################################################################################
 FROM build-deps AS plv8-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt update && \
-    apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5
+    apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5 binutils
 
-# https://github.com/plv8/plv8/issues/475
-# Debian bullseye provides binutils 2.35 when >= 2.38 is necessary
-RUN apt update && \
-    apt install -y --no-install-recommends -t testing binutils
+# https://github.com/plv8/plv8/issues/475:
+#   v8 uses gold for linking and sets `--thread-count=4` which breaks
+#   gold version <= 1.35 (https://sourceware.org/bugzilla/show_bug.cgi?id=23607)
+# Install newer gold version manually as debian-testing binutils version updates
+# libc version, which in turn breaks other extension built against non-testing libc.
+RUN wget https://ftp.gnu.org/gnu/binutils/binutils-2.38.tar.gz && \
+    tar xvzf binutils-2.38.tar.gz && \
+    cd binutils-2.38 && \
+    cd libiberty && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && \
+    cd ../bfd && ./configure && make bfdver.h && \
+    cd ../gold && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && make install && \
+    cp /usr/local/bin/ld.gold /usr/bin/gold
 
 # Sed is used to patch for https://github.com/plv8/plv8/issues/503
 RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
@@ -82,21 +87,25 @@ RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
     cd plv8-3.1.4 && \
     export PATH="/usr/local/pgsql/bin:$PATH" && \
     sed -i 's/MemoryContextAlloc(/MemoryContextAllocZero(/' plv8.cc && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
     rm -rf /plv8-* && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control
 
+#########################################################################################
 #
 # Layer "h3-pg-build"
 # Build h3_pg
 #
+#########################################################################################
 FROM build-deps AS h3-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # packaged cmake is too old
-RUN apt update && \
-    apt install -y --no-install-recommends -t testing cmake
+RUN wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2-linux-x86_64.sh \
+      -q -O /tmp/cmake-install.sh \
+      && chmod u+x /tmp/cmake-install.sh \
+      && /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \
+      && rm /tmp/cmake-install.sh
 
 RUN wget https://github.com/uber/h3/archive/refs/tags/v4.0.1.tar.gz -O h3.tgz && \
     tar xvzf h3.tgz  && \
@@ -115,12 +124,15 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3
     export PATH="/usr/local/pgsql/bin:$PATH" && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3_postgis.control
 
+#########################################################################################
 #
 # Layer "neon-pg-ext-build"
 # compile neon extensions
 #
+#########################################################################################
 FROM build-deps AS neon-pg-ext-build
 COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
@@ -133,16 +145,22 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
         -C pgxn/neon \
         -s install
 
+#########################################################################################
+#
 # Compile and run the Neon-specific `compute_ctl` binary
+#
+#########################################################################################
 FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:$TAG AS compute-tools
 USER nonroot
 # Copy entire project to get Cargo.* files with proper dependencies for the whole project
 COPY --chown=nonroot . .
 RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto
 
+#########################################################################################
 #
 # Clean up postgres folder before inclusion
 #
+#########################################################################################
 FROM neon-pg-ext-build AS postgres-cleanup-layer
 COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql
 
@@ -160,10 +178,12 @@ RUN rm -r /usr/local/pgsql/lib/pgxs/src
 # if they were to be used by other libraries.
 RUN rm /usr/local/pgsql/lib/lib*.a
 
+#########################################################################################
 #
 # Final layer
 # Put it all together into the final image
 #
+#########################################################################################
 FROM debian:bullseye-slim
 # Add user postgres
 RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
@@ -180,8 +200,6 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
 # libreadline8 for psql
 # libossp-uuid16 for extension ossp-uuid
 # libgeos, libgdal, libproj and libprotobuf-c1 for PostGIS
-# GLIBC 2.34 for plv8.
-#     Debian bullseye provides GLIBC 2.31, so we install the library from testing
 #
 # Lastly, link compute_ctl into zenith_ctl while we're at it,
 # so that we don't need to put this in another layer.
@@ -194,12 +212,6 @@ RUN apt update &&  \
         libproj19 \
         libprotobuf-c1 && \
     rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
-    echo "Installing GLIBC 2.34" && \
-    echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \
-    echo "Package: *\nPin: release n=bullseye\nPin-Priority: 50" > /etc/apt/preferences && \
-    apt update && \
-    apt install -y --no-install-recommends -t testing libc6 && \
-    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
     ln /usr/local/bin/compute_ctl /usr/local/bin/zenith_ctl
 
 USER postgres

From 8e5bb3ed4989bbf86ff8df16b145e27f28e8e9e7 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Thu, 27 Oct 2022 11:09:09 +0400
Subject: [PATCH 012/412] Enable etcd compaction in neon_local.

---
 control_plane/src/etcd.rs | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/control_plane/src/etcd.rs b/control_plane/src/etcd.rs
index ccadfa8ce7..ca2df8a50b 100644
--- a/control_plane/src/etcd.rs
+++ b/control_plane/src/etcd.rs
@@ -52,6 +52,10 @@ pub fn start_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
             // size smaller. Our test etcd clusters are very small.
             // See https://github.com/etcd-io/etcd/issues/7910
             "--quota-backend-bytes=100000000".to_string(),
+            // etcd doesn't compact (vacuum) with default settings,
+            // enable it to prevent space exhaustion.
+            "--auto-compaction-mode=revision".to_string(),
+            "--auto-compaction-retention=1".to_string(),
         ])
         .stdout(Stdio::from(etcd_stdout_file))
         .stderr(Stdio::from(etcd_stderr_file))

From 0816168296883b086dbc50af0b4ebc9fbdf3afe7 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Thu, 15 Dec 2022 12:36:57 +0400
Subject: [PATCH 013/412] Hotfix: terminate subscription if channel is full.

Might help as a hotfix, but need to understand root better.
---
 storage_broker/src/bin/storage_broker.rs | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs
index fdf2637b4d..1a743394ad 100644
--- a/storage_broker/src/bin/storage_broker.rs
+++ b/storage_broker/src/bin/storage_broker.rs
@@ -369,8 +369,9 @@ impl BrokerService for Broker {
                     Err(RecvError::Lagged(skipped_msg)) => {
                         missed_msgs += skipped_msg;
                         if let Poll::Ready(_) = futures::poll!(Box::pin(warn_interval.tick())) {
-                            warn!("dropped {} messages, channel is full", missed_msgs);
-                            missed_msgs = 0;
+                            error!("subscription id={}, key={:?}, addr={:?} dropped {} messages, channel is full",
+                                subscriber.id, subscriber.key, subscriber.remote_addr, missed_msgs);
+                            Err(Status::new(Code::Internal, "full channel"))?;
                         }
                     }
                     Err(RecvError::Closed) => {

From d24de169a7ac1de84f08a420295e309a5946f893 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 16 Dec 2022 00:57:59 +0400
Subject: [PATCH 014/412] Deploy broker with L4 LB in new env.

Seems to be fixing issue with missing keepalives.
---
 .../ansible/prod.ap-southeast-1.hosts.yaml    |  2 +-
 .github/ansible/prod.eu-central-1.hosts.yaml  |  2 +-
 .github/ansible/prod.us-east-2.hosts.yaml     |  2 +-
 .github/ansible/prod.us-west-2.hosts.yaml     |  2 +-
 .github/ansible/staging.eu-west-1.hosts.yaml  |  2 +-
 .github/ansible/staging.us-east-2.hosts.yaml  |  2 +-
 ...ev-eu-west-1-zeta.neon-storage-broker.yaml | 33 ++++++++-----------
 ...ev-us-east-2-beta.neon-storage-broker.yaml | 33 ++++++++-----------
 ...utheast-1-epsilon.neon-storage-broker.yaml | 33 ++++++++-----------
 ...u-central-1-gamma.neon-storage-broker.yaml | 33 ++++++++-----------
 ...d-us-east-2-delta.neon-storage-broker.yaml | 33 ++++++++-----------
 ...rod-us-west-2-eta.neon-storage-broker.yaml | 33 ++++++++-----------
 .github/workflows/build_and_test.yml          |  4 +--
 13 files changed, 92 insertions(+), 122 deletions(-)

diff --git a/.github/ansible/prod.ap-southeast-1.hosts.yaml b/.github/ansible/prod.ap-southeast-1.hosts.yaml
index bcc7bb3b16..648029c120 100644
--- a/.github/ansible/prod.ap-southeast-1.hosts.yaml
+++ b/.github/ansible/prod.ap-southeast-1.hosts.yaml
@@ -3,7 +3,7 @@ storage:
     bucket_name: neon-prod-storage-ap-southeast-1
     bucket_region: ap-southeast-1
     console_mgmt_base_url: http://console-release.local
-    broker_endpoint: https://storage-broker.epsilon.ap-southeast-1.internal.aws.neon.tech:443
+    broker_endpoint: http://storage-broker-lb.epsilon.ap-southeast-1.internal.aws.neon.tech:50051
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
       remote_storage:
diff --git a/.github/ansible/prod.eu-central-1.hosts.yaml b/.github/ansible/prod.eu-central-1.hosts.yaml
index 2b372d0fcb..c285a9f3b6 100644
--- a/.github/ansible/prod.eu-central-1.hosts.yaml
+++ b/.github/ansible/prod.eu-central-1.hosts.yaml
@@ -3,7 +3,7 @@ storage:
     bucket_name: neon-prod-storage-eu-central-1
     bucket_region: eu-central-1
     console_mgmt_base_url: http://console-release.local
-    broker_endpoint: https://storage-broker.gamma.eu-central-1.internal.aws.neon.tech:443
+    broker_endpoint: http://storage-broker-lb.gamma.eu-central-1.internal.aws.neon.tech:50051
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
       remote_storage:
diff --git a/.github/ansible/prod.us-east-2.hosts.yaml b/.github/ansible/prod.us-east-2.hosts.yaml
index 7a4002ec88..1753068b8c 100644
--- a/.github/ansible/prod.us-east-2.hosts.yaml
+++ b/.github/ansible/prod.us-east-2.hosts.yaml
@@ -3,7 +3,7 @@ storage:
     bucket_name: neon-prod-storage-us-east-2
     bucket_region: us-east-2
     console_mgmt_base_url: http://console-release.local
-    broker_endpoint: https://storage-broker.delta.us-east-2.internal.aws.neon.tech:443
+    broker_endpoint: http://storage-broker-lb.delta.us-east-2.internal.aws.neon.tech:50051
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
       remote_storage:
diff --git a/.github/ansible/prod.us-west-2.hosts.yaml b/.github/ansible/prod.us-west-2.hosts.yaml
index 682ee5994d..7d6e49bf9c 100644
--- a/.github/ansible/prod.us-west-2.hosts.yaml
+++ b/.github/ansible/prod.us-west-2.hosts.yaml
@@ -3,7 +3,7 @@ storage:
     bucket_name: neon-prod-storage-us-west-2
     bucket_region: us-west-2
     console_mgmt_base_url: http://console-release.local
-    broker_endpoint: https://storage-broker.eta.us-west-2.internal.aws.neon.tech:443
+    broker_endpoint: http://storage-broker-lb.eta.us-west-2.internal.aws.neon.tech:50051
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
       remote_storage:
diff --git a/.github/ansible/staging.eu-west-1.hosts.yaml b/.github/ansible/staging.eu-west-1.hosts.yaml
index 90f00175b0..cfcc3a9ae8 100644
--- a/.github/ansible/staging.eu-west-1.hosts.yaml
+++ b/.github/ansible/staging.eu-west-1.hosts.yaml
@@ -3,7 +3,7 @@ storage:
     bucket_name: neon-dev-storage-eu-west-1
     bucket_region: eu-west-1
     console_mgmt_base_url: http://console-staging.local
-    broker_endpoint: https://storage-broker.zeta.eu-west-1.internal.aws.neon.build:443
+    broker_endpoint: http://storage-broker-lb.zeta.eu-west-1.internal.aws.neon.build:50051
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
       remote_storage:
diff --git a/.github/ansible/staging.us-east-2.hosts.yaml b/.github/ansible/staging.us-east-2.hosts.yaml
index d2b7fae12a..78a4582e57 100644
--- a/.github/ansible/staging.us-east-2.hosts.yaml
+++ b/.github/ansible/staging.us-east-2.hosts.yaml
@@ -3,7 +3,7 @@ storage:
     bucket_name: neon-staging-storage-us-east-2
     bucket_region: us-east-2
     console_mgmt_base_url: http://console-staging.local
-    broker_endpoint: https://storage-broker.beta.us-east-2.internal.aws.neon.build:443
+    broker_endpoint: http://storage-broker-lb.beta.us-east-2.internal.aws.neon.build:50051
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
       remote_storage:
diff --git a/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml b/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml
index e876367a18..c6e682f571 100644
--- a/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml
+++ b/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml
@@ -3,27 +3,22 @@ podLabels:
   neon_env: staging
   neon_service: storage-broker
 
-ingress:
-  enabled: true
+# Use L4 LB
+service:
+  # service.annotations -- Annotations to add to the service
   annotations:
-    kubernetes.io/ingress.class: nginx-internal
-    nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
-    nginx.ingress.kubernetes.io/ssl-redirect: "true"
-    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
-    # we have basically infinite streams, disable body size limit
-    nginx.ingress.kubernetes.io/proxy-body-size: "0"
-    cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
-
-  hosts:
-    - host: storage-broker.zeta.eu-west-1.internal.aws.neon.build
-      paths:
-        - path: /
-          pathType: Prefix
-  tls:
-    - hosts:
-        - storage-broker.zeta.eu-west-1.internal.aws.neon.build
-      secretName: storage-broker-tls
+    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
+    # assign service to this name at external-dns
+    external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.zeta.eu-west-1.internal.aws.neon.build
+  # service.type -- Service type
+  type: LoadBalancer
+  # service.port -- broker listen port
+  port: 50051
 
+ingress:
+  enabled: false
 
 metrics:
   enabled: false
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml b/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml
index dcf4b99de2..c7682d24c0 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml
@@ -3,27 +3,22 @@ podLabels:
   neon_env: staging
   neon_service: storage-broker
 
-ingress:
-  enabled: true
+# Use L4 LB
+service:
+  # service.annotations -- Annotations to add to the service
   annotations:
-    kubernetes.io/ingress.class: nginx-internal
-    nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
-    nginx.ingress.kubernetes.io/ssl-redirect: "true"
-    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
-    # we have basically infinite streams, disable body size limit
-    nginx.ingress.kubernetes.io/proxy-body-size: "0"
-    cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
-
-  hosts:
-    - host: storage-broker.beta.us-east-2.internal.aws.neon.build
-      paths:
-        - path: /
-          pathType: Prefix
-  tls:
-    - hosts:
-        - storage-broker.beta.us-east-2.internal.aws.neon.build
-      secretName: storage-broker-tls
+    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
+    # assign service to this name at external-dns
+    external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.beta.us-east-2.internal.aws.neon.build
+  # service.type -- Service type
+  type: LoadBalancer
+  # service.port -- broker listen port
+  port: 50051
 
+ingress:
+  enabled: false
 
 metrics:
   enabled: false
diff --git a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-storage-broker.yaml b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-storage-broker.yaml
index 0abc6ebaa1..92b1777d0b 100644
--- a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-storage-broker.yaml
+++ b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-storage-broker.yaml
@@ -3,27 +3,22 @@ podLabels:
   neon_env: production
   neon_service: storage-broker
 
-ingress:
-  enabled: true
+# Use L4 LB
+service:
+  # service.annotations -- Annotations to add to the service
   annotations:
-    kubernetes.io/ingress.class: nginx-internal
-    nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
-    nginx.ingress.kubernetes.io/ssl-redirect: "true"
-    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
-    # we have basically infinite streams, disable body size limit
-    nginx.ingress.kubernetes.io/proxy-body-size: "0"
-    cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
-
-  hosts:
-    - host: storage-broker.epsilon.ap-southeast-1.internal.aws.neon.tech
-      paths:
-        - path: /
-          pathType: Prefix
-  tls:
-    - hosts:
-        - storage-broker.epsilon.ap-southeast-1.internal.aws.neon.tech
-      secretName: storage-broker-tls
+    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
+    # assign service to this name at external-dns
+    external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.epsilon.ap-southeast-1.internal.aws.neon.tech
+  # service.type -- Service type
+  type: LoadBalancer
+  # service.port -- broker listen port
+  port: 50051
 
+ingress:
+  enabled: false
 
 metrics:
   enabled: false
diff --git a/.github/helm-values/prod-eu-central-1-gamma.neon-storage-broker.yaml b/.github/helm-values/prod-eu-central-1-gamma.neon-storage-broker.yaml
index d44a3eab5c..f89df4533a 100644
--- a/.github/helm-values/prod-eu-central-1-gamma.neon-storage-broker.yaml
+++ b/.github/helm-values/prod-eu-central-1-gamma.neon-storage-broker.yaml
@@ -3,27 +3,22 @@ podLabels:
   neon_env: production
   neon_service: storage-broker
 
-ingress:
-  enabled: true
+# Use L4 LB
+service:
+  # service.annotations -- Annotations to add to the service
   annotations:
-    kubernetes.io/ingress.class: nginx-internal
-    nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
-    nginx.ingress.kubernetes.io/ssl-redirect: "true"
-    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
-    # we have basically infinite streams, disable body size limit
-    nginx.ingress.kubernetes.io/proxy-body-size: "0"
-    cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
-
-  hosts:
-    - host: storage-broker.gamma.eu-central-1.internal.aws.neon.tech
-      paths:
-        - path: /
-          pathType: Prefix
-  tls:
-    - hosts:
-        - storage-broker.gamma.eu-central-1.internal.aws.neon.tech
-      secretName: storage-broker-tls
+    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
+    # assign service to this name at external-dns
+    external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.gamma.eu-central-1.internal.aws.neon.tech
+  # service.type -- Service type
+  type: LoadBalancer
+  # service.port -- broker listen port
+  port: 50051
 
+ingress:
+  enabled: false
 
 metrics:
   enabled: false
diff --git a/.github/helm-values/prod-us-east-2-delta.neon-storage-broker.yaml b/.github/helm-values/prod-us-east-2-delta.neon-storage-broker.yaml
index b9eeff5681..8cbc1af7cf 100644
--- a/.github/helm-values/prod-us-east-2-delta.neon-storage-broker.yaml
+++ b/.github/helm-values/prod-us-east-2-delta.neon-storage-broker.yaml
@@ -3,27 +3,22 @@ podLabels:
   neon_env: production
   neon_service: storage-broker
 
-ingress:
-  enabled: true
+# Use L4 LB
+service:
+  # service.annotations -- Annotations to add to the service
   annotations:
-    kubernetes.io/ingress.class: nginx-internal
-    nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
-    nginx.ingress.kubernetes.io/ssl-redirect: "true"
-    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
-    # we have basically infinite streams, disable body size limit
-    nginx.ingress.kubernetes.io/proxy-body-size: "0"
-    cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
-
-  hosts:
-    - host: storage-broker.delta.us-east-2.internal.aws.neon.tech
-      paths:
-        - path: /
-          pathType: Prefix
-  tls:
-    - hosts:
-        - storage-broker.delta.us-east-2.internal.aws.neon.tech
-      secretName: storage-broker-tls
+    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
+    # assign service to this name at external-dns
+    external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.delta.us-east-2.internal.aws.neon.tech
+  # service.type -- Service type
+  type: LoadBalancer
+  # service.port -- broker listen port
+  port: 50051
 
+ingress:
+  enabled: false
 
 metrics:
   enabled: false
diff --git a/.github/helm-values/prod-us-west-2-eta.neon-storage-broker.yaml b/.github/helm-values/prod-us-west-2-eta.neon-storage-broker.yaml
index 249f76303a..8a7488948d 100644
--- a/.github/helm-values/prod-us-west-2-eta.neon-storage-broker.yaml
+++ b/.github/helm-values/prod-us-west-2-eta.neon-storage-broker.yaml
@@ -3,27 +3,22 @@ podLabels:
   neon_env: production
   neon_service: storage-broker
 
-ingress:
-  enabled: true
+# Use L4 LB
+service:
+  # service.annotations -- Annotations to add to the service
   annotations:
-    kubernetes.io/ingress.class: nginx-internal
-    nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
-    nginx.ingress.kubernetes.io/ssl-redirect: "true"
-    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
-    # we have basically infinite streams, disable body size limit
-    nginx.ingress.kubernetes.io/proxy-body-size: "0"
-    cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
-
-  hosts:
-    - host: storage-broker.eta.us-west-2.internal.aws.neon.tech
-      paths:
-        - path: /
-          pathType: Prefix
-  tls:
-    - hosts:
-        - storage-broker.eta.us-west-2.internal.aws.neon.tech
-      secretName: storage-broker-tls
+    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
+    # assign service to this name at external-dns
+    external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.eta.us-west-2.internal.aws.neon.tech
+  # service.type -- Service type
+  type: LoadBalancer
+  # service.port -- broker listen port
+  port: 50051
 
+ingress:
+  enabled: false
 
 metrics:
   enabled: false
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 7a887cbece..43b855a2b0 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1072,7 +1072,7 @@ jobs:
 
       - name: Deploy storage-broker
         run:
-          helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace neon-storage-broker --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
+          helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
 
   deploy-proxy-prod-new:
     runs-on: prod
@@ -1149,7 +1149,7 @@ jobs:
 
       - name: Deploy storage-broker
         run:
-          helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace neon-storage-broker --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
+          helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
 
   promote-compatibility-data:
     runs-on: [ self-hosted, dev, x64 ]

From 73ea0a0b0188cfa40d09fb458e91f3cb38ce7425 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 16 Dec 2022 13:40:01 +0200
Subject: [PATCH 015/412] fix(remote_storage): use cached credentials (#3128)

IMDSv2 has limits, and if we query it on every s3 interaction we are
going to go over those limits. Changes the s3_bucket client
configuration to use:
- ChainCredentialsProvider to handle env variables or imds usage
- LazyCachingCredentialsProvider to actually cache any credentials

Related: https://github.com/awslabs/aws-sdk-rust/issues/629
Possibly related: https://github.com/neondatabase/neon/issues/3118
---
 libs/remote_storage/src/s3_bucket.rs | 47 +++++++++++-----------------
 1 file changed, 18 insertions(+), 29 deletions(-)

diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index ab1e5da6c5..740f3753d8 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -4,14 +4,13 @@
 //! allowing multiple api users to independently work with the same S3 bucket, if
 //! their bucket prefixes are both specified and different.
 
-use std::env::var;
 use std::sync::Arc;
-use std::time::Duration;
 
 use anyhow::Context;
 use aws_config::{
-    environment::credentials::EnvironmentVariableCredentialsProvider, imds,
-    imds::credentials::ImdsCredentialsProvider, meta::credentials::provide_credentials_fn,
+    environment::credentials::EnvironmentVariableCredentialsProvider,
+    imds::credentials::ImdsCredentialsProvider,
+    meta::credentials::{CredentialsProviderChain, LazyCachingCredentialsProvider},
 };
 use aws_sdk_s3::{
     config::Config,
@@ -20,7 +19,6 @@ use aws_sdk_s3::{
     Client, Endpoint, Region,
 };
 use aws_smithy_http::body::SdkBody;
-use aws_types::credentials::{CredentialsError, ProvideCredentials};
 use hyper::Body;
 use tokio::{io, sync::Semaphore};
 use tokio_util::io::ReaderStream;
@@ -31,8 +29,6 @@ use crate::{
     Download, DownloadError, RemotePath, RemoteStorage, S3Config, REMOTE_STORAGE_PREFIX_SEPARATOR,
 };
 
-const DEFAULT_IMDS_TIMEOUT: Duration = Duration::from_secs(10);
-
 pub(super) mod metrics {
     use metrics::{register_int_counter_vec, IntCounterVec};
     use once_cell::sync::Lazy;
@@ -122,30 +118,23 @@ impl S3Bucket {
             "Creating s3 remote storage for S3 bucket {}",
             aws_config.bucket_name
         );
+
+        let credentials_provider = {
+            // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
+            let env_creds = EnvironmentVariableCredentialsProvider::new();
+            // uses imds v2
+            let imds = ImdsCredentialsProvider::builder().build();
+
+            // finally add caching.
+            // this might change in future, see https://github.com/awslabs/aws-sdk-rust/issues/629
+            LazyCachingCredentialsProvider::builder()
+                .load(CredentialsProviderChain::first_try("env", env_creds).or_else("imds", imds))
+                .build()
+        };
+
         let mut config_builder = Config::builder()
             .region(Region::new(aws_config.bucket_region.clone()))
-            .credentials_provider(provide_credentials_fn(|| async {
-                match var("AWS_ACCESS_KEY_ID").is_ok() && var("AWS_SECRET_ACCESS_KEY").is_ok() {
-                    true => {
-                        EnvironmentVariableCredentialsProvider::new()
-                            .provide_credentials()
-                            .await
-                    }
-                    false => {
-                        let imds_client = imds::Client::builder()
-                            .connect_timeout(DEFAULT_IMDS_TIMEOUT)
-                            .read_timeout(DEFAULT_IMDS_TIMEOUT)
-                            .build()
-                            .await
-                            .map_err(CredentialsError::unhandled)?;
-                        ImdsCredentialsProvider::builder()
-                            .imds_client(imds_client)
-                            .build()
-                            .provide_credentials()
-                            .await
-                    }
-                }
-            }));
+            .credentials_provider(credentials_provider);
 
         if let Some(custom_endpoint) = aws_config.endpoint.clone() {
             let endpoint = Endpoint::immutable(

From ece05556002b9c95fb9ad8f0a475ab10468e77fd Mon Sep 17 00:00:00 2001
From: Sergey Melnikov <sergey@neon.tech>
Date: Wed, 14 Dec 2022 21:28:14 +0100
Subject: [PATCH 016/412] Push proxy metrics to Victoria Metrics (#3106)

---
 .../dev-eu-west-1-zeta.neon-proxy-scram.yaml  | 25 +++++++++++++++++++
 .../dev-us-east-2-beta.neon-proxy-link.yaml   | 25 +++++++++++++++++++
 ...s-east-2-beta.neon-proxy-scram-legacy.yaml | 25 +++++++++++++++++++
 .../dev-us-east-2-beta.neon-proxy-scram.yaml  | 25 +++++++++++++++++++
 .../helm-values/neon-stress.proxy-scram.yaml  | 25 +++++++++++++++++++
 .github/helm-values/neon-stress.proxy.yaml    | 25 +++++++++++++++++++
 ...-southeast-1-epsilon.neon-proxy-scram.yaml | 25 +++++++++++++++++++
 ...d-eu-central-1-gamma.neon-proxy-scram.yaml | 25 +++++++++++++++++++
 ...prod-us-east-2-delta.neon-proxy-scram.yaml | 25 +++++++++++++++++++
 .../prod-us-west-2-eta.neon-proxy-scram.yaml  | 25 +++++++++++++++++++
 .../helm-values/production.proxy-scram.yaml   | 25 +++++++++++++++++++
 .github/helm-values/production.proxy.yaml     | 25 +++++++++++++++++++
 .github/helm-values/staging.proxy-scram.yaml  | 25 +++++++++++++++++++
 .github/helm-values/staging.proxy.yaml        | 25 +++++++++++++++++++
 14 files changed, 350 insertions(+)

diff --git a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
index f89eea5972..ae9c1f2e40 100644
--- a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
+++ b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
@@ -30,3 +30,28 @@ exposedService:
 #    enabled: true
 #    selector:
 #      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
index eeb025277b..093fac146a 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
@@ -38,3 +38,28 @@ exposedService:
 #    enabled: true
 #    selector:
 #      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
index ed710bc196..a2f932e4fb 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
@@ -30,3 +30,28 @@ exposedService:
 #    enabled: true
 #    selector:
 #      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
index ba0109c1eb..1138536e94 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
@@ -30,3 +30,28 @@ exposedService:
 #    enabled: true
 #    selector:
 #      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/neon-stress.proxy-scram.yaml b/.github/helm-values/neon-stress.proxy-scram.yaml
index dea47304a0..ed580349fc 100644
--- a/.github/helm-values/neon-stress.proxy-scram.yaml
+++ b/.github/helm-values/neon-stress.proxy-scram.yaml
@@ -25,3 +25,28 @@ metrics:
     enabled: true
     selector:
       release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/neon-stress.proxy.yaml b/.github/helm-values/neon-stress.proxy.yaml
index c3ecf6c743..94270ced09 100644
--- a/.github/helm-values/neon-stress.proxy.yaml
+++ b/.github/helm-values/neon-stress.proxy.yaml
@@ -34,3 +34,28 @@ metrics:
     enabled: true
     selector:
       release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
index a37a37406c..4e4aff1f9e 100644
--- a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
@@ -30,3 +30,28 @@ exposedService:
 #    enabled: true
 #    selector:
 #      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
index 69d00a7e9c..94290a87e1 100644
--- a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
@@ -30,3 +30,28 @@ exposedService:
 #    enabled: true
 #    selector:
 #      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
index 19d91fa4dc..1a4023708b 100644
--- a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
@@ -30,3 +30,28 @@ exposedService:
 #    enabled: true
 #    selector:
 #      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
index f148188c48..2942d6a2aa 100644
--- a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
@@ -30,3 +30,28 @@ exposedService:
 #    enabled: true
 #    selector:
 #      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/production.proxy-scram.yaml b/.github/helm-values/production.proxy-scram.yaml
index 399bc6d21b..c7143cd61a 100644
--- a/.github/helm-values/production.proxy-scram.yaml
+++ b/.github/helm-values/production.proxy-scram.yaml
@@ -23,3 +23,28 @@ metrics:
     enabled: true
     selector:
       release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/production.proxy.yaml b/.github/helm-values/production.proxy.yaml
index 9db68c1044..dbaf3cd096 100644
--- a/.github/helm-values/production.proxy.yaml
+++ b/.github/helm-values/production.proxy.yaml
@@ -32,3 +32,28 @@ metrics:
     enabled: true
     selector:
       release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/staging.proxy-scram.yaml b/.github/helm-values/staging.proxy-scram.yaml
index f249df3612..66f9921c9a 100644
--- a/.github/helm-values/staging.proxy-scram.yaml
+++ b/.github/helm-values/staging.proxy-scram.yaml
@@ -30,3 +30,28 @@ metrics:
     enabled: true
     selector:
       release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/staging.proxy.yaml b/.github/helm-values/staging.proxy.yaml
index 62b4c4a595..a22082e625 100644
--- a/.github/helm-values/staging.proxy.yaml
+++ b/.github/helm-values/staging.proxy.yaml
@@ -30,3 +30,28 @@ metrics:
     enabled: true
     selector:
       release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"

From f759b561f3f7489b536c8ed4638e34ea5c73c91a Mon Sep 17 00:00:00 2001
From: Dmitry Rodionov <dmitry@neon.tech>
Date: Thu, 29 Dec 2022 16:08:21 +0200
Subject: [PATCH 017/412] add pageserver to new region see
 https://github.com/neondatabase/aws/pull/116

---
 .github/ansible/prod.us-west-2.hosts.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/ansible/prod.us-west-2.hosts.yaml b/.github/ansible/prod.us-west-2.hosts.yaml
index 7d6e49bf9c..9eb422a3ae 100644
--- a/.github/ansible/prod.us-west-2.hosts.yaml
+++ b/.github/ansible/prod.us-west-2.hosts.yaml
@@ -25,6 +25,8 @@ storage:
           ansible_host: i-0d9f6dfae0e1c780d 
         pageserver-1.us-west-2.aws.neon.tech:
           ansible_host: i-0c834be1dddba8b3f
+        pageserver-2.us-west-2.aws.neon.tech:
+          ansible_host: i-051642d372c0a4f32
 
     safekeepers:
       hosts:

From 06d25f2186cc4b525dc138a8d193da92001f0e96 Mon Sep 17 00:00:00 2001
From: Dmitry Rodionov <dmitry@neon.tech>
Date: Wed, 28 Dec 2022 17:48:49 +0200
Subject: [PATCH 018/412] switch to debug from info to produce less noise

---
 pageserver/src/storage_sync2.rs   | 6 +++---
 pageserver/src/tenant/timeline.rs | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/pageserver/src/storage_sync2.rs b/pageserver/src/storage_sync2.rs
index 7cc0eac2bf..55dbeaff73 100644
--- a/pageserver/src/storage_sync2.rs
+++ b/pageserver/src/storage_sync2.rs
@@ -203,7 +203,7 @@ use std::sync::{Arc, Mutex};
 use anyhow::ensure;
 use remote_storage::{DownloadError, GenericRemoteStorage};
 use tokio::runtime::Runtime;
-use tracing::{info, warn};
+use tracing::{debug, info, warn};
 use tracing::{info_span, Instrument};
 
 use utils::lsn::Lsn;
@@ -747,7 +747,7 @@ impl RemoteTimelineClient {
             // We can launch this task. Remove it from the queue first.
             let next_op = upload_queue.queued_operations.pop_front().unwrap();
 
-            info!("starting op: {}", next_op);
+            debug!("starting op: {}", next_op);
 
             // Update the counters
             match next_op {
@@ -930,7 +930,7 @@ impl RemoteTimelineClient {
                 task.op, retries
             );
         } else {
-            info!("remote task {} completed successfully", task.op);
+            debug!("remote task {} completed successfully", task.op);
         }
 
         // The task has completed succesfully. Remove it from the in-progress list.
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index a746fd9bf8..cd045d1081 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2288,7 +2288,7 @@ impl Timeline {
         // See storage_sync module level comment on consistency.
         // Do it here because we don't want to hold self.layers.write() while waiting.
         if let Some(remote_client) = &self.remote_client {
-            info!("waiting for upload ops to complete");
+            debug!("waiting for upload ops to complete");
             remote_client
                 .wait_completion()
                 .await
@@ -2499,7 +2499,7 @@ impl Timeline {
         // See storage_sync module level comment on consistency.
         // Do it here because we don't want to hold self.layers.write() while waiting.
         if let Some(remote_client) = &self.remote_client {
-            info!("waiting for upload ops to complete");
+            debug!("waiting for upload ops to complete");
             remote_client
                 .wait_completion()
                 .await

From cd01bbc715e9ac594af29f6a657430e7a0703877 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Thu, 22 Dec 2022 19:21:53 +0400
Subject: [PATCH 019/412] Move zenith-1-sk-3 to zenith-1-sk-4 (#3164)

---
 .github/ansible/production.hosts.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ansible/production.hosts.yaml b/.github/ansible/production.hosts.yaml
index d22c845966..3122a43801 100644
--- a/.github/ansible/production.hosts.yaml
+++ b/.github/ansible/production.hosts.yaml
@@ -34,5 +34,5 @@ storage:
           console_region_id: aws-us-west-2
         zenith-1-sk-2:
           console_region_id: aws-us-west-2
-        zenith-1-sk-3:
+        zenith-1-sk-4:
           console_region_id: aws-us-west-2

From d90c5a03af3763949442686e118581e5cdd4dd90 Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Tue, 17 Jan 2023 22:07:38 +0200
Subject: [PATCH 020/412] Add more io::Error context when fail to operate on a
 path (#3254)

I have a test failure that shows

```
Caused by:
    0: Failed to reconstruct a page image:
    1: Directory not empty (os error 39)
```

but does not really show where exactly that happens.

https://neon-github-public-dev.s3.amazonaws.com/reports/pr-3227/release/3823785365/index.html#categories/c0057473fc9ec8fb70876fd29a171ce8/7088dab272f2c7b7/?attachment=60fe6ed2add4d82d

The PR aims to add more context in debugging that issue.
---
 pageserver/src/walredo.rs | 30 +++++++++++++-----------------
 1 file changed, 13 insertions(+), 17 deletions(-)

diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index a552c05d63..fd0524016f 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -626,24 +626,20 @@ impl PostgresRedoProcess {
 
         // Create empty data directory for wal-redo postgres, deleting old one first.
         if datadir.exists() {
-            info!(
-                "old temporary datadir {} exists, removing",
-                datadir.display()
-            );
-            fs::remove_dir_all(&datadir)?;
+            info!("old temporary datadir {datadir:?} exists, removing");
+            fs::remove_dir_all(&datadir).map_err(|e| {
+                Error::new(
+                    e.kind(),
+                    format!("Old temporary dir {datadir:?} removal failure: {e}"),
+                )
+            })?;
         }
-        let pg_bin_dir_path = conf.pg_bin_dir(pg_version).map_err(|e| {
-            Error::new(
-                ErrorKind::Other,
-                format!("incorrect pg_bin_dir path: {}", e),
-            )
-        })?;
-        let pg_lib_dir_path = conf.pg_lib_dir(pg_version).map_err(|e| {
-            Error::new(
-                ErrorKind::Other,
-                format!("incorrect pg_lib_dir path: {}", e),
-            )
-        })?;
+        let pg_bin_dir_path = conf
+            .pg_bin_dir(pg_version)
+            .map_err(|e| Error::new(ErrorKind::Other, format!("incorrect pg_bin_dir path: {e}")))?;
+        let pg_lib_dir_path = conf
+            .pg_lib_dir(pg_version)
+            .map_err(|e| Error::new(ErrorKind::Other, format!("incorrect pg_lib_dir path: {e}")))?;
 
         info!("running initdb in {}", datadir.display());
         let initdb = Command::new(pg_bin_dir_path.join("initdb"))

From bd535b3371f4e45f13a3f9abbacc3efbf931616f Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 18 Jan 2023 02:29:05 +0200
Subject: [PATCH 021/412] If an error happens while checking for core dumps,
 don't panic.

If we panic, we skip the 30s wait in 'main', and don't give the
console a chance to observe the error. Which is not nice.

Spotted by @ololobus at
https://github.com/neondatabase/neon/pull/3352#discussion_r1072806981
---
 compute_tools/src/compute.rs | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index c2c9ab2230..d652084e00 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -23,7 +23,7 @@ use std::sync::RwLock;
 
 use anyhow::{Context, Result};
 use chrono::{DateTime, Utc};
-use log::{info, warn};
+use log::{error, info, warn};
 use postgres::{Client, NoTls};
 use serde::{Serialize, Serializer};
 
@@ -311,8 +311,9 @@ impl ComputeNode {
             .wait()
             .expect("failed to start waiting on Postgres process");
 
-        self.check_for_core_dumps()
-            .expect("failed to check for core dumps");
+        if let Err(err) = self.check_for_core_dumps() {
+            error!("error while checking for core dumps: {err:?}");
+        }
 
         Ok(ecode)
     }

From 4992160677509996064df4f3dacc79c64fe2c9c2 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Wed, 18 Jan 2023 14:58:55 +0200
Subject: [PATCH 022/412] Fix metric_collection_endpoint for prod. It was
 incorrectly set to staging url

---
 .github/ansible/production.hosts.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ansible/production.hosts.yaml b/.github/ansible/production.hosts.yaml
index 22bace5ade..ecb847bd61 100644
--- a/.github/ansible/production.hosts.yaml
+++ b/.github/ansible/production.hosts.yaml
@@ -7,7 +7,7 @@ storage:
     broker_endpoint: http://storage-broker.prod.local:50051
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
-      metric_collection_endpoint: http://console-staging.local/billing/api/v1/usage_events
+      metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
       metric_collection_interval: 10min
       remote_storage:
         bucket_name: "{{ bucket_name }}"

From c85374295fd38e081c1a683281016355471852bd Mon Sep 17 00:00:00 2001
From: Vadim Kharitonov <vadim@neon.tech>
Date: Wed, 18 Jan 2023 14:49:59 +0100
Subject: [PATCH 023/412] Change SENTRY_ENVIRONMENT from "development" to
 "staging"

---
 .github/ansible/staging.eu-west-1.hosts.yaml                    | 2 +-
 .github/ansible/staging.us-east-2.hosts.yaml                    | 2 +-
 .github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml    | 2 +-
 .github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml | 2 +-
 .github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml     | 2 +-
 .../helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml | 2 +-
 .github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml    | 2 +-
 .github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml | 2 +-
 8 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/ansible/staging.eu-west-1.hosts.yaml b/.github/ansible/staging.eu-west-1.hosts.yaml
index fce450ed39..f28dc8e07b 100644
--- a/.github/ansible/staging.eu-west-1.hosts.yaml
+++ b/.github/ansible/staging.eu-west-1.hosts.yaml
@@ -18,7 +18,7 @@ storage:
     ansible_aws_ssm_region: eu-west-1
     ansible_aws_ssm_bucket_name: neon-dev-storage-eu-west-1
     console_region_id: aws-eu-west-1
-    sentry_environment: development
+    sentry_environment: staging
 
   children:
     pageservers:
diff --git a/.github/ansible/staging.us-east-2.hosts.yaml b/.github/ansible/staging.us-east-2.hosts.yaml
index 1d1b8dbfa4..4891875369 100644
--- a/.github/ansible/staging.us-east-2.hosts.yaml
+++ b/.github/ansible/staging.us-east-2.hosts.yaml
@@ -18,7 +18,7 @@ storage:
     ansible_aws_ssm_region: us-east-2
     ansible_aws_ssm_bucket_name: neon-staging-storage-us-east-2
     console_region_id: aws-us-east-2
-    sentry_environment: development
+    sentry_environment: staging
 
   children:
     pageservers:
diff --git a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
index 47924456ba..c49b8d2009 100644
--- a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
+++ b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
@@ -8,7 +8,7 @@ settings:
   authBackend: "console"
   authEndpoint: "http://console-staging.local/management/api/v2"
   domain: "*.eu-west-1.aws.neon.build"
-  sentryEnvironment: "development"
+  sentryEnvironment: "staging"
   wssPort: 8443
   metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
   metricCollectionInterval: "1min"
diff --git a/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml b/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml
index c6e682f571..ccf701f52d 100644
--- a/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml
+++ b/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml
@@ -49,4 +49,4 @@ extraManifests:
           - "{{ .Release.Namespace }}"
 
 settings:
-  sentryEnvironment: "development"
+  sentryEnvironment: "staging"
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
index eb8fd50c0f..cb062f705d 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
@@ -8,7 +8,7 @@ settings:
   authBackend: "link"
   authEndpoint: "https://console.stage.neon.tech/authenticate_proxy_request/"
   uri: "https://console.stage.neon.tech/psql_session/"
-  sentryEnvironment: "development"
+  sentryEnvironment: "staging"
   metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
   metricCollectionInterval: "1min"
 
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
index 8a08738d5f..99b67d75c1 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
@@ -8,7 +8,7 @@ settings:
   authBackend: "console"
   authEndpoint: "http://console-staging.local/management/api/v2"
   domain: "*.cloud.stage.neon.tech"
-  sentryEnvironment: "development"
+  sentryEnvironment: "staging"
   wssPort: 8443
   metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
   metricCollectionInterval: "1min"
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
index b02d46917c..764bb25b64 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
@@ -8,7 +8,7 @@ settings:
   authBackend: "console"
   authEndpoint: "http://console-staging.local/management/api/v2"
   domain: "*.us-east-2.aws.neon.build"
-  sentryEnvironment: "development"
+  sentryEnvironment: "staging"
   wssPort: 8443
   metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
   metricCollectionInterval: "1min"
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml b/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml
index c7682d24c0..69363c5f13 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml
@@ -49,4 +49,4 @@ extraManifests:
           - "{{ .Release.Namespace }}"
 
 settings:
-  sentryEnvironment: "development"
+  sentryEnvironment: "staging"

From cb356f325978241e20a82703b04b784cdf8bb605 Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Sat, 7 Jan 2023 00:23:35 +0200
Subject: [PATCH 024/412] Use actual temporary dir for pageserver unit tests

---
 .gitignore                                    |   2 -
 control_plane/.gitignore                      |   1 -
 pageserver/src/config.rs                      |   5 -
 pageserver/src/tenant.rs                      | 105 ++++++++----------
 pageserver/src/tenant/ephemeral_file.rs       |  36 +++---
 .../src/tenant/remote_timeline_client.rs      |   2 +-
 pageserver/src/virtual_file.rs                |   8 +-
 pageserver/src/walingest.rs                   |  12 +-
 .../src/walreceiver/connection_manager.rs     |  16 +--
 test_runner/sql_regress/.gitignore            |   1 -
 10 files changed, 80 insertions(+), 108 deletions(-)
 delete mode 100644 control_plane/.gitignore

diff --git a/.gitignore b/.gitignore
index f1afdee599..2e241ee8cd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,5 @@
 /pg_install
 /target
-/tmp_check
-/tmp_check_cli
 __pycache__/
 test_output/
 .vscode
diff --git a/control_plane/.gitignore b/control_plane/.gitignore
deleted file mode 100644
index c1e54a6bcb..0000000000
--- a/control_plane/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-tmp_check/
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 51d1664e52..f3e5fb8c1a 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -693,11 +693,6 @@ impl PageServerConf {
         Ok(t_conf)
     }
 
-    #[cfg(test)]
-    pub fn test_repo_dir(test_name: &str) -> PathBuf {
-        PathBuf::from(format!("../tmp_check/test_{test_name}"))
-    }
-
     pub fn dummy_conf(repo_dir: PathBuf) -> Self {
         let pg_distrib_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../pg_install");
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 1d0d6b66ab..0dd6735993 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2626,9 +2626,9 @@ where
 #[cfg(test)]
 pub mod harness {
     use bytes::{Bytes, BytesMut};
-    use once_cell::sync::Lazy;
-    use std::sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard};
+    use std::sync::Arc;
     use std::{fs, path::PathBuf};
+    use tempfile::TempDir;
     use utils::lsn::Lsn;
 
     use crate::{
@@ -2659,8 +2659,6 @@ pub mod harness {
         buf.freeze()
     }
 
-    static LOCK: Lazy<RwLock<()>> = Lazy::new(|| RwLock::new(()));
-
     impl From<TenantConf> for TenantConfOpt {
         fn from(tenant_conf: TenantConf) -> Self {
             Self {
@@ -2681,36 +2679,27 @@ pub mod harness {
         }
     }
 
-    pub struct TenantHarness<'a> {
+    /// The harness saves some boilerplate and provides a way to create functional tenant
+    /// without running pageserver binary. It uses temporary directory to store data in it.
+    /// Tempdir gets removed on harness drop.
+    pub struct TenantHarness {
+        // keep the struct to not to remove tmp dir during the test
+        _temp_repo_dir: TempDir,
         pub conf: &'static PageServerConf,
         pub tenant_conf: TenantConf,
         pub tenant_id: TenantId,
-
-        pub lock_guard: (
-            Option<RwLockReadGuard<'a, ()>>,
-            Option<RwLockWriteGuard<'a, ()>>,
-        ),
     }
 
-    impl<'a> TenantHarness<'a> {
-        pub fn create(test_name: &'static str) -> anyhow::Result<Self> {
-            Self::create_internal(test_name, false)
-        }
-        pub fn create_exclusive(test_name: &'static str) -> anyhow::Result<Self> {
-            Self::create_internal(test_name, true)
-        }
-        fn create_internal(test_name: &'static str, exclusive: bool) -> anyhow::Result<Self> {
-            let lock_guard = if exclusive {
-                (None, Some(LOCK.write().unwrap()))
-            } else {
-                (Some(LOCK.read().unwrap()), None)
-            };
+    static LOG_HANDLE: OnceCell<()> = OnceCell::new();
 
-            let repo_dir = PageServerConf::test_repo_dir(test_name);
-            let _ = fs::remove_dir_all(&repo_dir);
-            fs::create_dir_all(&repo_dir)?;
+    impl TenantHarness {
+        pub fn new() -> anyhow::Result<Self> {
+            let temp_repo_dir = tempfile::tempdir()?;
+            // `TempDir` uses a randomly generated subdirectory of a system tmp dir,
+            // so far it's enough to take care of concurrently running tests.
+            let repo_dir = temp_repo_dir.path();
 
-            let conf = PageServerConf::dummy_conf(repo_dir);
+            let conf = PageServerConf::dummy_conf(repo_dir.to_path_buf());
             // Make a static copy of the config. This can never be free'd, but that's
             // OK in a test.
             let conf: &'static PageServerConf = Box::leak(Box::new(conf));
@@ -2728,10 +2717,10 @@ pub mod harness {
             fs::create_dir_all(conf.timelines_path(&tenant_id))?;
 
             Ok(Self {
+                _temp_repo_dir: temp_repo_dir,
                 conf,
                 tenant_conf,
                 tenant_id,
-                lock_guard,
             })
         }
 
@@ -2825,7 +2814,8 @@ mod tests {
 
     #[tokio::test]
     async fn test_basic() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_basic")?.load().await;
+        let harness = TenantHarness::new()?;
+        let tenant = harness.load().await;
         let tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -2858,9 +2848,8 @@ mod tests {
 
     #[tokio::test]
     async fn no_duplicate_timelines() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("no_duplicate_timelines")?
-            .load()
-            .await;
+        let harness = TenantHarness::new()?;
+        let tenant = harness.load().await;
         let _ = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -2891,7 +2880,8 @@ mod tests {
     ///
     #[tokio::test]
     async fn test_branch() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_branch")?.load().await;
+        let harness = TenantHarness::new()?;
+        let tenant = harness.load().await;
         let tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -2988,10 +2978,8 @@ mod tests {
 
     #[tokio::test]
     async fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> {
-        let tenant =
-            TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")?
-                .load()
-                .await;
+        let harness = TenantHarness::new()?;
+        let tenant = harness.load().await;
         let tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -3026,9 +3014,8 @@ mod tests {
 
     #[tokio::test]
     async fn test_prohibit_branch_creation_on_pre_initdb_lsn() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?
-            .load()
-            .await;
+        let harness = TenantHarness::new()?;
+        let tenant = harness.load().await;
 
         tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0x50), DEFAULT_PG_VERSION)?
@@ -3077,9 +3064,8 @@ mod tests {
 
     #[tokio::test]
     async fn test_retain_data_in_parent_which_is_needed_for_child() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?
-            .load()
-            .await;
+        let harness = TenantHarness::new()?;
+        let tenant = harness.load().await;
         let tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -3101,9 +3087,8 @@ mod tests {
     }
     #[tokio::test]
     async fn test_parent_keeps_data_forever_after_branching() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_parent_keeps_data_forever_after_branching")?
-            .load()
-            .await;
+        let harness = TenantHarness::new()?;
+        let tenant = harness.load().await;
         let tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -3134,8 +3119,7 @@ mod tests {
 
     #[tokio::test]
     async fn timeline_load() -> anyhow::Result<()> {
-        const TEST_NAME: &str = "timeline_load";
-        let harness = TenantHarness::create(TEST_NAME)?;
+        let harness = TenantHarness::new()?;
         {
             let tenant = harness.load().await;
             let tline = tenant
@@ -3154,8 +3138,7 @@ mod tests {
 
     #[tokio::test]
     async fn timeline_load_with_ancestor() -> anyhow::Result<()> {
-        const TEST_NAME: &str = "timeline_load_with_ancestor";
-        let harness = TenantHarness::create(TEST_NAME)?;
+        let harness = TenantHarness::new()?;
         // create two timelines
         {
             let tenant = harness.load().await;
@@ -3193,8 +3176,7 @@ mod tests {
 
     #[tokio::test]
     async fn corrupt_metadata() -> anyhow::Result<()> {
-        const TEST_NAME: &str = "corrupt_metadata";
-        let harness = TenantHarness::create(TEST_NAME)?;
+        let harness = TenantHarness::new()?;
         let tenant = harness.load().await;
 
         tenant
@@ -3235,7 +3217,8 @@ mod tests {
 
     #[tokio::test]
     async fn test_images() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_images")?.load().await;
+        let harness = TenantHarness::new()?;
+        let tenant = harness.load().await;
         let tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -3302,7 +3285,8 @@ mod tests {
     //
     #[tokio::test]
     async fn test_bulk_insert() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_bulk_insert")?.load().await;
+        let harness = TenantHarness::new()?;
+        let tenant = harness.load().await;
         let tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -3346,7 +3330,8 @@ mod tests {
 
     #[tokio::test]
     async fn test_random_updates() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_random_updates")?.load().await;
+        let harness = TenantHarness::new()?;
+        let tenant = harness.load().await;
         let tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -3419,9 +3404,8 @@ mod tests {
 
     #[tokio::test]
     async fn test_traverse_branches() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_traverse_branches")?
-            .load()
-            .await;
+        let harness = TenantHarness::new()?;
+        let tenant = harness.load().await;
         let mut tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -3505,9 +3489,8 @@ mod tests {
 
     #[tokio::test]
     async fn test_traverse_ancestors() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_traverse_ancestors")?
-            .load()
-            .await;
+        let harness = TenantHarness::new()?;
+        let tenant = harness.load().await;
         let mut tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs
index c433e65ad2..0debeaff1c 100644
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -76,7 +76,7 @@ impl EphemeralFile {
         })
     }
 
-    fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), io::Error> {
+    fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> io::Result<()> {
         let mut off = 0;
         while off < PAGE_SZ {
             let n = self
@@ -277,7 +277,7 @@ impl Drop for EphemeralFile {
     }
 }
 
-pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> Result<(), io::Error> {
+pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> io::Result<()> {
     if let Some(file) = EPHEMERAL_FILES.read().unwrap().files.get(&file_id) {
         match file.write_all_at(buf, blkno as u64 * PAGE_SZ as u64) {
             Ok(_) => Ok(()),
@@ -332,25 +332,17 @@ mod tests {
     use super::*;
     use crate::tenant::blob_io::{BlobCursor, BlobWriter};
     use crate::tenant::block_io::BlockCursor;
+    use crate::tenant::harness::TenantHarness;
     use rand::{seq::SliceRandom, thread_rng, RngCore};
     use std::fs;
     use std::str::FromStr;
 
-    fn harness(
-        test_name: &str,
-    ) -> Result<(&'static PageServerConf, TenantId, TimelineId), io::Error> {
-        let repo_dir = PageServerConf::test_repo_dir(test_name);
-        let _ = fs::remove_dir_all(&repo_dir);
-        let conf = PageServerConf::dummy_conf(repo_dir);
-        // Make a static copy of the config. This can never be free'd, but that's
-        // OK in a test.
-        let conf: &'static PageServerConf = Box::leak(Box::new(conf));
-
-        let tenant_id = TenantId::from_str("11000000000000000000000000000000").unwrap();
+    fn harness() -> Result<(TenantHarness, TimelineId), io::Error> {
+        let harness = TenantHarness::new().expect("Failed to create tenant harness");
         let timeline_id = TimelineId::from_str("22000000000000000000000000000000").unwrap();
-        fs::create_dir_all(conf.timeline_path(&timeline_id, &tenant_id))?;
+        fs::create_dir_all(harness.timeline_path(&timeline_id))?;
 
-        Ok((conf, tenant_id, timeline_id))
+        Ok((harness, timeline_id))
     }
 
     // Helper function to slurp contents of a file, starting at the current position,
@@ -367,10 +359,10 @@ mod tests {
     }
 
     #[test]
-    fn test_ephemeral_files() -> Result<(), io::Error> {
-        let (conf, tenant_id, timeline_id) = harness("ephemeral_files")?;
+    fn test_ephemeral_files() -> io::Result<()> {
+        let (harness, timeline_id) = harness()?;
 
-        let file_a = EphemeralFile::create(conf, tenant_id, timeline_id)?;
+        let file_a = EphemeralFile::create(harness.conf, harness.tenant_id, timeline_id)?;
 
         file_a.write_all_at(b"foo", 0)?;
         assert_eq!("foo", read_string(&file_a, 0, 20)?);
@@ -381,7 +373,7 @@ mod tests {
         // Open a lot of files, enough to cause some page evictions.
         let mut efiles = Vec::new();
         for fileno in 0..100 {
-            let efile = EphemeralFile::create(conf, tenant_id, timeline_id)?;
+            let efile = EphemeralFile::create(harness.conf, harness.tenant_id, timeline_id)?;
             efile.write_all_at(format!("file {}", fileno).as_bytes(), 0)?;
             assert_eq!(format!("file {}", fileno), read_string(&efile, 0, 10)?);
             efiles.push((fileno, efile));
@@ -398,10 +390,10 @@ mod tests {
     }
 
     #[test]
-    fn test_ephemeral_blobs() -> Result<(), io::Error> {
-        let (conf, tenant_id, timeline_id) = harness("ephemeral_blobs")?;
+    fn test_ephemeral_blobs() -> io::Result<()> {
+        let (harness, timeline_id) = harness()?;
 
-        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id)?;
+        let mut file = EphemeralFile::create(harness.conf, harness.tenant_id, timeline_id)?;
 
         let pos_foo = file.write_blob(b"foo")?;
         assert_eq!(b"foo", file.block_cursor().read_blob(pos_foo)?.as_slice());
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 013591caee..58b7eea1eb 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1064,7 +1064,7 @@ mod tests {
     // Test scheduling
     #[test]
     fn upload_scheduling() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("upload_scheduling")?;
+        let harness = TenantHarness::new()?;
         let timeline_path = harness.timeline_path(&TIMELINE_ID);
         std::fs::create_dir_all(&timeline_path)?;
 
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index fb216123c1..3ad049cc21 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -525,12 +525,13 @@ mod tests {
         })
     }
 
-    fn test_files<OF, FD>(testname: &str, openfunc: OF) -> Result<(), Error>
+    fn test_files<OF, FD>(test_name: &str, openfunc: OF) -> Result<(), Error>
     where
         FD: Read + Write + Seek + FileExt,
         OF: Fn(&Path, &OpenOptions) -> Result<FD, std::io::Error>,
     {
-        let testdir = crate::config::PageServerConf::test_repo_dir(testname);
+        let temp_repo_dir = tempfile::tempdir()?;
+        let testdir = temp_repo_dir.path().join(test_name);
         std::fs::create_dir_all(&testdir)?;
 
         let path_a = testdir.join("file_a");
@@ -632,7 +633,8 @@ mod tests {
         const THREADS: usize = 100;
         const SAMPLE: [u8; SIZE] = [0xADu8; SIZE];
 
-        let testdir = crate::config::PageServerConf::test_repo_dir("vfile_concurrency");
+        let temp_repo_dir = tempfile::tempdir()?;
+        let testdir = temp_repo_dir.path().join("vfile_concurrency");
         std::fs::create_dir_all(&testdir)?;
 
         // Create a test file.
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 0de2e6654d..77fce95160 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -1146,7 +1146,8 @@ mod tests {
 
     #[tokio::test]
     async fn test_relsize() -> Result<()> {
-        let tenant = TenantHarness::create("test_relsize")?.load().await;
+        let harness = TenantHarness::new()?;
+        let tenant = harness.load().await;
         let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
         let mut walingest = init_walingest_test(&tline).await?;
 
@@ -1323,7 +1324,8 @@ mod tests {
     // and then created it again within the same layer.
     #[tokio::test]
     async fn test_drop_extend() -> Result<()> {
-        let tenant = TenantHarness::create("test_drop_extend")?.load().await;
+        let harness = TenantHarness::new()?;
+        let tenant = harness.load().await;
         let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
         let mut walingest = init_walingest_test(&tline).await?;
 
@@ -1376,7 +1378,8 @@ mod tests {
     // and then extended it again within the same layer.
     #[tokio::test]
     async fn test_truncate_extend() -> Result<()> {
-        let tenant = TenantHarness::create("test_truncate_extend")?.load().await;
+        let harness = TenantHarness::new()?;
+        let tenant = harness.load().await;
         let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
         let mut walingest = init_walingest_test(&tline).await?;
 
@@ -1497,7 +1500,8 @@ mod tests {
     /// split into multiple 1 GB segments in Postgres.
     #[tokio::test]
     async fn test_large_rel() -> Result<()> {
-        let tenant = TenantHarness::create("test_large_rel")?.load().await;
+        let harness = TenantHarness::new()?;
+        let tenant = harness.load().await;
         let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
         let mut walingest = init_walingest_test(&tline).await?;
 
diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs
index 8b60e59305..be58aa0e07 100644
--- a/pageserver/src/walreceiver/connection_manager.rs
+++ b/pageserver/src/walreceiver/connection_manager.rs
@@ -846,7 +846,7 @@ mod tests {
 
     #[tokio::test]
     async fn no_connection_no_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("no_connection_no_candidate")?;
+        let harness = TenantHarness::new()?;
         let mut state = dummy_state(&harness).await;
         let now = Utc::now().naive_utc();
 
@@ -879,7 +879,7 @@ mod tests {
 
     #[tokio::test]
     async fn connection_no_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("connection_no_candidate")?;
+        let harness = TenantHarness::new()?;
         let mut state = dummy_state(&harness).await;
         let now = Utc::now().naive_utc();
 
@@ -942,7 +942,7 @@ mod tests {
 
     #[tokio::test]
     async fn no_connection_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("no_connection_candidate")?;
+        let harness = TenantHarness::new()?;
         let mut state = dummy_state(&harness).await;
         let now = Utc::now().naive_utc();
 
@@ -1001,7 +1001,7 @@ mod tests {
 
     #[tokio::test]
     async fn candidate_with_many_connection_failures() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("candidate_with_many_connection_failures")?;
+        let harness = TenantHarness::new()?;
         let mut state = dummy_state(&harness).await;
         let now = Utc::now().naive_utc();
 
@@ -1041,7 +1041,7 @@ mod tests {
 
     #[tokio::test]
     async fn lsn_wal_over_threshhold_current_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("lsn_wal_over_threshcurrent_candidate")?;
+        let harness = TenantHarness::new()?;
         let mut state = dummy_state(&harness).await;
         let current_lsn = Lsn(100_000).align();
         let now = Utc::now().naive_utc();
@@ -1105,7 +1105,7 @@ mod tests {
 
     #[tokio::test]
     async fn timeout_connection_threshhold_current_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("timeout_connection_threshhold_current_candidate")?;
+        let harness = TenantHarness::new()?;
         let mut state = dummy_state(&harness).await;
         let current_lsn = Lsn(100_000).align();
         let now = Utc::now().naive_utc();
@@ -1166,7 +1166,7 @@ mod tests {
 
     #[tokio::test]
     async fn timeout_wal_over_threshhold_current_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("timeout_wal_over_threshhold_current_candidate")?;
+        let harness = TenantHarness::new()?;
         let mut state = dummy_state(&harness).await;
         let current_lsn = Lsn(100_000).align();
         let new_lsn = Lsn(100_100).align();
@@ -1232,7 +1232,7 @@ mod tests {
 
     const DUMMY_SAFEKEEPER_HOST: &str = "safekeeper_connstr";
 
-    async fn dummy_state(harness: &TenantHarness<'_>) -> WalreceiverState {
+    async fn dummy_state(harness: &TenantHarness) -> WalreceiverState {
         WalreceiverState {
             id: TenantTimelineId {
                 tenant_id: harness.tenant_id,
diff --git a/test_runner/sql_regress/.gitignore b/test_runner/sql_regress/.gitignore
index 89129d7358..83186b5c86 100644
--- a/test_runner/sql_regress/.gitignore
+++ b/test_runner/sql_regress/.gitignore
@@ -2,7 +2,6 @@
 /pg_regress
 
 # Generated subdirectories
-/tmp_check/
 /results/
 /log/
 

From ffca97bc1e10b6c351067b14bdee6bbc68369c4b Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Thu, 12 Jan 2023 15:14:04 +0200
Subject: [PATCH 025/412] Enable logs in unit tests

---
 libs/utils/src/logging.rs | 2 ++
 pageserver/src/tenant.rs  | 6 ++++++
 2 files changed, 8 insertions(+)

diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs
index 3b1a1f5aff..82c9267f4a 100644
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -8,6 +8,7 @@ use strum_macros::{EnumString, EnumVariantNames};
 pub enum LogFormat {
     Plain,
     Json,
+    Test,
 }
 
 impl LogFormat {
@@ -39,6 +40,7 @@ pub fn init(log_format: LogFormat) -> anyhow::Result<()> {
     match log_format {
         LogFormat::Json => base_logger.json().init(),
         LogFormat::Plain => base_logger.init(),
+        LogFormat::Test => base_logger.with_test_writer().init(),
     }
 
     Ok(())
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 0dd6735993..c53c9bc3e1 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2626,9 +2626,11 @@ where
 #[cfg(test)]
 pub mod harness {
     use bytes::{Bytes, BytesMut};
+    use once_cell::sync::OnceCell;
     use std::sync::Arc;
     use std::{fs, path::PathBuf};
     use tempfile::TempDir;
+    use utils::logging;
     use utils::lsn::Lsn;
 
     use crate::{
@@ -2694,6 +2696,10 @@ pub mod harness {
 
     impl TenantHarness {
         pub fn new() -> anyhow::Result<Self> {
+            LOG_HANDLE.get_or_init(|| {
+                logging::init(logging::LogFormat::Test).expect("Failed to init test logging")
+            });
+
             let temp_repo_dir = tempfile::tempdir()?;
             // `TempDir` uses a randomly generated subdirectory of a system tmp dir,
             // so far it's enough to take care of concurrently running tests.

From 7b22b5c43321bf729caea4766c7f98b589c405ab Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 18 Jan 2023 11:30:02 +0200
Subject: [PATCH 026/412] Switch to 'tracing' for logging, restructure code to
 make use of spans.

Refactors Compute::prepare_and_run. It's split into subroutines
differently, to make it easier to attach tracing spans to the
different stages. The high-level logic for waiting for Postgres to
exit is moved to the caller.

Replace 'env_logger' with 'tracing', and add `#instrument` directives
to different stages fo the startup process. This is a fairly
mechanical change, except for the changes in 'spec.rs'. 'spec.rs'
contained some complicated formatting, where parts of log messages
were printed directly to stdout with `print`s. That was a bit messed
up because the log normally goes to stderr, but those lines were
printed to stdout. In our docker images, stderr and stdout both go to
the same place so you wouldn't notice, but I don't think it was
intentional.

This changes the log format to the default
'tracing_subscriber::format' format. It's different from the Postgres
log format, however, and because both compute_tools and Postgres print
to the same log, it's now a mix of two different formats.  I'm not
sure how the Grafana log parsing pipeline can handle that. If it's a
problem, we can build custom formatter to change the compute_tools log
format to be the same as Postgres's, like it was before this commit,
or we can change the Postgres log format to match tracing_formatter's,
or we can start printing compute_tool's log output to a different
destination than Postgres
---
 Cargo.lock                              |   6 +-
 compute_tools/Cargo.toml                |   4 +-
 compute_tools/src/bin/compute_ctl.rs    |  55 +++++---
 compute_tools/src/checker.rs            |   4 +-
 compute_tools/src/compute.rs            |  77 ++++++-----
 compute_tools/src/http/api.rs           |   2 +-
 compute_tools/src/informant.rs          |   2 +-
 compute_tools/src/logger.rs             |  44 ++----
 compute_tools/src/monitor.rs            |   2 +-
 compute_tools/src/pg_helpers.rs         |  12 +-
 compute_tools/src/spec.rs               | 171 ++++++++++++++----------
 test_runner/regress/test_compute_ctl.py |   2 +-
 workspace_hack/Cargo.toml               |   5 +-
 13 files changed, 210 insertions(+), 176 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 59adf696a7..d8aba9ba68 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -833,10 +833,8 @@ dependencies = [
  "anyhow",
  "chrono",
  "clap 4.0.32",
- "env_logger",
  "futures",
  "hyper",
- "log",
  "notify",
  "postgres",
  "regex",
@@ -845,6 +843,8 @@ dependencies = [
  "tar",
  "tokio",
  "tokio-postgres",
+ "tracing",
+ "tracing-subscriber",
  "url",
  "workspace_hack",
 ]
@@ -1954,7 +1954,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e"
 dependencies = [
  "cfg-if",
- "serde",
 ]
 
 [[package]]
@@ -4565,6 +4564,7 @@ dependencies = [
  "tower",
  "tracing",
  "tracing-core",
+ "tracing-subscriber",
  "url",
 ]
 
diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml
index 1e0aee81d7..4536604bdf 100644
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -8,10 +8,8 @@ license.workspace = true
 anyhow.workspace = true
 chrono.workspace = true
 clap.workspace = true
-env_logger.workspace = true
 futures.workspace = true
 hyper = { workspace = true, features = ["full"] }
-log = { workspace = true, features = ["std", "serde"] }
 notify.workspace = true
 postgres.workspace = true
 regex.workspace = true
@@ -20,6 +18,8 @@ serde_json.workspace = true
 tar.workspace = true
 tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
 tokio-postgres.workspace = true
+tracing.workspace = true
+tracing-subscriber.workspace = true
 url.workspace = true
 
 workspace_hack.workspace = true
diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index 307300cfd8..e5ab8eb153 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -40,7 +40,7 @@ use std::{thread, time::Duration};
 use anyhow::{Context, Result};
 use chrono::Utc;
 use clap::Arg;
-use log::{error, info};
+use tracing::{error, info};
 
 use compute_tools::compute::{ComputeMetrics, ComputeNode, ComputeState, ComputeStatus};
 use compute_tools::http::api::launch_http_server;
@@ -53,7 +53,6 @@ use compute_tools::spec::*;
 use url::Url;
 
 fn main() -> Result<()> {
-    // TODO: re-use `utils::logging` later
     init_logger(DEFAULT_LOG_LEVEL)?;
 
     let matches = cli().get_matches();
@@ -122,29 +121,45 @@ fn main() -> Result<()> {
     // Also spawn the thread responsible for handling the VM informant -- if it's present
     let _vm_informant_handle = spawn_vm_informant_if_present().expect("cannot launch VM informant");
 
-    // Run compute (Postgres) and hang waiting on it.
-    match compute.prepare_and_run() {
-        Ok(ec) => {
-            let code = ec.code().unwrap_or(1);
-            info!("Postgres exited with code {}, shutting down", code);
-            exit(code)
-        }
-        Err(error) => {
-            error!("could not start the compute node: {:?}", error);
-
+    // Start Postgres
+    let mut delay_exit = false;
+    let mut exit_code = None;
+    let pg = match compute.start_compute() {
+        Ok(pg) => Some(pg),
+        Err(err) => {
+            error!("could not start the compute node: {:?}", err);
             let mut state = compute.state.write().unwrap();
-            state.error = Some(format!("{:?}", error));
+            state.error = Some(format!("{:?}", err));
             state.status = ComputeStatus::Failed;
             drop(state);
-
-            // Keep serving HTTP requests, so the cloud control plane was able to
-            // get the actual error.
-            info!("giving control plane 30s to collect the error before shutdown");
-            thread::sleep(Duration::from_secs(30));
-            info!("shutting down");
-            Err(error)
+            delay_exit = true;
+            None
         }
+    };
+
+    // Wait for the child Postgres process forever. In this state Ctrl+C will
+    // propagate to Postgres and it will be shut down as well.
+    if let Some(mut pg) = pg {
+        let ecode = pg
+            .wait()
+            .expect("failed to start waiting on Postgres process");
+        info!("Postgres exited with code {}, shutting down", ecode);
+        exit_code = ecode.code()
     }
+
+    if let Err(err) = compute.check_for_core_dumps() {
+        error!("error while checking for core dumps: {err:?}");
+    }
+
+    // If launch failed, keep serving HTTP requests for a while, so the cloud
+    // control plane can get the actual error.
+    if delay_exit {
+        info!("giving control plane 30s to collect the error before shutdown");
+        thread::sleep(Duration::from_secs(30));
+        info!("shutting down");
+    }
+
+    exit(exit_code.unwrap_or(1))
 }
 
 fn cli() -> clap::Command {
diff --git a/compute_tools/src/checker.rs b/compute_tools/src/checker.rs
index ee1605c814..b8413de516 100644
--- a/compute_tools/src/checker.rs
+++ b/compute_tools/src/checker.rs
@@ -1,10 +1,11 @@
 use anyhow::{anyhow, Result};
-use log::error;
 use postgres::Client;
 use tokio_postgres::NoTls;
+use tracing::{error, instrument};
 
 use crate::compute::ComputeNode;
 
+#[instrument(skip_all)]
 pub fn create_writability_check_data(client: &mut Client) -> Result<()> {
     let query = "
     CREATE TABLE IF NOT EXISTS health_check (
@@ -21,6 +22,7 @@ pub fn create_writability_check_data(client: &mut Client) -> Result<()> {
     Ok(())
 }
 
+#[instrument(skip_all)]
 pub async fn check_writability(compute: &ComputeNode) -> Result<()> {
     let (client, connection) = tokio_postgres::connect(compute.connstr.as_str(), NoTls).await?;
     if client.is_closed() {
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index d652084e00..e229bb1cc2 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -17,15 +17,15 @@
 use std::fs;
 use std::os::unix::fs::PermissionsExt;
 use std::path::Path;
-use std::process::{Command, ExitStatus, Stdio};
+use std::process::{Command, Stdio};
 use std::sync::atomic::{AtomicU64, Ordering};
 use std::sync::RwLock;
 
 use anyhow::{Context, Result};
 use chrono::{DateTime, Utc};
-use log::{error, info, warn};
 use postgres::{Client, NoTls};
 use serde::{Serialize, Serializer};
+use tracing::{info, instrument, warn};
 
 use crate::checker::create_writability_check_data;
 use crate::config;
@@ -121,6 +121,7 @@ impl ComputeNode {
 
     // Get basebackup from the libpq connection to pageserver using `connstr` and
     // unarchive it to `pgdata` directory overriding all its previous content.
+    #[instrument(skip(self))]
     fn get_basebackup(&self, lsn: &str) -> Result<()> {
         let start_time = Utc::now();
 
@@ -154,6 +155,7 @@ impl ComputeNode {
 
     // Run `postgres` in a special mode with `--sync-safekeepers` argument
     // and return the reported LSN back to the caller.
+    #[instrument(skip(self))]
     fn sync_safekeepers(&self) -> Result<String> {
         let start_time = Utc::now();
 
@@ -196,6 +198,7 @@ impl ComputeNode {
 
     /// Do all the preparations like PGDATA directory creation, configuration,
     /// safekeepers sync, basebackup, etc.
+    #[instrument(skip(self))]
     pub fn prepare_pgdata(&self) -> Result<()> {
         let spec = &self.spec;
         let pgdata_path = Path::new(&self.pgdata);
@@ -229,9 +232,8 @@ impl ComputeNode {
 
     /// Start Postgres as a child process and manage DBs/roles.
     /// After that this will hang waiting on the postmaster process to exit.
-    pub fn run(&self) -> Result<ExitStatus> {
-        let start_time = Utc::now();
-
+    #[instrument(skip(self))]
+    pub fn start_postgres(&self) -> Result<std::process::Child> {
         let pgdata_path = Path::new(&self.pgdata);
 
         // Run postgres as a child process.
@@ -242,6 +244,11 @@ impl ComputeNode {
 
         wait_for_postgres(&mut pg, pgdata_path)?;
 
+        Ok(pg)
+    }
+
+    #[instrument(skip(self))]
+    pub fn apply_config(&self) -> Result<()> {
         // If connection fails,
         // it may be the old node with `zenith_admin` superuser.
         //
@@ -279,8 +286,34 @@ impl ComputeNode {
 
         // 'Close' connection
         drop(client);
-        let startup_end_time = Utc::now();
 
+        info!(
+            "finished configuration of compute for project {}",
+            self.spec.cluster.cluster_id
+        );
+
+        Ok(())
+    }
+
+    #[instrument(skip(self))]
+    pub fn start_compute(&self) -> Result<std::process::Child> {
+        info!(
+            "starting compute for project {}, operation {}, tenant {}, timeline {}",
+            self.spec.cluster.cluster_id,
+            self.spec.operation_uuid.as_ref().unwrap(),
+            self.tenant,
+            self.timeline,
+        );
+
+        self.prepare_pgdata()?;
+
+        let start_time = Utc::now();
+
+        let pg = self.start_postgres()?;
+
+        self.apply_config()?;
+
+        let startup_end_time = Utc::now();
         self.metrics.config_ms.store(
             startup_end_time
                 .signed_duration_since(start_time)
@@ -300,35 +333,7 @@ impl ComputeNode {
 
         self.set_status(ComputeStatus::Running);
 
-        info!(
-            "finished configuration of compute for project {}",
-            self.spec.cluster.cluster_id
-        );
-
-        // Wait for child Postgres process basically forever. In this state Ctrl+C
-        // will propagate to Postgres and it will be shut down as well.
-        let ecode = pg
-            .wait()
-            .expect("failed to start waiting on Postgres process");
-
-        if let Err(err) = self.check_for_core_dumps() {
-            error!("error while checking for core dumps: {err:?}");
-        }
-
-        Ok(ecode)
-    }
-
-    pub fn prepare_and_run(&self) -> Result<ExitStatus> {
-        info!(
-            "starting compute for project {}, operation {}, tenant {}, timeline {}",
-            self.spec.cluster.cluster_id,
-            self.spec.operation_uuid.as_ref().unwrap(),
-            self.tenant,
-            self.timeline,
-        );
-
-        self.prepare_pgdata()?;
-        self.run()
+        Ok(pg)
     }
 
     // Look for core dumps and collect backtraces.
@@ -341,7 +346,7 @@ impl ComputeNode {
     //
     // Use that as a default location and pattern, except macos where core dumps are written
     // to /cores/ directory by default.
-    fn check_for_core_dumps(&self) -> Result<()> {
+    pub fn check_for_core_dumps(&self) -> Result<()> {
         let core_dump_dir = match std::env::consts::OS {
             "macos" => Path::new("/cores/"),
             _ => Path::new(&self.pgdata),
diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs
index 44f83e5003..f2a49f332c 100644
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -6,8 +6,8 @@ use std::thread;
 use anyhow::Result;
 use hyper::service::{make_service_fn, service_fn};
 use hyper::{Body, Method, Request, Response, Server, StatusCode};
-use log::{error, info};
 use serde_json;
+use tracing::{error, info};
 
 use crate::compute::ComputeNode;
 
diff --git a/compute_tools/src/informant.rs b/compute_tools/src/informant.rs
index 09bd5e3138..8a6e3ab43a 100644
--- a/compute_tools/src/informant.rs
+++ b/compute_tools/src/informant.rs
@@ -1,8 +1,8 @@
-use log::{info, warn};
 use std::path::Path;
 use std::process;
 use std::thread;
 use std::time::Duration;
+use tracing::{info, warn};
 
 use anyhow::{Context, Result};
 
diff --git a/compute_tools/src/logger.rs b/compute_tools/src/logger.rs
index dde0a950f8..57e5496e86 100644
--- a/compute_tools/src/logger.rs
+++ b/compute_tools/src/logger.rs
@@ -1,42 +1,20 @@
-use std::io::Write;
-
 use anyhow::Result;
-use chrono::Utc;
-use env_logger::{Builder, Env};
-
-macro_rules! info_println {
-    ($($tts:tt)*) => {
-        if log_enabled!(Level::Info) {
-            println!($($tts)*);
-        }
-    }
-}
-
-macro_rules! info_print {
-    ($($tts:tt)*) => {
-        if log_enabled!(Level::Info) {
-            print!($($tts)*);
-        }
-    }
-}
+use tracing_subscriber::layer::SubscriberExt;
+use tracing_subscriber::prelude::*;
 
 /// Initialize `env_logger` using either `default_level` or
 /// `RUST_LOG` environment variable as default log level.
 pub fn init_logger(default_level: &str) -> Result<()> {
-    let env = Env::default().filter_or("RUST_LOG", default_level);
+    let env_filter = tracing_subscriber::EnvFilter::try_from_default_env()
+        .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_level));
 
-    Builder::from_env(env)
-        .format(|buf, record| {
-            let thread_handle = std::thread::current();
-            writeln!(
-                buf,
-                "{} [{}] {}: {}",
-                Utc::now().format("%Y-%m-%d %H:%M:%S%.3f %Z"),
-                thread_handle.name().unwrap_or("main"),
-                record.level(),
-                record.args()
-            )
-        })
+    let fmt_layer = tracing_subscriber::fmt::layer()
+        .with_target(false)
+        .with_writer(std::io::stderr);
+
+    tracing_subscriber::registry()
+        .with(env_filter)
+        .with(fmt_layer)
         .init();
 
     Ok(())
diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs
index c871422e78..7c9878ffcf 100644
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -3,8 +3,8 @@ use std::{thread, time};
 
 use anyhow::Result;
 use chrono::{DateTime, Utc};
-use log::{debug, info};
 use postgres::{Client, NoTls};
+use tracing::{debug, info};
 
 use crate::compute::ComputeNode;
 
diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs
index ff422f1cf5..921289d7c2 100644
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -11,6 +11,7 @@ use anyhow::{bail, Result};
 use notify::{RecursiveMode, Watcher};
 use postgres::{Client, Transaction};
 use serde::Deserialize;
+use tracing::{debug, instrument};
 
 const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds
 
@@ -229,6 +230,7 @@ pub fn get_existing_dbs(client: &mut Client) -> Result<Vec<Database>> {
 /// Wait for Postgres to become ready to accept connections. It's ready to
 /// accept connections when the state-field in `pgdata/postmaster.pid` says
 /// 'ready'.
+#[instrument(skip(pg))]
 pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> {
     let pid_path = pgdata.join("postmaster.pid");
 
@@ -287,18 +289,18 @@ pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> {
         }
 
         let res = rx.recv_timeout(Duration::from_millis(100));
-        log::debug!("woken up by notify: {res:?}");
+        debug!("woken up by notify: {res:?}");
         // If there are multiple events in the channel already, we only need to be
         // check once. Swallow the extra events before we go ahead to check the
         // pid file.
         while let Ok(res) = rx.try_recv() {
-            log::debug!("swallowing extra event: {res:?}");
+            debug!("swallowing extra event: {res:?}");
         }
 
         // Check that we can open pid file first.
         if let Ok(file) = File::open(&pid_path) {
             if !postmaster_pid_seen {
-                log::debug!("postmaster.pid appeared");
+                debug!("postmaster.pid appeared");
                 watcher
                     .unwatch(pgdata)
                     .expect("Failed to remove pgdata dir watch");
@@ -314,7 +316,7 @@ pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> {
             // Pid file could be there and we could read it, but it could be empty, for example.
             if let Some(Ok(line)) = last_line {
                 let status = line.trim();
-                log::debug!("last line of postmaster.pid: {status:?}");
+                debug!("last line of postmaster.pid: {status:?}");
 
                 // Now Postgres is ready to accept connections
                 if status == "ready" {
@@ -330,7 +332,7 @@ pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> {
         }
     }
 
-    log::info!("PostgreSQL is now running, continuing to configure it");
+    tracing::info!("PostgreSQL is now running, continuing to configure it");
 
     Ok(())
 }
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 81e01fe555..40c8366bf4 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -1,12 +1,11 @@
 use std::path::Path;
 use std::str::FromStr;
-use std::time::Instant;
 
 use anyhow::Result;
-use log::{info, log_enabled, warn, Level};
 use postgres::config::Config;
 use postgres::{Client, NoTls};
 use serde::Deserialize;
+use tracing::{info, info_span, instrument, span_enabled, warn, Level};
 
 use crate::compute::ComputeNode;
 use crate::config;
@@ -80,23 +79,25 @@ pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {
 
 /// Given a cluster spec json and open transaction it handles roles creation,
 /// deletion and update.
+#[instrument(skip_all)]
 pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
     let mut xact = client.transaction()?;
     let existing_roles: Vec<Role> = get_existing_roles(&mut xact)?;
 
     // Print a list of existing Postgres roles (only in debug mode)
-    info!("postgres roles:");
-    for r in &existing_roles {
-        info_println!(
-            "{} - {}:{}",
-            " ".repeat(27 + 5),
-            r.name,
-            if r.encrypted_password.is_some() {
-                "[FILTERED]"
-            } else {
-                "(null)"
-            }
-        );
+    if span_enabled!(Level::INFO) {
+        info!("postgres roles:");
+        for r in &existing_roles {
+            info!(
+                "    - {}:{}",
+                r.name,
+                if r.encrypted_password.is_some() {
+                    "[FILTERED]"
+                } else {
+                    "(null)"
+                }
+            );
+        }
     }
 
     // Process delta operations first
@@ -137,58 +138,68 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
     info!("cluster spec roles:");
     for role in &spec.cluster.roles {
         let name = &role.name;
-
-        info_print!(
-            "{} - {}:{}",
-            " ".repeat(27 + 5),
-            name,
-            if role.encrypted_password.is_some() {
-                "[FILTERED]"
-            } else {
-                "(null)"
-            }
-        );
-
         // XXX: with a limited number of roles it is fine, but consider making it a HashMap
         let pg_role = existing_roles.iter().find(|r| r.name == *name);
 
-        if let Some(r) = pg_role {
-            let mut update_role = false;
-
+        enum RoleAction {
+            None,
+            Update,
+            Create,
+        }
+        let action = if let Some(r) = pg_role {
             if (r.encrypted_password.is_none() && role.encrypted_password.is_some())
                 || (r.encrypted_password.is_some() && role.encrypted_password.is_none())
             {
-                update_role = true;
+                RoleAction::Update
             } else if let Some(pg_pwd) = &r.encrypted_password {
                 // Check whether password changed or not (trim 'md5:' prefix first)
-                update_role = pg_pwd[3..] != *role.encrypted_password.as_ref().unwrap();
+                if pg_pwd[3..] != *role.encrypted_password.as_ref().unwrap() {
+                    RoleAction::Update
+                } else {
+                    RoleAction::None
+                }
+            } else {
+                RoleAction::None
             }
+        } else {
+            RoleAction::Create
+        };
 
-            if update_role {
+        match action {
+            RoleAction::None => {}
+            RoleAction::Update => {
                 let mut query: String = format!("ALTER ROLE {} ", name.pg_quote());
-                info_print!(" -> update");
-
                 query.push_str(&role.to_pg_options());
                 xact.execute(query.as_str(), &[])?;
             }
-        } else {
-            info!("role name: '{}'", &name);
-            let mut query: String = format!("CREATE ROLE {} ", name.pg_quote());
-            info!("role create query: '{}'", &query);
-            info_print!(" -> create");
+            RoleAction::Create => {
+                let mut query: String = format!("CREATE ROLE {} ", name.pg_quote());
+                info!("role create query: '{}'", &query);
+                query.push_str(&role.to_pg_options());
+                xact.execute(query.as_str(), &[])?;
 
-            query.push_str(&role.to_pg_options());
-            xact.execute(query.as_str(), &[])?;
-
-            let grant_query = format!(
-                "GRANT pg_read_all_data, pg_write_all_data TO {}",
-                name.pg_quote()
-            );
-            xact.execute(grant_query.as_str(), &[])?;
-            info!("role grant query: '{}'", &grant_query);
+                let grant_query = format!(
+                    "GRANT pg_read_all_data, pg_write_all_data TO {}",
+                    name.pg_quote()
+                );
+                xact.execute(grant_query.as_str(), &[])?;
+                info!("role grant query: '{}'", &grant_query);
+            }
         }
 
-        info_print!("\n");
+        if span_enabled!(Level::INFO) {
+            let pwd = if role.encrypted_password.is_some() {
+                "[FILTERED]"
+            } else {
+                "(null)"
+            };
+            let action_str = match action {
+                RoleAction::None => "",
+                RoleAction::Create => " -> create",
+                RoleAction::Update => " -> update",
+            };
+            info!("   - {}:{}{}", name, pwd, action_str);
+        }
     }
 
     xact.commit()?;
@@ -197,6 +208,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
 }
 
 /// Reassign all dependent objects and delete requested roles.
+#[instrument(skip_all)]
 pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<()> {
     if let Some(ops) = &node.spec.delta_operations {
         // First, reassign all dependent objects to db owners.
@@ -261,13 +273,16 @@ fn reassign_owned_objects(node: &ComputeNode, role_name: &PgIdent) -> Result<()>
 /// like `CREATE DATABASE` and `DROP DATABASE` do not support it. Statement-level
 /// atomicity should be enough here due to the order of operations and various checks,
 /// which together provide us idempotency.
+#[instrument(skip_all)]
 pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
     let existing_dbs: Vec<Database> = get_existing_dbs(client)?;
 
     // Print a list of existing Postgres databases (only in debug mode)
-    info!("postgres databases:");
-    for r in &existing_dbs {
-        info_println!("{} - {}:{}", " ".repeat(27 + 5), r.name, r.owner);
+    if span_enabled!(Level::INFO) {
+        info!("postgres databases:");
+        for r in &existing_dbs {
+            info!("    {}:{}", r.name, r.owner);
+        }
     }
 
     // Process delta operations first
@@ -310,13 +325,15 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
     for db in &spec.cluster.databases {
         let name = &db.name;
 
-        info_print!("{} - {}:{}", " ".repeat(27 + 5), db.name, db.owner);
-
         // XXX: with a limited number of databases it is fine, but consider making it a HashMap
         let pg_db = existing_dbs.iter().find(|r| r.name == *name);
 
-        let start_time = Instant::now();
-        if let Some(r) = pg_db {
+        enum DatabaseAction {
+            None,
+            Update,
+            Create,
+        }
+        let action = if let Some(r) = pg_db {
             // XXX: db owner name is returned as quoted string from Postgres,
             // when quoting is needed.
             let new_owner = if r.owner.starts_with('"') {
@@ -326,29 +343,42 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
             };
 
             if new_owner != r.owner {
+                // Update the owner
+                DatabaseAction::Update
+            } else {
+                DatabaseAction::None
+            }
+        } else {
+            DatabaseAction::Create
+        };
+
+        match action {
+            DatabaseAction::None => {}
+            DatabaseAction::Update => {
                 let query: String = format!(
                     "ALTER DATABASE {} OWNER TO {}",
                     name.pg_quote(),
                     db.owner.pg_quote()
                 );
-                info_print!(" -> update");
-
+                let _ = info_span!("executing", query).entered();
                 client.execute(query.as_str(), &[])?;
-                let elapsed = start_time.elapsed().as_millis();
-                info_print!(" ({} ms)", elapsed);
             }
-        } else {
-            let mut query: String = format!("CREATE DATABASE {} ", name.pg_quote());
-            info_print!(" -> create");
+            DatabaseAction::Create => {
+                let mut query: String = format!("CREATE DATABASE {} ", name.pg_quote());
+                query.push_str(&db.to_pg_options());
+                let _ = info_span!("executing", query).entered();
+                client.execute(query.as_str(), &[])?;
+            }
+        };
 
-            query.push_str(&db.to_pg_options());
-            client.execute(query.as_str(), &[])?;
-
-            let elapsed = start_time.elapsed().as_millis();
-            info_print!(" ({} ms)", elapsed);
+        if span_enabled!(Level::INFO) {
+            let action_str = match action {
+                DatabaseAction::None => "",
+                DatabaseAction::Create => " -> create",
+                DatabaseAction::Update => " -> update",
+            };
+            info!("   - {}:{}{}", db.name, db.owner, action_str);
         }
-
-        info_print!("\n");
     }
 
     Ok(())
@@ -356,6 +386,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
 
 /// Grant CREATE ON DATABASE to the database owner and do some other alters and grants
 /// to allow users creating trusted extensions and re-creating `public` schema, for example.
+#[instrument(skip_all)]
 pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> {
     let spec = &node.spec;
 
diff --git a/test_runner/regress/test_compute_ctl.py b/test_runner/regress/test_compute_ctl.py
index f973bd8e60..05ac3841dc 100644
--- a/test_runner/regress/test_compute_ctl.py
+++ b/test_runner/regress/test_compute_ctl.py
@@ -194,7 +194,7 @@ def test_sync_safekeepers_logs(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
         )
     except TimeoutExpired as exc:
         ctl_logs = (exc.stderr or b"").decode("utf-8")
-        log.info("compute_ctl output:\n{ctl_logs}")
+        log.info(f"compute_ctl stderr:\n{ctl_logs}")
 
     with ExternalProcessManager(Path(pgdata) / "postmaster.pid"):
         start = "starting safekeepers syncing"
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 8addfcf72e..f4b71ae9b7 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -26,7 +26,7 @@ futures-util = { version = "0.3", features = ["channel", "io", "sink"] }
 indexmap = { version = "1", default-features = false, features = ["std"] }
 itertools = { version = "0.10" }
 libc = { version = "0.2", features = ["extra_traits"] }
-log = { version = "0.4", default-features = false, features = ["serde", "std"] }
+log = { version = "0.4", default-features = false, features = ["std"] }
 memchr = { version = "2" }
 nom = { version = "7" }
 num-bigint = { version = "0.4" }
@@ -45,6 +45,7 @@ tokio-util = { version = "0.7", features = ["codec", "io"] }
 tower = { version = "0.4", features = ["balance", "buffer", "limit", "retry", "timeout", "util"] }
 tracing = { version = "0.1", features = ["log"] }
 tracing-core = { version = "0.1" }
+tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] }
 url = { version = "2", features = ["serde"] }
 
 [build-dependencies]
@@ -54,7 +55,7 @@ either = { version = "1" }
 indexmap = { version = "1", default-features = false, features = ["std"] }
 itertools = { version = "0.10" }
 libc = { version = "0.2", features = ["extra_traits"] }
-log = { version = "0.4", default-features = false, features = ["serde", "std"] }
+log = { version = "0.4", default-features = false, features = ["std"] }
 memchr = { version = "2" }
 nom = { version = "7" }
 prost = { version = "0.11" }

From 300da5b872e105e404eb0842c9302f5742fb164b Mon Sep 17 00:00:00 2001
From: bojanserafimov <bojan.serafimov7@gmail.com>
Date: Thu, 19 Jan 2023 10:29:15 -0500
Subject: [PATCH 027/412] Improve layer map docstrings (#3382)

---
 pageserver/src/tenant/layer_map.rs | 52 ++++++++++++++++++++++--------
 1 file changed, 39 insertions(+), 13 deletions(-)

diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index 44bed5959f..01c5359e88 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -250,15 +250,32 @@ where
     L: ?Sized + Layer,
 {
     ///
-    /// Find the latest layer that covers the given 'key', with lsn <
-    /// 'end_lsn'.
+    /// Find the latest layer (by lsn.end) that covers the given
+    /// 'key', with lsn.start < 'end_lsn'.
     ///
-    /// Returns the layer, if any, and an 'lsn_floor' value that
-    /// indicates which portion of the layer the caller should
-    /// check. 'lsn_floor' is normally the start-LSN of the layer, but
-    /// can be greater if there is an overlapping layer that might
-    /// contain the version, even if it's missing from the returned
-    /// layer.
+    /// The caller of this function is the page reconstruction
+    /// algorithm looking for the next relevant delta layer, or
+    /// the terminal image layer. The caller will pass the lsn_floor
+    /// value as end_lsn in the next call to search.
+    ///
+    /// If there's an image layer exactly below the given end_lsn,
+    /// search should return that layer regardless if there are
+    /// overlapping deltas.
+    ///
+    /// If the latest layer is a delta and there is an overlapping
+    /// image with it below, the lsn_floor returned should be right
+    /// above that image so we don't skip it in the search. Otherwise
+    /// the lsn_floor returned should be the bottom of the delta layer
+    /// because we should make as much progress down the lsn axis
+    /// as possible. It's fine if this way we skip some overlapping
+    /// deltas, because the delta we returned would contain the same
+    /// wal content.
+    ///
+    /// TODO: This API is convoluted and inefficient. If the caller
+    /// makes N search calls, we'll end up finding the same latest
+    /// image layer N times. We should either cache the latest image
+    /// layer result, or simplify the api to `get_latest_image` and
+    /// `get_latest_delta`, and only call `get_latest_image` once.
     ///
     /// NOTE: This only searches the 'historic' layers, *not* the
     /// 'open' and 'frozen' layers!
@@ -401,7 +418,9 @@ where
         NUM_ONDISK_LAYERS.dec();
     }
 
-    /// Is there a newer image layer for given key- and LSN-range?
+    /// Is there a newer image layer for given key- and LSN-range? Or a set
+    /// of image layers within the specified lsn range that cover the entire
+    /// specified key range?
     ///
     /// This is used for garbage collection, to determine if an old layer can
     /// be deleted.
@@ -488,8 +507,8 @@ where
 
     ///
     /// Divide the whole given range of keys into sub-ranges based on the latest
-    /// image layer that covers each range. (This is used when creating  new
-    /// image layers)
+    /// image layer that covers each range at the specified lsn (inclusive).
+    /// This is used when creating  new image layers.
     ///
     // FIXME: clippy complains that the result type is very complex. She's probably
     // right...
@@ -541,8 +560,15 @@ where
         Ok(ranges)
     }
 
-    /// Count how many L1 delta layers there are that overlap with the
-    /// given key and LSN range.
+    /// Count the height of the tallest stack of deltas in this 2d region.
+    ///
+    /// This number is used to compute the largest number of deltas that
+    /// we'll need to visit for any page reconstruction in this region.
+    /// We use this heuristic to decide whether to create an image layer.
+    ///
+    /// TODO currently we just return the total number of deltas in the
+    ///      region, no matter if they're stacked on top of each other
+    ///      or next to each other.
     pub fn count_deltas(&self, key_range: &Range<Key>, lsn_range: &Range<Lsn>) -> Result<usize> {
         let mut result = 0;
         if lsn_range.start >= lsn_range.end {

From 262265daad0d1a000d1f425d71ddfb723c3a05e8 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 19 Jan 2023 18:49:36 +0100
Subject: [PATCH 028/412] Revert "Use actual temporary dir for pageserver unit
 tests"

This reverts commit 826e89b9ce43ce2c4d046b2c5d6376c3de8dbbac.

The problem with that commit was that it deletes the TempDir while
there are still EphemeralFile instances open.

At first I thought this could be fixed by simply adding

  Handle::current().block_on(task_mgr::shutdown(None, Some(tenant_id), None))

to TenantHarness::drop, but it turned out to be insufficient.

So, reverting the commit until we find a proper solution.

refs https://github.com/neondatabase/neon/issues/3385
---
 .gitignore                                    |   2 +
 control_plane/.gitignore                      |   1 +
 pageserver/src/config.rs                      |   5 +
 pageserver/src/tenant.rs                      | 105 +++++++++++-------
 pageserver/src/tenant/ephemeral_file.rs       |  38 ++++---
 .../src/tenant/remote_timeline_client.rs      |   2 +-
 pageserver/src/virtual_file.rs                |   8 +-
 pageserver/src/walingest.rs                   |  12 +-
 .../src/walreceiver/connection_manager.rs     |  16 +--
 test_runner/sql_regress/.gitignore            |   1 +
 10 files changed, 110 insertions(+), 80 deletions(-)
 create mode 100644 control_plane/.gitignore

diff --git a/.gitignore b/.gitignore
index 2e241ee8cd..f1afdee599 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,7 @@
 /pg_install
 /target
+/tmp_check
+/tmp_check_cli
 __pycache__/
 test_output/
 .vscode
diff --git a/control_plane/.gitignore b/control_plane/.gitignore
new file mode 100644
index 0000000000..c1e54a6bcb
--- /dev/null
+++ b/control_plane/.gitignore
@@ -0,0 +1 @@
+tmp_check/
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index f3e5fb8c1a..51d1664e52 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -693,6 +693,11 @@ impl PageServerConf {
         Ok(t_conf)
     }
 
+    #[cfg(test)]
+    pub fn test_repo_dir(test_name: &str) -> PathBuf {
+        PathBuf::from(format!("../tmp_check/test_{test_name}"))
+    }
+
     pub fn dummy_conf(repo_dir: PathBuf) -> Self {
         let pg_distrib_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../pg_install");
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index c53c9bc3e1..c18c645e5b 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2626,10 +2626,10 @@ where
 #[cfg(test)]
 pub mod harness {
     use bytes::{Bytes, BytesMut};
+    use once_cell::sync::Lazy;
     use once_cell::sync::OnceCell;
-    use std::sync::Arc;
+    use std::sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard};
     use std::{fs, path::PathBuf};
-    use tempfile::TempDir;
     use utils::logging;
     use utils::lsn::Lsn;
 
@@ -2661,6 +2661,8 @@ pub mod harness {
         buf.freeze()
     }
 
+    static LOCK: Lazy<RwLock<()>> = Lazy::new(|| RwLock::new(()));
+
     impl From<TenantConf> for TenantConfOpt {
         fn from(tenant_conf: TenantConf) -> Self {
             Self {
@@ -2681,31 +2683,42 @@ pub mod harness {
         }
     }
 
-    /// The harness saves some boilerplate and provides a way to create functional tenant
-    /// without running pageserver binary. It uses temporary directory to store data in it.
-    /// Tempdir gets removed on harness drop.
-    pub struct TenantHarness {
-        // keep the struct to not to remove tmp dir during the test
-        _temp_repo_dir: TempDir,
+    pub struct TenantHarness<'a> {
         pub conf: &'static PageServerConf,
         pub tenant_conf: TenantConf,
         pub tenant_id: TenantId,
+
+        pub lock_guard: (
+            Option<RwLockReadGuard<'a, ()>>,
+            Option<RwLockWriteGuard<'a, ()>>,
+        ),
     }
 
     static LOG_HANDLE: OnceCell<()> = OnceCell::new();
 
-    impl TenantHarness {
-        pub fn new() -> anyhow::Result<Self> {
+    impl<'a> TenantHarness<'a> {
+        pub fn create(test_name: &'static str) -> anyhow::Result<Self> {
+            Self::create_internal(test_name, false)
+        }
+        pub fn create_exclusive(test_name: &'static str) -> anyhow::Result<Self> {
+            Self::create_internal(test_name, true)
+        }
+        fn create_internal(test_name: &'static str, exclusive: bool) -> anyhow::Result<Self> {
+            let lock_guard = if exclusive {
+                (None, Some(LOCK.write().unwrap()))
+            } else {
+                (Some(LOCK.read().unwrap()), None)
+            };
+
             LOG_HANDLE.get_or_init(|| {
                 logging::init(logging::LogFormat::Test).expect("Failed to init test logging")
             });
 
-            let temp_repo_dir = tempfile::tempdir()?;
-            // `TempDir` uses a randomly generated subdirectory of a system tmp dir,
-            // so far it's enough to take care of concurrently running tests.
-            let repo_dir = temp_repo_dir.path();
+            let repo_dir = PageServerConf::test_repo_dir(test_name);
+            let _ = fs::remove_dir_all(&repo_dir);
+            fs::create_dir_all(&repo_dir)?;
 
-            let conf = PageServerConf::dummy_conf(repo_dir.to_path_buf());
+            let conf = PageServerConf::dummy_conf(repo_dir);
             // Make a static copy of the config. This can never be free'd, but that's
             // OK in a test.
             let conf: &'static PageServerConf = Box::leak(Box::new(conf));
@@ -2723,10 +2736,10 @@ pub mod harness {
             fs::create_dir_all(conf.timelines_path(&tenant_id))?;
 
             Ok(Self {
-                _temp_repo_dir: temp_repo_dir,
                 conf,
                 tenant_conf,
                 tenant_id,
+                lock_guard,
             })
         }
 
@@ -2820,8 +2833,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_basic() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("test_basic")?.load().await;
         let tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -2854,8 +2866,9 @@ mod tests {
 
     #[tokio::test]
     async fn no_duplicate_timelines() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("no_duplicate_timelines")?
+            .load()
+            .await;
         let _ = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -2886,8 +2899,7 @@ mod tests {
     ///
     #[tokio::test]
     async fn test_branch() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("test_branch")?.load().await;
         let tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -2984,8 +2996,10 @@ mod tests {
 
     #[tokio::test]
     async fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant =
+            TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")?
+                .load()
+                .await;
         let tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -3020,8 +3034,9 @@ mod tests {
 
     #[tokio::test]
     async fn test_prohibit_branch_creation_on_pre_initdb_lsn() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?
+            .load()
+            .await;
 
         tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0x50), DEFAULT_PG_VERSION)?
@@ -3070,8 +3085,9 @@ mod tests {
 
     #[tokio::test]
     async fn test_retain_data_in_parent_which_is_needed_for_child() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?
+            .load()
+            .await;
         let tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -3093,8 +3109,9 @@ mod tests {
     }
     #[tokio::test]
     async fn test_parent_keeps_data_forever_after_branching() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("test_parent_keeps_data_forever_after_branching")?
+            .load()
+            .await;
         let tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -3125,7 +3142,8 @@ mod tests {
 
     #[tokio::test]
     async fn timeline_load() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
+        const TEST_NAME: &str = "timeline_load";
+        let harness = TenantHarness::create(TEST_NAME)?;
         {
             let tenant = harness.load().await;
             let tline = tenant
@@ -3144,7 +3162,8 @@ mod tests {
 
     #[tokio::test]
     async fn timeline_load_with_ancestor() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
+        const TEST_NAME: &str = "timeline_load_with_ancestor";
+        let harness = TenantHarness::create(TEST_NAME)?;
         // create two timelines
         {
             let tenant = harness.load().await;
@@ -3182,7 +3201,8 @@ mod tests {
 
     #[tokio::test]
     async fn corrupt_metadata() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
+        const TEST_NAME: &str = "corrupt_metadata";
+        let harness = TenantHarness::create(TEST_NAME)?;
         let tenant = harness.load().await;
 
         tenant
@@ -3223,8 +3243,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_images() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("test_images")?.load().await;
         let tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -3291,8 +3310,7 @@ mod tests {
     //
     #[tokio::test]
     async fn test_bulk_insert() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("test_bulk_insert")?.load().await;
         let tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -3336,8 +3354,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_random_updates() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("test_random_updates")?.load().await;
         let tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -3410,8 +3427,9 @@ mod tests {
 
     #[tokio::test]
     async fn test_traverse_branches() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("test_traverse_branches")?
+            .load()
+            .await;
         let mut tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -3495,8 +3513,9 @@ mod tests {
 
     #[tokio::test]
     async fn test_traverse_ancestors() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("test_traverse_ancestors")?
+            .load()
+            .await;
         let mut tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs
index 0debeaff1c..c433e65ad2 100644
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -76,7 +76,7 @@ impl EphemeralFile {
         })
     }
 
-    fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> io::Result<()> {
+    fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), io::Error> {
         let mut off = 0;
         while off < PAGE_SZ {
             let n = self
@@ -277,7 +277,7 @@ impl Drop for EphemeralFile {
     }
 }
 
-pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> io::Result<()> {
+pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> Result<(), io::Error> {
     if let Some(file) = EPHEMERAL_FILES.read().unwrap().files.get(&file_id) {
         match file.write_all_at(buf, blkno as u64 * PAGE_SZ as u64) {
             Ok(_) => Ok(()),
@@ -332,17 +332,25 @@ mod tests {
     use super::*;
     use crate::tenant::blob_io::{BlobCursor, BlobWriter};
     use crate::tenant::block_io::BlockCursor;
-    use crate::tenant::harness::TenantHarness;
     use rand::{seq::SliceRandom, thread_rng, RngCore};
     use std::fs;
     use std::str::FromStr;
 
-    fn harness() -> Result<(TenantHarness, TimelineId), io::Error> {
-        let harness = TenantHarness::new().expect("Failed to create tenant harness");
-        let timeline_id = TimelineId::from_str("22000000000000000000000000000000").unwrap();
-        fs::create_dir_all(harness.timeline_path(&timeline_id))?;
+    fn harness(
+        test_name: &str,
+    ) -> Result<(&'static PageServerConf, TenantId, TimelineId), io::Error> {
+        let repo_dir = PageServerConf::test_repo_dir(test_name);
+        let _ = fs::remove_dir_all(&repo_dir);
+        let conf = PageServerConf::dummy_conf(repo_dir);
+        // Make a static copy of the config. This can never be free'd, but that's
+        // OK in a test.
+        let conf: &'static PageServerConf = Box::leak(Box::new(conf));
 
-        Ok((harness, timeline_id))
+        let tenant_id = TenantId::from_str("11000000000000000000000000000000").unwrap();
+        let timeline_id = TimelineId::from_str("22000000000000000000000000000000").unwrap();
+        fs::create_dir_all(conf.timeline_path(&timeline_id, &tenant_id))?;
+
+        Ok((conf, tenant_id, timeline_id))
     }
 
     // Helper function to slurp contents of a file, starting at the current position,
@@ -359,10 +367,10 @@ mod tests {
     }
 
     #[test]
-    fn test_ephemeral_files() -> io::Result<()> {
-        let (harness, timeline_id) = harness()?;
+    fn test_ephemeral_files() -> Result<(), io::Error> {
+        let (conf, tenant_id, timeline_id) = harness("ephemeral_files")?;
 
-        let file_a = EphemeralFile::create(harness.conf, harness.tenant_id, timeline_id)?;
+        let file_a = EphemeralFile::create(conf, tenant_id, timeline_id)?;
 
         file_a.write_all_at(b"foo", 0)?;
         assert_eq!("foo", read_string(&file_a, 0, 20)?);
@@ -373,7 +381,7 @@ mod tests {
         // Open a lot of files, enough to cause some page evictions.
         let mut efiles = Vec::new();
         for fileno in 0..100 {
-            let efile = EphemeralFile::create(harness.conf, harness.tenant_id, timeline_id)?;
+            let efile = EphemeralFile::create(conf, tenant_id, timeline_id)?;
             efile.write_all_at(format!("file {}", fileno).as_bytes(), 0)?;
             assert_eq!(format!("file {}", fileno), read_string(&efile, 0, 10)?);
             efiles.push((fileno, efile));
@@ -390,10 +398,10 @@ mod tests {
     }
 
     #[test]
-    fn test_ephemeral_blobs() -> io::Result<()> {
-        let (harness, timeline_id) = harness()?;
+    fn test_ephemeral_blobs() -> Result<(), io::Error> {
+        let (conf, tenant_id, timeline_id) = harness("ephemeral_blobs")?;
 
-        let mut file = EphemeralFile::create(harness.conf, harness.tenant_id, timeline_id)?;
+        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id)?;
 
         let pos_foo = file.write_blob(b"foo")?;
         assert_eq!(b"foo", file.block_cursor().read_blob(pos_foo)?.as_slice());
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 58b7eea1eb..013591caee 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1064,7 +1064,7 @@ mod tests {
     // Test scheduling
     #[test]
     fn upload_scheduling() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
+        let harness = TenantHarness::create("upload_scheduling")?;
         let timeline_path = harness.timeline_path(&TIMELINE_ID);
         std::fs::create_dir_all(&timeline_path)?;
 
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 3ad049cc21..fb216123c1 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -525,13 +525,12 @@ mod tests {
         })
     }
 
-    fn test_files<OF, FD>(test_name: &str, openfunc: OF) -> Result<(), Error>
+    fn test_files<OF, FD>(testname: &str, openfunc: OF) -> Result<(), Error>
     where
         FD: Read + Write + Seek + FileExt,
         OF: Fn(&Path, &OpenOptions) -> Result<FD, std::io::Error>,
     {
-        let temp_repo_dir = tempfile::tempdir()?;
-        let testdir = temp_repo_dir.path().join(test_name);
+        let testdir = crate::config::PageServerConf::test_repo_dir(testname);
         std::fs::create_dir_all(&testdir)?;
 
         let path_a = testdir.join("file_a");
@@ -633,8 +632,7 @@ mod tests {
         const THREADS: usize = 100;
         const SAMPLE: [u8; SIZE] = [0xADu8; SIZE];
 
-        let temp_repo_dir = tempfile::tempdir()?;
-        let testdir = temp_repo_dir.path().join("vfile_concurrency");
+        let testdir = crate::config::PageServerConf::test_repo_dir("vfile_concurrency");
         std::fs::create_dir_all(&testdir)?;
 
         // Create a test file.
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 77fce95160..0de2e6654d 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -1146,8 +1146,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_relsize() -> Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("test_relsize")?.load().await;
         let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
         let mut walingest = init_walingest_test(&tline).await?;
 
@@ -1324,8 +1323,7 @@ mod tests {
     // and then created it again within the same layer.
     #[tokio::test]
     async fn test_drop_extend() -> Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("test_drop_extend")?.load().await;
         let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
         let mut walingest = init_walingest_test(&tline).await?;
 
@@ -1378,8 +1376,7 @@ mod tests {
     // and then extended it again within the same layer.
     #[tokio::test]
     async fn test_truncate_extend() -> Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("test_truncate_extend")?.load().await;
         let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
         let mut walingest = init_walingest_test(&tline).await?;
 
@@ -1500,8 +1497,7 @@ mod tests {
     /// split into multiple 1 GB segments in Postgres.
     #[tokio::test]
     async fn test_large_rel() -> Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("test_large_rel")?.load().await;
         let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
         let mut walingest = init_walingest_test(&tline).await?;
 
diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs
index be58aa0e07..8b60e59305 100644
--- a/pageserver/src/walreceiver/connection_manager.rs
+++ b/pageserver/src/walreceiver/connection_manager.rs
@@ -846,7 +846,7 @@ mod tests {
 
     #[tokio::test]
     async fn no_connection_no_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
+        let harness = TenantHarness::create("no_connection_no_candidate")?;
         let mut state = dummy_state(&harness).await;
         let now = Utc::now().naive_utc();
 
@@ -879,7 +879,7 @@ mod tests {
 
     #[tokio::test]
     async fn connection_no_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
+        let harness = TenantHarness::create("connection_no_candidate")?;
         let mut state = dummy_state(&harness).await;
         let now = Utc::now().naive_utc();
 
@@ -942,7 +942,7 @@ mod tests {
 
     #[tokio::test]
     async fn no_connection_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
+        let harness = TenantHarness::create("no_connection_candidate")?;
         let mut state = dummy_state(&harness).await;
         let now = Utc::now().naive_utc();
 
@@ -1001,7 +1001,7 @@ mod tests {
 
     #[tokio::test]
     async fn candidate_with_many_connection_failures() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
+        let harness = TenantHarness::create("candidate_with_many_connection_failures")?;
         let mut state = dummy_state(&harness).await;
         let now = Utc::now().naive_utc();
 
@@ -1041,7 +1041,7 @@ mod tests {
 
     #[tokio::test]
     async fn lsn_wal_over_threshhold_current_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
+        let harness = TenantHarness::create("lsn_wal_over_threshcurrent_candidate")?;
         let mut state = dummy_state(&harness).await;
         let current_lsn = Lsn(100_000).align();
         let now = Utc::now().naive_utc();
@@ -1105,7 +1105,7 @@ mod tests {
 
     #[tokio::test]
     async fn timeout_connection_threshhold_current_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
+        let harness = TenantHarness::create("timeout_connection_threshhold_current_candidate")?;
         let mut state = dummy_state(&harness).await;
         let current_lsn = Lsn(100_000).align();
         let now = Utc::now().naive_utc();
@@ -1166,7 +1166,7 @@ mod tests {
 
     #[tokio::test]
     async fn timeout_wal_over_threshhold_current_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
+        let harness = TenantHarness::create("timeout_wal_over_threshhold_current_candidate")?;
         let mut state = dummy_state(&harness).await;
         let current_lsn = Lsn(100_000).align();
         let new_lsn = Lsn(100_100).align();
@@ -1232,7 +1232,7 @@ mod tests {
 
     const DUMMY_SAFEKEEPER_HOST: &str = "safekeeper_connstr";
 
-    async fn dummy_state(harness: &TenantHarness) -> WalreceiverState {
+    async fn dummy_state(harness: &TenantHarness<'_>) -> WalreceiverState {
         WalreceiverState {
             id: TenantTimelineId {
                 tenant_id: harness.tenant_id,
diff --git a/test_runner/sql_regress/.gitignore b/test_runner/sql_regress/.gitignore
index 83186b5c86..89129d7358 100644
--- a/test_runner/sql_regress/.gitignore
+++ b/test_runner/sql_regress/.gitignore
@@ -2,6 +2,7 @@
 /pg_regress
 
 # Generated subdirectories
+/tmp_check/
 /results/
 /log/
 

From 47f9890bae75c63fc4b29c009cf9020ac2bcbafa Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Fri, 20 Jan 2023 15:37:24 +0100
Subject: [PATCH 029/412] [compute_ctl] Make role deletion spec processing
 idempotent (#3380)

Previously, we were trying to re-assign owned objects of the already
deleted role. This were causing a crash loop in the case when compute
was restarted with a spec that includes delta operation for role
deletion. To avoid such cases, check that role is still present before
calling `reassign_owned_objects`.

Resolves neondatabase/cloud#3553
---
 compute_tools/src/compute.rs    |  3 ++-
 compute_tools/src/pg_helpers.rs |  4 ++--
 compute_tools/src/spec.rs       | 14 +++++++++++++-
 3 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index e229bb1cc2..c8af8822b7 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -252,7 +252,7 @@ impl ComputeNode {
         // If connection fails,
         // it may be the old node with `zenith_admin` superuser.
         //
-        // In this case we need to connect with old `zenith_admin`name
+        // In this case we need to connect with old `zenith_admin` name
         // and create new user. We cannot simply rename connected user,
         // but we can create a new one and grant it all privileges.
         let mut client = match Client::connect(self.connstr.as_str(), NoTls) {
@@ -278,6 +278,7 @@ impl ComputeNode {
             Ok(client) => client,
         };
 
+        // Proceed with post-startup configuration. Note, that order of operations is important.
         handle_roles(&self.spec, &mut client)?;
         handle_databases(&self.spec, &mut client)?;
         handle_role_deletions(self, &mut client)?;
diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs
index 921289d7c2..6ab2864721 100644
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -130,8 +130,8 @@ impl Role {
     /// Serialize a list of role parameters into a Postgres-acceptable
     /// string of arguments.
     pub fn to_pg_options(&self) -> String {
-        // XXX: consider putting LOGIN as a default option somewhere higher, e.g. in Rails.
-        // For now we do not use generic `options` for roles. Once used, add
+        // XXX: consider putting LOGIN as a default option somewhere higher, e.g. in control-plane.
+        // For now, we do not use generic `options` for roles. Once used, add
         // `self.options.as_pg_options()` somewhere here.
         let mut params: String = "LOGIN".to_string();
 
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 40c8366bf4..97cd623052 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -213,8 +213,20 @@ pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<
     if let Some(ops) = &node.spec.delta_operations {
         // First, reassign all dependent objects to db owners.
         info!("reassigning dependent objects of to-be-deleted roles");
+
+        // Fetch existing roles. We could've exported and used `existing_roles` from
+        // `handle_roles()`, but we only make this list there before creating new roles.
+        // Which is probably fine as we never create to-be-deleted roles, but that'd
+        // just look a bit untidy. Anyway, the entire `pg_roles` should be in shared
+        // buffers already, so this shouldn't be a big deal.
+        let mut xact = client.transaction()?;
+        let existing_roles: Vec<Role> = get_existing_roles(&mut xact)?;
+        xact.commit()?;
+
         for op in ops {
-            if op.action == "delete_role" {
+            // Check that role is still present in Postgres, as this could be a
+            // restart with the same spec after role deletion.
+            if op.action == "delete_role" && existing_roles.iter().any(|r| r.name == op.name) {
                 reassign_owned_objects(node, &op.name)?;
             }
         }

From 802f17407259bf9ad027480721083662f66b13b1 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 20 Jan 2023 18:19:52 +0200
Subject: [PATCH 030/412] fix: dont stop pageserver if we fail to calculate
 synthetic size

---
 pageserver/src/consumption_metrics.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index f8a0bc6f08..c07026261d 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -59,7 +59,7 @@ pub async fn collect_metrics(
         None,
         None,
         "synthetic size calculation",
-        true,
+        false,
         async move {
             calculate_synthetic_size_worker(synthetic_size_calculation_interval)
                 .instrument(info_span!("synthetic_size_worker"))

From 478322ebf90f8580258b02e1fb5c899c7f8ad279 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Fri, 20 Jan 2023 20:21:36 +0200
Subject: [PATCH 031/412] Fix tenant size orphans (#3377)

Before only the timelines which have passed the `gc_horizon` were
processed which failed with orphans at the tree_sort phase. Example
input in added `test_branched_empty_timeline_size` test case.

The PR changes iteration to happen through all timelines, and in
addition to that, any learned branch points will be calculated as they
would had been in the original implementation if the ancestor branch had
been over the `gc_horizon`.

This also changes how tenants where all timelines are below `gc_horizon`
are handled. Previously tenant_size 0 was returned, but now they will
have approximately `initdb_lsn` worth of tenant_size.

The PR also adds several new tenant size tests that describe various corner
cases of branching structure and `gc_horizon` setting.
They are currently disabled to not consume time during CI.

Co-authored-by: Joonas Koivunen <joonas@neon.tech>
Co-authored-by: Anastasia Lubennikova <anastasia@neon.tech>
---
 pageserver/src/tenant/size.rs           | 149 ++++++++++++---
 test_runner/fixtures/neon_fixtures.py   |   9 +-
 test_runner/regress/test_tenant_size.py | 244 ++++++++++++++++++++++--
 3 files changed, 360 insertions(+), 42 deletions(-)

diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs
index dd4bf768a7..2181d6d4dc 100644
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -23,7 +23,13 @@ use tracing::*;
 pub struct ModelInputs {
     updates: Vec<Update>,
     retention_period: u64,
+
+    /// Relevant lsns per timeline.
+    ///
+    /// This field is not required for deserialization purposes, which is mostly used in tests. The
+    /// LSNs explain the outcome (updates) but are not needed in size calculation.
     #[serde_as(as = "HashMap<serde_with::DisplayFromStr, _>")]
+    #[serde(default)]
     timeline_inputs: HashMap<TimelineId, TimelineInputs>,
 }
 
@@ -32,6 +38,8 @@ pub struct ModelInputs {
 #[serde_with::serde_as]
 #[derive(Debug, serde::Serialize, serde::Deserialize)]
 struct TimelineInputs {
+    #[serde_as(as = "serde_with::DisplayFromStr")]
+    ancestor_lsn: Lsn,
     #[serde_as(as = "serde_with::DisplayFromStr")]
     last_record: Lsn,
     #[serde_as(as = "serde_with::DisplayFromStr")]
@@ -178,21 +186,13 @@ pub(super) async fn gather_inputs(
     // our advantage with `?` error handling.
     let mut joinset = tokio::task::JoinSet::new();
 
-    let timelines = tenant
+    // refresh is needed to update gc related pitr_cutoff and horizon_cutoff
+    tenant
         .refresh_gc_info()
         .await
         .context("Failed to refresh gc_info before gathering inputs")?;
 
-    if timelines.is_empty() {
-        // All timelines are below tenant's gc_horizon; alternative would be to use
-        // Tenant::list_timelines but then those gc_info's would not be updated yet, possibly
-        // missing GcInfo::retain_lsns or having obsolete values for cutoff's.
-        return Ok(ModelInputs {
-            updates: vec![],
-            retention_period: 0,
-            timeline_inputs: HashMap::new(),
-        });
-    }
+    let timelines = tenant.list_timelines();
 
     // record the used/inserted cache keys here, to remove extras not to start leaking
     // after initial run the cache should be quite stable, but live timelines will eventually
@@ -201,13 +201,25 @@ pub(super) async fn gather_inputs(
 
     let mut updates = Vec::new();
 
-    // record the per timline values used to determine `retention_period`
+    // record the per timeline values useful to debug the model inputs, also used to track
+    // ancestor_lsn without keeping a hold of Timeline
     let mut timeline_inputs = HashMap::with_capacity(timelines.len());
 
     // used to determine the `retention_period` for the size model
     let mut max_cutoff_distance = None;
 
+    // mapping from (TimelineId, Lsn) => if this branch point has been handled already via
+    // GcInfo::retain_lsns or if it needs to have its logical_size calculated.
+    let mut referenced_branch_froms = HashMap::<(TimelineId, Lsn), bool>::new();
+
     for timeline in timelines {
+        if !timeline.is_active() {
+            anyhow::bail!(
+                "timeline {} is not active, cannot calculate tenant_size now",
+                timeline.timeline_id
+            );
+        }
+
         let last_record_lsn = timeline.get_last_record_lsn();
 
         let (interesting_lsns, horizon_cutoff, pitr_cutoff, next_gc_cutoff) = {
@@ -273,13 +285,30 @@ pub(super) async fn gather_inputs(
 
         // all timelines branch from something, because it might be impossible to pinpoint
         // which is the tenant_size_model's "default" branch.
+
+        let ancestor_lsn = timeline.get_ancestor_lsn();
+
         updates.push(Update {
-            lsn: timeline.get_ancestor_lsn(),
+            lsn: ancestor_lsn,
             command: Command::BranchFrom(timeline.get_ancestor_timeline_id()),
             timeline_id: timeline.timeline_id,
         });
 
+        if let Some(parent_timeline_id) = timeline.get_ancestor_timeline_id() {
+            // refresh_gc_info will update branchpoints and pitr_cutoff but only do it for branches
+            // which are over gc_horizon. for example, a "main" branch which never received any
+            // updates apart from initdb not have branch points recorded.
+            referenced_branch_froms
+                .entry((parent_timeline_id, timeline.get_ancestor_lsn()))
+                .or_default();
+        }
+
         for (lsn, _kind) in &interesting_lsns {
+            // mark this visited so don't need to re-process this parent
+            *referenced_branch_froms
+                .entry((timeline.timeline_id, *lsn))
+                .or_default() = true;
+
             if let Some(size) = logical_size_cache.get(&(timeline.timeline_id, *lsn)) {
                 updates.push(Update {
                     lsn: *lsn,
@@ -295,22 +324,10 @@ pub(super) async fn gather_inputs(
             }
         }
 
-        // all timelines also have an end point if they have made any progress
-        if last_record_lsn > timeline.get_ancestor_lsn()
-            && !interesting_lsns
-                .iter()
-                .any(|(lsn, _)| lsn == &last_record_lsn)
-        {
-            updates.push(Update {
-                lsn: last_record_lsn,
-                command: Command::EndOfBranch,
-                timeline_id: timeline.timeline_id,
-            });
-        }
-
         timeline_inputs.insert(
             timeline.timeline_id,
             TimelineInputs {
+                ancestor_lsn,
                 last_record: last_record_lsn,
                 // this is not used above, because it might not have updated recently enough
                 latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(),
@@ -321,6 +338,80 @@ pub(super) async fn gather_inputs(
         );
     }
 
+    // iterate over discovered branch points and make sure we are getting logical sizes at those
+    // points.
+    for ((timeline_id, lsn), handled) in referenced_branch_froms.iter() {
+        if *handled {
+            continue;
+        }
+
+        let timeline_id = *timeline_id;
+        let lsn = *lsn;
+
+        match timeline_inputs.get(&timeline_id) {
+            Some(inputs) if inputs.ancestor_lsn == lsn => {
+                // we don't need an update at this branch point which is also point where
+                // timeline_id branch was branched from.
+                continue;
+            }
+            Some(_) => {}
+            None => {
+                // we should have this because we have iterated through all of the timelines
+                anyhow::bail!("missing timeline_input for {timeline_id}")
+            }
+        }
+
+        if let Some(size) = logical_size_cache.get(&(timeline_id, lsn)) {
+            updates.push(Update {
+                lsn,
+                timeline_id,
+                command: Command::Update(*size),
+            });
+
+            needed_cache.insert((timeline_id, lsn));
+        } else {
+            let timeline = tenant
+                .get_timeline(timeline_id, false)
+                .context("find referenced ancestor timeline")?;
+            let parallel_size_calcs = Arc::clone(limit);
+            joinset.spawn(calculate_logical_size(
+                parallel_size_calcs,
+                timeline.clone(),
+                lsn,
+            ));
+
+            if let Some(parent_id) = timeline.get_ancestor_timeline_id() {
+                // we should not find new ones because we iterated tenants all timelines
+                anyhow::ensure!(
+                    timeline_inputs.contains_key(&parent_id),
+                    "discovered new timeline {parent_id} (parent of {timeline_id})"
+                );
+            }
+        };
+    }
+
+    // finally add in EndOfBranch for all timelines where their last_record_lsn is not a branch
+    // point. this is needed by the model.
+    for (timeline_id, inputs) in timeline_inputs.iter() {
+        let lsn = inputs.last_record;
+
+        if referenced_branch_froms.contains_key(&(*timeline_id, lsn)) {
+            // this means that the (timeline_id, last_record_lsn) represents a branch point
+            // we do not want to add EndOfBranch updates for these points because it doesn't fit
+            // into the current tenant_size_model.
+            continue;
+        }
+
+        if lsn > inputs.ancestor_lsn {
+            // all timelines also have an end point if they have made any progress
+            updates.push(Update {
+                lsn,
+                command: Command::EndOfBranch,
+                timeline_id: *timeline_id,
+            });
+        }
+    }
+
     let mut have_any_error = false;
 
     while let Some(res) = joinset.join_next().await {
@@ -379,6 +470,7 @@ pub(super) async fn gather_inputs(
     // handled by the variant order in `Command`.
     //
     updates.sort_unstable();
+
     // And another sort to handle Command::BranchFrom ordering
     // in case when there are multiple branches at the same LSN.
     let sorted_updates = sort_updates_in_tree_order(updates)?;
@@ -574,7 +666,10 @@ fn updates_sort() {
 fn verify_size_for_multiple_branches() {
     // this is generated from integration test test_tenant_size_with_multiple_branches, but this way
     // it has the stable lsn's
-    let doc = r#"{"updates":[{"lsn":"0/0","command":{"branch_from":null},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/176FA40","command":{"update":25763840},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/176FA40","command":{"branch_from":"cd9d9409c216e64bf580904facedb01b"},"timeline_id":"10b532a550540bc15385eac4edde416a"},{"lsn":"0/1819818","command":{"update":26075136},"timeline_id":"10b532a550540bc15385eac4edde416a"},{"lsn":"0/18B5E40","command":{"update":26427392},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/18D3DF0","command":{"update":26492928},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/18D3DF0","command":{"branch_from":"cd9d9409c216e64bf580904facedb01b"},"timeline_id":"230fc9d756f7363574c0d66533564dcc"},{"lsn":"0/220F438","command":{"update":25239552},"timeline_id":"230fc9d756f7363574c0d66533564dcc"}],"retention_period":131072,"timeline_inputs":{"cd9d9409c216e64bf580904facedb01b":{"last_record":"0/18D5E40","latest_gc_cutoff":"0/169ACF0","horizon_cutoff":"0/18B5E40","pitr_cutoff":"0/18B5E40","next_gc_cutoff":"0/18B5E40"},"10b532a550540bc15385eac4edde416a":{"last_record":"0/1839818","latest_gc_cutoff":"0/169ACF0","horizon_cutoff":"0/1819818","pitr_cutoff":"0/1819818","next_gc_cutoff":"0/1819818"},"230fc9d756f7363574c0d66533564dcc":{"last_record":"0/222F438","latest_gc_cutoff":"0/169ACF0","horizon_cutoff":"0/220F438","pitr_cutoff":"0/220F438","next_gc_cutoff":"0/220F438"}}}"#;
+    //
+    // timelineinputs have been left out, because those explain the inputs, but don't participate
+    // in further size calculations.
+    let doc = r#"{"updates":[{"lsn":"0/0","command":{"branch_from":null},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/176FA40","command":{"update":25763840},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/176FA40","command":{"branch_from":"cd9d9409c216e64bf580904facedb01b"},"timeline_id":"10b532a550540bc15385eac4edde416a"},{"lsn":"0/1819818","command":{"update":26075136},"timeline_id":"10b532a550540bc15385eac4edde416a"},{"lsn":"0/18B5E40","command":{"update":26427392},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/18D3DF0","command":{"update":26492928},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/18D3DF0","command":{"branch_from":"cd9d9409c216e64bf580904facedb01b"},"timeline_id":"230fc9d756f7363574c0d66533564dcc"},{"lsn":"0/220F438","command":{"update":25239552},"timeline_id":"230fc9d756f7363574c0d66533564dcc"}],"retention_period":131072}"#;
 
     let inputs: ModelInputs = serde_json::from_str(doc).unwrap();
 
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index d6c4c32b0b..8476066691 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1206,6 +1206,9 @@ class PageserverHttpClient(requests.Session):
         return res_json
 
     def tenant_size(self, tenant_id: TenantId) -> int:
+        return self.tenant_size_and_modelinputs(tenant_id)[0]
+
+    def tenant_size_and_modelinputs(self, tenant_id: TenantId) -> Tuple[int, Dict[str, Any]]:
         """
         Returns the tenant size, together with the model inputs as the second tuple item.
         """
@@ -1216,9 +1219,9 @@ class PageserverHttpClient(requests.Session):
         assert TenantId(res["id"]) == tenant_id
         size = res["size"]
         assert type(size) == int
-        # there are additional inputs, which are the collected raw information before being fed to the tenant_size_model
-        # there are no tests for those right now.
-        return size
+        inputs = res["inputs"]
+        assert type(inputs) is dict
+        return (size, inputs)
 
     def timeline_list(
         self,
diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py
index 5747ae235f..72cfbc9dda 100644
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -1,5 +1,6 @@
-from typing import List, Tuple
+from typing import Any, List, Tuple
 
+import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, wait_for_last_flush_lsn
 from fixtures.types import Lsn
@@ -9,28 +10,247 @@ def test_empty_tenant_size(neon_simple_env: NeonEnv):
     env = neon_simple_env
     (tenant_id, _) = env.neon_cli.create_tenant()
     http_client = env.pageserver.http_client()
-    size = http_client.tenant_size(tenant_id)
+    initial_size = http_client.tenant_size(tenant_id)
 
-    # we should never have zero, because there should be the initdb however
-    # this is questionable if we should have anything in this case, as the
-    # gc_cutoff is negative
-    assert (
-        size == 0
-    ), "initial implementation returns zero tenant_size before last_record_lsn is past gc_horizon"
+    # we should never have zero, because there should be the initdb "changes"
+    assert initial_size > 0, "initial implementation returns ~initdb tenant_size"
 
-    with env.postgres.create_start("main", tenant_id=tenant_id) as pg:
+    main_branch_name = "main"
+
+    with env.postgres.create_start(
+        main_branch_name,
+        tenant_id=tenant_id,
+        config_lines=["autovacuum=off", "checkpoint_timeout=10min"],
+    ) as pg:
         with pg.cursor() as cur:
             cur.execute("SELECT 1")
             row = cur.fetchone()
             assert row is not None
             assert row[0] == 1
         size = http_client.tenant_size(tenant_id)
-        assert size == 0, "starting idle compute should not change the tenant size"
+        # we've disabled the autovacuum and checkpoint
+        # so background processes should not change the size.
+        # If this test will flake we should probably loosen the check
+        assert size == initial_size, "starting idle compute should not change the tenant size"
 
     # the size should be the same, until we increase the size over the
     # gc_horizon
-    size = http_client.tenant_size(tenant_id)
-    assert size == 0, "tenant_size should not be affected by shutdown of compute"
+    size, inputs = http_client.tenant_size_and_modelinputs(tenant_id)
+    assert size == initial_size, "tenant_size should not be affected by shutdown of compute"
+
+    expected_commands: List[Any] = [{"branch_from": None}, "end_of_branch"]
+    actual_commands: List[Any] = list(map(lambda x: x["command"], inputs["updates"]))  # type: ignore
+    assert actual_commands == expected_commands
+
+
+def test_branched_empty_timeline_size(neon_simple_env: NeonEnv):
+    """
+    Issue found in production. Because the ancestor branch was under
+    gc_horizon, the branchpoint was "dangling" and the computation could not be
+    done.
+
+    Assuming gc_horizon = 50
+    root:    I      0---10------>20
+    branch:              |-------------------I---------->150
+                                   gc_horizon
+    """
+    env = neon_simple_env
+    (tenant_id, _) = env.neon_cli.create_tenant()
+    http_client = env.pageserver.http_client()
+
+    initial_size = http_client.tenant_size(tenant_id)
+
+    first_branch_timeline_id = env.neon_cli.create_branch("first-branch", tenant_id=tenant_id)
+
+    with env.postgres.create_start("first-branch", tenant_id=tenant_id) as pg:
+        with pg.cursor() as cur:
+            cur.execute(
+                "CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)"
+            )
+        wait_for_last_flush_lsn(env, pg, tenant_id, first_branch_timeline_id)
+
+    size_after_branching = http_client.tenant_size(tenant_id)
+    log.info(f"size_after_branching: {size_after_branching}")
+
+    assert size_after_branching > initial_size
+
+
+def test_branched_from_many_empty_parents_size(neon_simple_env: NeonEnv):
+    """
+    More general version of test_branched_empty_timeline_size
+
+    Assuming gc_horizon = 50
+
+    root:  I 0------10
+    first: I        10
+    nth_0: I        10
+    nth_1: I        10
+    nth_n:          10------------I--------100
+    """
+    env = neon_simple_env
+    (tenant_id, _) = env.neon_cli.create_tenant()
+    http_client = env.pageserver.http_client()
+
+    initial_size = http_client.tenant_size(tenant_id)
+
+    first_branch_name = "first"
+    env.neon_cli.create_branch(first_branch_name, tenant_id=tenant_id)
+
+    size_after_branching = http_client.tenant_size(tenant_id)
+
+    # this might be flaky like test_get_tenant_size_with_multiple_branches
+    # https://github.com/neondatabase/neon/issues/2962
+    assert size_after_branching == initial_size
+
+    last_branch_name = first_branch_name
+    last_branch = None
+
+    for i in range(0, 4):
+        latest_branch_name = f"nth_{i}"
+        last_branch = env.neon_cli.create_branch(
+            latest_branch_name, ancestor_branch_name=last_branch_name, tenant_id=tenant_id
+        )
+        last_branch_name = latest_branch_name
+
+        size_after_branching = http_client.tenant_size(tenant_id)
+        assert size_after_branching == initial_size
+
+    assert last_branch is not None
+
+    with env.postgres.create_start(last_branch_name, tenant_id=tenant_id) as pg:
+        with pg.cursor() as cur:
+            cur.execute(
+                "CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)"
+            )
+        wait_for_last_flush_lsn(env, pg, tenant_id, last_branch)
+
+    size_after_writes = http_client.tenant_size(tenant_id)
+    assert size_after_writes > initial_size
+
+
+@pytest.mark.skip("This should work, but is left out because assumed covered by other tests")
+def test_branch_point_within_horizon(neon_simple_env: NeonEnv):
+    """
+    gc_horizon = 15
+
+    main:          0--I-10------>20
+    branch:              |-------------------I---------->150
+                                   gc_horizon
+    """
+
+    env = neon_simple_env
+    gc_horizon = 20_000
+    (tenant_id, main_id) = env.neon_cli.create_tenant(conf={"gc_horizon": str(gc_horizon)})
+    http_client = env.pageserver.http_client()
+
+    with env.postgres.create_start("main", tenant_id=tenant_id) as pg:
+        initdb_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, main_id)
+        with pg.cursor() as cur:
+            cur.execute("CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, 1000) s(i)")
+        flushed_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, main_id)
+
+    size_before_branching = http_client.tenant_size(tenant_id)
+
+    assert flushed_lsn.lsn_int - gc_horizon > initdb_lsn.lsn_int
+
+    branch_id = env.neon_cli.create_branch(
+        "branch", tenant_id=tenant_id, ancestor_start_lsn=flushed_lsn
+    )
+
+    with env.postgres.create_start("branch", tenant_id=tenant_id) as pg:
+        with pg.cursor() as cur:
+            cur.execute("CREATE TABLE t1 AS SELECT i::bigint n FROM generate_series(0, 1000) s(i)")
+        wait_for_last_flush_lsn(env, pg, tenant_id, branch_id)
+
+    size_after = http_client.tenant_size(tenant_id)
+
+    assert size_before_branching < size_after
+
+
+@pytest.mark.skip("This should work, but is left out because assumed covered by other tests")
+def test_parent_within_horizon(neon_simple_env: NeonEnv):
+    """
+    gc_horizon = 5
+
+    main:          0----10----I->20
+    branch:              |-------------------I---------->150
+                                   gc_horizon
+    """
+
+    env = neon_simple_env
+    gc_horizon = 200_000
+    (tenant_id, main_id) = env.neon_cli.create_tenant(conf={"gc_horizon": str(gc_horizon)})
+    http_client = env.pageserver.http_client()
+
+    with env.postgres.create_start("main", tenant_id=tenant_id) as pg:
+        initdb_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, main_id)
+        with pg.cursor() as cur:
+            cur.execute("CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, 1000) s(i)")
+
+        flushed_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, main_id)
+
+        with pg.cursor() as cur:
+            cur.execute("CREATE TABLE t00 AS SELECT i::bigint n FROM generate_series(0, 2000) s(i)")
+
+        wait_for_last_flush_lsn(env, pg, tenant_id, main_id)
+
+    size_before_branching = http_client.tenant_size(tenant_id)
+
+    assert flushed_lsn.lsn_int - gc_horizon > initdb_lsn.lsn_int
+
+    branch_id = env.neon_cli.create_branch(
+        "branch", tenant_id=tenant_id, ancestor_start_lsn=flushed_lsn
+    )
+
+    with env.postgres.create_start("branch", tenant_id=tenant_id) as pg:
+        with pg.cursor() as cur:
+            cur.execute("CREATE TABLE t1 AS SELECT i::bigint n FROM generate_series(0, 10000) s(i)")
+        wait_for_last_flush_lsn(env, pg, tenant_id, branch_id)
+
+    size_after = http_client.tenant_size(tenant_id)
+
+    assert size_before_branching < size_after
+
+
+@pytest.mark.skip("This should work, but is left out because assumed covered by other tests")
+def test_only_heads_within_horizon(neon_simple_env: NeonEnv):
+    """
+    gc_horizon = small
+
+    main: 0--------10-----I>20
+    first:         |-----------------------------I>150
+    second:        |---------I>30
+    """
+
+    env = neon_simple_env
+    (tenant_id, main_id) = env.neon_cli.create_tenant(conf={"gc_horizon": "1024"})
+    http_client = env.pageserver.http_client()
+
+    initial_size = http_client.tenant_size(tenant_id)
+
+    first_id = env.neon_cli.create_branch("first", tenant_id=tenant_id)
+    second_id = env.neon_cli.create_branch("second", tenant_id=tenant_id)
+
+    ids = {"main": main_id, "first": first_id, "second": second_id}
+
+    latest_size = None
+
+    # gc is not expected to change the results
+
+    for branch_name, amount in [("main", 2000), ("first", 15000), ("second", 3000)]:
+        with env.postgres.create_start(branch_name, tenant_id=tenant_id) as pg:
+            with pg.cursor() as cur:
+                cur.execute(
+                    f"CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, {amount}) s(i)"
+                )
+            wait_for_last_flush_lsn(env, pg, tenant_id, ids[branch_name])
+            size_now = http_client.tenant_size(tenant_id)
+            if latest_size is not None:
+                assert size_now > latest_size
+            else:
+                assert size_now > initial_size
+
+            latest_size = size_now
 
 
 def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder):

From 664a69e65ba79c8a3b6c5bd8d428de41f2243bd7 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 23 Jan 2023 10:51:09 +0200
Subject: [PATCH 032/412] Fix slru_segment_key_range function: segno was
 assigned to incorrect Key field (#3354)

---
 pageserver/src/pgdatadir_mapping.rs | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 6ae70e3a30..cc521c5e35 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1405,15 +1405,15 @@ fn slru_segment_key_range(kind: SlruKind, segno: u32) -> Range<Key> {
     Key {
         field1: 0x01,
         field2,
-        field3: segno,
-        field4: 0,
+        field3: 1,
+        field4: segno,
         field5: 0,
         field6: 0,
     }..Key {
         field1: 0x01,
         field2,
-        field3: segno,
-        field4: 0,
+        field3: 1,
+        field4: segno,
         field5: 1,
         field6: 0,
     }

From edb02d3299772e9075561a3b41141d42894e3287 Mon Sep 17 00:00:00 2001
From: Shany Pozin <shany@neon.tech>
Date: Mon, 23 Jan 2023 15:08:48 +0200
Subject: [PATCH 033/412] Adding pageserver3 to staging (#3403)

---
 .github/ansible/staging.us-east-2.hosts.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/ansible/staging.us-east-2.hosts.yaml b/.github/ansible/staging.us-east-2.hosts.yaml
index 4891875369..b46e729e32 100644
--- a/.github/ansible/staging.us-east-2.hosts.yaml
+++ b/.github/ansible/staging.us-east-2.hosts.yaml
@@ -29,6 +29,8 @@ storage:
           ansible_host: i-0565a8b4008aa3f40
         pageserver-2.us-east-2.aws.neon.build:
           ansible_host: i-01e31cdf7e970586a
+        pageserver-3.us-east-2.aws.neon.build:
+          ansible_host: i-0602a0291365ef7cc
 
     safekeepers:
       hosts:

From f67f0c1c11b4f5226225195b5be956c154ea4200 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 23 Jan 2023 17:12:51 +0200
Subject: [PATCH 034/412] More tenant size fixes (#3410)

Small changes, but hopefully this will help with the panic detected in
staging, for which we cannot get the debugging information right now
(end-of-branch before branch-point).
---
 libs/tenant_size_model/src/lib.rs  |  83 ++++++++++++----------
 libs/tenant_size_model/src/main.rs | 107 +++++++++++++++--------------
 pageserver/src/tenant/size.rs      |  15 +++-
 3 files changed, 114 insertions(+), 91 deletions(-)

diff --git a/libs/tenant_size_model/src/lib.rs b/libs/tenant_size_model/src/lib.rs
index 92bec8aebe..b156e1be9d 100644
--- a/libs/tenant_size_model/src/lib.rs
+++ b/libs/tenant_size_model/src/lib.rs
@@ -134,22 +134,25 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
         op: Cow<'static, str>,
         lsn: u64,
         size: Option<u64>,
-    ) where
+    ) -> anyhow::Result<()>
+    where
         K: std::borrow::Borrow<Q>,
         Q: std::hash::Hash + Eq + std::fmt::Debug,
     {
-        let lastseg_id = *self.branches.get(branch).unwrap();
+        let Some(lastseg_id) = self.branches.get(branch).copied() else { anyhow::bail!("branch not found: {branch:?}") };
         let newseg_id = self.segments.len();
         let lastseg = &mut self.segments[lastseg_id];
 
         assert!(lsn > lastseg.end_lsn);
 
+        let Some(start_size) = lastseg.end_size else { anyhow::bail!("no end_size on latest segment for {branch:?}") };
+
         let newseg = Segment {
             op,
             parent: Some(lastseg_id),
             start_lsn: lastseg.end_lsn,
             end_lsn: lsn,
-            start_size: lastseg.end_size.unwrap(),
+            start_size,
             end_size: size,
             children_after: Vec::new(),
             needed: false,
@@ -158,6 +161,8 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
 
         self.segments.push(newseg);
         *self.branches.get_mut(branch).expect("read already") = newseg_id;
+
+        Ok(())
     }
 
     /// Advances the branch with the named operation, by the relative LSN and logical size bytes.
@@ -167,21 +172,24 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
         op: Cow<'static, str>,
         lsn_bytes: u64,
         size_bytes: i64,
-    ) where
+    ) -> anyhow::Result<()>
+    where
         K: std::borrow::Borrow<Q>,
-        Q: std::hash::Hash + Eq,
+        Q: std::hash::Hash + Eq + std::fmt::Debug,
     {
-        let lastseg_id = *self.branches.get(branch).unwrap();
+        let Some(lastseg_id) = self.branches.get(branch).copied() else { anyhow::bail!("branch not found: {branch:?}") };
         let newseg_id = self.segments.len();
         let lastseg = &mut self.segments[lastseg_id];
 
+        let Some(last_end_size) = lastseg.end_size else { anyhow::bail!("no end_size on latest segment for {branch:?}") };
+
         let newseg = Segment {
             op,
             parent: Some(lastseg_id),
             start_lsn: lastseg.end_lsn,
             end_lsn: lastseg.end_lsn + lsn_bytes,
-            start_size: lastseg.end_size.unwrap(),
-            end_size: Some((lastseg.end_size.unwrap() as i64 + size_bytes) as u64),
+            start_size: last_end_size,
+            end_size: Some((last_end_size as i64 + size_bytes) as u64),
             children_after: Vec::new(),
             needed: false,
         };
@@ -189,33 +197,33 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
 
         self.segments.push(newseg);
         *self.branches.get_mut(branch).expect("read already") = newseg_id;
+        Ok(())
     }
 
-    pub fn insert<Q: ?Sized>(&mut self, branch: &Q, bytes: u64)
+    pub fn insert<Q: ?Sized>(&mut self, branch: &Q, bytes: u64) -> anyhow::Result<()>
     where
         K: std::borrow::Borrow<Q>,
-        Q: std::hash::Hash + Eq,
+        Q: std::hash::Hash + Eq + std::fmt::Debug,
     {
-        self.modify_branch(branch, "insert".into(), bytes, bytes as i64);
+        self.modify_branch(branch, "insert".into(), bytes, bytes as i64)
     }
 
-    pub fn update<Q: ?Sized>(&mut self, branch: &Q, bytes: u64)
+    pub fn update<Q: ?Sized>(&mut self, branch: &Q, bytes: u64) -> anyhow::Result<()>
     where
         K: std::borrow::Borrow<Q>,
-        Q: std::hash::Hash + Eq,
+        Q: std::hash::Hash + Eq + std::fmt::Debug,
     {
-        self.modify_branch(branch, "update".into(), bytes, 0i64);
+        self.modify_branch(branch, "update".into(), bytes, 0i64)
     }
 
-    pub fn delete<Q: ?Sized>(&mut self, branch: &Q, bytes: u64)
+    pub fn delete<Q: ?Sized>(&mut self, branch: &Q, bytes: u64) -> anyhow::Result<()>
     where
         K: std::borrow::Borrow<Q>,
-        Q: std::hash::Hash + Eq,
+        Q: std::hash::Hash + Eq + std::fmt::Debug,
     {
-        self.modify_branch(branch, "delete".into(), bytes, -(bytes as i64));
+        self.modify_branch(branch, "delete".into(), bytes, -(bytes as i64))
     }
 
-    /// Panics if the parent branch cannot be found.
     pub fn branch<Q: ?Sized>(&mut self, parent: &Q, name: K) -> anyhow::Result<()>
     where
         K: std::borrow::Borrow<Q> + std::fmt::Debug,
@@ -236,7 +244,7 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
         Ok(())
     }
 
-    pub fn calculate(&mut self, retention_period: u64) -> SegmentSize {
+    pub fn calculate(&mut self, retention_period: u64) -> anyhow::Result<SegmentSize> {
         // Phase 1: Mark all the segments that need to be retained
         for (_branch, &last_seg_id) in self.branches.iter() {
             let last_seg = &self.segments[last_seg_id];
@@ -261,7 +269,7 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
         self.size_from_snapshot_later(0)
     }
 
-    fn size_from_wal(&self, seg_id: usize) -> SegmentSize {
+    fn size_from_wal(&self, seg_id: usize) -> anyhow::Result<SegmentSize> {
         let seg = &self.segments[seg_id];
 
         let this_size = seg.end_lsn - seg.start_lsn;
@@ -272,10 +280,10 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
         for &child_id in seg.children_after.iter() {
             // try each child both ways
             let child = &self.segments[child_id];
-            let p1 = self.size_from_wal(child_id);
+            let p1 = self.size_from_wal(child_id)?;
 
             let p = if !child.needed {
-                let p2 = self.size_from_snapshot_later(child_id);
+                let p2 = self.size_from_snapshot_later(child_id)?;
                 if p1.total() < p2.total() {
                     p1
                 } else {
@@ -286,15 +294,15 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
             };
             children.push(p);
         }
-        SegmentSize {
+        Ok(SegmentSize {
             seg_id,
             method: if seg.needed { WalNeeded } else { Wal },
             this_size,
             children,
-        }
+        })
     }
 
-    fn size_from_snapshot_later(&self, seg_id: usize) -> SegmentSize {
+    fn size_from_snapshot_later(&self, seg_id: usize) -> anyhow::Result<SegmentSize> {
         // If this is needed, then it's time to do the snapshot and continue
         // with wal method.
         let seg = &self.segments[seg_id];
@@ -305,10 +313,10 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
             for &child_id in seg.children_after.iter() {
                 // try each child both ways
                 let child = &self.segments[child_id];
-                let p1 = self.size_from_wal(child_id);
+                let p1 = self.size_from_wal(child_id)?;
 
                 let p = if !child.needed {
-                    let p2 = self.size_from_snapshot_later(child_id);
+                    let p2 = self.size_from_snapshot_later(child_id)?;
                     if p1.total() < p2.total() {
                         p1
                     } else {
@@ -319,12 +327,12 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
                 };
                 children.push(p);
             }
-            SegmentSize {
+            Ok(SegmentSize {
                 seg_id,
                 method: WalNeeded,
                 this_size: seg.start_size,
                 children,
-            }
+            })
         } else {
             // If any of the direct children are "needed", need to be able to reconstruct here
             let mut children_needed = false;
@@ -339,7 +347,7 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
             let method1 = if !children_needed {
                 let mut children = Vec::new();
                 for child in seg.children_after.iter() {
-                    children.push(self.size_from_snapshot_later(*child));
+                    children.push(self.size_from_snapshot_later(*child)?);
                 }
                 Some(SegmentSize {
                     seg_id,
@@ -355,20 +363,25 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
             let method2 = if children_needed || seg.children_after.len() >= 2 {
                 let mut children = Vec::new();
                 for child in seg.children_after.iter() {
-                    children.push(self.size_from_wal(*child));
+                    children.push(self.size_from_wal(*child)?);
                 }
+                let Some(this_size) = seg.end_size else { anyhow::bail!("no end_size at junction {seg_id}") };
                 Some(SegmentSize {
                     seg_id,
                     method: SnapshotAfter,
-                    this_size: seg.end_size.unwrap(),
+                    this_size,
                     children,
                 })
             } else {
                 None
             };
 
-            match (method1, method2) {
-                (None, None) => panic!(),
+            Ok(match (method1, method2) {
+                (None, None) => anyhow::bail!(
+                    "neither method was applicable: children_after={}, children_needed={}",
+                    seg.children_after.len(),
+                    children_needed
+                ),
                 (Some(method), None) => method,
                 (None, Some(method)) => method,
                 (Some(method1), Some(method2)) => {
@@ -378,7 +391,7 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
                         method2
                     }
                 }
-            }
+            })
         }
     }
 
diff --git a/libs/tenant_size_model/src/main.rs b/libs/tenant_size_model/src/main.rs
index 9378a98a09..e32dd055f4 100644
--- a/libs/tenant_size_model/src/main.rs
+++ b/libs/tenant_size_model/src/main.rs
@@ -7,118 +7,118 @@
 use tenant_size_model::{Segment, SegmentSize, Storage};
 
 // Main branch only. Some updates on it.
-fn scenario_1() -> (Vec<Segment>, SegmentSize) {
+fn scenario_1() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
     // Create main branch
     let mut storage = Storage::new("main");
 
     // Bulk load 5 GB of data to it
-    storage.insert("main", 5_000);
+    storage.insert("main", 5_000)?;
 
     // Stream of updates
     for _ in 0..5 {
-        storage.update("main", 1_000);
+        storage.update("main", 1_000)?;
     }
 
-    let size = storage.calculate(1000);
+    let size = storage.calculate(1000)?;
 
-    (storage.into_segments(), size)
+    Ok((storage.into_segments(), size))
 }
 
 // Main branch only. Some updates on it.
-fn scenario_2() -> (Vec<Segment>, SegmentSize) {
+fn scenario_2() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
     // Create main branch
     let mut storage = Storage::new("main");
 
     // Bulk load 5 GB of data to it
-    storage.insert("main", 5_000);
+    storage.insert("main", 5_000)?;
 
     // Stream of updates
     for _ in 0..5 {
-        storage.update("main", 1_000);
+        storage.update("main", 1_000)?;
     }
 
     // Branch
-    storage.branch("main", "child").unwrap();
-    storage.update("child", 1_000);
+    storage.branch("main", "child")?;
+    storage.update("child", 1_000)?;
 
     // More updates on parent
-    storage.update("main", 1_000);
+    storage.update("main", 1_000)?;
 
-    let size = storage.calculate(1000);
+    let size = storage.calculate(1000)?;
 
-    (storage.into_segments(), size)
+    Ok((storage.into_segments(), size))
 }
 
 // Like 2, but more updates on main
-fn scenario_3() -> (Vec<Segment>, SegmentSize) {
+fn scenario_3() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
     // Create main branch
     let mut storage = Storage::new("main");
 
     // Bulk load 5 GB of data to it
-    storage.insert("main", 5_000);
+    storage.insert("main", 5_000)?;
 
     // Stream of updates
     for _ in 0..5 {
-        storage.update("main", 1_000);
+        storage.update("main", 1_000)?;
     }
 
     // Branch
-    storage.branch("main", "child").unwrap();
-    storage.update("child", 1_000);
+    storage.branch("main", "child")?;
+    storage.update("child", 1_000)?;
 
     // More updates on parent
     for _ in 0..5 {
-        storage.update("main", 1_000);
+        storage.update("main", 1_000)?;
     }
 
-    let size = storage.calculate(1000);
+    let size = storage.calculate(1000)?;
 
-    (storage.into_segments(), size)
+    Ok((storage.into_segments(), size))
 }
 
 // Diverged branches
-fn scenario_4() -> (Vec<Segment>, SegmentSize) {
+fn scenario_4() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
     // Create main branch
     let mut storage = Storage::new("main");
 
     // Bulk load 5 GB of data to it
-    storage.insert("main", 5_000);
+    storage.insert("main", 5_000)?;
 
     // Stream of updates
     for _ in 0..5 {
-        storage.update("main", 1_000);
+        storage.update("main", 1_000)?;
     }
 
     // Branch
-    storage.branch("main", "child").unwrap();
-    storage.update("child", 1_000);
+    storage.branch("main", "child")?;
+    storage.update("child", 1_000)?;
 
     // More updates on parent
     for _ in 0..8 {
-        storage.update("main", 1_000);
+        storage.update("main", 1_000)?;
     }
 
-    let size = storage.calculate(1000);
+    let size = storage.calculate(1000)?;
 
-    (storage.into_segments(), size)
+    Ok((storage.into_segments(), size))
 }
 
-fn scenario_5() -> (Vec<Segment>, SegmentSize) {
+fn scenario_5() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
     let mut storage = Storage::new("a");
-    storage.insert("a", 5000);
-    storage.branch("a", "b").unwrap();
-    storage.update("b", 4000);
-    storage.update("a", 2000);
-    storage.branch("a", "c").unwrap();
-    storage.insert("c", 4000);
-    storage.insert("a", 2000);
+    storage.insert("a", 5000)?;
+    storage.branch("a", "b")?;
+    storage.update("b", 4000)?;
+    storage.update("a", 2000)?;
+    storage.branch("a", "c")?;
+    storage.insert("c", 4000)?;
+    storage.insert("a", 2000)?;
 
-    let size = storage.calculate(5000);
+    let size = storage.calculate(5000)?;
 
-    (storage.into_segments(), size)
+    Ok((storage.into_segments(), size))
 }
 
-fn scenario_6() -> (Vec<Segment>, SegmentSize) {
+fn scenario_6() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
     use std::borrow::Cow;
 
     const NO_OP: Cow<'static, str> = Cow::Borrowed("");
@@ -133,18 +133,18 @@ fn scenario_6() -> (Vec<Segment>, SegmentSize) {
 
     let mut storage = Storage::new(None);
 
-    storage.branch(&None, branches[0]).unwrap(); // at 0
-    storage.modify_branch(&branches[0], NO_OP, 108951064, 43696128); // at 108951064
-    storage.branch(&branches[0], branches[1]).unwrap(); // at 108951064
-    storage.modify_branch(&branches[1], NO_OP, 15560408, -1851392); // at 124511472
-    storage.modify_branch(&branches[0], NO_OP, 174464360, -1531904); // at 283415424
-    storage.branch(&branches[0], branches[2]).unwrap(); // at 283415424
-    storage.modify_branch(&branches[2], NO_OP, 15906192, 8192); // at 299321616
-    storage.modify_branch(&branches[0], NO_OP, 18909976, 32768); // at 302325400
+    storage.branch(&None, branches[0])?; // at 0
+    storage.modify_branch(&branches[0], NO_OP, 108951064, 43696128)?; // at 108951064
+    storage.branch(&branches[0], branches[1])?; // at 108951064
+    storage.modify_branch(&branches[1], NO_OP, 15560408, -1851392)?; // at 124511472
+    storage.modify_branch(&branches[0], NO_OP, 174464360, -1531904)?; // at 283415424
+    storage.branch(&branches[0], branches[2])?; // at 283415424
+    storage.modify_branch(&branches[2], NO_OP, 15906192, 8192)?; // at 299321616
+    storage.modify_branch(&branches[0], NO_OP, 18909976, 32768)?; // at 302325400
 
-    let size = storage.calculate(100_000);
+    let size = storage.calculate(100_000)?;
 
-    (storage.into_segments(), size)
+    Ok((storage.into_segments(), size))
 }
 
 fn main() {
@@ -163,7 +163,8 @@ fn main() {
             eprintln!("invalid scenario {}", other);
             std::process::exit(1);
         }
-    };
+    }
+    .unwrap();
 
     graphviz_tree(&segments, &size);
 }
@@ -251,7 +252,7 @@ fn graphviz_tree(segments: &[Segment], tree: &SegmentSize) {
 
 #[test]
 fn scenarios_return_same_size() {
-    type ScenarioFn = fn() -> (Vec<Segment>, SegmentSize);
+    type ScenarioFn = fn() -> anyhow::Result<(Vec<Segment>, SegmentSize)>;
     let truths: &[(u32, ScenarioFn, _)] = &[
         (line!(), scenario_1, 8000),
         (line!(), scenario_2, 9000),
@@ -262,7 +263,7 @@ fn scenarios_return_same_size() {
     ];
 
     for (line, scenario, expected) in truths {
-        let (_, size) = scenario();
+        let (_, size) = scenario().unwrap();
         assert_eq!(*expected, size.total_children(), "scenario on line {line}");
     }
 }
diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs
index 2181d6d4dc..61cb32fc76 100644
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -194,6 +194,15 @@ pub(super) async fn gather_inputs(
 
     let timelines = tenant.list_timelines();
 
+    if timelines.is_empty() {
+        // perhaps the tenant has just been created, and as such doesn't have any data yet
+        return Ok(ModelInputs {
+            updates: vec![],
+            retention_period: 0,
+            timeline_inputs: HashMap::default(),
+        });
+    }
+
     // record the used/inserted cache keys here, to remove extras not to start leaking
     // after initial run the cache should be quite stable, but live timelines will eventually
     // require new lsns to be inspected.
@@ -505,10 +514,10 @@ impl ModelInputs {
             let Lsn(now) = *lsn;
             match op {
                 Command::Update(sz) => {
-                    storage.insert_point(&Some(*timeline_id), "".into(), now, Some(*sz));
+                    storage.insert_point(&Some(*timeline_id), "".into(), now, Some(*sz))?;
                 }
                 Command::EndOfBranch => {
-                    storage.insert_point(&Some(*timeline_id), "".into(), now, None);
+                    storage.insert_point(&Some(*timeline_id), "".into(), now, None)?;
                 }
                 Command::BranchFrom(parent) => {
                     // This branch command may fail if it cannot find a parent to branch from.
@@ -517,7 +526,7 @@ impl ModelInputs {
             }
         }
 
-        Ok(storage.calculate(self.retention_period).total_children())
+        Ok(storage.calculate(self.retention_period)?.total_children())
     }
 }
 

From eb36403e71210b1be7e2482fc385b8da8c149d5f Mon Sep 17 00:00:00 2001
From: Vadim Kharitonov <vadim2404@users.noreply.github.com>
Date: Tue, 31 Jan 2023 14:06:35 +0100
Subject: [PATCH 035/412] Release 2023 01 31 (#3497)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Kirill Bulatov <kirill@neon.tech>
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
Co-authored-by: Anastasia Lubennikova <anastasia@neon.tech>
Co-authored-by: bojanserafimov <bojan.serafimov7@gmail.com>
Co-authored-by: Christian Schwarz <christian@neon.tech>
Co-authored-by: Alexey Kondratov <kondratov.aleksey@gmail.com>
Co-authored-by: Joonas Koivunen <joonas@neon.tech>
Co-authored-by: Konstantin Knizhnik <knizhnik@garret.ru>
Co-authored-by: Shany Pozin <shany@neon.tech>
Co-authored-by: Sergey Melnikov <sergey@neon.tech>
Co-authored-by: Dmitry Rodionov <dmitry@neon.tech>
Co-authored-by: Rory de Zoete <33318916+zoete@users.noreply.github.com>
Co-authored-by: Rory de Zoete <rdezoete@Rorys-Mac-Studio.fritz.box>
Co-authored-by: Rory de Zoete <rdezoete@RorysMacStudio.fritz.box>
Co-authored-by: Lassi Pölönen <lassi.polonen@iki.fi>
---
 .../actions/run-python-test-set/action.yml    |   4 +-
 .github/ansible/deploy.yaml                   |   6 +-
 .../dev-us-east-2-beta.neon-proxy-link.yaml   |   1 +
 ...prod-us-east-2-delta.neon-proxy-link.yaml} |  24 +-
 ...us-west-2-eta.neon-proxy-scram-legacy.yaml |  61 ++
 .github/workflows/build_and_test.yml          | 572 +++---------
 .github/workflows/deploy-dev.yml              | 179 ++++
 .github/workflows/deploy-prod.yml             | 277 ++++++
 .github/workflows/release.yml                 |  33 +
 Cargo.lock                                    | 328 ++++---
 Cargo.toml                                    |  10 +-
 ...ompute-node-v14 => Dockerfile.compute-node |  34 +-
 Dockerfile.compute-node-v15                   | 220 -----
 compute_tools/Cargo.toml                      |   3 +
 compute_tools/src/bin/compute_ctl.rs          |  32 +-
 compute_tools/src/http/api.rs                 |  27 +-
 compute_tools/src/logger.rs                   |  24 +-
 compute_tools/src/params.rs                   |   8 +-
 compute_tools/src/spec.rs                     |  23 +-
 libs/metrics/src/lib.rs                       |   1 +
 libs/pageserver_api/src/models.rs             |  37 +-
 libs/tracing-utils/Cargo.toml                 |  17 +
 libs/tracing-utils/src/http.rs                |  96 ++
 libs/tracing-utils/src/lib.rs                 | 168 ++++
 libs/utils/Cargo.toml                         |   1 +
 libs/utils/src/http/error.rs                  |  17 +-
 libs/utils/src/logging.rs                     |   2 +-
 pageserver/Cargo.toml                         |   3 +-
 pageserver/benches/bench_layer_map.rs         | 224 ++---
 pageserver/src/basebackup.rs                  |  50 +-
 pageserver/src/bin/pageserver.rs              |  57 +-
 pageserver/src/broker_client.rs               |  48 +
 pageserver/src/config.rs                      |  28 +
 pageserver/src/consumption_metrics.rs         |  24 +-
 pageserver/src/context.rs                     | 199 ++++
 pageserver/src/http/openapi_spec.yml          |  10 +-
 pageserver/src/http/routes.rs                 | 170 ++--
 pageserver/src/import_datadir.rs              |  52 +-
 pageserver/src/lib.rs                         |   3 +-
 pageserver/src/metrics.rs                     | 166 +++-
 pageserver/src/page_service.rs                | 223 +++--
 pageserver/src/pgdatadir_mapping.rs           | 169 ++--
 pageserver/src/repository.rs                  |  11 +
 pageserver/src/task_mgr.rs                    |  45 +-
 pageserver/src/tenant.rs                      | 576 ++++++------
 pageserver/src/tenant/config.rs               |   7 +-
 pageserver/src/tenant/layer_map.rs            | 877 +++++++++---------
 .../layer_map/historic_layer_coverage.rs      | 583 ++++++++++++
 .../src/tenant/layer_map/layer_coverage.rs    | 154 +++
 pageserver/src/tenant/mgr.rs                  | 236 +++--
 .../src/tenant/remote_timeline_client.rs      |  25 +-
 pageserver/src/tenant/size.rs                 |  16 +-
 pageserver/src/tenant/storage_layer.rs        |  47 +
 pageserver/src/tenant/tasks.rs                |  13 +-
 pageserver/src/tenant/timeline.rs             | 327 +++++--
 .../src/{ => tenant/timeline}/walreceiver.rs  |  44 -
 .../walreceiver/connection_manager.rs         |  55 +-
 .../walreceiver/walreceiver_connection.rs     |  29 +-
 pageserver/src/walingest.rs                   | 418 ++++++---
 pageserver/src/walredo.rs                     | 329 +++++--
 poetry.lock                                   | 247 ++++-
 proxy/src/main.rs                             |   4 +-
 pyproject.toml                                |   1 +
 safekeeper/src/bin/safekeeper.rs              |   7 +-
 scripts/force_layer_download.py               | 324 +++++++
 storage_broker/src/bin/storage_broker.rs      |   4 +-
 test_runner/fixtures/metrics.py               |  12 +-
 test_runner/regress/test_tenant_conf.py       |  55 +-
 test_runner/regress/test_tenant_detach.py     |  46 +-
 test_runner/regress/test_tenants.py           |  56 +-
 workspace_hack/Cargo.toml                     |   8 +-
 71 files changed, 5779 insertions(+), 2408 deletions(-)
 rename .github/helm-values/{production.proxy.yaml => prod-us-east-2-delta.neon-proxy-link.yaml} (80%)
 create mode 100644 .github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml
 create mode 100644 .github/workflows/deploy-dev.yml
 create mode 100644 .github/workflows/deploy-prod.yml
 create mode 100644 .github/workflows/release.yml
 rename Dockerfile.compute-node-v14 => Dockerfile.compute-node (86%)
 delete mode 100644 Dockerfile.compute-node-v15
 create mode 100644 libs/tracing-utils/Cargo.toml
 create mode 100644 libs/tracing-utils/src/http.rs
 create mode 100644 libs/tracing-utils/src/lib.rs
 create mode 100644 pageserver/src/broker_client.rs
 create mode 100644 pageserver/src/context.rs
 create mode 100644 pageserver/src/tenant/layer_map/historic_layer_coverage.rs
 create mode 100644 pageserver/src/tenant/layer_map/layer_coverage.rs
 rename pageserver/src/{ => tenant/timeline}/walreceiver.rs (83%)
 rename pageserver/src/{ => tenant/timeline}/walreceiver/connection_manager.rs (96%)
 rename pageserver/src/{ => tenant/timeline}/walreceiver/walreceiver_connection.rs (94%)
 create mode 100644 scripts/force_layer_download.py

diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index 990c7e25a9..29b04a3478 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -123,8 +123,8 @@ runs:
           exit 1
         fi
         if [[ "${{ inputs.run_in_parallel }}" == "true" ]]; then
-          # -n4 uses four processes to run tests via pytest-xdist
-          EXTRA_PARAMS="-n4 $EXTRA_PARAMS"
+          # -n16 uses sixteen processes to run tests via pytest-xdist
+          EXTRA_PARAMS="-n16 $EXTRA_PARAMS"
 
           # --dist=loadgroup points tests marked with @pytest.mark.xdist_group
           # to the same worker to make @pytest.mark.order work with xdist
diff --git a/.github/ansible/deploy.yaml b/.github/ansible/deploy.yaml
index 4adc685684..a17dc9c78f 100644
--- a/.github/ansible/deploy.yaml
+++ b/.github/ansible/deploy.yaml
@@ -117,7 +117,8 @@
       shell:
         cmd: |
           INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
-          curl -sfS -d '{"version": {{ current_version }} }' -X PATCH {{ console_mgmt_base_url }}/api/v1/pageservers/$INSTANCE_ID
+          curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/pageservers/$INSTANCE_ID | jq '.version = {{ current_version }}' > /tmp/new_version
+          curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" -X POST -d@/tmp/new_version {{ console_mgmt_base_url }}/management/api/v2/pageservers
       tags:
       - pageserver
 
@@ -186,6 +187,7 @@
       shell:
         cmd: |
           INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
-          curl -sfS -d '{"version": {{ current_version }} }' -X PATCH {{ console_mgmt_base_url }}/api/v1/safekeepers/$INSTANCE_ID
+          curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/safekeepers/$INSTANCE_ID | jq '.version = {{ current_version }}' > /tmp/new_version
+          curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" -X POST -d@/tmp/new_version {{ console_mgmt_base_url }}/management/api/v2/safekeepers
       tags:
       - safekeeper
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
index cb062f705d..157ae66ed1 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
@@ -8,6 +8,7 @@ settings:
   authBackend: "link"
   authEndpoint: "https://console.stage.neon.tech/authenticate_proxy_request/"
   uri: "https://console.stage.neon.tech/psql_session/"
+  domain: "pg.neon.build"
   sentryEnvironment: "staging"
   metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
   metricCollectionInterval: "1min"
diff --git a/.github/helm-values/production.proxy.yaml b/.github/helm-values/prod-us-east-2-delta.neon-proxy-link.yaml
similarity index 80%
rename from .github/helm-values/production.proxy.yaml
rename to .github/helm-values/prod-us-east-2-delta.neon-proxy-link.yaml
index dbaf3cd096..eff24302bb 100644
--- a/.github/helm-values/production.proxy.yaml
+++ b/.github/helm-values/prod-us-east-2-delta.neon-proxy-link.yaml
@@ -1,37 +1,37 @@
+# Helm chart values for neon-proxy-link.
+# This is a YAML-formatted file.
+
+image:
+  repository: neondatabase/neon
+
 settings:
   authBackend: "link"
   authEndpoint: "https://console.neon.tech/authenticate_proxy_request/"
   uri: "https://console.neon.tech/psql_session/"
+  domain: "pg.neon.tech"
   sentryEnvironment: "production"
 
 # -- Additional labels for zenith-proxy pods
 podLabels:
   zenith_service: proxy
   zenith_env: production
-  zenith_region: us-west-2
-  zenith_region_slug: oregon
+  zenith_region: us-east-2
+  zenith_region_slug: us-east-2
 
 service:
+  type: LoadBalancer
   annotations:
     service.beta.kubernetes.io/aws-load-balancer-type: external
     service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
     service.beta.kubernetes.io/aws-load-balancer-scheme: internal
-    external-dns.alpha.kubernetes.io/hostname: proxy-release.local
-  type: LoadBalancer
+    external-dns.alpha.kubernetes.io/hostname: neon-proxy-link-mgmt.delta.us-east-2.aws.neon.tech
 
 exposedService:
   annotations:
     service.beta.kubernetes.io/aws-load-balancer-type: external
     service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
     service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
-    external-dns.alpha.kubernetes.io/hostname: connect.neon.tech,pg.neon.tech
-
-metrics:
-  enabled: true
-  serviceMonitor:
-    enabled: true
-    selector:
-      release: kube-prometheus-stack
+    external-dns.alpha.kubernetes.io/hostname: neon-proxy-link.delta.us-east-2.aws.neon.tech
 
 extraManifests:
   - apiVersion: operator.victoriametrics.com/v1beta1
diff --git a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml
new file mode 100644
index 0000000000..3a5cde4b01
--- /dev/null
+++ b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml
@@ -0,0 +1,61 @@
+# Helm chart values for neon-proxy-scram.
+# This is a YAML-formatted file.
+
+image:
+  repository: neondatabase/neon
+
+settings:
+  authBackend: "console"
+  authEndpoint: "http://console-release.local/management/api/v2"
+  domain: "*.cloud.neon.tech"
+  sentryEnvironment: "production"
+  wssPort: 8443
+  metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
+  metricCollectionInterval: "10min"
+
+# -- Additional labels for neon-proxy pods
+podLabels:
+  zenith_service: proxy-scram
+  zenith_env: prod
+  zenith_region: us-west-2
+  zenith_region_slug: us-west-2
+
+exposedService:
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: external
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
+    external-dns.alpha.kubernetes.io/hostname: neon-proxy-scram-legacy.eta.us-west-2.aws.neon.tech
+  httpsPort: 443
+
+#metrics:
+#  enabled: true
+#  serviceMonitor:
+#    enabled: true
+#    selector:
+#      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 918e568e27..89e12360f9 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1,4 +1,4 @@
-name: Test and Deploy
+name: Build and Test
 
 on:
   push:
@@ -19,10 +19,12 @@ concurrency:
 env:
   RUST_BACKTRACE: 1
   COPT: '-Werror'
+  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
 
 jobs:
   tag:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, small ]
     container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
     outputs:
       build-tag: ${{steps.build-tag.outputs.tag}}
@@ -50,7 +52,7 @@ jobs:
         id: build-tag
 
   check-codestyle-python:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, small ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cloud:pinned
       options: --init
@@ -85,7 +87,7 @@ jobs:
         run: poetry run mypy .
 
   check-codestyle-rust:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, large ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
       options: --init
@@ -97,16 +99,16 @@ jobs:
           submodules: true
           fetch-depth: 1
 
-      - name: Restore cargo deps cache
-        id: cache_cargo
-        uses: actions/cache@v3
-        with:
-          path: |
-            ~/.cargo/registry/
-            !~/.cargo/registry/src
-            ~/.cargo/git/
-            target/
-          key: v1-${{ runner.os }}-cargo-clippy-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
+#      Disabled for now
+#      - name: Restore cargo deps cache
+#        id: cache_cargo
+#        uses: actions/cache@v3
+#        with:
+#          path: |
+#            !~/.cargo/registry/src
+#            ~/.cargo/git/
+#            target/
+#          key: v1-${{ runner.os }}-cargo-clippy-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
 
       # Some of our rust modules use FFI and need those to be checked
       - name: Get postgres headers
@@ -133,7 +135,7 @@ jobs:
         run: cargo deny check
 
   build-neon:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, large ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
       options: --init
@@ -141,7 +143,6 @@ jobs:
       fail-fast: false
       matrix:
         build_type: [ debug, release ]
-
     env:
       BUILD_TYPE: ${{ matrix.build_type }}
       GIT_VERSION: ${{ github.sha }}
@@ -194,24 +195,26 @@ jobs:
           echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV
           echo "CARGO_FEATURES=${CARGO_FEATURES}" >> $GITHUB_ENV
           echo "CARGO_FLAGS=${CARGO_FLAGS}" >> $GITHUB_ENV
+          echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo" >> $GITHUB_ENV
 
+      # Disabled for now
       # Don't include the ~/.cargo/registry/src directory. It contains just
       # uncompressed versions of the crates in ~/.cargo/registry/cache
       # directory, and it's faster to let 'cargo' to rebuild it from the
       # compressed crates.
-      - name: Cache cargo deps
-        id: cache_cargo
-        uses: actions/cache@v3
-        with:
-          path: |
-            ~/.cargo/registry/
-            !~/.cargo/registry/src
-            ~/.cargo/git/
-            target/
-          # Fall back to older versions of the key, if no cache for current Cargo.lock was found
-          key: |
-            v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
-            v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-
+#      - name: Cache cargo deps
+#        id: cache_cargo
+#        uses: actions/cache@v3
+#        with:
+#          path: |
+#            ~/.cargo/registry/
+#            !~/.cargo/registry/src
+#            ~/.cargo/git/
+#            target/
+#          # Fall back to older versions of the key, if no cache for current Cargo.lock was found
+#          key: |
+#            v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
+#            v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-
 
       - name: Cache postgres v14 build
         id: cache_pg_14
@@ -301,7 +304,7 @@ jobs:
         uses: ./.github/actions/save-coverage-data
 
   regress-tests:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, large ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
       options: --init
@@ -334,7 +337,7 @@ jobs:
         uses: ./.github/actions/save-coverage-data
 
   benchmarks:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, small ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
       options: --init
@@ -365,7 +368,7 @@ jobs:
       # while coverage is currently collected for the debug ones
 
   merge-allure-report:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, small ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
       options: --init
@@ -402,7 +405,7 @@ jobs:
           DATABASE_URL="$TEST_RESULT_CONNSTR" poetry run python3 scripts/ingest_regress_test_result.py --revision ${SHA} --reference ${GITHUB_REF} --build-type ${BUILD_TYPE} --ingest suites.json
 
   coverage-report:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, small ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
       options: --init
@@ -418,16 +421,17 @@ jobs:
           submodules: true
           fetch-depth: 1
 
-      - name: Restore cargo deps cache
-        id: cache_cargo
-        uses: actions/cache@v3
-        with:
-          path: |
-            ~/.cargo/registry/
-            !~/.cargo/registry/src
-            ~/.cargo/git/
-            target/
-          key: v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
+#      Disabled for now
+#      - name: Restore cargo deps cache
+#        id: cache_cargo
+#        uses: actions/cache@v3
+#        with:
+#          path: |
+#            ~/.cargo/registry/
+#            !~/.cargo/registry/src
+#            ~/.cargo/git/
+#            target/
+#          key: v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
 
       - name: Get Neon artifact
         uses: ./.github/actions/download
@@ -477,7 +481,7 @@ jobs:
             }"
 
   trigger-e2e-tests:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, small ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
       options: --init
@@ -522,9 +526,10 @@ jobs:
             }"
 
   neon-image:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, large ]
     needs: [ tag ]
-    container: gcr.io/kaniko-project/executor:v1.9.0-debug
+    # https://github.com/GoogleContainerTools/kaniko/issues/2005
+    container: gcr.io/kaniko-project/executor:v1.7.0-debug
     defaults:
       run:
         shell: sh -eu {0}
@@ -540,12 +545,16 @@ jobs:
         run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
 
       - name: Kaniko build neon
-        run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
+        run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
+
+      # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
+      - name: Cleanup ECR folder
+        run: rm -rf ~/.ecr
 
   compute-tools-image:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, large ]
     needs: [ tag ]
-    container: gcr.io/kaniko-project/executor:v1.9.0-debug
+    container: gcr.io/kaniko-project/executor:v1.7.0-debug
     defaults:
       run:
         shell: sh -eu {0}
@@ -558,11 +567,14 @@ jobs:
         run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
 
       - name: Kaniko build compute tools
-        run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
+        run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
+
+      - name: Cleanup ECR folder
+        run: rm -rf ~/.ecr
 
   compute-node-image:
-    runs-on: [ self-hosted, dev, x64 ]
-    container: gcr.io/kaniko-project/executor:v1.9.0-debug
+    runs-on: [ self-hosted, gen3, large ]
+    container: gcr.io/kaniko-project/executor:v1.7.0-debug
     needs: [ tag ]
     strategy:
       fail-fast: false
@@ -583,10 +595,13 @@ jobs:
         run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
 
       - name: Kaniko build compute node with extensions
-        run: /kaniko/executor --skip-unused-stages  --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache  --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-${{ matrix.version }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+        run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache  --context . --build-arg GIT_VERSION=${{ github.sha }} --build-arg PG_VERSION=${{ matrix.version }} --dockerfile Dockerfile.compute-node --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+
+      - name: Cleanup ECR folder
+        run: rm -rf ~/.ecr
 
   vm-compute-node-image:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, large ]
     needs: [ tag, compute-node-image ]
     strategy:
       fail-fast: false
@@ -631,7 +646,7 @@ jobs:
 
   test-images:
     needs: [ tag, neon-image, compute-node-image, compute-tools-image ]
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, small ]
 
     steps:
       - name: Checkout
@@ -673,20 +688,39 @@ jobs:
           docker compose -f ./docker-compose/docker-compose.yml down
 
   promote-images:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, small ]
     needs: [ tag, test-images, vm-compute-node-image ]
+    container: golang:1.19-bullseye
     if: github.event_name != 'workflow_dispatch'
-    container: amazon/aws-cli
-    strategy:
-      fail-fast: false
-      matrix:
-        name: [ neon, compute-node-v14, vm-compute-node-v14, compute-node-v15, vm-compute-node-v15, compute-tools]
 
     steps:
-      - name: Promote image to latest
+      - name: Install Crane & ECR helper
+        if: |
+          (github.ref_name == 'main' || github.ref_name == 'release') &&
+          github.event_name != 'workflow_dispatch'
         run: |
-          export MANIFEST=$(aws ecr batch-get-image --repository-name ${{ matrix.name }} --image-ids imageTag=${{needs.tag.outputs.build-tag}} --query 'images[].imageManifest' --output text)
-          aws ecr put-image --repository-name ${{ matrix.name }} --image-tag latest --image-manifest "$MANIFEST"
+          go install github.com/google/go-containerregistry/cmd/crane@31786c6cbb82d6ec4fb8eb79cd9387905130534e # v0.11.0
+          go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0
+
+      - name: Configure ECR login
+        run: |
+          mkdir /github/home/.docker/
+          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
+
+      - name: Add latest tag to images
+        if: |
+          (github.ref_name == 'main' || github.ref_name == 'release') &&
+          github.event_name != 'workflow_dispatch'
+        run: |
+          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} latest
+          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest
+          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
+          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
+          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
+          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
+
+      - name: Cleanup ECR folder
+        run: rm -rf ~/.ecr
 
   push-docker-hub:
     runs-on: [ self-hosted, dev, x64 ]
@@ -776,114 +810,11 @@ jobs:
           crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
           crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
 
-  calculate-deploy-targets:
-    runs-on: [ self-hosted, dev, x64 ]
-    if: |
-      github.ref_name == 'release' &&
-      github.event_name != 'workflow_dispatch'
-    outputs:
-      matrix-include: ${{ steps.set-matrix.outputs.include }}
-    steps:
-      - id: set-matrix
-        run: |
-          if [[ "$GITHUB_REF_NAME" == "release" ]]; then
-            PRODUCTION='{"env_name": "production", "proxy_job": "neon-proxy", "proxy_config": "production.proxy", "storage_broker_ns": "neon-storage-broker", "storage_broker_config": "production.neon-storage-broker", "kubeconfig_secret": "PRODUCTION_KUBECONFIG_DATA", "console_api_key_secret": "NEON_PRODUCTION_API_KEY"}'
-            echo "include=[$PRODUCTION]" >> $GITHUB_OUTPUT
-          else
-            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to 'release'"
-            exit 1
-          fi
-
-  deploy:
-    runs-on: [ self-hosted, dev, x64 ]
-    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
-    # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
-    # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
-    needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
-    if: |
-      github.ref_name == 'release' &&
-      github.event_name != 'workflow_dispatch'
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      matrix:
-        include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}}
-    environment:
-      name: prod-old
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 0
-
-      - name: Redeploy
-        run: |
-          export DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          cd "$(pwd)/.github/ansible"
-
-          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
-            ./get_binaries.sh
-          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
-            RELEASE=true ./get_binaries.sh
-          else
-            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
-            exit 1
-          fi
-
-          eval $(ssh-agent)
-          echo "${{ secrets.TELEPORT_SSH_KEY }}"  | tr -d '\n'| base64 --decode >ssh-key
-          echo "${{ secrets.TELEPORT_SSH_CERT }}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub
-          chmod 0600 ssh-key
-          ssh-add ssh-key
-          rm -f ssh-key ssh-key-cert.pub
-          ANSIBLE_CONFIG=./ansible.cfg ansible-galaxy collection install sivel.toiletwater
-          ANSIBLE_CONFIG=./ansible.cfg ansible-playbook deploy.yaml -i ${{ matrix.env_name }}.hosts.yaml -e CONSOLE_API_TOKEN=${{ secrets[matrix.console_api_key_secret] }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
-          rm -f neon_install.tar.gz .neon_current_version
-
-  deploy-new:
-    runs-on: [ self-hosted, dev, x64 ]
-    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
-    # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
-    # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
-    needs: [ push-docker-hub, tag, regress-tests ]
-    if: |
-      (github.ref_name == 'main') &&
-      github.event_name != 'workflow_dispatch'
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      matrix:
-        target_region: [ eu-west-1, us-east-2 ]
-    environment:
-      name: dev-${{ matrix.target_region }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 0
-
-      - name: Redeploy
-        run: |
-          export DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          cd "$(pwd)/.github/ansible"
-          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
-            ./get_binaries.sh
-          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
-            RELEASE=true ./get_binaries.sh
-          else
-            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
-            exit 1
-          fi
-          ansible-galaxy collection install sivel.toiletwater
-          ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_STAGING_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
-          rm -f neon_install.tar.gz .neon_current_version
+      - name: Cleanup ECR folder
+        run: rm -rf ~/.ecr
 
   deploy-pr-test-new:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, small ]
     container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
     # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
     # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
@@ -915,311 +846,40 @@ jobs:
           ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_STAGING_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
           rm -f neon_install.tar.gz .neon_current_version
 
-  deploy-prod-new:
-    runs-on: prod
-    container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
-    # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
-    # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
+      - name: Cleanup ansible folder
+        run: rm -rf ~/.ansible
+
+  deploy:
+    runs-on: [ self-hosted, gen3, small ]
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
     needs: [ push-docker-hub, tag, regress-tests ]
-    if: |
-      (github.ref_name == 'release') &&
-      github.event_name != 'workflow_dispatch'
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      matrix:
-        target_region: [ us-east-2, us-west-2, eu-central-1, ap-southeast-1 ]
-    environment:
-      name: prod-${{ matrix.target_region }}
+    if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
     steps:
       - name: Checkout
         uses: actions/checkout@v3
         with:
-          submodules: true
+          submodules: false
           fetch-depth: 0
 
-      - name: Redeploy
+      - name: Trigger deploy workflow
+        env:
+          GH_TOKEN: ${{ github.token }}
         run: |
-          export DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          cd "$(pwd)/.github/ansible"
-
           if [[ "$GITHUB_REF_NAME" == "main" ]]; then
-            ./get_binaries.sh
+            gh workflow run deploy-dev.yml --ref main -f branch=${{ github.sha }} -f dockerTag=${{needs.tag.outputs.build-tag}}
           elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
-            RELEASE=true ./get_binaries.sh
+            gh workflow run deploy-prod.yml --ref release -f branch=${{ github.sha }} -f dockerTag=${{needs.tag.outputs.build-tag}}
           else
             echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
             exit 1
           fi
 
-          ansible-galaxy collection install sivel.toiletwater
-          ansible-playbook deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
-          rm -f neon_install.tar.gz .neon_current_version
-
-  deploy-proxy:
-    runs-on: [ self-hosted, dev, x64 ]
-    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
-    # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
-    needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
-    if: |
-      github.ref_name == 'release' &&
-      github.event_name != 'workflow_dispatch'
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      matrix:
-        include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}}
-    environment:
-      name: prod-old
-    env:
-      KUBECONFIG: .kubeconfig
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 0
-
-      - name: Add curl
-        run: apt update && apt install curl -y
-
-      - name: Store kubeconfig file
-        run: |
-          echo "${{ secrets[matrix.kubeconfig_secret] }}" | base64 --decode > ${KUBECONFIG}
-          chmod 0600 ${KUBECONFIG}
-
-      - name: Setup helm v3
-        run: |
-          curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
-          helm repo add neondatabase https://neondatabase.github.io/helm-charts
-
-      - name: Re-deploy proxy
-        run: |
-          DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          helm upgrade ${{ matrix.proxy_job }}       neondatabase/neon-proxy --namespace neon-proxy --install --atomic -f .github/helm-values/${{ matrix.proxy_config }}.yaml       --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
-          helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install --atomic -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
-
-  deploy-storage-broker:
-    name: deploy storage broker on old staging and old prod
-    runs-on: [ self-hosted, dev, x64 ]
-    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
-    # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
-    needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
-    if: |
-      github.ref_name == 'release' &&
-      github.event_name != 'workflow_dispatch'
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      matrix:
-        include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}}
-    environment:
-      name: prod-old
-    env:
-      KUBECONFIG: .kubeconfig
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 0
-
-      - name: Add curl
-        run: apt update && apt install curl -y
-
-      - name: Store kubeconfig file
-        run: |
-          echo "${{ secrets[matrix.kubeconfig_secret] }}" | base64 --decode > ${KUBECONFIG}
-          chmod 0600 ${KUBECONFIG}
-
-      - name: Setup helm v3
-        run: |
-          curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
-          helm repo add neondatabase https://neondatabase.github.io/helm-charts
-
-      - name: Deploy storage-broker
-        run:
-          helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace ${{ matrix.storage_broker_ns }} --create-namespace --install --atomic -f .github/helm-values/${{ matrix.storage_broker_config }}.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
-
-  deploy-proxy-new:
-    runs-on: [ self-hosted, dev, x64 ]
-    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
-    # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
-    needs: [ push-docker-hub, tag, regress-tests ]
-    if: |
-      (github.ref_name == 'main') &&
-      github.event_name != 'workflow_dispatch'
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      matrix:
-        include:
-          - target_region:  us-east-2
-            target_cluster: dev-us-east-2-beta
-            deploy_link_proxy: true
-            deploy_legacy_scram_proxy: true
-          - target_region:  eu-west-1
-            target_cluster: dev-eu-west-1-zeta
-            deploy_link_proxy: false
-            deploy_legacy_scram_proxy: false
-    environment:
-      name: dev-${{ matrix.target_region }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 0
-
-      - name: Configure environment
-        run: |
-          helm repo add neondatabase https://neondatabase.github.io/helm-charts
-          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}
-
-      - name: Re-deploy scram proxy
-        run: |
-          DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
-
-      - name: Re-deploy link proxy
-        if: matrix.deploy_link_proxy
-        run: |
-          DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          helm upgrade neon-proxy-link neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-link.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
-
-      - name: Re-deploy legacy scram proxy
-        if: matrix.deploy_legacy_scram_proxy
-        run: |
-          DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          helm upgrade neon-proxy-scram-legacy neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram-legacy.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
-
-  deploy-storage-broker-dev-new:
-    runs-on: [ self-hosted, dev, x64 ]
-    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
-    # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
-    needs: [ push-docker-hub, tag, regress-tests ]
-    if: |
-      (github.ref_name == 'main') &&
-      github.event_name != 'workflow_dispatch'
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      matrix:
-        include:
-          - target_region:  us-east-2
-            target_cluster: dev-us-east-2-beta
-          - target_region:  eu-west-1
-            target_cluster: dev-eu-west-1-zeta
-    environment:
-      name: dev-${{ matrix.target_region }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 0
-
-      - name: Configure environment
-        run: |
-          helm repo add neondatabase https://neondatabase.github.io/helm-charts
-          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}
-
-      - name: Deploy storage-broker
-        run:
-          helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
-
-  deploy-proxy-prod-new:
-    runs-on: prod
-    container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
-    # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
-    needs: [ push-docker-hub, tag, regress-tests ]
-    if: |
-      (github.ref_name == 'release') &&
-      github.event_name != 'workflow_dispatch'
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      matrix:
-        include:
-          - target_region:  us-east-2
-            target_cluster: prod-us-east-2-delta
-          - target_region:  us-west-2
-            target_cluster: prod-us-west-2-eta
-          - target_region: eu-central-1
-            target_cluster: prod-eu-central-1-gamma
-          - target_region: ap-southeast-1
-            target_cluster: prod-ap-southeast-1-epsilon
-    environment:
-      name: prod-${{ matrix.target_region }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 0
-
-      - name: Configure environment
-        run: |
-          helm repo add neondatabase https://neondatabase.github.io/helm-charts
-          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}
-
-      - name: Re-deploy proxy
-        run: |
-          DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
-
-  deploy-storage-broker-prod-new:
-    runs-on: prod
-    container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
-    # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
-    needs: [ push-docker-hub, tag, regress-tests ]
-    if: |
-      (github.ref_name == 'release') &&
-      github.event_name != 'workflow_dispatch'
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      matrix:
-        include:
-          - target_region:  us-east-2
-            target_cluster: prod-us-east-2-delta
-          - target_region:  us-west-2
-            target_cluster: prod-us-west-2-eta
-          - target_region: eu-central-1
-            target_cluster: prod-eu-central-1-gamma
-          - target_region: ap-southeast-1
-            target_cluster: prod-ap-southeast-1-epsilon
-    environment:
-      name: prod-${{ matrix.target_region }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 0
-
-      - name: Configure environment
-        run: |
-          helm repo add neondatabase https://neondatabase.github.io/helm-charts
-          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}
-
-      - name: Deploy storage-broker
-        run:
-          helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
-
   promote-compatibility-data:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, small ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
       options: --init
-    needs: [ deploy, deploy-proxy ]
+    needs: [ push-docker-hub, tag, regress-tests ]
     if: github.ref_name == 'release' && github.event_name != 'workflow_dispatch'
     steps:
       - name: Promote compatibility snapshot for the release
diff --git a/.github/workflows/deploy-dev.yml b/.github/workflows/deploy-dev.yml
new file mode 100644
index 0000000000..409517bf63
--- /dev/null
+++ b/.github/workflows/deploy-dev.yml
@@ -0,0 +1,179 @@
+name: Neon Deploy dev
+
+on:
+  workflow_dispatch:
+    inputs:
+      dockerTag:
+        description: 'Docker tag to deploy'
+        required: true
+        type: string
+      branch:
+        description: 'Branch or commit used for deploy scripts and configs'
+        required: true
+        type: string
+        default: 'main'
+      deployStorage:
+        description: 'Deploy storage'
+        required: true
+        type: boolean
+        default: true
+      deployProxy:
+        description: 'Deploy proxy'
+        required: true
+        type: boolean
+        default: true
+      deployStorageBroker:
+        description: 'Deploy storage-broker'
+        required: true
+        type: boolean
+        default: true
+
+env:
+  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
+
+concurrency:
+  group: deploy-dev
+  cancel-in-progress: false
+
+jobs:
+  deploy-storage-new:
+    runs-on: [ self-hosted, gen3, small ]
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
+      options: --user root --privileged
+    if: inputs.deployStorage
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        target_region: [ eu-west-1, us-east-2 ]
+    environment:
+      name: dev-${{ matrix.target_region }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+          ref: ${{ inputs.branch }}
+
+      - name: Redeploy
+        run: |
+          export DOCKER_TAG=${{ inputs.dockerTag }}
+          cd "$(pwd)/.github/ansible"
+
+          ./get_binaries.sh
+
+          ansible-galaxy collection install sivel.toiletwater
+          ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_STAGING_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
+          rm -f neon_install.tar.gz .neon_current_version
+
+      - name: Cleanup ansible folder
+        run: rm -rf ~/.ansible
+
+  deploy-proxy-new:
+    runs-on: [ self-hosted, gen3, small ]
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
+    if: inputs.deployProxy
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        include:
+          - target_region:  us-east-2
+            target_cluster: dev-us-east-2-beta
+            deploy_link_proxy: true
+            deploy_legacy_scram_proxy: true
+          - target_region:  eu-west-1
+            target_cluster: dev-eu-west-1-zeta
+            deploy_link_proxy: false
+            deploy_legacy_scram_proxy: false
+    environment:
+      name: dev-${{ matrix.target_region }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+          ref: ${{ inputs.branch }}
+  
+      - name: Configure AWS Credentials
+        uses: aws-actions/configure-aws-credentials@v1-node16
+        with:
+          role-to-assume: arn:aws:iam::369495373322:role/github-runner
+          aws-region: eu-central-1
+          role-skip-session-tagging: true
+          role-duration-seconds: 1800
+  
+      - name: Configure environment
+        run: |
+          helm repo add neondatabase https://neondatabase.github.io/helm-charts
+          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}
+  
+      - name: Re-deploy scram proxy
+        run: |
+          DOCKER_TAG=${{ inputs.dockerTag }}
+          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+  
+      - name: Re-deploy link proxy
+        if: matrix.deploy_link_proxy
+        run: |
+          DOCKER_TAG=${{ inputs.dockerTag }}
+          helm upgrade neon-proxy-link neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-link.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+  
+      - name: Re-deploy legacy scram proxy
+        if: matrix.deploy_legacy_scram_proxy
+        run: |
+          DOCKER_TAG=${{ inputs.dockerTag }}
+          helm upgrade neon-proxy-scram-legacy neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram-legacy.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+  
+      - name: Cleanup helm folder
+        run: rm -rf ~/.cache
+  
+  deploy-storage-broker-new:
+    runs-on: [ self-hosted, gen3, small ]
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
+    if: inputs.deployStorageBroker
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        include:
+          - target_region:  us-east-2
+            target_cluster: dev-us-east-2-beta
+          - target_region:  eu-west-1
+            target_cluster: dev-eu-west-1-zeta
+    environment:
+      name: dev-${{ matrix.target_region }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+          ref: ${{ inputs.branch }}
+  
+      - name: Configure AWS Credentials
+        uses: aws-actions/configure-aws-credentials@v1-node16
+        with:
+          role-to-assume: arn:aws:iam::369495373322:role/github-runner
+          aws-region: eu-central-1
+          role-skip-session-tagging: true
+          role-duration-seconds: 1800
+  
+      - name: Configure environment
+        run: |
+          helm repo add neondatabase https://neondatabase.github.io/helm-charts
+          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}
+  
+      - name: Deploy storage-broker
+        run:
+          helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
+  
+      - name: Cleanup helm folder
+        run: rm -rf ~/.cache
diff --git a/.github/workflows/deploy-prod.yml b/.github/workflows/deploy-prod.yml
new file mode 100644
index 0000000000..e1954b5540
--- /dev/null
+++ b/.github/workflows/deploy-prod.yml
@@ -0,0 +1,277 @@
+name: Neon Deploy prod
+
+on:
+  workflow_dispatch:
+    inputs:
+      dockerTag:
+        description: 'Docker tag to deploy'
+        required: true
+        type: string
+      branch:
+        description: 'Branch or commit used for deploy scripts and configs'
+        required: true
+        type: string
+        default: 'main'
+      deployStorage:
+        description: 'Deploy storage'
+        required: true
+        type: boolean
+        default: true
+      deployProxy:
+        description: 'Deploy proxy'
+        required: true
+        type: boolean
+        default: true
+      deployStorageBroker:
+        description: 'Deploy storage-broker'
+        required: true
+        type: boolean
+        default: true
+
+concurrency:
+  group: deploy-prod
+  cancel-in-progress: false
+
+jobs:
+  deploy-prod-new:
+    runs-on: prod
+    container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
+    if: inputs.deployStorage
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        target_region: [ us-east-2, us-west-2, eu-central-1, ap-southeast-1 ]
+    environment:
+      name: prod-${{ matrix.target_region }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+          ref: ${{ inputs.branch }}
+
+      - name: Redeploy
+        run: |
+          export DOCKER_TAG=${{ inputs.dockerTag }}
+          cd "$(pwd)/.github/ansible"
+
+          ./get_binaries.sh
+
+          ansible-galaxy collection install sivel.toiletwater
+          ansible-playbook deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
+          rm -f neon_install.tar.gz .neon_current_version
+
+  deploy-proxy-prod-new:
+    runs-on: prod
+    container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
+    if: inputs.deployProxy
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        include:
+          - target_region:  us-east-2
+            target_cluster: prod-us-east-2-delta
+            deploy_link_proxy: true
+            deploy_legacy_scram_proxy: false
+          - target_region:  us-west-2
+            target_cluster: prod-us-west-2-eta
+            deploy_link_proxy: false
+            deploy_legacy_scram_proxy: true
+          - target_region: eu-central-1
+            target_cluster: prod-eu-central-1-gamma
+            deploy_link_proxy: false
+            deploy_legacy_scram_proxy: false
+          - target_region: ap-southeast-1
+            target_cluster: prod-ap-southeast-1-epsilon
+            deploy_link_proxy: false
+            deploy_legacy_scram_proxy: false
+    environment:
+      name: prod-${{ matrix.target_region }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+          ref: ${{ inputs.branch }}
+
+      - name: Configure environment
+        run: |
+          helm repo add neondatabase https://neondatabase.github.io/helm-charts
+          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}
+
+      - name: Re-deploy scram proxy
+        run: |
+          DOCKER_TAG=${{ inputs.dockerTag }}
+          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+
+      - name: Re-deploy link proxy
+        if: matrix.deploy_link_proxy
+        run: |
+          DOCKER_TAG=${{ inputs.dockerTag }}
+          helm upgrade neon-proxy-link neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-link.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+
+      - name: Re-deploy legacy scram proxy
+        if: matrix.deploy_legacy_scram_proxy
+        run: |
+          DOCKER_TAG=${{ inputs.dockerTag }}
+          helm upgrade neon-proxy-scram-legacy neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram-legacy.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+
+  deploy-storage-broker-prod-new:
+    runs-on: prod
+    container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
+    if: inputs.deployStorageBroker
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        include:
+          - target_region:  us-east-2
+            target_cluster: prod-us-east-2-delta
+          - target_region:  us-west-2
+            target_cluster: prod-us-west-2-eta
+          - target_region: eu-central-1
+            target_cluster: prod-eu-central-1-gamma
+          - target_region: ap-southeast-1
+            target_cluster: prod-ap-southeast-1-epsilon
+    environment:
+      name: prod-${{ matrix.target_region }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+          ref: ${{ inputs.branch }}
+
+      - name: Configure environment
+        run: |
+          helm repo add neondatabase https://neondatabase.github.io/helm-charts
+          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}
+
+      - name: Deploy storage-broker
+        run:
+          helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
+
+  # Deploy to old account below          
+
+  deploy:
+    runs-on: [ self-hosted, gen3, small ]
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
+    if: inputs.deployStorage
+    defaults:
+      run:
+        shell: bash
+    environment:
+      name: prod-old
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+          ref: ${{ inputs.branch }}
+
+      - name: Redeploy
+        run: |
+          export DOCKER_TAG=${{ inputs.dockerTag }}
+          cd "$(pwd)/.github/ansible"
+
+          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
+            ./get_binaries.sh
+          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
+            RELEASE=true ./get_binaries.sh
+          else
+            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
+            exit 1
+          fi
+
+          eval $(ssh-agent)
+          echo "${{ secrets.TELEPORT_SSH_KEY }}"  | tr -d '\n'| base64 --decode >ssh-key
+          echo "${{ secrets.TELEPORT_SSH_CERT }}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub
+          chmod 0600 ssh-key
+          ssh-add ssh-key
+          rm -f ssh-key ssh-key-cert.pub
+          ANSIBLE_CONFIG=./ansible.cfg ansible-galaxy collection install sivel.toiletwater
+          ANSIBLE_CONFIG=./ansible.cfg ansible-playbook deploy.yaml -i production.hosts.yaml -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
+          rm -f neon_install.tar.gz .neon_current_version
+
+      # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ansible/collections': Permission denied
+      - name: Cleanup ansible folder
+        run: rm -rf ~/.ansible
+
+  deploy-proxy:
+    runs-on: [ self-hosted, gen3, small ]
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
+    if: inputs.deployProxy
+    defaults:
+      run:
+        shell: bash
+    environment:
+      name: prod-old
+    env:
+      KUBECONFIG: .kubeconfig
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+          ref: ${{ inputs.branch }}
+
+      - name: Store kubeconfig file
+        run: |
+          echo "${{ secrets.PRODUCTION_KUBECONFIG_DATA }}" | base64 --decode > ${KUBECONFIG}
+          chmod 0600 ${KUBECONFIG}
+
+      - name: Add neon helm chart
+        run: helm repo add neondatabase https://neondatabase.github.io/helm-charts
+
+      - name: Re-deploy proxy
+        run: |
+          DOCKER_TAG=${{ inputs.dockerTag }}
+          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --install --atomic -f .github/helm-values/production.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+
+      - name: Cleanup helm folder
+        run: rm -rf ~/.cache
+
+  deploy-storage-broker:
+    name: deploy storage broker on old staging and old prod
+    runs-on: [ self-hosted, gen3, small ]
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
+    if: inputs.deployStorageBroker
+    defaults:
+      run:
+        shell: bash
+    environment:
+      name: prod-old
+    env:
+      KUBECONFIG: .kubeconfig
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+          ref: ${{ inputs.branch }}
+
+      - name: Store kubeconfig file
+        run: |
+          echo "${{ secrets.PRODUCTION_KUBECONFIG_DATA }}" | base64 --decode > ${KUBECONFIG}
+          chmod 0600 ${KUBECONFIG}
+
+      - name: Add neon helm chart
+        run: helm repo add neondatabase https://neondatabase.github.io/helm-charts
+
+      - name: Deploy storage-broker
+        run:
+          helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace neon-storage-broker --create-namespace --install --atomic -f .github/helm-values/production.neon-storage-broker.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
+
+      - name: Cleanup helm folder
+        run: rm -rf ~/.cache
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
new file mode 100644
index 0000000000..49e04ee001
--- /dev/null
+++ b/.github/workflows/release.yml
@@ -0,0 +1,33 @@
+name: Create Release Branch
+
+on:
+  schedule:
+    - cron: '0 10 * * 2'
+
+jobs:
+  create_release_branch:
+    runs-on: [ubuntu-latest]
+
+    steps:
+    - name: Check out code
+      uses: actions/checkout@v3
+      with:
+        ref: main
+
+    - name: Get current date
+      id: date
+      run: echo "date=(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT
+
+    - name: Create release branch
+      run: git checkout -b release/${{ steps.date.outputs.date }}
+
+    - name: Push new branch
+      run: git push origin release/${{ steps.date.outputs.date }}
+
+    - name: Create pull request into release
+      uses: thomaseizinger/create-pull-request@e3972219c86a56550fb70708d96800d8e24ba862 # 1.3.0
+      with:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        head: release/${{ steps.date.outputs.date }}
+        base: release
+        title: Release ${{ steps.date.outputs.date }}
diff --git a/Cargo.lock b/Cargo.lock
index d8aba9ba68..2985a654f3 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -37,11 +37,6 @@ dependencies = [
  "memchr",
 ]
 
-[[package]]
-name = "amplify_num"
-version = "0.4.1"
-source = "git+https://github.com/rust-amplify/rust-amplify.git?tag=v4.0.0-beta.1#3ad006cf2804e1862ec7725a7684a493f3023523"
-
 [[package]]
 name = "android_system_properties"
 version = "0.1.5"
@@ -66,6 +61,15 @@ dependencies = [
  "backtrace",
 ]
 
+[[package]]
+name = "archery"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0a8da9bc4c4053ee067669762bcaeea6e241841295a2b6c948312dad6ef4cc02"
+dependencies = [
+ "static_assertions",
+]
+
 [[package]]
 name = "asn1-rs"
 version = "0.5.1"
@@ -137,15 +141,6 @@ dependencies = [
  "syn",
 ]
 
-[[package]]
-name = "atomic-polyfill"
-version = "0.1.11"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e3ff7eb3f316534d83a8a2c3d1674ace8a5a71198eba31e2e2b597833f699b28"
-dependencies = [
- "critical-section",
-]
-
 [[package]]
 name = "atty"
 version = "0.2.14"
@@ -629,9 +624,9 @@ dependencies = [
 
 [[package]]
 name = "bumpalo"
-version = "3.11.1"
+version = "3.12.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "572f695136211188308f16ad2ca5c851a712c464060ae6974944458eb83880ba"
+checksum = "0d261e256854913907f67ed06efbc3338dfe6179796deefc1ff763fc1aee5535"
 
 [[package]]
 name = "byteorder"
@@ -750,13 +745,13 @@ dependencies = [
 
 [[package]]
 name = "clap"
-version = "4.0.32"
+version = "4.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a7db700bc935f9e43e88d00b0850dae18a63773cfbec6d8e070fccf7fef89a39"
+checksum = "4ec7a4128863c188deefe750ac1d1dfe66c236909f845af04beed823638dc1b2"
 dependencies = [
  "bitflags",
  "clap_derive",
- "clap_lex 0.3.0",
+ "clap_lex 0.3.1",
  "is-terminal",
  "once_cell",
  "strsim",
@@ -765,9 +760,9 @@ dependencies = [
 
 [[package]]
 name = "clap_derive"
-version = "4.0.21"
+version = "4.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0177313f9f02afc995627906bbd8967e2be069f5261954222dac78290c2b9014"
+checksum = "684a277d672e91966334af371f1a7b5833f9aa00b07c84e92fbce95e00208ce8"
 dependencies = [
  "heck",
  "proc-macro-error",
@@ -787,9 +782,9 @@ dependencies = [
 
 [[package]]
 name = "clap_lex"
-version = "0.3.0"
+version = "0.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0d4198f73e42b4936b35b5bb248d81d2b595ecb170da0bac7655c54eedfa8da8"
+checksum = "783fe232adfca04f90f56201b26d79682d4cd2625e0bc7290b95123afe558ade"
 dependencies = [
  "os_str_bytes",
 ]
@@ -832,10 +827,11 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "chrono",
- "clap 4.0.32",
+ "clap 4.1.1",
  "futures",
  "hyper",
  "notify",
+ "opentelemetry",
  "postgres",
  "regex",
  "serde",
@@ -844,7 +840,9 @@ dependencies = [
  "tokio",
  "tokio-postgres",
  "tracing",
+ "tracing-opentelemetry",
  "tracing-subscriber",
+ "tracing-utils",
  "url",
  "workspace_hack",
 ]
@@ -887,7 +885,7 @@ name = "control_plane"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "clap 4.0.32",
+ "clap 4.1.1",
  "comfy-table",
  "git-version",
  "nix",
@@ -988,12 +986,6 @@ dependencies = [
  "itertools",
 ]
 
-[[package]]
-name = "critical-section"
-version = "1.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6548a0ad5d2549e111e1f6a11a6c2e2d00ce6a3dafe22948d67c2b443f775e52"
-
 [[package]]
 name = "crossbeam-channel"
 version = "0.5.6"
@@ -1030,12 +1022,11 @@ dependencies = [
 
 [[package]]
 name = "crossbeam-utils"
-version = "0.8.11"
+version = "0.8.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "51887d4adc7b564537b15adcfb307936f8075dfcd5f00dde9a9f1d29383682bc"
+checksum = "4fb766fa798726286dbbb842f174001dab8abc7b627a1dd86e0b7222a95d929f"
 dependencies = [
  "cfg-if",
- "once_cell",
 ]
 
 [[package]]
@@ -1152,6 +1143,19 @@ dependencies = [
  "syn",
 ]
 
+[[package]]
+name = "dashmap"
+version = "5.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "907076dfda823b0b36d2a1bb5f90c96660a5bbcd7729e10727f07858f22c4edc"
+dependencies = [
+ "cfg-if",
+ "hashbrown 0.12.3",
+ "lock_api",
+ "once_cell",
+ "parking_lot_core",
+]
+
 [[package]]
 name = "data-encoding"
 version = "2.3.3"
@@ -1506,15 +1510,6 @@ version = "1.8.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7"
 
-[[package]]
-name = "hash32"
-version = "0.2.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b0c35f58762feb77d74ebe43bdbc3210f09be9fe6742234d573bacc26ed92b67"
-dependencies = [
- "byteorder",
-]
-
 [[package]]
 name = "hashbrown"
 version = "0.12.3"
@@ -1530,19 +1525,6 @@ dependencies = [
  "ahash",
 ]
 
-[[package]]
-name = "heapless"
-version = "0.7.16"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "db04bc24a18b9ea980628ecf00e6c0264f3c1426dac36c00cb49b6fbad8b0743"
-dependencies = [
- "atomic-polyfill",
- "hash32",
- "rustc_version",
- "spin 0.9.4",
- "stable_deref_trait",
-]
-
 [[package]]
 name = "heck"
 version = "0.4.0"
@@ -1804,9 +1786,9 @@ dependencies = [
 
 [[package]]
 name = "io-lifetimes"
-version = "1.0.3"
+version = "1.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "46112a93252b123d31a119a8d1a1ac19deac4fac6e0e8b0df58f0d4e5870e63c"
+checksum = "e7d6c6f8c91b4b9ed43484ad1a938e393caf35960fce7f82a040497207bd8e9e"
 dependencies = [
  "libc",
  "windows-sys",
@@ -1916,12 +1898,6 @@ dependencies = [
  "winapi",
 ]
 
-[[package]]
-name = "libm"
-version = "0.2.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "348108ab3fba42ec82ff6e9564fc4ca0247bdccdc68dd8af9764bbc79c3c8ffb"
-
 [[package]]
 name = "link-cplusplus"
 version = "1.0.8"
@@ -2067,9 +2043,9 @@ checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"
 
 [[package]]
 name = "nix"
-version = "0.26.1"
+version = "0.26.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "46a58d1d356c6597d08cde02c2f09d785b09e28711837b1ed667dc652c08a694"
+checksum = "bfdda3d196821d6af13126e40375cdf7da646a96114af134d5f417a9a1dc8e1a"
 dependencies = [
  "bitflags",
  "cfg-if",
@@ -2081,9 +2057,9 @@ dependencies = [
 
 [[package]]
 name = "nom"
-version = "7.1.2"
+version = "7.1.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e5507769c4919c998e69e49c839d9dc6e693ede4cc4290d6ad8b41d4f09c548c"
+checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a"
 dependencies = [
  "memchr",
  "minimal-lexical",
@@ -2154,7 +2130,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd"
 dependencies = [
  "autocfg",
- "libm",
 ]
 
 [[package]]
@@ -2203,6 +2178,108 @@ version = "0.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
 
+[[package]]
+name = "opentelemetry"
+version = "0.18.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "69d6c3d7288a106c0a363e4b0e8d308058d56902adefb16f4936f417ffef086e"
+dependencies = [
+ "opentelemetry_api",
+ "opentelemetry_sdk",
+]
+
+[[package]]
+name = "opentelemetry-http"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1edc79add46364183ece1a4542592ca593e6421c60807232f5b8f7a31703825d"
+dependencies = [
+ "async-trait",
+ "bytes",
+ "http",
+ "opentelemetry_api",
+ "reqwest",
+]
+
+[[package]]
+name = "opentelemetry-otlp"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d1c928609d087790fc936a1067bdc310ae702bdf3b090c3f281b713622c8bbde"
+dependencies = [
+ "async-trait",
+ "futures",
+ "futures-util",
+ "http",
+ "opentelemetry",
+ "opentelemetry-http",
+ "opentelemetry-proto",
+ "prost",
+ "reqwest",
+ "thiserror",
+]
+
+[[package]]
+name = "opentelemetry-proto"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d61a2f56df5574508dd86aaca016c917489e589ece4141df1b5e349af8d66c28"
+dependencies = [
+ "futures",
+ "futures-util",
+ "opentelemetry",
+ "prost",
+ "tonic",
+ "tonic-build",
+]
+
+[[package]]
+name = "opentelemetry-semantic-conventions"
+version = "0.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b02e0230abb0ab6636d18e2ba8fa02903ea63772281340ccac18e0af3ec9eeb"
+dependencies = [
+ "opentelemetry",
+]
+
+[[package]]
+name = "opentelemetry_api"
+version = "0.18.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c24f96e21e7acc813c7a8394ee94978929db2bcc46cf6b5014fc612bf7760c22"
+dependencies = [
+ "fnv",
+ "futures-channel",
+ "futures-util",
+ "indexmap",
+ "js-sys",
+ "once_cell",
+ "pin-project-lite",
+ "thiserror",
+]
+
+[[package]]
+name = "opentelemetry_sdk"
+version = "0.18.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1ca41c4933371b61c2a2f214bf16931499af4ec90543604ec828f7a625c09113"
+dependencies = [
+ "async-trait",
+ "crossbeam-channel",
+ "dashmap",
+ "fnv",
+ "futures-channel",
+ "futures-executor",
+ "futures-util",
+ "once_cell",
+ "opentelemetry_api",
+ "percent-encoding",
+ "rand",
+ "thiserror",
+ "tokio",
+ "tokio-stream",
+]
+
 [[package]]
 name = "os_info"
 version = "3.5.1"
@@ -2230,14 +2307,13 @@ checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
 name = "pageserver"
 version = "0.1.0"
 dependencies = [
- "amplify_num",
  "anyhow",
  "async-stream",
  "async-trait",
  "byteorder",
  "bytes",
  "chrono",
- "clap 4.0.32",
+ "clap 4.1.1",
  "close_fds",
  "const_format",
  "consumption_metrics",
@@ -2269,7 +2345,7 @@ dependencies = [
  "regex",
  "remote_storage",
  "reqwest",
- "rstar",
+ "rpds",
  "scopeguard",
  "serde",
  "serde_json",
@@ -2581,9 +2657,9 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068"
 
 [[package]]
 name = "proc-macro2"
-version = "1.0.49"
+version = "1.0.50"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "57a8eca9f9c4ffde41714334dee777596264c7825420f521abc92b5b5deb63a5"
+checksum = "6ef7d57beacfaf2d8aee5937dab7b7f28de3cb8b1828479bb5de2a7106f2bae2"
 dependencies = [
  "unicode-ident",
 ]
@@ -2683,7 +2759,7 @@ dependencies = [
  "bstr",
  "bytes",
  "chrono",
- "clap 4.0.32",
+ "clap 4.1.1",
  "consumption_metrics",
  "futures",
  "git-version",
@@ -2742,14 +2818,13 @@ dependencies = [
 
 [[package]]
 name = "rand"
-version = "0.8.4"
+version = "0.8.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2e7573632e6454cf6b99d7aac4ccca54be06da05aca2ef7423d22d27d4d4bcd8"
+checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
 dependencies = [
  "libc",
  "rand_chacha",
  "rand_core",
- "rand_hc",
 ]
 
 [[package]]
@@ -2771,15 +2846,6 @@ dependencies = [
  "getrandom",
 ]
 
-[[package]]
-name = "rand_hc"
-version = "0.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d51e9f596de227fda2ea6c84607f5558e196eeaf43c986b724ba4fb8fdf497e7"
-dependencies = [
- "rand_core",
-]
-
 [[package]]
 name = "rayon"
 version = "1.6.1"
@@ -2930,7 +2996,7 @@ dependencies = [
  "cc",
  "libc",
  "once_cell",
- "spin 0.5.2",
+ "spin",
  "untrusted",
  "web-sys",
  "winapi",
@@ -2950,14 +3016,12 @@ dependencies = [
 ]
 
 [[package]]
-name = "rstar"
-version = "0.9.3"
+name = "rpds"
+version = "0.12.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b40f1bfe5acdab44bc63e6699c28b74f75ec43afb59f3eda01e145aff86a25fa"
+checksum = "66262ea963eff99163e6b741fbc3417a52cc13074728c1047e9911789df9b000"
 dependencies = [
- "heapless",
- "num-traits",
- "smallvec",
+ "archery",
 ]
 
 [[package]]
@@ -3018,9 +3082,9 @@ dependencies = [
 
 [[package]]
 name = "rustix"
-version = "0.36.6"
+version = "0.36.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4feacf7db682c6c329c4ede12649cd36ecab0f3be5b7d74e6a20304725db4549"
+checksum = "d4fdebc4b395b7fbb9ab11e462e20ed9051e7b16e42d24042c776eca0ac81b03"
 dependencies = [
  "bitflags",
  "errno",
@@ -3093,7 +3157,7 @@ dependencies = [
  "async-trait",
  "byteorder",
  "bytes",
- "clap 4.0.32",
+ "clap 4.1.1",
  "const_format",
  "crc32c",
  "fs2",
@@ -3479,21 +3543,6 @@ version = "0.5.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d"
 
-[[package]]
-name = "spin"
-version = "0.9.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7f6002a767bff9e83f8eeecf883ecb8011875a21ae8da43bffb817a57e78cc09"
-dependencies = [
- "lock_api",
-]
-
-[[package]]
-name = "stable_deref_trait"
-version = "1.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
-
 [[package]]
 name = "static_assertions"
 version = "1.1.0"
@@ -3507,7 +3556,7 @@ dependencies = [
  "anyhow",
  "async-stream",
  "bytes",
- "clap 4.0.32",
+ "clap 4.1.1",
  "const_format",
  "futures",
  "futures-core",
@@ -3639,9 +3688,9 @@ dependencies = [
 
 [[package]]
 name = "termcolor"
-version = "1.1.3"
+version = "1.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bab24d30b911b2376f3a13cc2cd443142f0c81dda04c118693e35b3835757755"
+checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6"
 dependencies = [
  "winapi-util",
 ]
@@ -3749,9 +3798,9 @@ dependencies = [
 
 [[package]]
 name = "tokio"
-version = "1.24.1"
+version = "1.24.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1d9f76183f91ecfb55e1d7d5602bd1d979e38a3a522fe900241cf195624d67ae"
+checksum = "597a12a59981d9e3c38d216785b0c37399f6e415e8d0712047620f189371b0bb"
 dependencies = [
  "autocfg",
  "bytes",
@@ -4071,6 +4120,20 @@ dependencies = [
  "tracing-core",
 ]
 
+[[package]]
+name = "tracing-opentelemetry"
+version = "0.18.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "21ebb87a95ea13271332df069020513ab70bdb5637ca42d6e492dc3bbbad48de"
+dependencies = [
+ "once_cell",
+ "opentelemetry",
+ "tracing",
+ "tracing-core",
+ "tracing-log",
+ "tracing-subscriber",
+]
+
 [[package]]
 name = "tracing-serde"
 version = "0.1.3"
@@ -4102,6 +4165,22 @@ dependencies = [
  "tracing-serde",
 ]
 
+[[package]]
+name = "tracing-utils"
+version = "0.1.0"
+dependencies = [
+ "hyper",
+ "opentelemetry",
+ "opentelemetry-otlp",
+ "opentelemetry-semantic-conventions",
+ "reqwest",
+ "tokio",
+ "tracing",
+ "tracing-opentelemetry",
+ "tracing-subscriber",
+ "workspace_hack",
+]
+
 [[package]]
 name = "try-lock"
 version = "0.2.4"
@@ -4183,9 +4262,9 @@ checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a"
 
 [[package]]
 name = "ureq"
-version = "2.6.1"
+version = "2.6.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "733b5ad78377302af52c0dbcb2623d78fe50e4b3bf215948ff29e9ee031d8566"
+checksum = "338b31dd1314f68f3aabf3ed57ab922df95ffcd902476ca7ba3c4ce7b908c46d"
 dependencies = [
  "base64 0.13.1",
  "log",
@@ -4226,6 +4305,7 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "async-trait",
+ "atty",
  "bincode",
  "byteorder",
  "bytes",
@@ -4287,7 +4367,7 @@ name = "wal_craft"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "clap 4.0.32",
+ "clap 4.1.1",
  "env_logger",
  "log",
  "once_cell",
@@ -4534,11 +4614,13 @@ dependencies = [
  "anyhow",
  "bytes",
  "chrono",
- "clap 4.0.32",
+ "clap 4.1.1",
  "crossbeam-utils",
  "either",
  "fail",
+ "futures",
  "futures-channel",
+ "futures-executor",
  "futures-task",
  "futures-util",
  "indexmap",
@@ -4554,6 +4636,9 @@ dependencies = [
  "rand",
  "regex",
  "regex-syntax",
+ "reqwest",
+ "ring",
+ "rustls",
  "scopeguard",
  "serde",
  "serde_json",
@@ -4561,6 +4646,7 @@ dependencies = [
  "syn",
  "tokio",
  "tokio-util",
+ "tonic",
  "tower",
  "tracing",
  "tracing-core",
diff --git a/Cargo.toml b/Cargo.toml
index 74cc16d690..e6695c4246 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -61,6 +61,10 @@ nix = "0.26"
 notify = "5.0.0"
 num-traits = "0.2.15"
 once_cell = "1.13"
+opentelemetry = "0.18.0"
+opentelemetry-otlp = { version = "0.11.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
+opentelemetry-semantic-conventions = "0.10.0"
+tracing-opentelemetry = "0.18.0"
 parking_lot = "0.12"
 pin-project-lite = "0.2"
 prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
@@ -69,7 +73,7 @@ rand = "0.8"
 regex = "1.4"
 reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
 routerify = "3"
-rstar = "0.9.3"
+rpds = "0.12.0"
 rustls = "0.20"
 rustls-pemfile = "1"
 rustls-split = "0.3"
@@ -107,9 +111,6 @@ x509-parser = "0.14"
 env_logger = "0.10"
 log = "0.4"
 
-## TODO switch when the new release is made
-amplify_num = { git = "https://github.com/rust-amplify/rust-amplify.git", tag = "v4.0.0-beta.1" }
-
 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
 postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
 postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
@@ -128,6 +129,7 @@ remote_storage = { version = "0.1", path = "./libs/remote_storage/" }
 safekeeper_api = { version = "0.1", path = "./libs/safekeeper_api" }
 storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main broker code is inside the binary crate, so linking with the library shouldn't be heavy.
 tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" }
+tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
 utils = { version = "0.1", path = "./libs/utils/" }
 
 ## Common library dependency
diff --git a/Dockerfile.compute-node-v14 b/Dockerfile.compute-node
similarity index 86%
rename from Dockerfile.compute-node-v14
rename to Dockerfile.compute-node
index 2deb95a93f..936f368833 100644
--- a/Dockerfile.compute-node-v14
+++ b/Dockerfile.compute-node
@@ -1,8 +1,5 @@
-#
-# This file is identical to the Dockerfile.compute-node-v15 file
-# except for the version of Postgres that is built.
-#
-
+ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
+ARG IMAGE=rust
 ARG TAG=pinned
 
 #########################################################################################
@@ -22,7 +19,8 @@ RUN apt update &&  \
 #
 #########################################################################################
 FROM build-deps AS pg-build
-COPY vendor/postgres-v14 postgres
+ARG PG_VERSION
+COPY vendor/postgres-${PG_VERSION} postgres
 RUN cd postgres && \
     ./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp && \
     make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
@@ -135,6 +133,27 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3_postgis.control
 
+#########################################################################################
+#
+# Layer "unit-pg-build"
+# compile unit extension
+#
+#########################################################################################
+FROM build-deps AS unit-pg-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz && \
+    tar xvzf 7.7.tar.gz && \
+    cd postgresql-unit-7.7 && \
+    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    # unit extension's "create extension" script relies on absolute install path to fill some reference tables.
+    # We move the extension from '/usr/local/pgsql/' to '/usr/local/'  after it is build. So we need to adjust the path.
+    # This one-liner removes pgsql/ part of the path.
+    # NOTE: Other extensions that rely on MODULEDIR variable after building phase will need the same fix.
+    find /usr/local/pgsql/share/extension/ -name "unit*.sql" -print0 | xargs -0 sed -i "s|pgsql/||g" && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/unit.control
+
 #########################################################################################
 #
 # Layer "neon-pg-ext-build"
@@ -146,6 +165,7 @@ COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=h3-pg-build /h3/usr /
+COPY --from=unit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY pgxn/ pgxn/
 
 RUN make -j $(getconf _NPROCESSORS_ONLN) \
@@ -158,7 +178,7 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
 # Compile and run the Neon-specific `compute_ctl` binary
 #
 #########################################################################################
-FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:$TAG AS compute-tools
+FROM $REPOSITORY/$IMAGE:$TAG AS compute-tools
 USER nonroot
 # Copy entire project to get Cargo.* files with proper dependencies for the whole project
 COPY --chown=nonroot . .
diff --git a/Dockerfile.compute-node-v15 b/Dockerfile.compute-node-v15
deleted file mode 100644
index 8647ce2bf4..0000000000
--- a/Dockerfile.compute-node-v15
+++ /dev/null
@@ -1,220 +0,0 @@
-#
-# This file is identical to the Dockerfile.compute-node-v14 file
-# except for the version of Postgres that is built.
-#
-
-ARG TAG=pinned
-
-#########################################################################################
-#
-# Layer "build-deps"
-#
-#########################################################################################
-FROM debian:bullseye-slim AS build-deps
-RUN apt update &&  \
-    apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
-    zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev
-
-#########################################################################################
-#
-# Layer "pg-build"
-# Build Postgres from the neon postgres repository.
-#
-#########################################################################################
-FROM build-deps AS pg-build
-COPY vendor/postgres-v15 postgres
-RUN cd postgres && \
-    ./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp && \
-    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
-    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
-    # Install headers
-    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \
-    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install && \
-    # Enable some of contrib extensions
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/bloom.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/intagg.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control
-
-#########################################################################################
-#
-# Layer "postgis-build"
-# Build PostGIS from the upstream PostGIS mirror.
-#
-#########################################################################################
-FROM build-deps AS postgis-build
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
-RUN apt update && \
-    apt install -y gdal-bin libgdal-dev libprotobuf-c-dev protobuf-c-compiler xsltproc
-
-RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \
-    tar xvzf postgis-3.3.1.tar.gz && \
-    cd postgis-3.3.1 && \
-    ./autogen.sh && \
-    export PATH="/usr/local/pgsql/bin:$PATH" && \
-    ./configure && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    cd extensions/postgis && \
-    make clean && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_raster.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer_data_us.control
-
-#########################################################################################
-#
-# Layer "plv8-build"
-# Build plv8
-#
-#########################################################################################
-FROM build-deps AS plv8-build
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
-RUN apt update && \
-    apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5 binutils
-
-# https://github.com/plv8/plv8/issues/475:
-#   v8 uses gold for linking and sets `--thread-count=4` which breaks
-#   gold version <= 1.35 (https://sourceware.org/bugzilla/show_bug.cgi?id=23607)
-# Install newer gold version manually as debian-testing binutils version updates
-# libc version, which in turn breaks other extension built against non-testing libc.
-RUN wget https://ftp.gnu.org/gnu/binutils/binutils-2.38.tar.gz && \
-    tar xvzf binutils-2.38.tar.gz && \
-    cd binutils-2.38 && \
-    cd libiberty && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && \
-    cd ../bfd && ./configure && make bfdver.h && \
-    cd ../gold && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && make install && \
-    cp /usr/local/bin/ld.gold /usr/bin/gold
-
-# Sed is used to patch for https://github.com/plv8/plv8/issues/503
-RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
-    tar xvzf v3.1.4.tar.gz && \
-    cd plv8-3.1.4 && \
-    export PATH="/usr/local/pgsql/bin:$PATH" && \
-    sed -i 's/MemoryContextAlloc(/MemoryContextAllocZero(/' plv8.cc && \
-    make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
-    rm -rf /plv8-* && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control
-
-#########################################################################################
-#
-# Layer "h3-pg-build"
-# Build h3_pg
-#
-#########################################################################################
-FROM build-deps AS h3-pg-build
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
-
-# packaged cmake is too old
-RUN wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2-linux-x86_64.sh \
-      -q -O /tmp/cmake-install.sh \
-      && chmod u+x /tmp/cmake-install.sh \
-      && /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \
-      && rm /tmp/cmake-install.sh
-
-RUN wget https://github.com/uber/h3/archive/refs/tags/v4.0.1.tar.gz -O h3.tgz && \
-    tar xvzf h3.tgz  && \
-    cd h3-4.0.1 && \
-    mkdir build && \
-    cd build && \
-    cmake .. -DCMAKE_BUILD_TYPE=Release && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
-    DESTDIR=/h3 make install && \
-    cp -R /h3/usr / && \
-    rm -rf build
-
-RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3-pg.tgz && \
-    tar xvzf h3-pg.tgz && \
-    cd h3-pg-4.0.1 && \
-    export PATH="/usr/local/pgsql/bin:$PATH" && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3_postgis.control
-
-#########################################################################################
-#
-# Layer "neon-pg-ext-build"
-# compile neon extensions
-#
-#########################################################################################
-FROM build-deps AS neon-pg-ext-build
-COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=h3-pg-build /h3/usr /
-COPY pgxn/ pgxn/
-
-RUN make -j $(getconf _NPROCESSORS_ONLN) \
-        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
-        -C pgxn/neon \
-        -s install
-
-#########################################################################################
-#
-# Compile and run the Neon-specific `compute_ctl` binary
-#
-#########################################################################################
-FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:$TAG AS compute-tools
-USER nonroot
-# Copy entire project to get Cargo.* files with proper dependencies for the whole project
-COPY --chown=nonroot . .
-RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto
-
-#########################################################################################
-#
-# Clean up postgres folder before inclusion
-#
-#########################################################################################
-FROM neon-pg-ext-build AS postgres-cleanup-layer
-COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql
-
-# Remove binaries from /bin/ that we won't use (or would manually copy & install otherwise)
-RUN cd /usr/local/pgsql/bin && rm ecpg raster2pgsql shp2pgsql pgtopo_export pgtopo_import pgsql2shp
-
-# Remove headers that we won't need anymore - we've completed installation of all extensions
-RUN rm -r /usr/local/pgsql/include
-
-# Remove static postgresql libraries - all compilation is finished, so we
-# can now remove these files - they must be included in other binaries by now
-# if they were to be used by other libraries.
-RUN rm /usr/local/pgsql/lib/lib*.a
-
-#########################################################################################
-#
-# Final layer
-# Put it all together into the final image
-#
-#########################################################################################
-FROM debian:bullseye-slim
-# Add user postgres
-RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
-    echo "postgres:test_console_pass" | chpasswd && \
-    mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \
-    chown -R postgres:postgres /var/db/postgres && \
-    chmod 0750 /var/db/postgres/compute && \
-    echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig
-
-COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
-COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
-
-# Install:
-# libreadline8 for psql
-# libossp-uuid16 for extension ossp-uuid
-# libgeos, libgdal, libproj and libprotobuf-c1 for PostGIS
-RUN apt update &&  \
-    apt install --no-install-recommends -y \
-        libreadline8 \
-        libossp-uuid16 \
-        libgeos-c1v5 \
-        libgdal28 \
-        libproj19 \
-        libprotobuf-c1 \
-        gdb && \
-    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
-
-USER postgres
-ENTRYPOINT ["/usr/local/bin/compute_ctl"]
diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml
index 4536604bdf..f8c3481f57 100644
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -11,6 +11,7 @@ clap.workspace = true
 futures.workspace = true
 hyper = { workspace = true, features = ["full"] }
 notify.workspace = true
+opentelemetry.workspace = true
 postgres.workspace = true
 regex.workspace = true
 serde.workspace = true
@@ -19,7 +20,9 @@ tar.workspace = true
 tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
 tokio-postgres.workspace = true
 tracing.workspace = true
+tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
+tracing-utils.workspace = true
 url.workspace = true
 
 workspace_hack.workspace = true
diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index e5ab8eb153..2c42662020 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -53,7 +53,7 @@ use compute_tools::spec::*;
 use url::Url;
 
 fn main() -> Result<()> {
-    init_logger(DEFAULT_LOG_LEVEL)?;
+    init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
 
     let matches = cli().get_matches();
 
@@ -84,6 +84,29 @@ fn main() -> Result<()> {
         }
     };
 
+    // Extract OpenTelemetry context for the startup actions from the spec, and
+    // attach it to the current tracing context.
+    //
+    // This is used to propagate the context for the 'start_compute' operation
+    // from the neon control plane. This allows linking together the wider
+    // 'start_compute' operation that creates the compute container, with the
+    // startup actions here within the container.
+    //
+    // Switch to the startup context here, and exit it once the startup has
+    // completed and Postgres is up and running.
+    //
+    // NOTE: This is supposed to only cover the *startup* actions. Once
+    // postgres is configured and up-and-running, we exit this span. Any other
+    // actions that are performed on incoming HTTP requests, for example, are
+    // performed in separate spans.
+    let startup_context_guard = if let Some(ref carrier) = spec.startup_tracing_context {
+        use opentelemetry::propagation::TextMapPropagator;
+        use opentelemetry::sdk::propagation::TraceContextPropagator;
+        Some(TraceContextPropagator::new().extract(carrier).attach())
+    } else {
+        None
+    };
+
     let pageserver_connstr = spec
         .cluster
         .settings
@@ -140,6 +163,9 @@ fn main() -> Result<()> {
     // Wait for the child Postgres process forever. In this state Ctrl+C will
     // propagate to Postgres and it will be shut down as well.
     if let Some(mut pg) = pg {
+        // Startup is finished, exit the startup tracing span
+        drop(startup_context_guard);
+
         let ecode = pg
             .wait()
             .expect("failed to start waiting on Postgres process");
@@ -159,6 +185,10 @@ fn main() -> Result<()> {
         info!("shutting down");
     }
 
+    // Shutdown trace pipeline gracefully, so that it has a chance to send any
+    // pending traces before we exit.
+    tracing_utils::shutdown_tracing();
+
     exit(exit_code.unwrap_or(1))
 }
 
diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs
index f2a49f332c..589a8e1434 100644
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -3,16 +3,21 @@ use std::net::SocketAddr;
 use std::sync::Arc;
 use std::thread;
 
+use crate::compute::ComputeNode;
 use anyhow::Result;
 use hyper::service::{make_service_fn, service_fn};
 use hyper::{Body, Method, Request, Response, Server, StatusCode};
 use serde_json;
 use tracing::{error, info};
-
-use crate::compute::ComputeNode;
+use tracing_utils::http::OtelName;
 
 // Service function to handle all available routes.
-async fn routes(req: Request<Body>, compute: Arc<ComputeNode>) -> Response<Body> {
+async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body> {
+    //
+    // NOTE: The URI path is currently included in traces. That's OK because
+    // it doesn't contain any variable parts or sensitive information. But
+    // please keep that in mind if you change the routing here.
+    //
     match (req.method(), req.uri().path()) {
         // Serialized compute state.
         (&Method::GET, "/status") => {
@@ -30,7 +35,7 @@ async fn routes(req: Request<Body>, compute: Arc<ComputeNode>) -> Response<Body>
 
         (&Method::POST, "/check_writability") => {
             info!("serving /check_writability POST request");
-            let res = crate::checker::check_writability(&compute).await;
+            let res = crate::checker::check_writability(compute).await;
             match res {
                 Ok(_) => Response::new(Body::from("true")),
                 Err(e) => Response::new(Body::from(e.to_string())),
@@ -56,7 +61,19 @@ async fn serve(state: Arc<ComputeNode>) {
         async move {
             Ok::<_, Infallible>(service_fn(move |req: Request<Body>| {
                 let state = state.clone();
-                async move { Ok::<_, Infallible>(routes(req, state).await) }
+                async move {
+                    Ok::<_, Infallible>(
+                        // NOTE: We include the URI path in the string. It
+                        // doesn't contain any variable parts or sensitive
+                        // information in this API.
+                        tracing_utils::http::tracing_handler(
+                            req,
+                            |req| routes(req, &state),
+                            OtelName::UriPath,
+                        )
+                        .await,
+                    )
+                }
             }))
         }
     });
diff --git a/compute_tools/src/logger.rs b/compute_tools/src/logger.rs
index 57e5496e86..1b5cf647b0 100644
--- a/compute_tools/src/logger.rs
+++ b/compute_tools/src/logger.rs
@@ -1,21 +1,37 @@
-use anyhow::Result;
+use tracing_opentelemetry::OpenTelemetryLayer;
 use tracing_subscriber::layer::SubscriberExt;
 use tracing_subscriber::prelude::*;
 
-/// Initialize `env_logger` using either `default_level` or
+/// Initialize logging to stderr, and OpenTelemetry tracing and exporter.
+///
+/// Logging is configured using either `default_log_level` or
 /// `RUST_LOG` environment variable as default log level.
-pub fn init_logger(default_level: &str) -> Result<()> {
+///
+/// OpenTelemetry is configured with OTLP/HTTP exporter. It picks up
+/// configuration from environment variables. For example, to change the destination,
+/// set `OTEL_EXPORTER_OTLP_ENDPOINT=http://jaeger:4318`. See
+/// `tracing-utils` package description.
+///
+pub fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> {
+    // Initialize Logging
     let env_filter = tracing_subscriber::EnvFilter::try_from_default_env()
-        .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_level));
+        .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_log_level));
 
     let fmt_layer = tracing_subscriber::fmt::layer()
         .with_target(false)
         .with_writer(std::io::stderr);
 
+    // Initialize OpenTelemetry
+    let otlp_layer =
+        tracing_utils::init_tracing_without_runtime("compute_ctl").map(OpenTelemetryLayer::new);
+
+    // Put it all together
     tracing_subscriber::registry()
         .with(env_filter)
+        .with(otlp_layer)
         .with(fmt_layer)
         .init();
+    tracing::info!("logging and tracing started");
 
     Ok(())
 }
diff --git a/compute_tools/src/params.rs b/compute_tools/src/params.rs
index 925a2f8ef3..0ce01ff478 100644
--- a/compute_tools/src/params.rs
+++ b/compute_tools/src/params.rs
@@ -1,3 +1,9 @@
 pub const DEFAULT_LOG_LEVEL: &str = "info";
-pub const DEFAULT_CONNSTRING: &str = "host=localhost user=postgres";
+// From Postgres docs:
+//   To ease transition from the md5 method to the newer SCRAM method, if md5 is specified
+//   as a method in pg_hba.conf but the user's password on the server is encrypted for SCRAM
+//   (see below), then SCRAM-based authentication will automatically be chosen instead.
+//   https://www.postgresql.org/docs/15/auth-password.html
+//
+// So it's safe to set md5 here, as `control-plane` anyway uses SCRAM for all roles.
 pub const PG_HBA_ALL_MD5: &str = "host\tall\t\tall\t\t0.0.0.0/0\t\tmd5";
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 97cd623052..bbd0ec21ed 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -1,3 +1,4 @@
+use std::collections::HashMap;
 use std::path::Path;
 use std::str::FromStr;
 
@@ -22,6 +23,8 @@ pub struct ComputeSpec {
     /// Expected cluster state at the end of transition process.
     pub cluster: Cluster,
     pub delta_operations: Option<Vec<DeltaOp>>,
+
+    pub startup_tracing_context: Option<HashMap<String, String>>,
 }
 
 /// Cluster state seen from the perspective of the external tools
@@ -152,8 +155,20 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
             {
                 RoleAction::Update
             } else if let Some(pg_pwd) = &r.encrypted_password {
-                // Check whether password changed or not (trim 'md5:' prefix first)
-                if pg_pwd[3..] != *role.encrypted_password.as_ref().unwrap() {
+                // Check whether password changed or not (trim 'md5' prefix first if any)
+                //
+                // This is a backward compatibility hack, which comes from the times when we were using
+                // md5 for everyone and hashes were stored in the console db without md5 prefix. So when
+                // role comes from the control-plane (json spec) `Role.encrypted_password` doesn't have md5 prefix,
+                // but when role comes from Postgres (`get_existing_roles` / `existing_roles`) it has this prefix.
+                // Here is the only place so far where we compare hashes, so it seems to be the best candidate
+                // to place this compatibility layer.
+                let pg_pwd = if let Some(stripped) = pg_pwd.strip_prefix("md5") {
+                    stripped
+                } else {
+                    pg_pwd
+                };
+                if pg_pwd != *role.encrypted_password.as_ref().unwrap() {
                     RoleAction::Update
                 } else {
                     RoleAction::None
@@ -372,13 +387,13 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
                     name.pg_quote(),
                     db.owner.pg_quote()
                 );
-                let _ = info_span!("executing", query).entered();
+                let _guard = info_span!("executing", query).entered();
                 client.execute(query.as_str(), &[])?;
             }
             DatabaseAction::Create => {
                 let mut query: String = format!("CREATE DATABASE {} ", name.pg_quote());
                 query.push_str(&db.to_pg_options());
-                let _ = info_span!("executing", query).entered();
+                let _guard = info_span!("executing", query).entered();
                 client.execute(query.as_str(), &[])?;
             }
         };
diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs
index 880ab0e83c..07d220195b 100644
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -8,6 +8,7 @@ pub use prometheus::opts;
 pub use prometheus::register;
 pub use prometheus::{core, default_registry, proto};
 pub use prometheus::{exponential_buckets, linear_buckets};
+pub use prometheus::{register_counter_vec, Counter, CounterVec};
 pub use prometheus::{register_gauge, Gauge};
 pub use prometheus::{register_gauge_vec, GaugeVec};
 pub use prometheus::{register_histogram, Histogram};
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index b5027cb331..0d7aa2db55 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -29,6 +29,14 @@ pub enum TenantState {
     Broken,
 }
 
+pub mod state {
+    pub const LOADING: &str = "loading";
+    pub const ATTACHING: &str = "attaching";
+    pub const ACTIVE: &str = "active";
+    pub const STOPPING: &str = "stopping";
+    pub const BROKEN: &str = "broken";
+}
+
 impl TenantState {
     pub fn has_in_progress_downloads(&self) -> bool {
         match self {
@@ -39,23 +47,32 @@ impl TenantState {
             Self::Broken => false,
         }
     }
+
+    pub fn as_str(&self) -> &'static str {
+        match self {
+            TenantState::Loading => state::LOADING,
+            TenantState::Attaching => state::ATTACHING,
+            TenantState::Active => state::ACTIVE,
+            TenantState::Stopping => state::STOPPING,
+            TenantState::Broken => state::BROKEN,
+        }
+    }
 }
 
 /// A state of a timeline in pageserver's memory.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 pub enum TimelineState {
-    /// Timeline is fully operational. If the containing Tenant is Active, the timeline's
-    /// background jobs are running otherwise they will be launched when the tenant is activated.
+    /// The timeline is recognized by the pageserver but is not yet operational.
+    /// In particular, the walreceiver connection loop is not running for this timeline.
+    /// It will eventually transition to state Active or Broken.
+    Loading,
+    /// The timeline is fully operational.
+    /// It can be queried, and the walreceiver connection loop is running.
     Active,
-    /// A timeline is recognized by pageserver, but not yet ready to operate.
-    /// The status indicates, that the timeline could eventually go back to Active automatically:
-    /// for example, if the owning tenant goes back to Active again.
-    Suspended,
-    /// A timeline is recognized by pageserver, but not yet ready to operate and not allowed to
-    /// automatically become Active after certain events: only a management call can change this status.
+    /// The timeline was previously Loading or Active but is shutting down.
+    /// It cannot transition back into any other state.
     Stopping,
-    /// A timeline is recognized by the pageserver, but can no longer be used for
-    /// any operations, because it failed to be activated.
+    /// The timeline is broken and not operational (previous states: Loading or Active).
     Broken,
 }
 
diff --git a/libs/tracing-utils/Cargo.toml b/libs/tracing-utils/Cargo.toml
new file mode 100644
index 0000000000..8c3d3f9063
--- /dev/null
+++ b/libs/tracing-utils/Cargo.toml
@@ -0,0 +1,17 @@
+[package]
+name = "tracing-utils"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+hyper.workspace = true
+opentelemetry = { workspace = true, features=["rt-tokio"] }
+opentelemetry-otlp = { workspace = true, default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
+opentelemetry-semantic-conventions.workspace = true
+reqwest = { workspace = true, default-features = false, features = ["rustls-tls"] }
+tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
+tracing.workspace = true
+tracing-opentelemetry.workspace = true
+tracing-subscriber.workspace = true
+workspace_hack = { version = "0.1", path = "../../workspace_hack" }
diff --git a/libs/tracing-utils/src/http.rs b/libs/tracing-utils/src/http.rs
new file mode 100644
index 0000000000..3f80f49de1
--- /dev/null
+++ b/libs/tracing-utils/src/http.rs
@@ -0,0 +1,96 @@
+//! Tracing wrapper for Hyper HTTP server
+
+use hyper::HeaderMap;
+use hyper::{Body, Request, Response};
+use std::future::Future;
+use tracing::Instrument;
+use tracing_opentelemetry::OpenTelemetrySpanExt;
+
+/// Configuration option for what to use as the "otel.name" field in the traces.
+pub enum OtelName<'a> {
+    /// Use a constant string
+    Constant(&'a str),
+
+    /// Use the path from the request.
+    ///
+    /// That's very useful information, but is not appropriate if the
+    /// path contains parameters that differ on ever request, or worse,
+    /// sensitive information like usernames or email addresses.
+    ///
+    /// See <https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/trace/semantic_conventions/http.md#name>
+    UriPath,
+}
+
+/// Handle an incoming HTTP request using the given handler function,
+/// with OpenTelemetry tracing.
+///
+/// This runs 'handler' on the request in a new span, with fields filled in
+/// from the request. Notably, if the request contains tracing information,
+/// it is propagated to the span, so that this request is traced as part of
+/// the same trace.
+///
+/// XXX: Usually, this is handled by existing libraries, or built
+/// directly into HTTP servers. However, I couldn't find one for Hyper,
+/// so I had to write our own. OpenTelemetry website has a registry of
+/// instrumentation libraries at:
+/// https://opentelemetry.io/registry/?language=rust&component=instrumentation
+/// If a Hyper crate appears, consider switching to that.
+pub async fn tracing_handler<F, R>(
+    req: Request<Body>,
+    handler: F,
+    otel_name: OtelName<'_>,
+) -> Response<Body>
+where
+    F: Fn(Request<Body>) -> R,
+    R: Future<Output = Response<Body>>,
+{
+    // Create a tracing span, with context propagated from the incoming
+    // request if any.
+    //
+    // See list of standard fields defined for HTTP requests at
+    // https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/trace/semantic_conventions/http.md
+    // We only fill in a few of the most useful ones here.
+    let otel_name = match otel_name {
+        OtelName::Constant(s) => s,
+        OtelName::UriPath => req.uri().path(),
+    };
+
+    let span = tracing::info_span!(
+        "http request",
+        otel.name= %otel_name,
+        http.method = %req.method(),
+        http.status_code = tracing::field::Empty,
+    );
+    let parent_ctx = extract_remote_context(req.headers());
+    span.set_parent(parent_ctx);
+
+    // Handle the request within the span
+    let response = handler(req).instrument(span.clone()).await;
+
+    // Fill in the fields from the response code
+    let status = response.status();
+    span.record("http.status_code", status.as_str());
+    span.record(
+        "otel.status_code",
+        if status.is_success() { "OK" } else { "ERROR" },
+    );
+
+    response
+}
+
+// Extract remote tracing context from the HTTP headers
+fn extract_remote_context(headers: &HeaderMap) -> opentelemetry::Context {
+    struct HeaderExtractor<'a>(&'a HeaderMap);
+
+    impl<'a> opentelemetry::propagation::Extractor for HeaderExtractor<'a> {
+        fn get(&self, key: &str) -> Option<&str> {
+            self.0.get(key).and_then(|value| value.to_str().ok())
+        }
+
+        fn keys(&self) -> Vec<&str> {
+            self.0.keys().map(|value| value.as_str()).collect()
+        }
+    }
+    let extractor = HeaderExtractor(headers);
+    opentelemetry::global::get_text_map_propagator(|propagator| propagator.extract(&extractor))
+}
diff --git a/libs/tracing-utils/src/lib.rs b/libs/tracing-utils/src/lib.rs
new file mode 100644
index 0000000000..de0e2ad799
--- /dev/null
+++ b/libs/tracing-utils/src/lib.rs
@@ -0,0 +1,168 @@
+//! Helper functions to set up OpenTelemetry tracing.
+//!
+//! This comes in two variants, depending on whether you have a Tokio runtime available.
+//! If you do, call `init_tracing()`. It sets up the trace processor and exporter to use
+//! the current tokio runtime. If you don't have a runtime available, or you don't want
+//! to share the runtime with the tracing tasks, call `init_tracing_without_runtime()`
+//! instead. It sets up a dedicated single-threaded Tokio runtime for the tracing tasks.
+//!
+//! Example:
+//!
+//! ```rust,no_run
+//! use tracing_subscriber::prelude::*;
+//! use tracing_opentelemetry::OpenTelemetryLayer;
+//!
+//! #[tokio::main]
+//! async fn main() {
+//!     // Set up logging to stderr
+//!     let env_filter = tracing_subscriber::EnvFilter::try_from_default_env()
+//!         .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info"));
+//!     let fmt_layer = tracing_subscriber::fmt::layer()
+//!         .with_target(false)
+//!         .with_writer(std::io::stderr);
+//!
+//!     // Initialize OpenTelemetry. Exports tracing spans as OpenTelemetry traces
+//!     let otlp_layer = tracing_utils::init_tracing("my_application").await.map(OpenTelemetryLayer::new);
+//!
+//!     // Put it all together
+//!     tracing_subscriber::registry()
+//!         .with(env_filter)
+//!         .with(otlp_layer)
+//!         .with(fmt_layer)
+//!         .init();
+//! }
+//! ```
+
+use opentelemetry::sdk::Resource;
+use opentelemetry::KeyValue;
+use opentelemetry_otlp::WithExportConfig;
+use opentelemetry_otlp::{OTEL_EXPORTER_OTLP_ENDPOINT, OTEL_EXPORTER_OTLP_TRACES_ENDPOINT};
+
+pub use tracing_opentelemetry::OpenTelemetryLayer;
+
+pub mod http;
+
+/// Set up OpenTelemetry exporter, using configuration from environment variables.
+///
+/// `service_name` is set as the OpenTelemetry 'service.name' resource (see
+/// <https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/resource/semantic_conventions/README.md#service>)
+///
+/// We try to follow the conventions for the environment variables specified in
+/// <https://opentelemetry.io/docs/reference/specification/sdk-environment-variables/>
+///
+/// However, we only support a subset of those options:
+///
+/// - OTEL_SDK_DISABLED is supported. The default is "false", meaning tracing
+///   is enabled by default. Set it to "true" to disable.
+///
+/// - We use the OTLP exporter, with HTTP protocol. Most of the OTEL_EXPORTER_OTLP_*
+///   settings specified in
+///   <https://opentelemetry.io/docs/reference/specification/protocol/exporter/>
+///   are supported, as they are handled by the `opentelemetry-otlp` crate.
+///   Settings related to other exporters have no effect.
+///
+/// - Some other settings are supported by the `opentelemetry` crate.
+///
+/// If you need some other setting, please test if it works first. And perhaps
+/// add a comment in the list above to save the effort of testing for the next
+/// person.
+///
+/// This doesn't block, but is marked as 'async' to hint that this must be called in
+/// asynchronous execution context.
+pub async fn init_tracing(service_name: &str) -> Option<opentelemetry::sdk::trace::Tracer> {
+    if std::env::var("OTEL_SDK_DISABLED") == Ok("true".to_string()) {
+        return None;
+    };
+    Some(init_tracing_internal(service_name.to_string()))
+}
+
+/// Like `init_tracing`, but creates a separate tokio Runtime for the tracing
+/// tasks.
+pub fn init_tracing_without_runtime(
+    service_name: &str,
+) -> Option<opentelemetry::sdk::trace::Tracer> {
+    if std::env::var("OTEL_SDK_DISABLED") == Ok("true".to_string()) {
+        return None;
+    };
+
+    // The opentelemetry batch processor and the OTLP exporter needs a Tokio
+    // runtime. Create a dedicated runtime for them. One thread should be
+    // enough.
+    //
+    // (Alternatively, instead of batching, we could use the "simple
+    // processor", which doesn't need Tokio, and use "reqwest-blocking"
+    // feature for the OTLP exporter, which also doesn't need Tokio.  However,
+    // batching is considered best practice, and also I have the feeling that
+    // the non-Tokio codepaths in the opentelemetry crate are less used and
+    // might be more buggy, so better to stay on the well-beaten path.)
+    //
+    // We leak the runtime so that it keeps running after we exit the
+    // function.
+    let runtime = Box::leak(Box::new(
+        tokio::runtime::Builder::new_multi_thread()
+            .enable_all()
+            .thread_name("otlp runtime thread")
+            .worker_threads(1)
+            .build()
+            .unwrap(),
+    ));
+    let _guard = runtime.enter();
+
+    Some(init_tracing_internal(service_name.to_string()))
+}
+
+fn init_tracing_internal(service_name: String) -> opentelemetry::sdk::trace::Tracer {
+    // Set up exporter from the OTEL_EXPORTER_* environment variables
+    let mut exporter = opentelemetry_otlp::new_exporter().http().with_env();
+
+    // XXX opentelemetry-otlp v0.18.0 has a bug in how it uses the
+    // OTEL_EXPORTER_OTLP_ENDPOINT env variable. According to the
+    // OpenTelemetry spec at
+    // <https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/protocol/exporter.md#endpoint-urls-for-otlphttp>,
+    // the full exporter URL is formed by appending "/v1/traces" to the value
+    // of OTEL_EXPORTER_OTLP_ENDPOINT. However, opentelemetry-otlp only does
+    // that with the grpc-tonic exporter. Other exporters, like the HTTP
+    // exporter, use the URL from OTEL_EXPORTER_OTLP_ENDPOINT as is, without
+    // appending "/v1/traces".
+    //
+    // See https://github.com/open-telemetry/opentelemetry-rust/pull/950
+    //
+    // Work around that by checking OTEL_EXPORTER_OTLP_ENDPOINT, and setting
+    // the endpoint url with the "/v1/traces" path ourselves. If the bug is
+    // fixed in a later version, we can remove this code. But if we don't
+    // remember to remove this, it won't do any harm either, as the crate will
+    // just ignore the OTEL_EXPORTER_OTLP_ENDPOINT setting when the endpoint
+    // is set directly with `with_endpoint`.
+    if std::env::var(OTEL_EXPORTER_OTLP_TRACES_ENDPOINT).is_err() {
+        if let Ok(mut endpoint) = std::env::var(OTEL_EXPORTER_OTLP_ENDPOINT) {
+            if !endpoint.ends_with('/') {
+                endpoint.push('/');
+            }
+            endpoint.push_str("v1/traces");
+            exporter = exporter.with_endpoint(endpoint);
+        }
+    }
+
+    // Propagate trace information in the standard W3C TraceContext format.
+    opentelemetry::global::set_text_map_propagator(
+        opentelemetry::sdk::propagation::TraceContextPropagator::new(),
+    );
+
+    opentelemetry_otlp::new_pipeline()
+        .tracing()
+        .with_exporter(exporter)
+        .with_trace_config(
+            opentelemetry::sdk::trace::config().with_resource(Resource::new(vec![KeyValue::new(
+                opentelemetry_semantic_conventions::resource::SERVICE_NAME,
+                service_name,
+            )])),
+        )
+        .install_batch(opentelemetry::runtime::Tokio)
+        .expect("could not initialize opentelemetry exporter")
+}
+
+// Shutdown trace pipeline gracefully, so that it has a chance to send any
+// pending traces before we exit.
+pub fn shutdown_tracing() {
+    opentelemetry::global::shutdown_tracer_provider();
+}
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index 020e4d9dd7..1f6c96bdbe 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -5,6 +5,7 @@ edition.workspace = true
 license.workspace = true
 
 [dependencies]
+atty.workspace = true
 sentry.workspace = true
 async-trait.workspace = true
 anyhow.workspace = true
diff --git a/libs/utils/src/http/error.rs b/libs/utils/src/http/error.rs
index b0ecb746d9..1ba0422993 100644
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -1,6 +1,7 @@
 use hyper::{header, Body, Response, StatusCode};
 use serde::{Deserialize, Serialize};
 use thiserror::Error;
+use tracing::error;
 
 #[derive(Debug, Error)]
 pub enum ApiError {
@@ -76,8 +77,16 @@ impl HttpErrorBody {
 }
 
 pub async fn handler(err: routerify::RouteError) -> Response<Body> {
-    tracing::error!("Error processing HTTP request: {:?}", err);
-    err.downcast::<ApiError>()
-        .expect("handler should always return api error")
-        .into_response()
+    let api_error = err
+        .downcast::<ApiError>()
+        .expect("handler should always return api error");
+
+    // Print a stack trace for Internal Server errors
+    if let ApiError::InternalServerError(_) = api_error.as_ref() {
+        error!("Error processing HTTP request: {api_error:?}");
+    } else {
+        error!("Error processing HTTP request: {api_error:#}");
+    }
+
+    api_error.into_response()
 }
diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs
index 82c9267f4a..02684d3d16 100644
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -34,7 +34,7 @@ pub fn init(log_format: LogFormat) -> anyhow::Result<()> {
     let base_logger = tracing_subscriber::fmt()
         .with_env_filter(env_filter)
         .with_target(false)
-        .with_ansi(false)
+        .with_ansi(atty::is(atty::Stream::Stdout))
         .with_writer(std::io::stdout);
 
     match log_format {
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index cb9e4478bf..66c25e8576 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -11,7 +11,6 @@ default = []
 testing = ["fail/failpoints"]
 
 [dependencies]
-amplify_num.workspace = true
 anyhow.workspace = true
 async-stream.workspace = true
 async-trait.workspace = true
@@ -41,7 +40,6 @@ postgres-protocol.workspace = true
 postgres-types.workspace = true
 rand.workspace = true
 regex.workspace = true
-rstar.workspace = true
 scopeguard.workspace = true
 serde.workspace = true
 serde_json = { workspace = true, features = ["raw_value"] }
@@ -68,6 +66,7 @@ tenant_size_model.workspace = true
 utils.workspace = true
 workspace_hack.workspace = true
 reqwest.workspace = true
+rpds.workspace = true
 
 [dev-dependencies]
 criterion.workspace = true
diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs
index 6a01fdfc6f..e18c00da96 100644
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -1,13 +1,12 @@
-use anyhow::Result;
+use pageserver::keyspace::{KeyPartitioning, KeySpace};
 use pageserver::repository::Key;
 use pageserver::tenant::layer_map::LayerMap;
-use pageserver::tenant::storage_layer::{DeltaFileName, ImageFileName, ValueReconstructState};
-use pageserver::tenant::storage_layer::{Layer, ValueReconstructResult};
+use pageserver::tenant::storage_layer::Layer;
+use pageserver::tenant::storage_layer::{DeltaFileName, ImageFileName, LayerDescriptor};
 use rand::prelude::{SeedableRng, SliceRandom, StdRng};
 use std::cmp::{max, min};
 use std::fs::File;
 use std::io::{BufRead, BufReader};
-use std::ops::Range;
 use std::path::PathBuf;
 use std::str::FromStr;
 use std::sync::Arc;
@@ -17,102 +16,35 @@ use utils::lsn::Lsn;
 
 use criterion::{criterion_group, criterion_main, Criterion};
 
-struct DummyDelta {
-    key_range: Range<Key>,
-    lsn_range: Range<Lsn>,
-}
-
-impl Layer for DummyDelta {
-    fn get_key_range(&self) -> Range<Key> {
-        self.key_range.clone()
-    }
-
-    fn get_lsn_range(&self) -> Range<Lsn> {
-        self.lsn_range.clone()
-    }
-    fn get_value_reconstruct_data(
-        &self,
-        _key: Key,
-        _lsn_range: Range<Lsn>,
-        _reconstruct_data: &mut ValueReconstructState,
-    ) -> Result<ValueReconstructResult> {
-        panic!()
-    }
-
-    fn is_incremental(&self) -> bool {
-        true
-    }
-
-    fn dump(&self, _verbose: bool) -> Result<()> {
-        unimplemented!()
-    }
-
-    fn short_id(&self) -> String {
-        unimplemented!()
-    }
-}
-
-struct DummyImage {
-    key_range: Range<Key>,
-    lsn: Lsn,
-}
-
-impl Layer for DummyImage {
-    fn get_key_range(&self) -> Range<Key> {
-        self.key_range.clone()
-    }
-
-    fn get_lsn_range(&self) -> Range<Lsn> {
-        // End-bound is exclusive
-        self.lsn..(self.lsn + 1)
-    }
-
-    fn get_value_reconstruct_data(
-        &self,
-        _key: Key,
-        _lsn_range: Range<Lsn>,
-        _reconstruct_data: &mut ValueReconstructState,
-    ) -> Result<ValueReconstructResult> {
-        panic!()
-    }
-
-    fn is_incremental(&self) -> bool {
-        false
-    }
-
-    fn dump(&self, _verbose: bool) -> Result<()> {
-        unimplemented!()
-    }
-
-    fn short_id(&self) -> String {
-        unimplemented!()
-    }
-}
-
-fn build_layer_map(filename_dump: PathBuf) -> LayerMap<dyn Layer> {
-    let mut layer_map = LayerMap::<dyn Layer>::default();
+fn build_layer_map(filename_dump: PathBuf) -> LayerMap<LayerDescriptor> {
+    let mut layer_map = LayerMap::<LayerDescriptor>::default();
 
     let mut min_lsn = Lsn(u64::MAX);
     let mut max_lsn = Lsn(0);
 
     let filenames = BufReader::new(File::open(filename_dump).unwrap()).lines();
 
+    let mut updates = layer_map.batch_update();
     for fname in filenames {
         let fname = &fname.unwrap();
         if let Some(imgfilename) = ImageFileName::parse_str(fname) {
-            let layer = DummyImage {
-                key_range: imgfilename.key_range,
-                lsn: imgfilename.lsn,
+            let layer = LayerDescriptor {
+                key: imgfilename.key_range,
+                lsn: imgfilename.lsn..(imgfilename.lsn + 1),
+                is_incremental: false,
+                short_id: fname.to_string(),
             };
-            layer_map.insert_historic(Arc::new(layer));
+            updates.insert_historic(Arc::new(layer));
             min_lsn = min(min_lsn, imgfilename.lsn);
             max_lsn = max(max_lsn, imgfilename.lsn);
         } else if let Some(deltafilename) = DeltaFileName::parse_str(fname) {
-            let layer = DummyDelta {
-                key_range: deltafilename.key_range,
-                lsn_range: deltafilename.lsn_range.clone(),
+            let layer = LayerDescriptor {
+                key: deltafilename.key_range.clone(),
+                lsn: deltafilename.lsn_range.clone(),
+                is_incremental: true,
+                short_id: fname.to_string(),
             };
-            layer_map.insert_historic(Arc::new(layer));
+            updates.insert_historic(Arc::new(layer));
             min_lsn = min(min_lsn, deltafilename.lsn_range.start);
             max_lsn = max(max_lsn, deltafilename.lsn_range.end);
         } else {
@@ -122,11 +54,12 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap<dyn Layer> {
 
     println!("min: {min_lsn}, max: {max_lsn}");
 
+    updates.flush();
     layer_map
 }
 
 /// Construct a layer map query pattern for benchmarks
-fn uniform_query_pattern(layer_map: &LayerMap<dyn Layer>) -> Vec<(Key, Lsn)> {
+fn uniform_query_pattern(layer_map: &LayerMap<LayerDescriptor>) -> Vec<(Key, Lsn)> {
     // For each image layer we query one of the pages contained, at LSN right
     // before the image layer was created. This gives us a somewhat uniform
     // coverage of both the lsn and key space because image layers have
@@ -150,6 +83,41 @@ fn uniform_query_pattern(layer_map: &LayerMap<dyn Layer>) -> Vec<(Key, Lsn)> {
         .collect()
 }
 
+// Construct a partitioning for testing get_difficulty map when we
+// don't have an exact result of `collect_keyspace` to work with.
+fn uniform_key_partitioning(layer_map: &LayerMap<LayerDescriptor>, _lsn: Lsn) -> KeyPartitioning {
+    let mut parts = Vec::new();
+
+    // We add a partition boundary at the start of each image layer,
+    // no matter what lsn range it covers. This is just the easiest
+    // thing to do. A better thing to do would be to get a real
+    // partitioning from some database. Even better, remove the need
+    // for key partitions by deciding where to create image layers
+    // directly based on a coverage-based difficulty map.
+    let mut keys: Vec<_> = layer_map
+        .iter_historic_layers()
+        .filter_map(|l| {
+            if l.is_incremental() {
+                None
+            } else {
+                let kr = l.get_key_range();
+                Some(kr.start.next())
+            }
+        })
+        .collect();
+    keys.sort();
+
+    let mut current_key = Key::from_hex("000000000000000000000000000000000000").unwrap();
+    for key in keys {
+        parts.push(KeySpace {
+            ranges: vec![current_key..key],
+        });
+        current_key = key;
+    }
+
+    KeyPartitioning { parts }
+}
+
 // Benchmark using metadata extracted from our performance test environment, from
 // a project where we have run pgbench many timmes. The pgbench database was initialized
 // between each test run.
@@ -183,24 +151,68 @@ fn bench_from_captest_env(c: &mut Criterion) {
 // Benchmark using metadata extracted from a real project that was taknig
 // too long processing layer map queries.
 fn bench_from_real_project(c: &mut Criterion) {
-    // TODO consider compressing this file
+    // Init layer map
+    let now = Instant::now();
     let layer_map = build_layer_map(PathBuf::from("benches/odd-brook-layernames.txt"));
+    println!("Finished layer map init in {:?}", now.elapsed());
+
+    // Choose uniformly distributed queries
     let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map);
 
-    // Test with uniform query pattern
-    c.bench_function("real_map_uniform_queries", |b| {
+    // Choose inputs for get_difficulty_map
+    let latest_lsn = layer_map
+        .iter_historic_layers()
+        .map(|l| l.get_lsn_range().end)
+        .max()
+        .unwrap();
+    let partitioning = uniform_key_partitioning(&layer_map, latest_lsn);
+
+    // Check correctness of get_difficulty_map
+    // TODO put this in a dedicated test outside of this mod
+    {
+        println!("running correctness check");
+
+        let now = Instant::now();
+        let result_bruteforce = layer_map.get_difficulty_map_bruteforce(latest_lsn, &partitioning);
+        assert!(result_bruteforce.len() == partitioning.parts.len());
+        println!("Finished bruteforce in {:?}", now.elapsed());
+
+        let now = Instant::now();
+        let result_fast = layer_map.get_difficulty_map(latest_lsn, &partitioning, None);
+        assert!(result_fast.len() == partitioning.parts.len());
+        println!("Finished fast in {:?}", now.elapsed());
+
+        // Assert results are equal. Manually iterate for easier debugging.
+        let zip = std::iter::zip(
+            &partitioning.parts,
+            std::iter::zip(result_bruteforce, result_fast),
+        );
+        for (_part, (bruteforce, fast)) in zip {
+            assert_eq!(bruteforce, fast);
+        }
+
+        println!("No issues found");
+    }
+
+    // Define and name the benchmark function
+    let mut group = c.benchmark_group("real_map");
+    group.bench_function("uniform_queries", |b| {
         b.iter(|| {
             for q in queries.clone().into_iter() {
                 layer_map.search(q.0, q.1);
             }
         });
     });
+    group.bench_function("get_difficulty_map", |b| {
+        b.iter(|| {
+            layer_map.get_difficulty_map(latest_lsn, &partitioning, Some(3));
+        });
+    });
+    group.finish();
 }
 
 // Benchmark using synthetic data. Arrange image layers on stacked diagonal lines.
 fn bench_sequential(c: &mut Criterion) {
-    let mut layer_map: LayerMap<dyn Layer> = LayerMap::default();
-
     // Init layer map. Create 100_000 layers arranged in 1000 diagonal lines.
     //
     // TODO This code is pretty slow and runs even if we're only running other
@@ -208,39 +220,39 @@ fn bench_sequential(c: &mut Criterion) {
     //      Putting it inside the `bench_function` closure is not a solution
     //      because then it runs multiple times during warmup.
     let now = Instant::now();
+    let mut layer_map = LayerMap::default();
+    let mut updates = layer_map.batch_update();
     for i in 0..100_000 {
-        // TODO try inserting a super-wide layer in between every 10 to reflect
-        //      what often happens with L1 layers that include non-rel changes.
-        //      Maybe do that as a separate test.
         let i32 = (i as u32) % 100;
         let zero = Key::from_hex("000000000000000000000000000000000000").unwrap();
-        let layer = DummyImage {
-            key_range: zero.add(10 * i32)..zero.add(10 * i32 + 1),
-            lsn: Lsn(10 * i),
+        let layer = LayerDescriptor {
+            key: zero.add(10 * i32)..zero.add(10 * i32 + 1),
+            lsn: Lsn(i)..Lsn(i + 1),
+            is_incremental: false,
+            short_id: format!("Layer {}", i),
         };
-        layer_map.insert_historic(Arc::new(layer));
+        updates.insert_historic(Arc::new(layer));
     }
-
-    // Manually measure runtime without criterion because criterion
-    // has a minimum sample size of 10 and I don't want to run it 10 times.
-    println!("Finished init in {:?}", now.elapsed());
+    updates.flush();
+    println!("Finished layer map init in {:?}", now.elapsed());
 
     // Choose 100 uniformly random queries
     let rng = &mut StdRng::seed_from_u64(1);
     let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map)
-        .choose_multiple(rng, 1)
+        .choose_multiple(rng, 100)
         .copied()
         .collect();
 
     // Define and name the benchmark function
-    c.bench_function("sequential_uniform_queries", |b| {
-        // Run the search queries
+    let mut group = c.benchmark_group("sequential");
+    group.bench_function("uniform_queries", |b| {
         b.iter(|| {
             for q in queries.clone().into_iter() {
                 layer_map.search(q.0, q.1);
             }
         });
     });
+    group.finish();
 }
 
 criterion_group!(group_1, bench_from_captest_env);
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index f1d92ac36b..06d4853274 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -27,6 +27,7 @@ use tracing::*;
 ///
 use tokio_tar::{Builder, EntryType, Header};
 
+use crate::context::RequestContext;
 use crate::tenant::Timeline;
 use pageserver_api::reltag::{RelTag, SlruKind};
 
@@ -52,6 +53,7 @@ pub async fn send_basebackup_tarball<'a, W>(
     req_lsn: Option<Lsn>,
     prev_lsn: Option<Lsn>,
     full_backup: bool,
+    ctx: &'a RequestContext,
 ) -> anyhow::Result<()>
 where
     W: AsyncWrite + Send + Sync + Unpin,
@@ -110,6 +112,7 @@ where
         lsn: backup_lsn,
         prev_record_lsn: prev_lsn,
         full_backup,
+        ctx,
     };
     basebackup
         .send_tarball()
@@ -129,6 +132,7 @@ where
     lsn: Lsn,
     prev_record_lsn: Lsn,
     full_backup: bool,
+    ctx: &'a RequestContext,
 }
 
 impl<'a, W> Basebackup<'a, W>
@@ -171,23 +175,37 @@ where
             SlruKind::MultiXactOffsets,
             SlruKind::MultiXactMembers,
         ] {
-            for segno in self.timeline.list_slru_segments(kind, self.lsn).await? {
+            for segno in self
+                .timeline
+                .list_slru_segments(kind, self.lsn, self.ctx)
+                .await?
+            {
                 self.add_slru_segment(kind, segno).await?;
             }
         }
 
         // Create tablespace directories
-        for ((spcnode, dbnode), has_relmap_file) in self.timeline.list_dbdirs(self.lsn).await? {
+        for ((spcnode, dbnode), has_relmap_file) in
+            self.timeline.list_dbdirs(self.lsn, self.ctx).await?
+        {
             self.add_dbdir(spcnode, dbnode, has_relmap_file).await?;
 
             // Gather and send relational files in each database if full backup is requested.
             if self.full_backup {
-                for rel in self.timeline.list_rels(spcnode, dbnode, self.lsn).await? {
+                for rel in self
+                    .timeline
+                    .list_rels(spcnode, dbnode, self.lsn, self.ctx)
+                    .await?
+                {
                     self.add_rel(rel).await?;
                 }
             }
         }
-        for xid in self.timeline.list_twophase_files(self.lsn).await? {
+        for xid in self
+            .timeline
+            .list_twophase_files(self.lsn, self.ctx)
+            .await?
+        {
             self.add_twophase_file(xid).await?;
         }
 
@@ -203,7 +221,10 @@ where
     }
 
     async fn add_rel(&mut self, tag: RelTag) -> anyhow::Result<()> {
-        let nblocks = self.timeline.get_rel_size(tag, self.lsn, false).await?;
+        let nblocks = self
+            .timeline
+            .get_rel_size(tag, self.lsn, false, self.ctx)
+            .await?;
 
         // If the relation is empty, create an empty file
         if nblocks == 0 {
@@ -223,7 +244,7 @@ where
             for blknum in startblk..endblk {
                 let img = self
                     .timeline
-                    .get_rel_page_at_lsn(tag, blknum, self.lsn, false)
+                    .get_rel_page_at_lsn(tag, blknum, self.lsn, false, self.ctx)
                     .await?;
                 segment_data.extend_from_slice(&img[..]);
             }
@@ -245,14 +266,14 @@ where
     async fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> {
         let nblocks = self
             .timeline
-            .get_slru_segment_size(slru, segno, self.lsn)
+            .get_slru_segment_size(slru, segno, self.lsn, self.ctx)
             .await?;
 
         let mut slru_buf: Vec<u8> = Vec::with_capacity(nblocks as usize * BLCKSZ as usize);
         for blknum in 0..nblocks {
             let img = self
                 .timeline
-                .get_slru_page_at_lsn(slru, segno, blknum, self.lsn)
+                .get_slru_page_at_lsn(slru, segno, blknum, self.lsn, self.ctx)
                 .await?;
 
             if slru == SlruKind::Clog {
@@ -287,7 +308,7 @@ where
         let relmap_img = if has_relmap_file {
             let img = self
                 .timeline
-                .get_relmap_file(spcnode, dbnode, self.lsn)
+                .get_relmap_file(spcnode, dbnode, self.lsn, self.ctx)
                 .await?;
             ensure!(img.len() == 512);
             Some(img)
@@ -323,7 +344,7 @@ where
             if !has_relmap_file
                 && self
                     .timeline
-                    .list_rels(spcnode, dbnode, self.lsn)
+                    .list_rels(spcnode, dbnode, self.lsn, self.ctx)
                     .await?
                     .is_empty()
             {
@@ -356,7 +377,10 @@ where
     // Extract twophase state files
     //
     async fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
-        let img = self.timeline.get_twophase_file(xid, self.lsn).await?;
+        let img = self
+            .timeline
+            .get_twophase_file(xid, self.lsn, self.ctx)
+            .await?;
 
         let mut buf = BytesMut::new();
         buf.extend_from_slice(&img[..]);
@@ -394,12 +418,12 @@ where
 
         let checkpoint_bytes = self
             .timeline
-            .get_checkpoint(self.lsn)
+            .get_checkpoint(self.lsn, self.ctx)
             .await
             .context("failed to get checkpoint bytes")?;
         let pg_control_bytes = self
             .timeline
-            .get_control_file(self.lsn)
+            .get_control_file(self.lsn, self.ctx)
             .await
             .context("failed get control bytes")?;
 
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 5de6e4def5..f2cd93bd3a 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -13,6 +13,7 @@ use tracing::*;
 use metrics::set_build_info_metric;
 use pageserver::{
     config::{defaults::*, PageServerConf},
+    context::{DownloadBehavior, RequestContext},
     http, page_cache, page_service, task_mgr,
     task_mgr::TaskKind,
     task_mgr::{
@@ -26,7 +27,7 @@ use utils::{
     logging,
     postgres_backend::AuthType,
     project_git_version,
-    sentry_init::{init_sentry, release_name},
+    sentry_init::init_sentry,
     signals::{self, Signal},
     tcp_listener,
 };
@@ -85,7 +86,10 @@ fn main() -> anyhow::Result<()> {
     };
 
     // initialize sentry if SENTRY_DSN is provided
-    let _sentry_guard = init_sentry(release_name!(), &[("node_id", &conf.id.to_string())]);
+    let _sentry_guard = init_sentry(
+        Some(GIT_VERSION.into()),
+        &[("node_id", &conf.id.to_string())],
+    );
 
     let tenants_path = conf.tenants_path();
     if !tenants_path.exists() {
@@ -246,7 +250,7 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
     let signals = signals::install_shutdown_handlers()?;
 
     // Launch broker client
-    WALRECEIVER_RUNTIME.block_on(pageserver::walreceiver::init_broker_client(conf))?;
+    WALRECEIVER_RUNTIME.block_on(pageserver::broker_client::init_broker_client(conf))?;
 
     // Initialize authentication for incoming connections
     let auth = match &conf.auth_type {
@@ -325,6 +329,13 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
         );
 
         if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
+            let metrics_ctx = RequestContext::todo_child(
+                TaskKind::MetricsCollection,
+                // This task itself shouldn't download anything.
+                // The actual size calculation does need downloads, and
+                // creates a child context with the right DownloadBehavior.
+                DownloadBehavior::Error,
+            );
             task_mgr::spawn(
                 MGMT_REQUEST_RUNTIME.handle(),
                 TaskKind::MetricsCollection,
@@ -338,6 +349,7 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
                         conf.metric_collection_interval,
                         conf.synthetic_size_calculation_interval,
                         conf.id,
+                        metrics_ctx,
                     )
                     .instrument(info_span!("metrics_collection"))
                     .await?;
@@ -349,17 +361,34 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
 
     // Spawn a task to listen for libpq connections. It will spawn further tasks
     // for each connection. We created the listener earlier already.
-    task_mgr::spawn(
-        COMPUTE_REQUEST_RUNTIME.handle(),
-        TaskKind::LibpqEndpointListener,
-        None,
-        None,
-        "libpq endpoint listener",
-        true,
-        async move {
-            page_service::libpq_listener_main(conf, auth, pageserver_listener, conf.auth_type).await
-        },
-    );
+    {
+        let libpq_ctx = RequestContext::todo_child(
+            TaskKind::LibpqEndpointListener,
+            // listener task shouldn't need to download anything. (We will
+            // create a separate sub-contexts for each connection, with their
+            // own download behavior. This context is used only to listen and
+            // accept connections.)
+            DownloadBehavior::Error,
+        );
+        task_mgr::spawn(
+            COMPUTE_REQUEST_RUNTIME.handle(),
+            TaskKind::LibpqEndpointListener,
+            None,
+            None,
+            "libpq endpoint listener",
+            true,
+            async move {
+                page_service::libpq_listener_main(
+                    conf,
+                    auth,
+                    pageserver_listener,
+                    conf.auth_type,
+                    libpq_ctx,
+                )
+                .await
+            },
+        );
+    }
 
     // All started up! Now just sit and wait for shutdown signal.
     signals.handle(|signal| match signal {
diff --git a/pageserver/src/broker_client.rs b/pageserver/src/broker_client.rs
new file mode 100644
index 0000000000..6c92967ca3
--- /dev/null
+++ b/pageserver/src/broker_client.rs
@@ -0,0 +1,48 @@
+//! The broker client instance of the pageserver, created during pageserver startup.
+//! Used by each timelines' [`walreceiver`].
+
+use crate::config::PageServerConf;
+
+use anyhow::Context;
+use once_cell::sync::OnceCell;
+use storage_broker::BrokerClientChannel;
+use tracing::*;
+
+static BROKER_CLIENT: OnceCell<BrokerClientChannel> = OnceCell::new();
+
+///
+/// Initialize the broker client. This must be called once at page server startup.
+///
+pub async fn init_broker_client(conf: &'static PageServerConf) -> anyhow::Result<()> {
+    let broker_endpoint = conf.broker_endpoint.clone();
+
+    // Note: we do not attempt connecting here (but validate endpoints sanity).
+    let broker_client =
+        storage_broker::connect(broker_endpoint.clone(), conf.broker_keepalive_interval).context(
+            format!(
+                "Failed to create broker client to {}",
+                &conf.broker_endpoint
+            ),
+        )?;
+
+    if BROKER_CLIENT.set(broker_client).is_err() {
+        panic!("broker already initialized");
+    }
+
+    info!(
+        "Initialized broker client with endpoints: {}",
+        broker_endpoint
+    );
+    Ok(())
+}
+
+///
+/// Get a handle to the broker client
+///
+pub fn get_broker_client() -> &'static BrokerClientChannel {
+    BROKER_CLIENT.get().expect("broker client not initialized")
+}
+
+pub fn is_broker_client_initialized() -> bool {
+    BROKER_CLIENT.get().is_some()
+}
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 51d1664e52..a3b051279d 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -158,6 +158,8 @@ pub struct PageServerConf {
     pub synthetic_size_calculation_interval: Duration,
 
     pub test_remote_failures: u64,
+
+    pub ondemand_download_behavior_treat_error_as_warn: bool,
 }
 
 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -222,6 +224,8 @@ struct PageServerConfigBuilder {
     synthetic_size_calculation_interval: BuilderValue<Duration>,
 
     test_remote_failures: BuilderValue<u64>,
+
+    ondemand_download_behavior_treat_error_as_warn: BuilderValue<bool>,
 }
 
 impl Default for PageServerConfigBuilder {
@@ -267,6 +271,8 @@ impl Default for PageServerConfigBuilder {
             metric_collection_endpoint: Set(DEFAULT_METRIC_COLLECTION_ENDPOINT),
 
             test_remote_failures: Set(0),
+
+            ondemand_download_behavior_treat_error_as_warn: Set(false),
         }
     }
 }
@@ -363,6 +369,14 @@ impl PageServerConfigBuilder {
         self.test_remote_failures = BuilderValue::Set(fail_first);
     }
 
+    pub fn ondemand_download_behavior_treat_error_as_warn(
+        &mut self,
+        ondemand_download_behavior_treat_error_as_warn: bool,
+    ) {
+        self.ondemand_download_behavior_treat_error_as_warn =
+            BuilderValue::Set(ondemand_download_behavior_treat_error_as_warn);
+    }
+
     pub fn build(self) -> anyhow::Result<PageServerConf> {
         Ok(PageServerConf {
             listen_pg_addr: self
@@ -422,6 +436,11 @@ impl PageServerConfigBuilder {
             test_remote_failures: self
                 .test_remote_failures
                 .ok_or(anyhow!("missing test_remote_failuers"))?,
+            ondemand_download_behavior_treat_error_as_warn: self
+                .ondemand_download_behavior_treat_error_as_warn
+                .ok_or(anyhow!(
+                    "missing ondemand_download_behavior_treat_error_as_warn"
+                ))?,
         })
     }
 }
@@ -600,6 +619,7 @@ impl PageServerConf {
                 "synthetic_size_calculation_interval" =>
                     builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?),
                 "test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?),
+                "ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?),
                 _ => bail!("unrecognized pageserver option '{key}'"),
             }
         }
@@ -724,6 +744,7 @@ impl PageServerConf {
             metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
             synthetic_size_calculation_interval: Duration::from_secs(60),
             test_remote_failures: 0,
+            ondemand_download_behavior_treat_error_as_warn: false,
         }
     }
 }
@@ -749,6 +770,11 @@ fn parse_toml_u64(name: &str, item: &Item) -> Result<u64> {
     Ok(i as u64)
 }
 
+fn parse_toml_bool(name: &str, item: &Item) -> Result<bool> {
+    item.as_bool()
+        .with_context(|| format!("configure option {name} is not a bool"))
+}
+
 fn parse_toml_duration(name: &str, item: &Item) -> Result<Duration> {
     let s = item
         .as_str()
@@ -907,6 +933,7 @@ log_format = 'json'
                     defaults::DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL
                 )?,
                 test_remote_failures: 0,
+                ondemand_download_behavior_treat_error_as_warn: false,
             },
             "Correct defaults should be used when no config values are provided"
         );
@@ -954,6 +981,7 @@ log_format = 'json'
                 metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?),
                 synthetic_size_calculation_interval: Duration::from_secs(333),
                 test_remote_failures: 0,
+                ondemand_download_behavior_treat_error_as_warn: false,
             },
             "Should be able to parse all basic config values correctly"
         );
diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index c07026261d..d848ec5ee5 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -3,6 +3,7 @@
 //! and push them to a HTTP endpoint.
 //! Cache metrics to send only the updated ones.
 //!
+use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::mgr;
 use anyhow;
@@ -47,12 +48,15 @@ pub async fn collect_metrics(
     metric_collection_interval: Duration,
     synthetic_size_calculation_interval: Duration,
     node_id: NodeId,
+    ctx: RequestContext,
 ) -> anyhow::Result<()> {
     let mut ticker = tokio::time::interval(metric_collection_interval);
 
     info!("starting collect_metrics");
 
     // spin up background worker that caclulates tenant sizes
+    let worker_ctx =
+        ctx.detached_child(TaskKind::CalculateSyntheticSize, DownloadBehavior::Download);
     task_mgr::spawn(
         BACKGROUND_RUNTIME.handle(),
         TaskKind::CalculateSyntheticSize,
@@ -61,7 +65,7 @@ pub async fn collect_metrics(
         "synthetic size calculation",
         false,
         async move {
-            calculate_synthetic_size_worker(synthetic_size_calculation_interval)
+            calculate_synthetic_size_worker(synthetic_size_calculation_interval, &worker_ctx)
                 .instrument(info_span!("synthetic_size_worker"))
                 .await?;
             Ok(())
@@ -79,7 +83,7 @@ pub async fn collect_metrics(
                 return Ok(());
             },
             _ = ticker.tick() => {
-                if let Err(err) = collect_metrics_iteration(&client, &mut cached_metrics, metric_collection_endpoint, node_id).await
+                if let Err(err) = collect_metrics_iteration(&client, &mut cached_metrics, metric_collection_endpoint, node_id, &ctx).await
                 {
                     error!("metrics collection failed: {err:?}");
                 }
@@ -102,6 +106,7 @@ pub async fn collect_metrics_iteration(
     cached_metrics: &mut HashMap<PageserverConsumptionMetricsKey, u64>,
     metric_collection_endpoint: &reqwest::Url,
     node_id: NodeId,
+    ctx: &RequestContext,
 ) -> anyhow::Result<()> {
     let mut current_metrics: Vec<(PageserverConsumptionMetricsKey, u64)> = Vec::new();
     trace!(
@@ -110,7 +115,7 @@ pub async fn collect_metrics_iteration(
     );
 
     // get list of tenants
-    let tenants = mgr::list_tenants().await;
+    let tenants = mgr::list_tenants().await?;
 
     // iterate through list of Active tenants and collect metrics
     for (tenant_id, tenant_state) in tenants {
@@ -137,7 +142,7 @@ pub async fn collect_metrics_iteration(
                     timeline_written_size,
                 ));
 
-                let (timeline_logical_size, is_exact) = timeline.get_current_logical_size()?;
+                let (timeline_logical_size, is_exact) = timeline.get_current_logical_size(ctx)?;
                 // Only send timeline logical size when it is fully calculated.
                 if is_exact {
                     current_metrics.push((
@@ -258,6 +263,7 @@ pub async fn collect_metrics_iteration(
 /// Caclculate synthetic size for each active tenant
 pub async fn calculate_synthetic_size_worker(
     synthetic_size_calculation_interval: Duration,
+    ctx: &RequestContext,
 ) -> anyhow::Result<()> {
     info!("starting calculate_synthetic_size_worker");
 
@@ -270,7 +276,13 @@ pub async fn calculate_synthetic_size_worker(
             },
         _ = ticker.tick() => {
 
-                let tenants = mgr::list_tenants().await;
+                let tenants = match mgr::list_tenants().await {
+                    Ok(tenants) => tenants,
+                    Err(e) => {
+                        warn!("cannot get tenant list: {e:#}");
+                        continue;
+                    }
+                };
                 // iterate through list of Active tenants and collect metrics
                 for (tenant_id, tenant_state) in tenants {
 
@@ -280,7 +292,7 @@ pub async fn calculate_synthetic_size_worker(
 
                     if let Ok(tenant) = mgr::get_tenant(tenant_id, true).await
                     {
-                        if let Err(e) = tenant.calculate_synthetic_size().await {
+                        if let Err(e) = tenant.calculate_synthetic_size(ctx).await {
                             error!("failed to calculate synthetic size for tenant {}: {}", tenant_id, e);
                         }
                     }
diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs
new file mode 100644
index 0000000000..e826d28e6d
--- /dev/null
+++ b/pageserver/src/context.rs
@@ -0,0 +1,199 @@
+//! This module defines `RequestContext`, a structure that we use throughout
+//! the pageserver to propagate high-level context from places
+//! that _originate_ activity down to the shared code paths at the
+//! heart of the pageserver. It's inspired by Golang's `context.Context`.
+//!
+//! For example, in `Timeline::get(page_nr, lsn)` we need to answer the following questions:
+//! 1. What high-level activity ([`TaskKind`]) needs this page?
+//!    We need that information as a categorical dimension for page access
+//!    statistics, which we, in turn, need to guide layer eviction policy design.
+//! 2. How should we behave if, to produce the page image, we need to
+//!    on-demand download a layer file ([`DownloadBehavior`]).
+//!
+//! [`RequestContext`] satisfies those needs.
+//! The current implementation is a small `struct` that is passed through
+//! the call chain by reference.
+//!
+//! ### Future Work
+//!
+//! However, we do not intend to stop here, since there are other needs that
+//! require carrying information from high to low levels of the app.
+//!
+//! Most importantly, **cancellation signaling** in response to
+//! 1. timeouts (page_service max response time) and
+//! 2. lifecycle requests (detach tenant, delete timeline).
+//!
+//! Related to that, there is sometimes a need to ensure that all tokio tasks spawned
+//! by the transitive callees of a request have finished. The keyword here
+//! is **Structured Concurrency**, and right now, we use `task_mgr` in most places,
+//! `TaskHandle` in some places, and careful code review around `FuturesUnordered`
+//! or `JoinSet` in other places.
+//!
+//! We do not yet have a systematic cancellation story in pageserver, and it is
+//! pretty clear that [`RequestContext`] will be responsible for that.
+//! So, the API already prepares for this role through the
+//! [`RequestContext::detached_child`] and [`RequestContext::attached_child`]  methods.
+//! See their doc comments for details on how we will use them in the future.
+//!
+//! It is not clear whether or how we will enforce Structured Concurrency, and
+//! what role [`RequestContext`] will play there.
+//! So, the API doesn't prepare us for this topic.
+//!
+//! Other future uses of `RequestContext`:
+//! - Communicate compute & IO priorities (user-initiated request vs. background-loop)
+//! - Request IDs for distributed tracing
+//! - Request/Timeline/Tenant-scoped log levels
+//!
+//! RequestContext might look quite different once it supports those features.
+//! Likely, it will have a shape similar to Golang's `context.Context`.
+//!
+//! ### Why A Struct Instead Of Method Parameters
+//!
+//! What's typical about such information is that it needs to be passed down
+//! along the call chain from high level to low level, but few of the functions
+//! in the middle need to understand it.
+//! Further, it is to be expected that we will need to propagate more data
+//! in the future (see the earlier section on future work).
+//! Hence, for functions in the middle of the call chain, we have the following
+//! requirements:
+//! 1. It should be easy to forward the context to callees.
+//! 2. To propagate more data from high-level to low-level code, the functions in
+//!    the middle should not need to be modified.
+//! The solution is to have a container structure ([`RequestContext`]) that
+//! carries the information. Functions that don't care about what's in it
+//! pass it along to callees.
+//!
+//! ### Why Not Task-Local Variables
+//!
+//! One could use task-local variables (the equivalent of thread-local variables)
+//! to address the immediate needs outlined above.
+//! However, we reject task-local variables because:
+//! 1. they are implicit, thereby making it harder to trace the data flow in code
+//!    reviews and during debugging,
+//! 2. they can be mutable, which enables implicit return data flow,
+//! 3. they are restrictive in that code which fans out into multiple tasks,
+//!    or even threads, needs to carefully propagate the state.
+//!
+//! In contrast, information flow with [`RequestContext`] is
+//! 1. always explicit,
+//! 2. strictly uni-directional because RequestContext is immutable,
+//! 3. tangible because a [`RequestContext`] is just a value.
+//!    When creating child activities, regardless of whether it's a task,
+//!    thread, or even an RPC to another service, the value can
+//!    be used like any other argument.
+//!
+//! The solution is that all code paths are infected with precisely one
+//! [`RequestContext`] argument. Functions in the middle of the call chain
+//! only need to pass it on.
+use crate::task_mgr::TaskKind;
+
+// The main structure of this module, see module-level comment.
+pub struct RequestContext {
+    task_kind: TaskKind,
+    download_behavior: DownloadBehavior,
+}
+
+/// Desired behavior if the operation requires an on-demand download
+/// to proceed.
+#[derive(Clone, Copy, PartialEq, Eq)]
+pub enum DownloadBehavior {
+    /// Download the layer file. It can take a while.
+    Download,
+
+    /// Download the layer file, but print a warning to the log. This should be used
+    /// in code where the layer file is expected to already exist locally.
+    Warn,
+
+    /// Return a PageReconstructError::NeedsDownload error
+    Error,
+}
+
+impl RequestContext {
+    /// Create a new RequestContext that has no parent.
+    ///
+    /// The function is called `new` because, once we add children
+    /// to it using `detached_child` or `attached_child`, the context
+    /// form a tree (not implemented yet since cancellation will be
+    /// the first feature that requires a tree).
+    ///
+    /// # Future: Cancellation
+    ///
+    /// The only reason why a context like this one can be canceled is
+    /// because someone explicitly canceled it.
+    /// It has no parent, so it cannot inherit cancellation from there.
+    pub fn new(task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
+        RequestContext {
+            task_kind,
+            download_behavior,
+        }
+    }
+
+    /// Create a detached child context for a task that may outlive `self`.
+    ///
+    /// Use this when spawning new background activity that should complete
+    /// even if the current request is canceled.
+    ///
+    /// # Future: Cancellation
+    ///
+    /// Cancellation of `self` will not propagate to the child context returned
+    /// by this method.
+    ///
+    /// # Future: Structured Concurrency
+    ///
+    /// We could add the Future as a parameter to this function, spawn it as a task,
+    /// and pass to the new task the child context as an argument.
+    /// That would be an ergonomic improvement.
+    ///
+    /// We could make new calls to this function fail if `self` is already canceled.
+    pub fn detached_child(&self, task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
+        self.child_impl(task_kind, download_behavior)
+    }
+
+    /// Create a child of context `self` for a task that shall not outlive `self`.
+    ///
+    /// Use this when fanning-out work to other async tasks.
+    ///
+    /// # Future: Cancellation
+    ///
+    /// Cancelling a context will propagate to its attached children.
+    ///
+    /// # Future: Structured Concurrency
+    ///
+    /// We could add the Future as a parameter to this function, spawn it as a task,
+    /// and track its `JoinHandle` inside the `RequestContext`.
+    ///
+    /// We could then provide another method to allow waiting for all child tasks
+    /// to finish.
+    ///
+    /// We could make new calls to this function fail if `self` is already canceled.
+    /// Alternatively, we could allow the creation but not spawn the task.
+    /// The method to wait for child tasks would return an error, indicating
+    /// that the child task was not started because the context was canceled.
+    pub fn attached_child(&self) -> Self {
+        self.child_impl(self.task_kind(), self.download_behavior())
+    }
+
+    /// Use this function when you should be creating a child context using
+    /// [`attached_child`] or [`detached_child`], but your caller doesn't provide
+    /// a context and you are unwilling to change all callers to provide one.
+    ///
+    /// Before we add cancellation, we should get rid of this method.
+    pub fn todo_child(task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
+        Self::new(task_kind, download_behavior)
+    }
+
+    fn child_impl(&self, task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
+        RequestContext {
+            task_kind,
+            download_behavior,
+        }
+    }
+
+    pub fn task_kind(&self) -> TaskKind {
+        self.task_kind
+    }
+
+    pub fn download_behavior(&self) -> DownloadBehavior {
+        self.download_behavior
+    }
+}
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index f9b8a81dad..23faff7ace 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -430,6 +430,13 @@ paths:
         schema:
           type: string
           format: hex
+      - name: inputs_only
+        in: query
+        required: false
+        schema:
+          type: boolean
+        description: |
+          When true, skip calculation and only provide the model inputs (for debugging). Defaults to false.
     get:
       description: |
         Calculate tenant's size, which is a mixture of WAL (bytes) and logical_size (bytes).
@@ -449,8 +456,9 @@ paths:
                     format: hex
                   size:
                     type: integer
+                    nullable: true
                     description: |
-                      Size metric in bytes.
+                      Size metric in bytes or null if inputs_only=true was given.
         "401":
           description: Unauthorized Error
           content:
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 1eb24c1507..a7802f3cbe 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -12,8 +12,11 @@ use super::models::{
     StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
     TimelineCreateRequest, TimelineInfo,
 };
+use crate::context::{DownloadBehavior, RequestContext};
 use crate::pgdatadir_mapping::LsnForTimestamp;
+use crate::task_mgr::TaskKind;
 use crate::tenant::config::TenantConfOpt;
+use crate::tenant::mgr::TenantMapInsertError;
 use crate::tenant::{PageReconstructError, Timeline};
 use crate::{config::PageServerConf, tenant::mgr};
 use utils::{
@@ -81,18 +84,39 @@ fn check_permission(request: &Request<Body>, tenant_id: Option<TenantId>) -> Res
 fn apierror_from_prerror(err: PageReconstructError) -> ApiError {
     match err {
         PageReconstructError::Other(err) => ApiError::InternalServerError(err),
+        PageReconstructError::NeedsDownload(_, _) => {
+            // This shouldn't happen, because we use a RequestContext that requests to
+            // download any missing layer files on-demand.
+            ApiError::InternalServerError(anyhow::anyhow!("need to download remote layer file"))
+        }
+        PageReconstructError::Cancelled => {
+            ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
+        }
         PageReconstructError::WalRedo(err) => {
             ApiError::InternalServerError(anyhow::Error::new(err))
         }
     }
 }
 
+fn apierror_from_tenant_map_insert_error(e: TenantMapInsertError) -> ApiError {
+    match e {
+        TenantMapInsertError::StillInitializing | TenantMapInsertError::ShuttingDown => {
+            ApiError::InternalServerError(anyhow::Error::new(e))
+        }
+        TenantMapInsertError::TenantAlreadyExists(id, state) => {
+            ApiError::Conflict(format!("tenant {id} already exists, state: {state:?}"))
+        }
+        TenantMapInsertError::Closure(e) => ApiError::InternalServerError(e),
+    }
+}
+
 // Helper function to construct a TimelineInfo struct for a timeline
 async fn build_timeline_info(
     timeline: &Arc<Timeline>,
     include_non_incremental_logical_size: bool,
+    ctx: &RequestContext,
 ) -> anyhow::Result<TimelineInfo> {
-    let mut info = build_timeline_info_common(timeline)?;
+    let mut info = build_timeline_info_common(timeline, ctx)?;
     if include_non_incremental_logical_size {
         // XXX we should be using spawn_ondemand_logical_size_calculation here.
         // Otherwise, if someone deletes the timeline / detaches the tenant while
@@ -102,6 +126,7 @@ async fn build_timeline_info(
                 .get_current_logical_size_non_incremental(
                     info.last_record_lsn,
                     CancellationToken::new(),
+                    ctx,
                 )
                 .await?,
         );
@@ -109,7 +134,10 @@ async fn build_timeline_info(
     Ok(info)
 }
 
-fn build_timeline_info_common(timeline: &Arc<Timeline>) -> anyhow::Result<TimelineInfo> {
+fn build_timeline_info_common(
+    timeline: &Arc<Timeline>,
+    ctx: &RequestContext,
+) -> anyhow::Result<TimelineInfo> {
     let last_record_lsn = timeline.get_last_record_lsn();
     let (wal_source_connstr, last_received_msg_lsn, last_received_msg_ts) = {
         let guard = timeline.last_received_wal.lock().unwrap();
@@ -129,7 +157,7 @@ fn build_timeline_info_common(timeline: &Arc<Timeline>) -> anyhow::Result<Timeli
         Lsn(0) => None,
         lsn @ Lsn(_) => Some(lsn),
     };
-    let current_logical_size = match timeline.get_current_logical_size() {
+    let current_logical_size = match timeline.get_current_logical_size(ctx) {
         Ok((size, _)) => Some(size),
         Err(err) => {
             error!("Timeline info creation failed to get current logical size: {err:?}");
@@ -180,6 +208,8 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
         .new_timeline_id
         .unwrap_or_else(TimelineId::generate);
 
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Error);
+
     let tenant = mgr::get_tenant(tenant_id, true)
         .await
         .map_err(ApiError::NotFound)?;
@@ -187,13 +217,14 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
         new_timeline_id,
         request_data.ancestor_timeline_id.map(TimelineId::from),
         request_data.ancestor_start_lsn,
-        request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION)
+        request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION),
+        &ctx,
     )
     .instrument(info_span!("timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
     .await {
         Ok(Some(new_timeline)) => {
             // Created. Construct a TimelineInfo for it.
-            let timeline_info = build_timeline_info_common(&new_timeline)
+            let timeline_info = build_timeline_info_common(&new_timeline, &ctx)
                 .map_err(ApiError::InternalServerError)?;
             json_response(StatusCode::CREATED, timeline_info)
         }
@@ -208,6 +239,8 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
         query_param_present(&request, "include-non-incremental-logical-size");
     check_permission(&request, Some(tenant_id))?;
 
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+
     let response_data = async {
         let tenant = mgr::get_tenant(tenant_id, true)
             .await
@@ -217,7 +250,7 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
         let mut response_data = Vec::with_capacity(timelines.len());
         for timeline in timelines {
             let timeline_info =
-                build_timeline_info(&timeline, include_non_incremental_logical_size)
+                build_timeline_info(&timeline, include_non_incremental_logical_size, &ctx)
                     .await
                     .context(
                         "Failed to convert tenant timeline {timeline_id} into the local one: {e:?}",
@@ -239,11 +272,7 @@ fn query_param_present(request: &Request<Body>, param: &str) -> bool {
     request
         .uri()
         .query()
-        .map(|v| {
-            url::form_urlencoded::parse(v.as_bytes())
-                .into_owned()
-                .any(|(p, _)| p == param)
-        })
+        .map(|v| url::form_urlencoded::parse(v.as_bytes()).any(|(p, _)| p == param))
         .unwrap_or(false)
 }
 
@@ -252,13 +281,12 @@ fn get_query_param(request: &Request<Body>, param_name: &str) -> Result<String,
         Err(ApiError::BadRequest(anyhow!("empty query in request"))),
         |v| {
             url::form_urlencoded::parse(v.as_bytes())
-                .into_owned()
                 .find(|(k, _)| k == param_name)
                 .map_or(
                     Err(ApiError::BadRequest(anyhow!(
                         "no {param_name} specified in query parameters"
                     ))),
-                    |(_, v)| Ok(v),
+                    |(_, v)| Ok(v.into_owned()),
                 )
         },
     )
@@ -271,6 +299,9 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
         query_param_present(&request, "include-non-incremental-logical-size");
     check_permission(&request, Some(tenant_id))?;
 
+    // Logical size calculation needs downloading.
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+
     let timeline_info = async {
         let tenant = mgr::get_tenant(tenant_id, true)
             .await
@@ -280,10 +311,11 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
             .get_timeline(timeline_id, false)
             .map_err(ApiError::NotFound)?;
 
-        let timeline_info = build_timeline_info(&timeline, include_non_incremental_logical_size)
-            .await
-            .context("Failed to get local timeline info: {e:#}")
-            .map_err(ApiError::InternalServerError)?;
+        let timeline_info =
+            build_timeline_info(&timeline, include_non_incremental_logical_size, &ctx)
+                .await
+                .context("get local timeline info")
+                .map_err(ApiError::InternalServerError)?;
 
         Ok::<_, ApiError>(timeline_info)
     }
@@ -304,12 +336,13 @@ async fn get_lsn_by_timestamp_handler(request: Request<Body>) -> Result<Response
         .map_err(ApiError::BadRequest)?;
     let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp);
 
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
     let timeline = mgr::get_tenant(tenant_id, true)
         .await
         .and_then(|tenant| tenant.get_timeline(timeline_id, true))
         .map_err(ApiError::NotFound)?;
     let result = timeline
-        .find_lsn_for_timestamp(timestamp_pg)
+        .find_lsn_for_timestamp(timestamp_pg, &ctx)
         .await
         .map_err(apierror_from_prerror)?;
 
@@ -327,16 +360,17 @@ async fn tenant_attach_handler(request: Request<Body>) -> Result<Response<Body>,
     let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
     check_permission(&request, Some(tenant_id))?;
 
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
+
     info!("Handling tenant attach {tenant_id}");
 
     let state = get_state(&request);
 
     if let Some(remote_storage) = &state.remote_storage {
-        // FIXME: distinguish between "Tenant already exists" and other errors
-        mgr::attach_tenant(state.conf, tenant_id, remote_storage.clone())
+        mgr::attach_tenant(state.conf, tenant_id, remote_storage.clone(), &ctx)
             .instrument(info_span!("tenant_attach", tenant = %tenant_id))
             .await
-            .map_err(ApiError::InternalServerError)?;
+            .map_err(apierror_from_tenant_map_insert_error)?;
     } else {
         return Err(ApiError::BadRequest(anyhow!(
             "attach_tenant is not possible because pageserver was configured without remote storage"
@@ -351,7 +385,9 @@ async fn timeline_delete_handler(request: Request<Body>) -> Result<Response<Body
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     check_permission(&request, Some(tenant_id))?;
 
-    mgr::delete_timeline(tenant_id, timeline_id)
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
+
+    mgr::delete_timeline(tenant_id, timeline_id, &ctx)
         .instrument(info_span!("timeline_delete", tenant = %tenant_id, timeline = %timeline_id))
         .await
         // FIXME: Errors from `delete_timeline` can occur for a number of reasons, incuding both
@@ -382,11 +418,13 @@ async fn tenant_load_handler(request: Request<Body>) -> Result<Response<Body>, A
     let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
     check_permission(&request, Some(tenant_id))?;
 
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
+
     let state = get_state(&request);
-    mgr::load_tenant(state.conf, tenant_id, state.remote_storage.clone())
+    mgr::load_tenant(state.conf, tenant_id, state.remote_storage.clone(), &ctx)
         .instrument(info_span!("load", tenant = %tenant_id))
         .await
-        .map_err(ApiError::InternalServerError)?;
+        .map_err(apierror_from_tenant_map_insert_error)?;
 
     json_response(StatusCode::ACCEPTED, ())
 }
@@ -413,6 +451,8 @@ async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, A
     let response_data = mgr::list_tenants()
         .instrument(info_span!("tenant_list"))
         .await
+        .map_err(anyhow::Error::new)
+        .map_err(ApiError::InternalServerError)?
         .iter()
         .map(|(id, state)| TenantInfo {
             id: *id,
@@ -453,21 +493,40 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
     json_response(StatusCode::OK, tenant_info)
 }
 
+/// HTTP endpoint to query the current tenant_size of a tenant.
+///
+/// This is not used by consumption metrics under [`crate::consumption_metrics`], but can be used
+/// to debug any of the calculations. Requires `tenant_id` request parameter, supports
+/// `inputs_only=true|false` (default false) which supports debugging failure to calculate model
+/// values.
 async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
     check_permission(&request, Some(tenant_id))?;
 
+    let inputs_only = if query_param_present(&request, "inputs_only") {
+        get_query_param(&request, "inputs_only")?
+            .parse()
+            .map_err(|_| ApiError::BadRequest(anyhow!("failed to parse inputs_only")))?
+    } else {
+        false
+    };
+
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
     let tenant = mgr::get_tenant(tenant_id, true)
         .await
         .map_err(ApiError::InternalServerError)?;
 
-    // this can be long operation, it currently is not backed by any request coalescing or similar
+    // this can be long operation
     let inputs = tenant
-        .gather_size_inputs()
+        .gather_size_inputs(&ctx)
         .await
         .map_err(ApiError::InternalServerError)?;
 
-    let size = inputs.calculate().map_err(ApiError::InternalServerError)?;
+    let size = if !inputs_only {
+        Some(inputs.calculate().map_err(ApiError::InternalServerError)?)
+    } else {
+        None
+    };
 
     /// Private response type with the additional "unstable" `inputs` field.
     ///
@@ -479,7 +538,9 @@ async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, A
         #[serde_as(as = "serde_with::DisplayFromStr")]
         id: TenantId,
         /// Size is a mixture of WAL and logical size, so the unit is bytes.
-        size: u64,
+        ///
+        /// Will be none if `?inputs_only=true` was given.
+        size: Option<u64>,
         inputs: crate::tenant::size::ModelInputs,
     }
 
@@ -506,6 +567,8 @@ fn bad_duration<'a>(field_name: &'static str, value: &'a str) -> impl 'a + Fn()
 async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permission(&request, None)?;
 
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
+
     let request_data: TenantCreateRequest = json_request(&mut request).await?;
 
     let mut tenant_conf = TenantConfOpt::default();
@@ -583,34 +646,28 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
         tenant_conf,
         target_tenant_id,
         state.remote_storage.clone(),
+        &ctx,
     )
     .instrument(info_span!("tenant_create", tenant = ?target_tenant_id))
     .await
-    // FIXME: `create_tenant` can fail from both user and internal errors. Replace this
-    // with better error handling once the type permits it
-    .map_err(ApiError::InternalServerError)?;
+    .map_err(apierror_from_tenant_map_insert_error)?;
 
-    Ok(match new_tenant {
-        Some(tenant) => {
-            // We created the tenant. Existing API semantics are that the tenant
-            // is Active when this function returns.
-            if let res @ Err(_) = tenant.wait_to_become_active().await {
-                // This shouldn't happen because we just created the tenant directory
-                // in tenant::mgr::create_tenant, and there aren't any remote timelines
-                // to load, so, nothing can really fail during load.
-                // Don't do cleanup because we don't know how we got here.
-                // The tenant will likely be in `Broken` state and subsequent
-                // calls will fail.
-                res.context("created tenant failed to become active")
-                    .map_err(ApiError::InternalServerError)?;
-            }
-            json_response(
-                StatusCode::CREATED,
-                TenantCreateResponse(tenant.tenant_id()),
-            )?
-        }
-        None => json_response(StatusCode::CONFLICT, ())?,
-    })
+    // We created the tenant. Existing API semantics are that the tenant
+    // is Active when this function returns.
+    if let res @ Err(_) = new_tenant.wait_to_become_active().await {
+        // This shouldn't happen because we just created the tenant directory
+        // in tenant::mgr::create_tenant, and there aren't any remote timelines
+        // to load, so, nothing can really fail during load.
+        // Don't do cleanup because we don't know how we got here.
+        // The tenant will likely be in `Broken` state and subsequent
+        // calls will fail.
+        res.context("created tenant failed to become active")
+            .map_err(ApiError::InternalServerError)?;
+    }
+    json_response(
+        StatusCode::CREATED,
+        TenantCreateResponse(new_tenant.tenant_id()),
+    )
 }
 
 async fn tenant_config_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -732,7 +789,8 @@ async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body
 
     let gc_req: TimelineGcRequest = json_request(&mut request).await?;
 
-    let wait_task_done = mgr::immediate_gc(tenant_id, timeline_id, gc_req).await?;
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+    let wait_task_done = mgr::immediate_gc(tenant_id, timeline_id, gc_req, &ctx).await?;
     let gc_result = wait_task_done
         .await
         .context("wait for gc task")
@@ -749,7 +807,8 @@ async fn timeline_compact_handler(request: Request<Body>) -> Result<Response<Bod
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     check_permission(&request, Some(tenant_id))?;
 
-    let result_receiver = mgr::immediate_compact(tenant_id, timeline_id)
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+    let result_receiver = mgr::immediate_compact(tenant_id, timeline_id, &ctx)
         .await
         .context("spawn compaction task")
         .map_err(ApiError::InternalServerError)?;
@@ -770,6 +829,7 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     check_permission(&request, Some(tenant_id))?;
 
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
     let tenant = mgr::get_tenant(tenant_id, true)
         .await
         .map_err(ApiError::NotFound)?;
@@ -781,7 +841,7 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
         .await
         .map_err(ApiError::InternalServerError)?;
     timeline
-        .compact()
+        .compact(&ctx)
         .await
         .map_err(ApiError::InternalServerError)?;
 
diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs
index 3fd4bf12ca..39e434a023 100644
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -12,6 +12,7 @@ use tokio_tar::Archive;
 use tracing::*;
 use walkdir::WalkDir;
 
+use crate::context::RequestContext;
 use crate::pgdatadir_mapping::*;
 use crate::tenant::Timeline;
 use crate::walingest::WalIngest;
@@ -47,6 +48,7 @@ pub async fn import_timeline_from_postgres_datadir(
     tline: &Timeline,
     pgdata_path: &Path,
     pgdata_lsn: Lsn,
+    ctx: &RequestContext,
 ) -> Result<()> {
     let mut pg_control: Option<ControlFileData> = None;
 
@@ -69,7 +71,7 @@ pub async fn import_timeline_from_postgres_datadir(
             let mut file = tokio::fs::File::open(absolute_path).await?;
             let len = metadata.len() as usize;
             if let Some(control_file) =
-                import_file(&mut modification, relative_path, &mut file, len).await?
+                import_file(&mut modification, relative_path, &mut file, len, ctx).await?
             {
                 pg_control = Some(control_file);
             }
@@ -99,6 +101,7 @@ pub async fn import_timeline_from_postgres_datadir(
         tline,
         Lsn(pg_control.checkPointCopy.redo),
         pgdata_lsn,
+        ctx,
     )
     .await?;
 
@@ -113,6 +116,7 @@ async fn import_rel(
     dboid: Oid,
     reader: &mut (impl AsyncRead + Send + Sync + Unpin),
     len: usize,
+    ctx: &RequestContext,
 ) -> anyhow::Result<()> {
     // Does it look like a relation file?
     trace!("importing rel file {}", path.display());
@@ -147,7 +151,10 @@ async fn import_rel(
     // FIXME: use proper error type for this, instead of parsing the error message.
     // Or better yet, keep track of which relations we've already created
     // https://github.com/neondatabase/neon/issues/3309
-    if let Err(e) = modification.put_rel_creation(rel, nblocks as u32).await {
+    if let Err(e) = modification
+        .put_rel_creation(rel, nblocks as u32, ctx)
+        .await
+    {
         if e.to_string().contains("already exists") {
             debug!("relation {} already exists. we must be extending it", rel);
         } else {
@@ -182,7 +189,7 @@ async fn import_rel(
     //
     // If we process rel segments out of order,
     // put_rel_extend will skip the update.
-    modification.put_rel_extend(rel, blknum).await?;
+    modification.put_rel_extend(rel, blknum, ctx).await?;
 
     Ok(())
 }
@@ -195,6 +202,7 @@ async fn import_slru(
     path: &Path,
     reader: &mut (impl AsyncRead + Send + Sync + Unpin),
     len: usize,
+    ctx: &RequestContext,
 ) -> anyhow::Result<()> {
     info!("importing slru file {path:?}");
 
@@ -211,7 +219,7 @@ async fn import_slru(
     ensure!(nblocks <= pg_constants::SLRU_PAGES_PER_SEGMENT as usize);
 
     modification
-        .put_slru_segment_creation(slru, segno, nblocks as u32)
+        .put_slru_segment_creation(slru, segno, nblocks as u32, ctx)
         .await?;
 
     let mut rpageno = 0;
@@ -252,15 +260,15 @@ async fn import_wal(
     tline: &Timeline,
     startpoint: Lsn,
     endpoint: Lsn,
+    ctx: &RequestContext,
 ) -> anyhow::Result<()> {
-    use std::io::Read;
     let mut waldecoder = WalStreamDecoder::new(startpoint, tline.pg_version);
 
     let mut segno = startpoint.segment_number(WAL_SEGMENT_SIZE);
     let mut offset = startpoint.segment_offset(WAL_SEGMENT_SIZE);
     let mut last_lsn = startpoint;
 
-    let mut walingest = WalIngest::new(tline, startpoint).await?;
+    let mut walingest = WalIngest::new(tline, startpoint, ctx).await?;
 
     while last_lsn <= endpoint {
         // FIXME: assume postgresql tli 1 for now
@@ -283,6 +291,7 @@ async fn import_wal(
             file.seek(std::io::SeekFrom::Start(offset as u64))?;
         }
 
+        use std::io::Read;
         let nread = file.read_to_end(&mut buf)?;
         if nread != WAL_SEGMENT_SIZE - offset {
             // Maybe allow this for .partial files?
@@ -297,7 +306,7 @@ async fn import_wal(
         while last_lsn <= endpoint {
             if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
                 walingest
-                    .ingest_record(recdata, lsn, &mut modification, &mut decoded)
+                    .ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
                     .await?;
                 last_lsn = lsn;
 
@@ -326,6 +335,7 @@ pub async fn import_basebackup_from_tar(
     tline: &Timeline,
     reader: &mut (impl AsyncRead + Send + Sync + Unpin),
     base_lsn: Lsn,
+    ctx: &RequestContext,
 ) -> Result<()> {
     info!("importing base at {base_lsn}");
     let mut modification = tline.begin_modification(base_lsn);
@@ -344,7 +354,7 @@ pub async fn import_basebackup_from_tar(
         match header.entry_type() {
             tokio_tar::EntryType::Regular => {
                 if let Some(res) =
-                    import_file(&mut modification, file_path.as_ref(), &mut entry, len).await?
+                    import_file(&mut modification, file_path.as_ref(), &mut entry, len, ctx).await?
                 {
                     // We found the pg_control file.
                     pg_control = Some(res);
@@ -376,13 +386,14 @@ pub async fn import_wal_from_tar(
     reader: &mut (impl AsyncRead + Send + Sync + Unpin),
     start_lsn: Lsn,
     end_lsn: Lsn,
+    ctx: &RequestContext,
 ) -> Result<()> {
     // Set up walingest mutable state
     let mut waldecoder = WalStreamDecoder::new(start_lsn, tline.pg_version);
     let mut segno = start_lsn.segment_number(WAL_SEGMENT_SIZE);
     let mut offset = start_lsn.segment_offset(WAL_SEGMENT_SIZE);
     let mut last_lsn = start_lsn;
-    let mut walingest = WalIngest::new(tline, start_lsn).await?;
+    let mut walingest = WalIngest::new(tline, start_lsn, ctx).await?;
 
     // Ingest wal until end_lsn
     info!("importing wal until {}", end_lsn);
@@ -431,7 +442,7 @@ pub async fn import_wal_from_tar(
         while last_lsn <= end_lsn {
             if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
                 walingest
-                    .ingest_record(recdata, lsn, &mut modification, &mut decoded)
+                    .ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
                     .await?;
                 last_lsn = lsn;
 
@@ -466,6 +477,7 @@ async fn import_file(
     file_path: &Path,
     reader: &mut (impl AsyncRead + Send + Sync + Unpin),
     len: usize,
+    ctx: &RequestContext,
 ) -> Result<Option<ControlFileData>> {
     let file_name = match file_path.file_name() {
         Some(name) => name.to_string_lossy(),
@@ -498,14 +510,16 @@ async fn import_file(
             }
             "pg_filenode.map" => {
                 let bytes = read_all_bytes(reader).await?;
-                modification.put_relmap_file(spcnode, dbnode, bytes).await?;
+                modification
+                    .put_relmap_file(spcnode, dbnode, bytes, ctx)
+                    .await?;
                 debug!("imported relmap file")
             }
             "PG_VERSION" => {
                 debug!("ignored PG_VERSION file");
             }
             _ => {
-                import_rel(modification, file_path, spcnode, dbnode, reader, len).await?;
+                import_rel(modification, file_path, spcnode, dbnode, reader, len, ctx).await?;
                 debug!("imported rel creation");
             }
         }
@@ -521,38 +535,40 @@ async fn import_file(
         match file_name.as_ref() {
             "pg_filenode.map" => {
                 let bytes = read_all_bytes(reader).await?;
-                modification.put_relmap_file(spcnode, dbnode, bytes).await?;
+                modification
+                    .put_relmap_file(spcnode, dbnode, bytes, ctx)
+                    .await?;
                 debug!("imported relmap file")
             }
             "PG_VERSION" => {
                 debug!("ignored PG_VERSION file");
             }
             _ => {
-                import_rel(modification, file_path, spcnode, dbnode, reader, len).await?;
+                import_rel(modification, file_path, spcnode, dbnode, reader, len, ctx).await?;
                 debug!("imported rel creation");
             }
         }
     } else if file_path.starts_with("pg_xact") {
         let slru = SlruKind::Clog;
 
-        import_slru(modification, slru, file_path, reader, len).await?;
+        import_slru(modification, slru, file_path, reader, len, ctx).await?;
         debug!("imported clog slru");
     } else if file_path.starts_with("pg_multixact/offsets") {
         let slru = SlruKind::MultiXactOffsets;
 
-        import_slru(modification, slru, file_path, reader, len).await?;
+        import_slru(modification, slru, file_path, reader, len, ctx).await?;
         debug!("imported multixact offsets slru");
     } else if file_path.starts_with("pg_multixact/members") {
         let slru = SlruKind::MultiXactMembers;
 
-        import_slru(modification, slru, file_path, reader, len).await?;
+        import_slru(modification, slru, file_path, reader, len, ctx).await?;
         debug!("imported multixact members slru");
     } else if file_path.starts_with("pg_twophase") {
         let xid = u32::from_str_radix(file_name.as_ref(), 16)?;
 
         let bytes = read_all_bytes(reader).await?;
         modification
-            .put_twophase_file(xid, Bytes::copy_from_slice(&bytes[..]))
+            .put_twophase_file(xid, Bytes::copy_from_slice(&bytes[..]), ctx)
             .await?;
         debug!("imported twophase file");
     } else if file_path.starts_with("pg_wal") {
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 91cde477ad..09e21ae755 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -1,7 +1,9 @@
 mod auth;
 pub mod basebackup;
+pub mod broker_client;
 pub mod config;
 pub mod consumption_metrics;
+pub mod context;
 pub mod http;
 pub mod import_datadir;
 pub mod keyspace;
@@ -15,7 +17,6 @@ pub mod tenant;
 pub mod trace;
 pub mod virtual_file;
 pub mod walingest;
-pub mod walreceiver;
 pub mod walrecord;
 pub mod walredo;
 
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index b61e64048b..6bd0eddbb5 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1,10 +1,12 @@
 use metrics::core::{AtomicU64, GenericCounter};
 use metrics::{
-    register_histogram, register_histogram_vec, register_int_counter, register_int_counter_vec,
-    register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec, Histogram, HistogramVec,
-    IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
+    register_counter_vec, register_histogram, register_histogram_vec, register_int_counter,
+    register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec,
+    Counter, CounterVec, Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec,
+    UIntGauge, UIntGaugeVec,
 };
 use once_cell::sync::Lazy;
+use pageserver_api::models::state;
 use utils::id::{TenantId, TimelineId};
 
 /// Prometheus histogram buckets (in seconds) that capture the majority of
@@ -35,11 +37,29 @@ const STORAGE_TIME_OPERATIONS: &[&str] = &[
     "gc",
 ];
 
-pub static STORAGE_TIME: Lazy<HistogramVec> = Lazy::new(|| {
-    register_histogram_vec!(
-        "pageserver_storage_operations_seconds",
-        "Time spent on storage operations",
+pub static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(|| {
+    register_counter_vec!(
+        "pageserver_storage_operations_seconds_sum",
+        "Total time spent on storage operations with operation, tenant and timeline dimensions",
         &["operation", "tenant_id", "timeline_id"],
+    )
+    .expect("failed to define a metric")
+});
+
+pub static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_storage_operations_seconds_count",
+        "Count of storage operations with operation, tenant and timeline dimensions",
+        &["operation", "tenant_id", "timeline_id"],
+    )
+    .expect("failed to define a metric")
+});
+
+pub static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "pageserver_storage_operations_seconds_global",
+        "Time spent on storage operations",
+        &["operation"],
         get_buckets_for_critical_operations(),
     )
     .expect("failed to define a metric")
@@ -112,6 +132,24 @@ static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
     .expect("failed to define current logical size metric")
 });
 
+// Metrics collected on tenant states.
+const TENANT_STATE_OPTIONS: &[&str] = &[
+    state::LOADING,
+    state::ATTACHING,
+    state::ACTIVE,
+    state::STOPPING,
+    state::BROKEN,
+];
+
+pub static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_tenant_states_count",
+        "Count of tenants per state",
+        &["tenant_id", "state"]
+    )
+    .expect("Failed to register pageserver_tenant_states_count metric")
+});
+
 // Metrics for cloud upload. These metrics reflect data uploaded to cloud storage,
 // or in testing they estimate how much we would upload if we did.
 static NUM_PERSISTENT_FILES_CREATED: Lazy<IntCounterVec> = Lazy::new(|| {
@@ -375,18 +413,81 @@ pub static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
     .unwrap()
 });
 
+/// Similar to [`prometheus::HistogramTimer`] but does not record on drop.
+pub struct StorageTimeMetricsTimer {
+    metrics: StorageTimeMetrics,
+    start: Instant,
+}
+
+impl StorageTimeMetricsTimer {
+    fn new(metrics: StorageTimeMetrics) -> Self {
+        Self {
+            metrics,
+            start: Instant::now(),
+        }
+    }
+
+    /// Record the time from creation to now.
+    pub fn stop_and_record(self) {
+        let duration = self.start.elapsed().as_secs_f64();
+        self.metrics.timeline_sum.inc_by(duration);
+        self.metrics.timeline_count.inc();
+        self.metrics.global_histogram.observe(duration);
+    }
+}
+
+/// Timing facilities for an globally histogrammed metric, which is supported by per tenant and
+/// timeline total sum and count.
+#[derive(Clone, Debug)]
+pub struct StorageTimeMetrics {
+    /// Sum of f64 seconds, per operation, tenant_id and timeline_id
+    timeline_sum: Counter,
+    /// Number of oeprations, per operation, tenant_id and timeline_id
+    timeline_count: IntCounter,
+    /// Global histogram having only the "operation" label.
+    global_histogram: Histogram,
+}
+
+impl StorageTimeMetrics {
+    pub fn new(operation: &str, tenant_id: &str, timeline_id: &str) -> Self {
+        let timeline_sum = STORAGE_TIME_SUM_PER_TIMELINE
+            .get_metric_with_label_values(&[operation, tenant_id, timeline_id])
+            .unwrap();
+        let timeline_count = STORAGE_TIME_COUNT_PER_TIMELINE
+            .get_metric_with_label_values(&[operation, tenant_id, timeline_id])
+            .unwrap();
+        let global_histogram = STORAGE_TIME_GLOBAL
+            .get_metric_with_label_values(&[operation])
+            .unwrap();
+
+        StorageTimeMetrics {
+            timeline_sum,
+            timeline_count,
+            global_histogram,
+        }
+    }
+
+    /// Starts timing a new operation.
+    ///
+    /// Note: unlike [`prometheus::HistogramTimer`] the returned timer does not record on drop.
+    pub fn start_timer(&self) -> StorageTimeMetricsTimer {
+        StorageTimeMetricsTimer::new(self.clone())
+    }
+}
+
 #[derive(Debug)]
 pub struct TimelineMetrics {
     tenant_id: String,
     timeline_id: String,
     pub reconstruct_time_histo: Histogram,
     pub materialized_page_cache_hit_counter: GenericCounter<AtomicU64>,
-    pub flush_time_histo: Histogram,
-    pub compact_time_histo: Histogram,
-    pub create_images_time_histo: Histogram,
-    pub init_logical_size_histo: Histogram,
-    pub logical_size_histo: Histogram,
-    pub load_layer_map_histo: Histogram,
+    pub flush_time_histo: StorageTimeMetrics,
+    pub compact_time_histo: StorageTimeMetrics,
+    pub create_images_time_histo: StorageTimeMetrics,
+    pub init_logical_size_histo: StorageTimeMetrics,
+    pub logical_size_histo: StorageTimeMetrics,
+    pub load_layer_map_histo: StorageTimeMetrics,
+    pub garbage_collect_histo: StorageTimeMetrics,
     pub last_record_gauge: IntGauge,
     pub wait_lsn_time_histo: Histogram,
     pub resident_physical_size_gauge: UIntGauge,
@@ -406,24 +507,16 @@ impl TimelineMetrics {
         let materialized_page_cache_hit_counter = MATERIALIZED_PAGE_CACHE_HIT
             .get_metric_with_label_values(&[&tenant_id, &timeline_id])
             .unwrap();
-        let flush_time_histo = STORAGE_TIME
-            .get_metric_with_label_values(&["layer flush", &tenant_id, &timeline_id])
-            .unwrap();
-        let compact_time_histo = STORAGE_TIME
-            .get_metric_with_label_values(&["compact", &tenant_id, &timeline_id])
-            .unwrap();
-        let create_images_time_histo = STORAGE_TIME
-            .get_metric_with_label_values(&["create images", &tenant_id, &timeline_id])
-            .unwrap();
-        let init_logical_size_histo = STORAGE_TIME
-            .get_metric_with_label_values(&["init logical size", &tenant_id, &timeline_id])
-            .unwrap();
-        let logical_size_histo = STORAGE_TIME
-            .get_metric_with_label_values(&["logical size", &tenant_id, &timeline_id])
-            .unwrap();
-        let load_layer_map_histo = STORAGE_TIME
-            .get_metric_with_label_values(&["load layer map", &tenant_id, &timeline_id])
-            .unwrap();
+        let flush_time_histo = StorageTimeMetrics::new("layer flush", &tenant_id, &timeline_id);
+        let compact_time_histo = StorageTimeMetrics::new("compact", &tenant_id, &timeline_id);
+        let create_images_time_histo =
+            StorageTimeMetrics::new("create images", &tenant_id, &timeline_id);
+        let init_logical_size_histo =
+            StorageTimeMetrics::new("init logical size", &tenant_id, &timeline_id);
+        let logical_size_histo = StorageTimeMetrics::new("logical size", &tenant_id, &timeline_id);
+        let load_layer_map_histo =
+            StorageTimeMetrics::new("load layer map", &tenant_id, &timeline_id);
+        let garbage_collect_histo = StorageTimeMetrics::new("gc", &tenant_id, &timeline_id);
         let last_record_gauge = LAST_RECORD_LSN
             .get_metric_with_label_values(&[&tenant_id, &timeline_id])
             .unwrap();
@@ -453,6 +546,7 @@ impl TimelineMetrics {
             create_images_time_histo,
             init_logical_size_histo,
             logical_size_histo,
+            garbage_collect_histo,
             load_layer_map_histo,
             last_record_gauge,
             wait_lsn_time_histo,
@@ -478,7 +572,10 @@ impl Drop for TimelineMetrics {
         let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
 
         for op in STORAGE_TIME_OPERATIONS {
-            let _ = STORAGE_TIME.remove_label_values(&[op, tenant_id, timeline_id]);
+            let _ =
+                STORAGE_TIME_SUM_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]);
+            let _ =
+                STORAGE_TIME_COUNT_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]);
         }
         for op in STORAGE_IO_TIME_OPERATIONS {
             let _ = STORAGE_IO_TIME.remove_label_values(&[op, tenant_id, timeline_id]);
@@ -495,7 +592,10 @@ impl Drop for TimelineMetrics {
 }
 
 pub fn remove_tenant_metrics(tenant_id: &TenantId) {
-    let _ = STORAGE_TIME.remove_label_values(&["gc", &tenant_id.to_string(), "-"]);
+    let tid = tenant_id.to_string();
+    for state in TENANT_STATE_OPTIONS {
+        let _ = TENANT_STATE_METRIC.remove_label_values(&[&tid, state]);
+    }
 }
 
 use futures::Future;
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 344a8d1c00..878928ae06 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -13,6 +13,7 @@ use anyhow::Context;
 use bytes::Buf;
 use bytes::Bytes;
 use futures::{Stream, StreamExt};
+use pageserver_api::models::TenantState;
 use pageserver_api::models::{
     PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
     PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse,
@@ -30,19 +31,19 @@ use std::sync::Arc;
 use std::time::Duration;
 use tracing::*;
 use utils::id::ConnectionId;
-use utils::postgres_backend_async::QueryError;
 use utils::{
     auth::{Claims, JwtAuth, Scope},
     id::{TenantId, TimelineId},
     lsn::Lsn,
     postgres_backend::AuthType,
-    postgres_backend_async::{self, PostgresBackend},
+    postgres_backend_async::{self, is_expected_io_error, PostgresBackend, QueryError},
     simple_rcu::RcuReadGuard,
 };
 
 use crate::auth::check_permission;
 use crate::basebackup;
 use crate::config::PageServerConf;
+use crate::context::{DownloadBehavior, RequestContext};
 use crate::import_datadir::import_wal_from_tar;
 use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME};
 use crate::task_mgr;
@@ -123,6 +124,7 @@ pub async fn libpq_listener_main(
     auth: Option<Arc<JwtAuth>>,
     listener: TcpListener,
     auth_type: AuthType,
+    listener_ctx: RequestContext,
 ) -> anyhow::Result<()> {
     listener.set_nonblocking(true)?;
     let tokio_listener = tokio::net::TcpListener::from_std(listener)?;
@@ -146,6 +148,9 @@ pub async fn libpq_listener_main(
                 debug!("accepted connection from {}", peer_addr);
                 let local_auth = auth.clone();
 
+                let connection_ctx = listener_ctx
+                    .detached_child(TaskKind::PageRequestHandler, DownloadBehavior::Download);
+
                 // PageRequestHandler tasks are not associated with any particular
                 // timeline in the task manager. In practice most connections will
                 // only deal with a particular timeline, but we don't know which one
@@ -157,7 +162,7 @@ pub async fn libpq_listener_main(
                     None,
                     "serving compute connection task",
                     false,
-                    page_service_conn_main(conf, local_auth, socket, auth_type),
+                    page_service_conn_main(conf, local_auth, socket, auth_type, connection_ctx),
                 );
             }
             Err(err) => {
@@ -177,6 +182,7 @@ async fn page_service_conn_main(
     auth: Option<Arc<JwtAuth>>,
     socket: tokio::net::TcpStream,
     auth_type: AuthType,
+    connection_ctx: RequestContext,
 ) -> anyhow::Result<()> {
     // Immediately increment the gauge, then create a job to decrement it on task exit.
     // One of the pros of `defer!` is that this will *most probably*
@@ -191,24 +197,24 @@ async fn page_service_conn_main(
         .set_nodelay(true)
         .context("could not set TCP_NODELAY")?;
 
-    let mut conn_handler = PageServerHandler::new(conf, auth);
+    // XXX: pgbackend.run() should take the connection_ctx,
+    // and create a child per-query context when it invokes process_query.
+    // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
+    // and create the per-query context in process_query ourselves.
+    let mut conn_handler = PageServerHandler::new(conf, auth, connection_ctx);
     let pgbackend = PostgresBackend::new(socket, auth_type, None)?;
 
-    let result = pgbackend
+    match pgbackend
         .run(&mut conn_handler, task_mgr::shutdown_watcher)
-        .await;
-    match result {
+        .await
+    {
         Ok(()) => {
             // we've been requested to shut down
             Ok(())
         }
         Err(QueryError::Disconnected(ConnectionError::Socket(io_error))) => {
-            // `ConnectionReset` error happens when the Postgres client closes the connection.
-            // As this disconnection happens quite often and is expected,
-            // we decided to downgrade the logging level to `INFO`.
-            // See: https://github.com/neondatabase/neon/issues/1683.
-            if io_error.kind() == io::ErrorKind::ConnectionReset {
-                info!("Postgres client disconnected");
+            if is_expected_io_error(&io_error) {
+                info!("Postgres client disconnected ({io_error})");
                 Ok(())
             } else {
                 Err(io_error).context("Postgres connection error")
@@ -255,30 +261,42 @@ struct PageServerHandler {
     _conf: &'static PageServerConf,
     auth: Option<Arc<JwtAuth>>,
     claims: Option<Claims>,
+
+    /// The context created for the lifetime of the connection
+    /// services by this PageServerHandler.
+    /// For each query received over the connection,
+    /// `process_query` creates a child context from this one.
+    connection_ctx: RequestContext,
 }
 
 impl PageServerHandler {
-    pub fn new(conf: &'static PageServerConf, auth: Option<Arc<JwtAuth>>) -> Self {
+    pub fn new(
+        conf: &'static PageServerConf,
+        auth: Option<Arc<JwtAuth>>,
+        connection_ctx: RequestContext,
+    ) -> Self {
         PageServerHandler {
             _conf: conf,
             auth,
             claims: None,
+            connection_ctx,
         }
     }
 
-    #[instrument(skip(self, pgb))]
+    #[instrument(skip(self, pgb, ctx))]
     async fn handle_pagerequests(
         &self,
         pgb: &mut PostgresBackend,
         tenant_id: TenantId,
         timeline_id: TimelineId,
+        ctx: RequestContext,
     ) -> anyhow::Result<()> {
         // NOTE: pagerequests handler exits when connection is closed,
         //       so there is no need to reset the association
         task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
 
         // Make request tracer if needed
-        let tenant = get_active_tenant_with_timeout(tenant_id).await?;
+        let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
         let mut tracer = if tenant.get_trace_read_requests() {
             let connection_id = ConnectionId::generate();
             let path = tenant
@@ -329,22 +347,27 @@ impl PageServerHandler {
 
             let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?;
 
+            // TODO: We could create a new per-request context here, with unique ID.
+            // Currently we use the same per-timeline context for all requests
+
             let response = match neon_fe_msg {
                 PagestreamFeMessage::Exists(req) => {
                     let _timer = metrics.get_rel_exists.start_timer();
-                    self.handle_get_rel_exists_request(&timeline, &req).await
+                    self.handle_get_rel_exists_request(&timeline, &req, &ctx)
+                        .await
                 }
                 PagestreamFeMessage::Nblocks(req) => {
                     let _timer = metrics.get_rel_size.start_timer();
-                    self.handle_get_nblocks_request(&timeline, &req).await
+                    self.handle_get_nblocks_request(&timeline, &req, &ctx).await
                 }
                 PagestreamFeMessage::GetPage(req) => {
                     let _timer = metrics.get_page_at_lsn.start_timer();
-                    self.handle_get_page_at_lsn_request(&timeline, &req).await
+                    self.handle_get_page_at_lsn_request(&timeline, &req, &ctx)
+                        .await
                 }
                 PagestreamFeMessage::DbSize(req) => {
                     let _timer = metrics.get_db_size.start_timer();
-                    self.handle_db_size_request(&timeline, &req).await
+                    self.handle_db_size_request(&timeline, &req, &ctx).await
                 }
             };
 
@@ -363,7 +386,8 @@ impl PageServerHandler {
         Ok(())
     }
 
-    #[instrument(skip(self, pgb))]
+    #[allow(clippy::too_many_arguments)]
+    #[instrument(skip(self, pgb, ctx))]
     async fn handle_import_basebackup(
         &self,
         pgb: &mut PostgresBackend,
@@ -372,12 +396,13 @@ impl PageServerHandler {
         base_lsn: Lsn,
         _end_lsn: Lsn,
         pg_version: u32,
+        ctx: RequestContext,
     ) -> Result<(), QueryError> {
         task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
         // Create empty timeline
         info!("creating new timeline");
-        let tenant = get_active_tenant_with_timeout(tenant_id).await?;
-        let timeline = tenant.create_empty_timeline(timeline_id, base_lsn, pg_version)?;
+        let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
+        let timeline = tenant.create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)?;
 
         // TODO mark timeline as not ready until it reaches end_lsn.
         // We might have some wal to import as well, and we should prevent compute
@@ -396,7 +421,7 @@ impl PageServerHandler {
 
         let mut copyin_stream = Box::pin(copyin_stream(pgb));
         timeline
-            .import_basebackup_from_tar(&mut copyin_stream, base_lsn)
+            .import_basebackup_from_tar(&mut copyin_stream, base_lsn, &ctx)
             .await?;
 
         // Drain the rest of the Copy data
@@ -418,7 +443,7 @@ impl PageServerHandler {
         Ok(())
     }
 
-    #[instrument(skip(self, pgb))]
+    #[instrument(skip(self, pgb, ctx))]
     async fn handle_import_wal(
         &self,
         pgb: &mut PostgresBackend,
@@ -426,10 +451,11 @@ impl PageServerHandler {
         timeline_id: TimelineId,
         start_lsn: Lsn,
         end_lsn: Lsn,
+        ctx: RequestContext,
     ) -> Result<(), QueryError> {
         task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
 
-        let timeline = get_active_timeline_with_timeout(tenant_id, timeline_id).await?;
+        let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
         let last_record_lsn = timeline.get_last_record_lsn();
         if last_record_lsn != start_lsn {
             return Err(QueryError::Other(
@@ -446,7 +472,7 @@ impl PageServerHandler {
         pgb.flush().await?;
         let mut copyin_stream = Box::pin(copyin_stream(pgb));
         let mut reader = tokio_util::io::StreamReader::new(&mut copyin_stream);
-        import_wal_from_tar(&timeline, &mut reader, start_lsn, end_lsn).await?;
+        import_wal_from_tar(&timeline, &mut reader, start_lsn, end_lsn, &ctx).await?;
         info!("wal import complete");
 
         // Drain the rest of the Copy data
@@ -492,6 +518,7 @@ impl PageServerHandler {
         mut lsn: Lsn,
         latest: bool,
         latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
+        ctx: &RequestContext,
     ) -> anyhow::Result<Lsn> {
         if latest {
             // Latest page version was requested. If LSN is given, it is a hint
@@ -515,7 +542,7 @@ impl PageServerHandler {
             if lsn <= last_record_lsn {
                 lsn = last_record_lsn;
             } else {
-                timeline.wait_lsn(lsn).await?;
+                timeline.wait_lsn(lsn, ctx).await?;
                 // Since we waited for 'lsn' to arrive, that is now the last
                 // record LSN. (Or close enough for our purposes; the
                 // last-record LSN can advance immediately after we return
@@ -525,7 +552,7 @@ impl PageServerHandler {
             if lsn == Lsn(0) {
                 anyhow::bail!("invalid LSN(0) in request");
             }
-            timeline.wait_lsn(lsn).await?;
+            timeline.wait_lsn(lsn, ctx).await?;
         }
         anyhow::ensure!(
             lsn >= **latest_gc_cutoff_lsn,
@@ -535,52 +562,60 @@ impl PageServerHandler {
         Ok(lsn)
     }
 
-    #[instrument(skip(self, timeline, req), fields(rel = %req.rel, req_lsn = %req.lsn))]
+    #[instrument(skip(self, timeline, req, ctx), fields(rel = %req.rel, req_lsn = %req.lsn))]
     async fn handle_get_rel_exists_request(
         &self,
         timeline: &Timeline,
         req: &PagestreamExistsRequest,
+        ctx: &RequestContext,
     ) -> anyhow::Result<PagestreamBeMessage> {
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
-        let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
-            .await?;
+        let lsn =
+            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
+                .await?;
 
-        let exists = timeline.get_rel_exists(req.rel, lsn, req.latest).await?;
+        let exists = timeline
+            .get_rel_exists(req.rel, lsn, req.latest, ctx)
+            .await?;
 
         Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse {
             exists,
         }))
     }
 
-    #[instrument(skip(self, timeline, req), fields(rel = %req.rel, req_lsn = %req.lsn))]
+    #[instrument(skip(self, timeline, req, ctx), fields(rel = %req.rel, req_lsn = %req.lsn))]
     async fn handle_get_nblocks_request(
         &self,
         timeline: &Timeline,
         req: &PagestreamNblocksRequest,
+        ctx: &RequestContext,
     ) -> anyhow::Result<PagestreamBeMessage> {
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
-        let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
-            .await?;
+        let lsn =
+            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
+                .await?;
 
-        let n_blocks = timeline.get_rel_size(req.rel, lsn, req.latest).await?;
+        let n_blocks = timeline.get_rel_size(req.rel, lsn, req.latest, ctx).await?;
 
         Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse {
             n_blocks,
         }))
     }
 
-    #[instrument(skip(self, timeline, req), fields(dbnode = %req.dbnode, req_lsn = %req.lsn))]
+    #[instrument(skip(self, timeline, req, ctx), fields(dbnode = %req.dbnode, req_lsn = %req.lsn))]
     async fn handle_db_size_request(
         &self,
         timeline: &Timeline,
         req: &PagestreamDbSizeRequest,
+        ctx: &RequestContext,
     ) -> anyhow::Result<PagestreamBeMessage> {
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
-        let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
-            .await?;
+        let lsn =
+            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
+                .await?;
 
         let total_blocks = timeline
-            .get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest)
+            .get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest, ctx)
             .await?;
         let db_size = total_blocks as i64 * BLCKSZ as i64;
 
@@ -589,15 +624,17 @@ impl PageServerHandler {
         }))
     }
 
-    #[instrument(skip(self, timeline, req), fields(rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn))]
+    #[instrument(skip(self, timeline, req, ctx), fields(rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn))]
     async fn handle_get_page_at_lsn_request(
         &self,
         timeline: &Timeline,
         req: &PagestreamGetPageRequest,
+        ctx: &RequestContext,
     ) -> anyhow::Result<PagestreamBeMessage> {
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
-        let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
-            .await?;
+        let lsn =
+            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
+                .await?;
         /*
         // Add a 1s delay to some requests. The delay helps the requests to
         // hit the race condition from github issue #1047 more easily.
@@ -608,7 +645,7 @@ impl PageServerHandler {
         */
 
         let page = timeline
-            .get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest)
+            .get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
             .await?;
 
         Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
@@ -616,23 +653,25 @@ impl PageServerHandler {
         }))
     }
 
-    #[instrument(skip(self, pgb))]
+    #[allow(clippy::too_many_arguments)]
+    #[instrument(skip(self, pgb, ctx))]
     async fn handle_basebackup_request(
-        &self,
+        &mut self,
         pgb: &mut PostgresBackend,
         tenant_id: TenantId,
         timeline_id: TimelineId,
         lsn: Option<Lsn>,
         prev_lsn: Option<Lsn>,
         full_backup: bool,
+        ctx: RequestContext,
     ) -> anyhow::Result<()> {
         // check that the timeline exists
-        let timeline = get_active_timeline_with_timeout(tenant_id, timeline_id).await?;
+        let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         if let Some(lsn) = lsn {
             // Backup was requested at a particular LSN. Wait for it to arrive.
             info!("waiting for {}", lsn);
-            timeline.wait_lsn(lsn).await?;
+            timeline.wait_lsn(lsn, &ctx).await?;
             timeline
                 .check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn)
                 .context("invalid basebackup lsn")?;
@@ -645,8 +684,15 @@ impl PageServerHandler {
         // Send a tarball of the latest layer on the timeline
         {
             let mut writer = pgb.copyout_writer();
-            basebackup::send_basebackup_tarball(&mut writer, &timeline, lsn, prev_lsn, full_backup)
-                .await?;
+            basebackup::send_basebackup_tarball(
+                &mut writer,
+                &timeline,
+                lsn,
+                prev_lsn,
+                full_backup,
+                &ctx,
+            )
+            .await?;
         }
 
         pgb.write_message(&BeMessage::CopyDone)?;
@@ -717,6 +763,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
         pgb: &mut PostgresBackend,
         query_string: &str,
     ) -> Result<(), QueryError> {
+        let ctx = self.connection_ctx.attached_child();
         debug!("process query {query_string:?}");
 
         if query_string.starts_with("pagestream ") {
@@ -734,7 +781,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
 
             self.check_permission(Some(tenant_id))?;
 
-            self.handle_pagerequests(pgb, tenant_id, timeline_id)
+            self.handle_pagerequests(pgb, tenant_id, timeline_id, ctx)
                 .await?;
         } else if query_string.starts_with("basebackup ") {
             let (_, params_raw) = query_string.split_at("basebackup ".len());
@@ -763,7 +810,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
             };
 
             // Check that the timeline exists
-            self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, None, false)
+            self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, None, false, ctx)
                 .await?;
             pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
         }
@@ -784,7 +831,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
                 .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
 
             self.check_permission(Some(tenant_id))?;
-            let timeline = get_active_timeline_with_timeout(tenant_id, timeline_id).await?;
+            let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
 
             let end_of_timeline = timeline.get_last_record_rlsn();
 
@@ -835,7 +882,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
             self.check_permission(Some(tenant_id))?;
 
             // Check that the timeline exists
-            self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, prev_lsn, true)
+            self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, prev_lsn, true, ctx)
                 .await?;
             pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
         } else if query_string.starts_with("import basebackup ") {
@@ -878,6 +925,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
                     base_lsn,
                     end_lsn,
                     pg_version,
+                    ctx,
                 )
                 .await
             {
@@ -914,7 +962,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
             self.check_permission(Some(tenant_id))?;
 
             match self
-                .handle_import_wal(pgb, tenant_id, timeline_id, start_lsn, end_lsn)
+                .handle_import_wal(pgb, tenant_id, timeline_id, start_lsn, end_lsn, ctx)
                 .await
             {
                 Ok(()) => pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?,
@@ -944,7 +992,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
 
             self.check_permission(Some(tenant_id))?;
 
-            let tenant = get_active_tenant_with_timeout(tenant_id).await?;
+            let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
             pgb.write_message(&BeMessage::RowDescription(&[
                 RowDescriptor::int8_col(b"checkpoint_distance"),
                 RowDescriptor::int8_col(b"checkpoint_timeout"),
@@ -990,27 +1038,66 @@ impl postgres_backend_async::Handler for PageServerHandler {
     }
 }
 
+#[derive(thiserror::Error, Debug)]
+enum GetActiveTenantError {
+    #[error(
+        "Timed out waiting {wait_time:?} for tenant active state. Latest state: {latest_state:?}"
+    )]
+    WaitForActiveTimeout {
+        latest_state: TenantState,
+        wait_time: Duration,
+    },
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
+impl From<GetActiveTenantError> for QueryError {
+    fn from(e: GetActiveTenantError) -> Self {
+        match e {
+            GetActiveTenantError::WaitForActiveTimeout { .. } => QueryError::Disconnected(
+                ConnectionError::Socket(io::Error::new(io::ErrorKind::TimedOut, e.to_string())),
+            ),
+            GetActiveTenantError::Other(e) => QueryError::Other(e),
+        }
+    }
+}
+
 /// Get active tenant.
 ///
 /// If the tenant is Loading, waits for it to become Active, for up to 30 s. That
 /// ensures that queries don't fail immediately after pageserver startup, because
 /// all tenants are still loading.
-async fn get_active_tenant_with_timeout(tenant_id: TenantId) -> anyhow::Result<Arc<Tenant>> {
+async fn get_active_tenant_with_timeout(
+    tenant_id: TenantId,
+    _ctx: &RequestContext, /* require get a context to support cancellation in the future */
+) -> Result<Arc<Tenant>, GetActiveTenantError> {
     let tenant = mgr::get_tenant(tenant_id, false).await?;
-    match tokio::time::timeout(Duration::from_secs(30), tenant.wait_to_become_active()).await {
-        Ok(wait_result) => wait_result
-            // no .context(), the error message is good enough and some tests depend on it
-            .map(move |()| tenant),
-        Err(_) => anyhow::bail!("Timeout waiting for tenant {tenant_id} to become Active"),
+    let wait_time = Duration::from_secs(30);
+    match tokio::time::timeout(wait_time, tenant.wait_to_become_active()).await {
+        Ok(Ok(())) => Ok(tenant),
+        // no .context(), the error message is good enough and some tests depend on it
+        Ok(Err(wait_error)) => Err(GetActiveTenantError::Other(wait_error)),
+        Err(_) => {
+            let latest_state = tenant.current_state();
+            if latest_state == TenantState::Active {
+                Ok(tenant)
+            } else {
+                Err(GetActiveTenantError::WaitForActiveTimeout {
+                    latest_state,
+                    wait_time,
+                })
+            }
+        }
     }
 }
 
 /// Shorthand for getting a reference to a Timeline of an Active tenant.
-async fn get_active_timeline_with_timeout(
+async fn get_active_tenant_timeline(
     tenant_id: TenantId,
     timeline_id: TimelineId,
-) -> anyhow::Result<Arc<Timeline>> {
-    get_active_tenant_with_timeout(tenant_id)
-        .await
-        .and_then(|tenant| tenant.get_timeline(timeline_id, true))
+    ctx: &RequestContext,
+) -> Result<Arc<Timeline>, GetActiveTenantError> {
+    let tenant = get_active_tenant_with_timeout(tenant_id, ctx).await?;
+    let timeline = tenant.get_timeline(timeline_id, true)?;
+    Ok(timeline)
 }
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index cc521c5e35..6f9035305d 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -7,6 +7,7 @@
 //! Clarify that)
 //!
 use super::tenant::{PageReconstructError, Timeline};
+use crate::context::RequestContext;
 use crate::keyspace::{KeySpace, KeySpaceAccum};
 use crate::repository::*;
 use crate::walrecord::NeonWalRecord;
@@ -97,6 +98,7 @@ impl Timeline {
         blknum: BlockNumber,
         lsn: Lsn,
         latest: bool,
+        ctx: &RequestContext,
     ) -> Result<Bytes, PageReconstructError> {
         if tag.relnode == 0 {
             return Err(PageReconstructError::Other(anyhow::anyhow!(
@@ -104,7 +106,7 @@ impl Timeline {
             )));
         }
 
-        let nblocks = self.get_rel_size(tag, lsn, latest).await?;
+        let nblocks = self.get_rel_size(tag, lsn, latest, ctx).await?;
         if blknum >= nblocks {
             debug!(
                 "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
@@ -114,7 +116,7 @@ impl Timeline {
         }
 
         let key = rel_block_to_key(tag, blknum);
-        self.get(key, lsn).await
+        self.get(key, lsn, ctx).await
     }
 
     // Get size of a database in blocks
@@ -124,13 +126,14 @@ impl Timeline {
         dbnode: Oid,
         lsn: Lsn,
         latest: bool,
+        ctx: &RequestContext,
     ) -> Result<usize, PageReconstructError> {
         let mut total_blocks = 0;
 
-        let rels = self.list_rels(spcnode, dbnode, lsn).await?;
+        let rels = self.list_rels(spcnode, dbnode, lsn, ctx).await?;
 
         for rel in rels {
-            let n_blocks = self.get_rel_size(rel, lsn, latest).await?;
+            let n_blocks = self.get_rel_size(rel, lsn, latest, ctx).await?;
             total_blocks += n_blocks as usize;
         }
         Ok(total_blocks)
@@ -142,6 +145,7 @@ impl Timeline {
         tag: RelTag,
         lsn: Lsn,
         latest: bool,
+        ctx: &RequestContext,
     ) -> Result<BlockNumber, PageReconstructError> {
         if tag.relnode == 0 {
             return Err(PageReconstructError::Other(anyhow::anyhow!(
@@ -154,7 +158,7 @@ impl Timeline {
         }
 
         if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM)
-            && !self.get_rel_exists(tag, lsn, latest).await?
+            && !self.get_rel_exists(tag, lsn, latest, ctx).await?
         {
             // FIXME: Postgres sometimes calls smgrcreate() to create
             // FSM, and smgrnblocks() on it immediately afterwards,
@@ -164,7 +168,7 @@ impl Timeline {
         }
 
         let key = rel_size_to_key(tag);
-        let mut buf = self.get(key, lsn).await?;
+        let mut buf = self.get(key, lsn, ctx).await?;
         let nblocks = buf.get_u32_le();
 
         if latest {
@@ -186,6 +190,7 @@ impl Timeline {
         tag: RelTag,
         lsn: Lsn,
         _latest: bool,
+        ctx: &RequestContext,
     ) -> Result<bool, PageReconstructError> {
         if tag.relnode == 0 {
             return Err(PageReconstructError::Other(anyhow::anyhow!(
@@ -199,7 +204,7 @@ impl Timeline {
         }
         // fetch directory listing
         let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
-        let buf = self.get(key, lsn).await?;
+        let buf = self.get(key, lsn, ctx).await?;
 
         match RelDirectory::des(&buf).context("deserialization failure") {
             Ok(dir) => {
@@ -216,10 +221,11 @@ impl Timeline {
         spcnode: Oid,
         dbnode: Oid,
         lsn: Lsn,
+        ctx: &RequestContext,
     ) -> Result<HashSet<RelTag>, PageReconstructError> {
         // fetch directory listing
         let key = rel_dir_to_key(spcnode, dbnode);
-        let buf = self.get(key, lsn).await?;
+        let buf = self.get(key, lsn, ctx).await?;
 
         match RelDirectory::des(&buf).context("deserialization failure") {
             Ok(dir) => {
@@ -244,9 +250,10 @@ impl Timeline {
         segno: u32,
         blknum: BlockNumber,
         lsn: Lsn,
+        ctx: &RequestContext,
     ) -> Result<Bytes, PageReconstructError> {
         let key = slru_block_to_key(kind, segno, blknum);
-        self.get(key, lsn).await
+        self.get(key, lsn, ctx).await
     }
 
     /// Get size of an SLRU segment
@@ -255,9 +262,10 @@ impl Timeline {
         kind: SlruKind,
         segno: u32,
         lsn: Lsn,
+        ctx: &RequestContext,
     ) -> Result<BlockNumber, PageReconstructError> {
         let key = slru_segment_size_to_key(kind, segno);
-        let mut buf = self.get(key, lsn).await?;
+        let mut buf = self.get(key, lsn, ctx).await?;
         Ok(buf.get_u32_le())
     }
 
@@ -267,10 +275,11 @@ impl Timeline {
         kind: SlruKind,
         segno: u32,
         lsn: Lsn,
+        ctx: &RequestContext,
     ) -> Result<bool, PageReconstructError> {
         // fetch directory listing
         let key = slru_dir_to_key(kind);
-        let buf = self.get(key, lsn).await?;
+        let buf = self.get(key, lsn, ctx).await?;
 
         match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
             Ok(dir) => {
@@ -291,6 +300,7 @@ impl Timeline {
     pub async fn find_lsn_for_timestamp(
         &self,
         search_timestamp: TimestampTz,
+        ctx: &RequestContext,
     ) -> Result<LsnForTimestamp, PageReconstructError> {
         let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
         let min_lsn = *gc_cutoff_lsn_guard;
@@ -313,6 +323,7 @@ impl Timeline {
                     Lsn(mid * 8),
                     &mut found_smaller,
                     &mut found_larger,
+                    ctx,
                 )
                 .await?;
 
@@ -362,14 +373,18 @@ impl Timeline {
         probe_lsn: Lsn,
         found_smaller: &mut bool,
         found_larger: &mut bool,
+        ctx: &RequestContext,
     ) -> Result<bool, PageReconstructError> {
-        for segno in self.list_slru_segments(SlruKind::Clog, probe_lsn).await? {
+        for segno in self
+            .list_slru_segments(SlruKind::Clog, probe_lsn, ctx)
+            .await?
+        {
             let nblocks = self
-                .get_slru_segment_size(SlruKind::Clog, segno, probe_lsn)
+                .get_slru_segment_size(SlruKind::Clog, segno, probe_lsn, ctx)
                 .await?;
             for blknum in (0..nblocks).rev() {
                 let clog_page = self
-                    .get_slru_page_at_lsn(SlruKind::Clog, segno, blknum, probe_lsn)
+                    .get_slru_page_at_lsn(SlruKind::Clog, segno, blknum, probe_lsn, ctx)
                     .await?;
 
                 if clog_page.len() == BLCKSZ as usize + 8 {
@@ -394,11 +409,12 @@ impl Timeline {
         &self,
         kind: SlruKind,
         lsn: Lsn,
+        ctx: &RequestContext,
     ) -> Result<HashSet<u32>, PageReconstructError> {
         // fetch directory entry
         let key = slru_dir_to_key(kind);
 
-        let buf = self.get(key, lsn).await?;
+        let buf = self.get(key, lsn, ctx).await?;
         match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
             Ok(dir) => Ok(dir.segments),
             Err(e) => Err(PageReconstructError::from(e)),
@@ -410,18 +426,21 @@ impl Timeline {
         spcnode: Oid,
         dbnode: Oid,
         lsn: Lsn,
+        ctx: &RequestContext,
     ) -> Result<Bytes, PageReconstructError> {
         let key = relmap_file_key(spcnode, dbnode);
 
-        self.get(key, lsn).await
+        let buf = self.get(key, lsn, ctx).await?;
+        Ok(buf)
     }
 
     pub async fn list_dbdirs(
         &self,
         lsn: Lsn,
+        ctx: &RequestContext,
     ) -> Result<HashMap<(Oid, Oid), bool>, PageReconstructError> {
         // fetch directory entry
-        let buf = self.get(DBDIR_KEY, lsn).await?;
+        let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
 
         match DbDirectory::des(&buf).context("deserialization failure") {
             Ok(dir) => Ok(dir.dbdirs),
@@ -433,18 +452,20 @@ impl Timeline {
         &self,
         xid: TransactionId,
         lsn: Lsn,
+        ctx: &RequestContext,
     ) -> Result<Bytes, PageReconstructError> {
         let key = twophase_file_key(xid);
-        let buf = self.get(key, lsn).await?;
+        let buf = self.get(key, lsn, ctx).await?;
         Ok(buf)
     }
 
     pub async fn list_twophase_files(
         &self,
         lsn: Lsn,
+        ctx: &RequestContext,
     ) -> Result<HashSet<TransactionId>, PageReconstructError> {
         // fetch directory entry
-        let buf = self.get(TWOPHASEDIR_KEY, lsn).await?;
+        let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?;
 
         match TwoPhaseDirectory::des(&buf).context("deserialization failure") {
             Ok(dir) => Ok(dir.xids),
@@ -452,12 +473,20 @@ impl Timeline {
         }
     }
 
-    pub async fn get_control_file(&self, lsn: Lsn) -> Result<Bytes, PageReconstructError> {
-        self.get(CONTROLFILE_KEY, lsn).await
+    pub async fn get_control_file(
+        &self,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> Result<Bytes, PageReconstructError> {
+        self.get(CONTROLFILE_KEY, lsn, ctx).await
     }
 
-    pub async fn get_checkpoint(&self, lsn: Lsn) -> Result<Bytes, PageReconstructError> {
-        self.get(CHECKPOINT_KEY, lsn).await
+    pub async fn get_checkpoint(
+        &self,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> Result<Bytes, PageReconstructError> {
+        self.get(CHECKPOINT_KEY, lsn, ctx).await
     }
 
     /// Does the same as get_current_logical_size but counted on demand.
@@ -469,15 +498,16 @@ impl Timeline {
         &self,
         lsn: Lsn,
         cancel: CancellationToken,
+        ctx: &RequestContext,
     ) -> Result<u64, CalculateLogicalSizeError> {
         // Fetch list of database dirs and iterate them
-        let buf = self.get(DBDIR_KEY, lsn).await.context("read dbdir")?;
+        let buf = self.get(DBDIR_KEY, lsn, ctx).await.context("read dbdir")?;
         let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?;
 
         let mut total_size: u64 = 0;
         for (spcnode, dbnode) in dbdir.dbdirs.keys() {
             for rel in self
-                .list_rels(*spcnode, *dbnode, lsn)
+                .list_rels(*spcnode, *dbnode, lsn, ctx)
                 .await
                 .context("list rels")?
             {
@@ -486,9 +516,9 @@ impl Timeline {
                 }
                 let relsize_key = rel_size_to_key(rel);
                 let mut buf = self
-                    .get(relsize_key, lsn)
+                    .get(relsize_key, lsn, ctx)
                     .await
-                    .context("read relation size of {rel:?}")?;
+                    .with_context(|| format!("read relation size of {rel:?}"))?;
                 let relsize = buf.get_u32_le();
 
                 total_size += relsize as u64;
@@ -501,7 +531,11 @@ impl Timeline {
     /// Get a KeySpace that covers all the Keys that are in use at the given LSN.
     /// Anything that's not listed maybe removed from the underlying storage (from
     /// that LSN forwards).
-    pub async fn collect_keyspace(&self, lsn: Lsn) -> anyhow::Result<KeySpace> {
+    pub async fn collect_keyspace(
+        &self,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<KeySpace> {
         // Iterate through key ranges, greedily packing them into partitions
         let mut result = KeySpaceAccum::new();
 
@@ -509,7 +543,7 @@ impl Timeline {
         result.add_key(DBDIR_KEY);
 
         // Fetch list of database dirs and iterate them
-        let buf = self.get(DBDIR_KEY, lsn).await?;
+        let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
         let dbdir = DbDirectory::des(&buf).context("deserialization failure")?;
 
         let mut dbs: Vec<(Oid, Oid)> = dbdir.dbdirs.keys().cloned().collect();
@@ -519,14 +553,14 @@ impl Timeline {
             result.add_key(rel_dir_to_key(spcnode, dbnode));
 
             let mut rels: Vec<RelTag> = self
-                .list_rels(spcnode, dbnode, lsn)
+                .list_rels(spcnode, dbnode, lsn, ctx)
                 .await?
                 .into_iter()
                 .collect();
             rels.sort_unstable();
             for rel in rels {
                 let relsize_key = rel_size_to_key(rel);
-                let mut buf = self.get(relsize_key, lsn).await?;
+                let mut buf = self.get(relsize_key, lsn, ctx).await?;
                 let relsize = buf.get_u32_le();
 
                 result.add_range(rel_block_to_key(rel, 0)..rel_block_to_key(rel, relsize));
@@ -542,13 +576,13 @@ impl Timeline {
         ] {
             let slrudir_key = slru_dir_to_key(kind);
             result.add_key(slrudir_key);
-            let buf = self.get(slrudir_key, lsn).await?;
+            let buf = self.get(slrudir_key, lsn, ctx).await?;
             let dir = SlruSegmentDirectory::des(&buf).context("deserialization failure")?;
             let mut segments: Vec<u32> = dir.segments.iter().cloned().collect();
             segments.sort_unstable();
             for segno in segments {
                 let segsize_key = slru_segment_size_to_key(kind, segno);
-                let mut buf = self.get(segsize_key, lsn).await?;
+                let mut buf = self.get(segsize_key, lsn, ctx).await?;
                 let segsize = buf.get_u32_le();
 
                 result.add_range(
@@ -560,7 +594,7 @@ impl Timeline {
 
         // Then pg_twophase
         result.add_key(TWOPHASEDIR_KEY);
-        let buf = self.get(TWOPHASEDIR_KEY, lsn).await?;
+        let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?;
         let twophase_dir = TwoPhaseDirectory::des(&buf).context("deserialization failure")?;
         let mut xids: Vec<TransactionId> = twophase_dir.xids.iter().cloned().collect();
         xids.sort_unstable();
@@ -723,9 +757,10 @@ impl<'a> DatadirModification<'a> {
         spcnode: Oid,
         dbnode: Oid,
         img: Bytes,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         // Add it to the directory (if it doesn't exist already)
-        let buf = self.get(DBDIR_KEY).await?;
+        let buf = self.get(DBDIR_KEY, ctx).await?;
         let mut dbdir = DbDirectory::des(&buf)?;
 
         let r = dbdir.dbdirs.insert((spcnode, dbnode), true);
@@ -755,9 +790,10 @@ impl<'a> DatadirModification<'a> {
         &mut self,
         xid: TransactionId,
         img: Bytes,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         // Add it to the directory entry
-        let buf = self.get(TWOPHASEDIR_KEY).await?;
+        let buf = self.get(TWOPHASEDIR_KEY, ctx).await?;
         let mut dir = TwoPhaseDirectory::des(&buf)?;
         if !dir.xids.insert(xid) {
             anyhow::bail!("twophase file for xid {} already exists", xid);
@@ -781,16 +817,21 @@ impl<'a> DatadirModification<'a> {
         Ok(())
     }
 
-    pub async fn drop_dbdir(&mut self, spcnode: Oid, dbnode: Oid) -> anyhow::Result<()> {
+    pub async fn drop_dbdir(
+        &mut self,
+        spcnode: Oid,
+        dbnode: Oid,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
         let req_lsn = self.tline.get_last_record_lsn();
 
         let total_blocks = self
             .tline
-            .get_db_size(spcnode, dbnode, req_lsn, true)
+            .get_db_size(spcnode, dbnode, req_lsn, true, ctx)
             .await?;
 
         // Remove entry from dbdir
-        let buf = self.get(DBDIR_KEY).await?;
+        let buf = self.get(DBDIR_KEY, ctx).await?;
         let mut dir = DbDirectory::des(&buf)?;
         if dir.dbdirs.remove(&(spcnode, dbnode)).is_some() {
             let buf = DbDirectory::ser(&dir)?;
@@ -817,11 +858,12 @@ impl<'a> DatadirModification<'a> {
         &mut self,
         rel: RelTag,
         nblocks: BlockNumber,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         anyhow::ensure!(rel.relnode != 0, "invalid relnode");
         // It's possible that this is the first rel for this db in this
         // tablespace.  Create the reldir entry for it if so.
-        let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY).await?)?;
+        let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await?)?;
         let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
         let mut rel_dir = if dbdir.dbdirs.get(&(rel.spcnode, rel.dbnode)).is_none() {
             // Didn't exist. Update dbdir
@@ -833,7 +875,7 @@ impl<'a> DatadirModification<'a> {
             RelDirectory::default()
         } else {
             // reldir already exists, fetch it
-            RelDirectory::des(&self.get(rel_dir_key).await?)?
+            RelDirectory::des(&self.get(rel_dir_key, ctx).await?)?
         };
 
         // Add the new relation to the rel directory entry, and write it back
@@ -865,13 +907,14 @@ impl<'a> DatadirModification<'a> {
         &mut self,
         rel: RelTag,
         nblocks: BlockNumber,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         anyhow::ensure!(rel.relnode != 0, "invalid relnode");
         let last_lsn = self.tline.get_last_record_lsn();
-        if self.tline.get_rel_exists(rel, last_lsn, true).await? {
+        if self.tline.get_rel_exists(rel, last_lsn, true, ctx).await? {
             let size_key = rel_size_to_key(rel);
             // Fetch the old size first
-            let old_size = self.get(size_key).await?.get_u32_le();
+            let old_size = self.get(size_key, ctx).await?.get_u32_le();
 
             // Update the entry with the new size.
             let buf = nblocks.to_le_bytes();
@@ -895,12 +938,13 @@ impl<'a> DatadirModification<'a> {
         &mut self,
         rel: RelTag,
         nblocks: BlockNumber,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         anyhow::ensure!(rel.relnode != 0, "invalid relnode");
 
         // Put size
         let size_key = rel_size_to_key(rel);
-        let old_size = self.get(size_key).await?.get_u32_le();
+        let old_size = self.get(size_key, ctx).await?.get_u32_le();
 
         // only extend relation here. never decrease the size
         if nblocks > old_size {
@@ -916,12 +960,12 @@ impl<'a> DatadirModification<'a> {
     }
 
     /// Drop a relation.
-    pub async fn put_rel_drop(&mut self, rel: RelTag) -> anyhow::Result<()> {
+    pub async fn put_rel_drop(&mut self, rel: RelTag, ctx: &RequestContext) -> anyhow::Result<()> {
         anyhow::ensure!(rel.relnode != 0, "invalid relnode");
 
         // Remove it from the directory entry
         let dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
-        let buf = self.get(dir_key).await?;
+        let buf = self.get(dir_key, ctx).await?;
         let mut dir = RelDirectory::des(&buf)?;
 
         if dir.rels.remove(&(rel.relnode, rel.forknum)) {
@@ -932,7 +976,7 @@ impl<'a> DatadirModification<'a> {
 
         // update logical size
         let size_key = rel_size_to_key(rel);
-        let old_size = self.get(size_key).await?.get_u32_le();
+        let old_size = self.get(size_key, ctx).await?.get_u32_le();
         self.pending_nblocks -= old_size as i64;
 
         // Remove enty from relation size cache
@@ -949,10 +993,11 @@ impl<'a> DatadirModification<'a> {
         kind: SlruKind,
         segno: u32,
         nblocks: BlockNumber,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         // Add it to the directory entry
         let dir_key = slru_dir_to_key(kind);
-        let buf = self.get(dir_key).await?;
+        let buf = self.get(dir_key, ctx).await?;
         let mut dir = SlruSegmentDirectory::des(&buf)?;
 
         if !dir.segments.insert(segno) {
@@ -988,10 +1033,15 @@ impl<'a> DatadirModification<'a> {
     }
 
     /// This method is used for marking truncated SLRU files
-    pub async fn drop_slru_segment(&mut self, kind: SlruKind, segno: u32) -> anyhow::Result<()> {
+    pub async fn drop_slru_segment(
+        &mut self,
+        kind: SlruKind,
+        segno: u32,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
         // Remove it from the directory entry
         let dir_key = slru_dir_to_key(kind);
-        let buf = self.get(dir_key).await?;
+        let buf = self.get(dir_key, ctx).await?;
         let mut dir = SlruSegmentDirectory::des(&buf)?;
 
         if !dir.segments.remove(&segno) {
@@ -1015,9 +1065,13 @@ impl<'a> DatadirModification<'a> {
     }
 
     /// This method is used for marking truncated SLRU files
-    pub async fn drop_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
+    pub async fn drop_twophase_file(
+        &mut self,
+        xid: TransactionId,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
         // Remove it from the directory entry
-        let buf = self.get(TWOPHASEDIR_KEY).await?;
+        let buf = self.get(TWOPHASEDIR_KEY, ctx).await?;
         let mut dir = TwoPhaseDirectory::des(&buf)?;
 
         if !dir.xids.remove(&xid) {
@@ -1111,7 +1165,7 @@ impl<'a> DatadirModification<'a> {
 
     // Internal helper functions to batch the modifications
 
-    async fn get(&self, key: Key) -> Result<Bytes, PageReconstructError> {
+    async fn get(&self, key: Key, ctx: &RequestContext) -> Result<Bytes, PageReconstructError> {
         // Have we already updated the same key? Read the pending updated
         // version in that case.
         //
@@ -1132,7 +1186,7 @@ impl<'a> DatadirModification<'a> {
             }
         } else {
             let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
-            self.tline.get(key, lsn).await
+            self.tline.get(key, lsn, ctx).await
         }
     }
 
@@ -1542,10 +1596,11 @@ pub fn create_test_timeline(
     tenant: &crate::tenant::Tenant,
     timeline_id: utils::id::TimelineId,
     pg_version: u32,
+    ctx: &RequestContext,
 ) -> anyhow::Result<std::sync::Arc<Timeline>> {
     let tline = tenant
-        .create_empty_timeline(timeline_id, Lsn(8), pg_version)?
-        .initialize()?;
+        .create_empty_timeline(timeline_id, Lsn(8), pg_version, ctx)?
+        .initialize(ctx)?;
     let mut m = tline.begin_modification(Lsn(8));
     m.init_empty()?;
     m.commit()?;
@@ -1598,7 +1653,7 @@ mod tests {
         assert!(tline.list_rels(0, TESTDB, Lsn(0x30))?.contains(&TESTREL_A));
 
         // Create a branch, check that the relation is visible there
-        repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x30))?;
+        repo.branch_timeline(&tline, NEW_TIMELINE_ID, Lsn(0x30))?;
         let newtline = match repo.get_timeline(NEW_TIMELINE_ID)?.local_timeline() {
             Some(timeline) => timeline,
             None => panic!("Should have a local timeline"),
diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs
index 586fd20886..092503b7c5 100644
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -37,6 +37,17 @@ impl Key {
             | self.field6 as i128
     }
 
+    pub fn from_i128(x: i128) -> Self {
+        Key {
+            field1: ((x >> 120) & 0xf) as u8,
+            field2: ((x >> 104) & 0xFFFF) as u32,
+            field3: (x >> 72) as u32,
+            field4: (x >> 40) as u32,
+            field5: (x >> 32) as u8,
+            field6: x as u32,
+        }
+    }
+
     pub fn next(&self) -> Key {
         self.add(1)
     }
diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index 02e2e2ee14..09716ba0e0 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -171,6 +171,9 @@ task_local! {
 ///
 #[derive(Debug, PartialEq, Eq, Clone, Copy)]
 pub enum TaskKind {
+    // Pageserver startup, i.e., `main`
+    Startup,
+
     // libpq listener task. It just accepts connection and spawns a
     // PageRequestHandler task for each connection.
     LibpqEndpointListener,
@@ -183,13 +186,37 @@ pub enum TaskKind {
     // associated with one later, after receiving a command from the client.
     PageRequestHandler,
 
-    // Manages the WAL receiver connection for one timeline. It subscribes to
-    // events from storage_broker, decides which safekeeper to connect to. It spawns a
-    // separate WalReceiverConnection task to handle each connection.
+    /// Manages the WAL receiver connection for one timeline.
+    /// It subscribes to events from storage_broker and decides which safekeeper to connect to.
+    /// Once the decision has been made, it establishes the connection using the `tokio-postgres` library.
+    /// There is at most one connection at any given time.
+    ///
+    /// That `tokio-postgres` library represents a connection as two objects: a `Client` and a `Connection`.
+    /// The `Client` object is what library users use to make requests & get responses.
+    /// Internally, `Client` hands over requests to the `Connection` object.
+    /// The `Connection` object is responsible for speaking the wire protocol.
+    ///
+    /// Walreceiver uses its own abstraction called `TaskHandle` to represent the activity of establishing and handling a connection.
+    /// That abstraction doesn't use `task_mgr`.
+    /// The [`WalReceiverManager`] task ensures that this `TaskHandle` task does not outlive the [`WalReceiverManager`] task.
+    /// For the `RequestContext` that we hand to the TaskHandle, we use the [`WalReceiverConnectionHandler`] task kind.
+    ///
+    /// Once the connection is established, the `TaskHandle` task creates a
+    /// [`WalReceiverConnectionPoller`] task_mgr task that is responsible for polling
+    /// the `Connection` object.
+    /// A `CancellationToken` created by the `TaskHandle` task ensures
+    /// that the [`WalReceiverConnectionPoller`] task will cancel soon after as the `TaskHandle` is dropped.
     WalReceiverManager,
 
-    // Handles a connection to a safekeeper, to stream WAL to a timeline.
-    WalReceiverConnection,
+    /// The `TaskHandle` task that executes [`walreceiver_connection::handle_walreceiver_connection`].
+    /// Not a `task_mgr` task, but we use this `TaskKind` for its `RequestContext`.
+    /// See the comment on [`WalReceiverManager`].
+    WalReceiverConnectionHandler,
+
+    /// The task that polls the `tokio-postgres::Connection` object.
+    /// Spawned by task [`WalReceiverConnectionHandler`].
+    /// See the comment on [`WalReceiverManager`].
+    WalReceiverConnectionPoller,
 
     // Garbage collection worker. One per tenant
     GarbageCollector,
@@ -200,6 +227,8 @@ pub enum TaskKind {
     // Initial logical size calculation
     InitialLogicalSizeCalculation,
 
+    OndemandLogicalSizeCalculation,
+
     // Task that flushes frozen in-memory layers to disk
     LayerFlushTask,
 
@@ -222,6 +251,12 @@ pub enum TaskKind {
     DownloadAllRemoteLayers,
     // Task that calculates synthetis size for all active tenants
     CalculateSyntheticSize,
+
+    // A request that comes in via the pageserver HTTP API.
+    MgmtRequest,
+
+    #[cfg(test)]
+    UnitTest,
 }
 
 #[derive(Default)]
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index c18c645e5b..2f45fe0dfc 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -48,9 +48,10 @@ use std::time::{Duration, Instant};
 use self::metadata::TimelineMetadata;
 use self::remote_timeline_client::RemoteTimelineClient;
 use crate::config::PageServerConf;
+use crate::context::{DownloadBehavior, RequestContext};
 use crate::import_datadir;
 use crate::is_uninit_mark;
-use crate::metrics::{remove_tenant_metrics, STORAGE_TIME};
+use crate::metrics::{remove_tenant_metrics, TENANT_STATE_METRIC};
 use crate::repository::GcResult;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
@@ -174,7 +175,7 @@ impl UninitializedTimeline<'_> {
     ///
     /// The new timeline is initialized in Active state, and its background jobs are
     /// started
-    pub fn initialize(self) -> anyhow::Result<Arc<Timeline>> {
+    pub fn initialize(self, _ctx: &RequestContext) -> anyhow::Result<Arc<Timeline>> {
         let mut timelines = self.owning_tenant.timelines.lock().unwrap();
         self.initialize_with_lock(&mut timelines, true, true)
     }
@@ -188,7 +189,7 @@ impl UninitializedTimeline<'_> {
         mut self,
         timelines: &mut HashMap<TimelineId, Arc<Timeline>>,
         load_layer_map: bool,
-        launch_wal_receiver: bool,
+        activate: bool,
     ) -> anyhow::Result<Arc<Timeline>> {
         let timeline_id = self.timeline_id;
         let tenant_id = self.owning_tenant.tenant_id;
@@ -221,13 +222,12 @@ impl UninitializedTimeline<'_> {
                         "Failed to remove uninit mark file for timeline {tenant_id}/{timeline_id}"
                     )
                 })?;
-                new_timeline.set_state(TimelineState::Active);
                 v.insert(Arc::clone(&new_timeline));
 
                 new_timeline.maybe_spawn_flush_loop();
 
-                if launch_wal_receiver {
-                    new_timeline.launch_wal_receiver();
+                if activate {
+                    new_timeline.activate();
                 }
             }
         }
@@ -240,11 +240,12 @@ impl UninitializedTimeline<'_> {
         self,
         copyin_stream: &mut (impl Stream<Item = io::Result<Bytes>> + Sync + Send + Unpin),
         base_lsn: Lsn,
+        ctx: &RequestContext,
     ) -> anyhow::Result<Arc<Timeline>> {
         let raw_timeline = self.raw_timeline()?;
 
         let mut reader = tokio_util::io::StreamReader::new(copyin_stream);
-        import_datadir::import_basebackup_from_tar(raw_timeline, &mut reader, base_lsn)
+        import_datadir::import_basebackup_from_tar(raw_timeline, &mut reader, base_lsn, ctx)
             .await
             .context("Failed to import basebackup")?;
 
@@ -262,9 +263,7 @@ impl UninitializedTimeline<'_> {
             .await
             .context("Failed to flush after basebackup import")?;
 
-        let timeline = self.initialize()?;
-
-        Ok(timeline)
+        self.initialize(ctx)
     }
 
     fn raw_timeline(&self) -> anyhow::Result<&Arc<Timeline>> {
@@ -450,6 +449,7 @@ impl Tenant {
     ///
     /// If the operation fails, the timeline is left in the tenant's hash map in Broken state. On success,
     /// it is marked as Active.
+    #[allow(clippy::too_many_arguments)]
     async fn timeline_init_and_sync(
         &self,
         timeline_id: TimelineId,
@@ -458,6 +458,7 @@ impl Tenant {
         local_metadata: Option<TimelineMetadata>,
         ancestor: Option<Arc<Timeline>>,
         first_save: bool,
+        _ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         let tenant_id = self.tenant_id;
 
@@ -573,6 +574,7 @@ impl Tenant {
         conf: &'static PageServerConf,
         tenant_id: TenantId,
         remote_storage: GenericRemoteStorage,
+        ctx: &RequestContext,
     ) -> Arc<Tenant> {
         // XXX: Attach should provide the config, especially during tenant migration.
         //      See https://github.com/neondatabase/neon/issues/1555
@@ -591,6 +593,7 @@ impl Tenant {
         // Do all the hard work in the background
         let tenant_clone = Arc::clone(&tenant);
 
+        let ctx = ctx.detached_child(TaskKind::Attach, DownloadBehavior::Warn);
         task_mgr::spawn(
             &tokio::runtime::Handle::current(),
             TaskKind::Attach,
@@ -599,7 +602,7 @@ impl Tenant {
             "attach tenant",
             false,
             async move {
-                match tenant_clone.attach().await {
+                match tenant_clone.attach(ctx).await {
                     Ok(_) => {}
                     Err(e) => {
                         tenant_clone.set_broken(&e.to_string());
@@ -615,8 +618,8 @@ impl Tenant {
     ///
     /// Background task that downloads all data for a tenant and brings it to Active state.
     ///
-    #[instrument(skip(self), fields(tenant_id=%self.tenant_id))]
-    async fn attach(self: &Arc<Tenant>) -> anyhow::Result<()> {
+    #[instrument(skip(self, ctx), fields(tenant_id=%self.tenant_id))]
+    async fn attach(self: &Arc<Tenant>, ctx: RequestContext) -> anyhow::Result<()> {
         // Create directory with marker file to indicate attaching state.
         // The load_local_tenants() function in tenant::mgr relies on the marker file
         // to determine whether a tenant has finished attaching.
@@ -716,6 +719,7 @@ impl Tenant {
                 index_parts.remove(&timeline_id).unwrap(),
                 remote_metadata,
                 remote_clients.remove(&timeline_id).unwrap(),
+                &ctx,
             )
             .await
             .with_context(|| {
@@ -765,6 +769,7 @@ impl Tenant {
         index_part: IndexPart,
         remote_metadata: TimelineMetadata,
         remote_client: RemoteTimelineClient,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         info!("downloading index file for timeline {}", timeline_id);
         tokio::fs::create_dir_all(self.conf.timeline_path(&timeline_id, &self.tenant_id))
@@ -799,6 +804,7 @@ impl Tenant {
             local_metadata,
             ancestor,
             true,
+            ctx,
         )
         .await
     }
@@ -827,11 +833,12 @@ impl Tenant {
     /// If the loading fails for some reason, the Tenant will go into Broken
     /// state.
     ///
-    #[instrument(skip(conf, remote_storage), fields(tenant_id=%tenant_id))]
+    #[instrument(skip(conf, remote_storage, ctx), fields(tenant_id=%tenant_id))]
     pub fn spawn_load(
         conf: &'static PageServerConf,
         tenant_id: TenantId,
         remote_storage: Option<GenericRemoteStorage>,
+        ctx: &RequestContext,
     ) -> Arc<Tenant> {
         let tenant_conf = match Self::load_tenant_config(conf, tenant_id) {
             Ok(conf) => conf,
@@ -855,6 +862,7 @@ impl Tenant {
         // Do all the hard work in a background task
         let tenant_clone = Arc::clone(&tenant);
 
+        let ctx = ctx.detached_child(TaskKind::InitialLoad, DownloadBehavior::Warn);
         let _ = task_mgr::spawn(
             &tokio::runtime::Handle::current(),
             TaskKind::InitialLoad,
@@ -863,7 +871,7 @@ impl Tenant {
             "initial tenant load",
             false,
             async move {
-                match tenant_clone.load().await {
+                match tenant_clone.load(&ctx).await {
                     Ok(()) => {}
                     Err(err) => {
                         tenant_clone.set_broken(&err.to_string());
@@ -884,8 +892,8 @@ impl Tenant {
     /// Background task to load in-memory data structures for this tenant, from
     /// files on disk. Used at pageserver startup.
     ///
-    #[instrument(skip(self), fields(tenant_id=%self.tenant_id))]
-    async fn load(self: &Arc<Tenant>) -> anyhow::Result<()> {
+    #[instrument(skip(self, ctx), fields(tenant_id=%self.tenant_id))]
+    async fn load(self: &Arc<Tenant>, ctx: &RequestContext) -> anyhow::Result<()> {
         info!("loading tenant task");
 
         utils::failpoint_sleep_millis_async!("before-loading-tenant");
@@ -996,7 +1004,7 @@ impl Tenant {
         //    1. "Timeline has no ancestor and no layer files"
 
         for (timeline_id, local_metadata) in sorted_timelines {
-            self.load_local_timeline(timeline_id, local_metadata)
+            self.load_local_timeline(timeline_id, local_metadata, ctx)
                 .await
                 .with_context(|| format!("load local timeline {timeline_id}"))?;
         }
@@ -1013,11 +1021,12 @@ impl Tenant {
     /// Subroutine of `load_tenant`, to load an individual timeline
     ///
     /// NB: The parent is assumed to be already loaded!
-    #[instrument(skip(self, local_metadata), fields(timeline_id=%timeline_id))]
+    #[instrument(skip(self, local_metadata, ctx), fields(timeline_id=%timeline_id))]
     async fn load_local_timeline(
         &self,
         timeline_id: TimelineId,
         local_metadata: TimelineMetadata,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         let ancestor = if let Some(ancestor_timeline_id) = local_metadata.ancestor_timeline() {
             let ancestor_timeline = self.get_timeline(ancestor_timeline_id, false)
@@ -1061,6 +1070,7 @@ impl Tenant {
             Some(local_metadata),
             ancestor,
             false,
+            ctx,
         )
         .await
     }
@@ -1112,6 +1122,7 @@ impl Tenant {
         new_timeline_id: TimelineId,
         initdb_lsn: Lsn,
         pg_version: u32,
+        _ctx: &RequestContext,
     ) -> anyhow::Result<UninitializedTimeline> {
         anyhow::ensure!(
             self.is_active(),
@@ -1153,6 +1164,7 @@ impl Tenant {
         ancestor_timeline_id: Option<TimelineId>,
         mut ancestor_start_lsn: Option<Lsn>,
         pg_version: u32,
+        ctx: &RequestContext,
     ) -> anyhow::Result<Option<Arc<Timeline>>> {
         anyhow::ensure!(
             self.is_active(),
@@ -1190,13 +1202,16 @@ impl Tenant {
                     // decoding the new WAL might need to look up previous pages, relation
                     // sizes etc. and that would get confused if the previous page versions
                     // are not in the repository yet.
-                    ancestor_timeline.wait_lsn(*lsn).await?;
+                    ancestor_timeline.wait_lsn(*lsn, ctx).await?;
                 }
 
-                self.branch_timeline(ancestor_timeline_id, new_timeline_id, ancestor_start_lsn)
+                self.branch_timeline(&ancestor_timeline, new_timeline_id, ancestor_start_lsn, ctx)
+                    .await?
+            }
+            None => {
+                self.bootstrap_timeline(new_timeline_id, pg_version, ctx)
                     .await?
             }
-            None => self.bootstrap_timeline(new_timeline_id, pg_version).await?,
         };
 
         Ok(Some(loaded_timeline))
@@ -1220,30 +1235,25 @@ impl Tenant {
         target_timeline_id: Option<TimelineId>,
         horizon: u64,
         pitr: Duration,
+        ctx: &RequestContext,
     ) -> anyhow::Result<GcResult> {
         anyhow::ensure!(
             self.is_active(),
             "Cannot run GC iteration on inactive tenant"
         );
 
-        let timeline_str = target_timeline_id
-            .map(|x| x.to_string())
-            .unwrap_or_else(|| "-".to_string());
+        let gc_result = self
+            .gc_iteration_internal(target_timeline_id, horizon, pitr, ctx)
+            .await;
 
-        {
-            let _timer = STORAGE_TIME
-                .with_label_values(&["gc", &self.tenant_id.to_string(), &timeline_str])
-                .start_timer();
-            self.gc_iteration_internal(target_timeline_id, horizon, pitr)
-                .await
-        }
+        gc_result
     }
 
     /// Perform one compaction iteration.
     /// This function is periodically called by compactor task.
     /// Also it can be explicitly requested per timeline through page server
     /// api's 'compact' command.
-    pub async fn compaction_iteration(&self) -> anyhow::Result<()> {
+    pub async fn compaction_iteration(&self, ctx: &RequestContext) -> anyhow::Result<()> {
         anyhow::ensure!(
             self.is_active(),
             "Cannot run compaction iteration on inactive tenant"
@@ -1265,7 +1275,7 @@ impl Tenant {
 
         for (timeline_id, timeline) in &timelines_to_compact {
             timeline
-                .compact()
+                .compact(ctx)
                 .instrument(info_span!("compact_timeline", timeline = %timeline_id))
                 .await?;
         }
@@ -1298,7 +1308,11 @@ impl Tenant {
     }
 
     /// Removes timeline-related in-memory data
-    pub async fn delete_timeline(&self, timeline_id: TimelineId) -> anyhow::Result<()> {
+    pub async fn delete_timeline(
+        &self,
+        timeline_id: TimelineId,
+        _ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
         // Transition the timeline into TimelineState::Stopping.
         // This should prevent new operations from starting.
         let timeline = {
@@ -1462,8 +1476,7 @@ impl Tenant {
                     tasks::start_background_loops(self.tenant_id);
 
                     for timeline in not_broken_timelines {
-                        timeline.set_state(TimelineState::Active);
-                        timeline.launch_wal_receiver();
+                        timeline.activate();
                     }
                 }
             }
@@ -1487,7 +1500,7 @@ impl Tenant {
                         .values()
                         .filter(|timeline| timeline.current_state() != TimelineState::Broken);
                     for timeline in not_broken_timelines {
-                        timeline.set_state(TimelineState::Suspended);
+                        timeline.set_state(TimelineState::Stopping);
                     }
                 }
                 TenantState::Broken => {
@@ -1717,7 +1730,33 @@ impl Tenant {
         tenant_id: TenantId,
         remote_storage: Option<GenericRemoteStorage>,
     ) -> Tenant {
-        let (state, _) = watch::channel(state);
+        let (state, mut rx) = watch::channel(state);
+
+        tokio::spawn(async move {
+            let current_state = *rx.borrow_and_update();
+            let tid = tenant_id.to_string();
+            TENANT_STATE_METRIC
+                .with_label_values(&[&tid, current_state.as_str()])
+                .inc();
+            loop {
+                match rx.changed().await {
+                    Ok(()) => {
+                        let new_state = *rx.borrow();
+                        TENANT_STATE_METRIC
+                            .with_label_values(&[&tid, current_state.as_str()])
+                            .dec();
+                        TENANT_STATE_METRIC
+                            .with_label_values(&[&tid, new_state.as_str()])
+                            .inc();
+                    }
+                    Err(_sender_dropped_error) => {
+                        info!("Tenant dropped the state updates sender, quitting waiting for tenant state change");
+                        return;
+                    }
+                }
+            }
+        });
+
         Tenant {
             tenant_id,
             conf,
@@ -1776,69 +1815,70 @@ impl Tenant {
     }
 
     pub(super) fn persist_tenant_config(
+        tenant_id: &TenantId,
         target_config_path: &Path,
         tenant_conf: TenantConfOpt,
-        first_save: bool,
+        creating_tenant: bool,
     ) -> anyhow::Result<()> {
         let _enter = info_span!("saving tenantconf").entered();
-        info!("persisting tenantconf to {}", target_config_path.display());
 
-        // TODO this will prepend comments endlessly ?
-        let mut conf_content = r#"# This file contains a specific per-tenant's config.
-#  It is read in case of pageserver restart.
-
-[tenant_config]
-"#
-        .to_string();
-
-        // Convert the config to a toml file.
-        conf_content += &toml_edit::easy::to_string(&tenant_conf)?;
-
-        let mut target_config_file = VirtualFile::open_with_options(
-            target_config_path,
-            OpenOptions::new()
-                .truncate(true) // This needed for overwriting with small config files
-                .write(true)
-                .create_new(first_save),
-        )?;
-
-        target_config_file
-            .write(conf_content.as_bytes())
-            .context("Failed to write toml bytes into file")
-            .and_then(|_| {
-                target_config_file
-                    .sync_all()
-                    .context("Faile to fsync config file")
-            })
-            .with_context(|| {
+        // imitate a try-block with a closure
+        let do_persist = |target_config_path: &Path| -> anyhow::Result<()> {
+            let target_config_parent = target_config_path.parent().with_context(|| {
                 format!(
-                    "Failed to write config file into path '{}'",
+                    "Config path does not have a parent: {}",
                     target_config_path.display()
                 )
             })?;
 
-        // fsync the parent directory to ensure the directory entry is durable
-        if first_save {
-            target_config_path
-                .parent()
-                .context("Config file does not have a parent")
-                .and_then(|target_config_parent| {
-                    File::open(target_config_parent).context("Failed to open config parent")
-                })
-                .and_then(|tenant_dir| {
-                    tenant_dir
-                        .sync_all()
-                        .context("Failed to fsync config parent")
-                })
-                .with_context(|| {
-                    format!(
-                        "Failed to fsync on first save for config {}",
-                        target_config_path.display()
-                    )
-                })?;
-        }
+            info!("persisting tenantconf to {}", target_config_path.display());
 
-        Ok(())
+            let mut conf_content = r#"# This file contains a specific per-tenant's config.
+#  It is read in case of pageserver restart.
+
+[tenant_config]
+"#
+            .to_string();
+
+            // Convert the config to a toml file.
+            conf_content += &toml_edit::easy::to_string(&tenant_conf)?;
+
+            let mut target_config_file = VirtualFile::open_with_options(
+                target_config_path,
+                OpenOptions::new()
+                    .truncate(true) // This needed for overwriting with small config files
+                    .write(true)
+                    .create_new(creating_tenant)
+                    // when creating a new tenant, first_save will be true and `.create(true)` will be
+                    // ignored (per rust std docs).
+                    //
+                    // later when updating the config of created tenant, or persisting config for the
+                    // first time for attached tenant, the `.create(true)` is used.
+                    .create(true),
+            )?;
+
+            target_config_file
+                .write(conf_content.as_bytes())
+                .context("write toml bytes into file")
+                .and_then(|_| target_config_file.sync_all().context("fsync config file"))
+                .context("write config file")?;
+
+            // fsync the parent directory to ensure the directory entry is durable.
+            // before this was done conditionally on creating_tenant, but these management actions are rare
+            // enough to just fsync it always.
+
+            crashsafe::fsync(target_config_parent)?;
+            Ok(())
+        };
+
+        // this function is called from creating the tenant and updating the tenant config, which
+        // would otherwise share this context, so keep it here in one place.
+        do_persist(target_config_path).with_context(|| {
+            format!(
+                "write tenant {tenant_id} config to {}",
+                target_config_path.display()
+            )
+        })
     }
 
     //
@@ -1871,12 +1911,13 @@ impl Tenant {
         target_timeline_id: Option<TimelineId>,
         horizon: u64,
         pitr: Duration,
+        ctx: &RequestContext,
     ) -> anyhow::Result<GcResult> {
         let mut totals: GcResult = Default::default();
         let now = Instant::now();
 
         let gc_timelines = self
-            .refresh_gc_info_internal(target_timeline_id, horizon, pitr)
+            .refresh_gc_info_internal(target_timeline_id, horizon, pitr, ctx)
             .await?;
 
         utils::failpoint_sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines");
@@ -1917,7 +1958,10 @@ impl Tenant {
     /// [`Tenant::get_gc_horizon`].
     ///
     /// This is usually executed as part of periodic gc, but can now be triggered more often.
-    pub async fn refresh_gc_info(&self) -> anyhow::Result<Vec<Arc<Timeline>>> {
+    pub async fn refresh_gc_info(
+        &self,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Vec<Arc<Timeline>>> {
         // since this method can now be called at different rates than the configured gc loop, it
         // might be that these configuration values get applied faster than what it was previously,
         // since these were only read from the gc task.
@@ -1927,7 +1971,7 @@ impl Tenant {
         // refresh all timelines
         let target_timeline_id = None;
 
-        self.refresh_gc_info_internal(target_timeline_id, horizon, pitr)
+        self.refresh_gc_info_internal(target_timeline_id, horizon, pitr, ctx)
             .await
     }
 
@@ -1936,6 +1980,7 @@ impl Tenant {
         target_timeline_id: Option<TimelineId>,
         horizon: u64,
         pitr: Duration,
+        ctx: &RequestContext,
     ) -> anyhow::Result<Vec<Arc<Timeline>>> {
         // grab mutex to prevent new timelines from being created here.
         let gc_cs = self.gc_cs.lock().await;
@@ -2007,7 +2052,9 @@ impl Tenant {
                     ))
                     .map(|&x| x.1)
                     .collect();
-                timeline.update_gc_info(branchpoints, cutoff, pitr).await?;
+                timeline
+                    .update_gc_info(branchpoints, cutoff, pitr, ctx)
+                    .await?;
 
                 gc_timelines.push(timeline);
             }
@@ -2019,53 +2066,53 @@ impl Tenant {
     /// Branch an existing timeline
     async fn branch_timeline(
         &self,
-        src: TimelineId,
-        dst: TimelineId,
+        src_timeline: &Arc<Timeline>,
+        dst_id: TimelineId,
         start_lsn: Option<Lsn>,
+        _ctx: &RequestContext,
     ) -> anyhow::Result<Arc<Timeline>> {
-        // We need to hold this lock to prevent GC from starting at the same time. GC scans the directory to learn
-        // about timelines, so otherwise a race condition is possible, where we create new timeline and GC
-        // concurrently removes data that is needed by the new timeline.
-        let _gc_cs = self.gc_cs.lock().await;
-        let timeline_uninit_mark = {
-            let timelines = self.timelines.lock().unwrap();
-            self.create_timeline_uninit_mark(dst, &timelines)?
-        };
-
-        // In order for the branch creation task to not wait for GC/compaction,
-        // we need to make sure that the starting LSN of the child branch is not out of scope midway by
-        //
-        // 1. holding the GC lock to prevent overwritting timeline's GC data
-        // 2. checking both the latest GC cutoff LSN and latest GC info of the source timeline
-        //
-        // Step 2 is to avoid initializing the new branch using data removed by past GC iterations
-        // or in-queue GC iterations.
-
-        let src_timeline = self.get_timeline(src, false).with_context(|| {
-            format!(
-                "No ancestor {} found for timeline {}/{}",
-                src, self.tenant_id, dst
-            )
-        })?;
-
-        let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn();
+        let src_id = src_timeline.timeline_id;
 
         // If no start LSN is specified, we branch the new timeline from the source timeline's last record LSN
         let start_lsn = start_lsn.unwrap_or_else(|| {
             let lsn = src_timeline.get_last_record_lsn();
-            info!("branching timeline {dst} from timeline {src} at last record LSN: {lsn}");
+            info!("branching timeline {dst_id} from timeline {src_id} at last record LSN: {lsn}");
             lsn
         });
 
-        // Check if the starting LSN is out of scope because it is less than
-        // 1. the latest GC cutoff LSN or
-        // 2. the planned GC cutoff LSN, which is from an in-queue GC iteration.
+        // First acquire the GC lock so that another task cannot advance the GC
+        // cutoff in 'gc_info', and make 'start_lsn' invalid, while we are
+        // creating the branch.
+        let _gc_cs = self.gc_cs.lock().await;
+
+        // Create a placeholder for the new branch. This will error
+        // out if the new timeline ID is already in use.
+        let timeline_uninit_mark = {
+            let timelines = self.timelines.lock().unwrap();
+            self.create_timeline_uninit_mark(dst_id, &timelines)?
+        };
+
+        // Ensure that `start_lsn` is valid, i.e. the LSN is within the PITR
+        // horizon on the source timeline
+        //
+        // We check it against both the planned GC cutoff stored in 'gc_info',
+        // and the 'latest_gc_cutoff' of the last GC that was performed.  The
+        // planned GC cutoff in 'gc_info' is normally larger than
+        // 'latest_gc_cutoff_lsn', but beware of corner cases like if you just
+        // changed the GC settings for the tenant to make the PITR window
+        // larger, but some of the data was already removed by an earlier GC
+        // iteration.
+
+        // check against last actual 'latest_gc_cutoff' first
+        let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn();
         src_timeline
             .check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn)
             .context(format!(
                 "invalid branch start lsn: less than latest GC cutoff {}",
                 *latest_gc_cutoff_lsn,
             ))?;
+
+        // and then the planned GC cutoff
         {
             let gc_info = src_timeline.gc_info.read().unwrap();
             let cutoff = min(gc_info.pitr_cutoff, gc_info.horizon_cutoff);
@@ -2076,6 +2123,12 @@ impl Tenant {
             }
         }
 
+        //
+        // The branch point is valid, and we are still holding the 'gc_cs' lock
+        // so that GC cannot advance the GC cutoff until we are finished.
+        // Proceed with the branch creation.
+        //
+
         // Determine prev-LSN for the new timeline. We can only determine it if
         // the timeline was branched at the current end of the source timeline.
         let RecordLsn {
@@ -2094,7 +2147,7 @@ impl Tenant {
         let metadata = TimelineMetadata::new(
             start_lsn,
             dst_prev,
-            Some(src),
+            Some(src_id),
             start_lsn,
             *src_timeline.latest_gc_cutoff_lsn.read(), // FIXME: should we hold onto this guard longer?
             src_timeline.initdb_lsn,
@@ -2103,15 +2156,15 @@ impl Tenant {
         let mut timelines = self.timelines.lock().unwrap();
         let new_timeline = self
             .prepare_timeline(
-                dst,
+                dst_id,
                 metadata,
                 timeline_uninit_mark,
                 false,
-                Some(src_timeline),
+                Some(Arc::clone(src_timeline)),
             )?
             .initialize_with_lock(&mut timelines, true, true)?;
         drop(timelines);
-        info!("branched timeline {dst} from {src} at {start_lsn}");
+        info!("branched timeline {dst_id} from {src_id} at {start_lsn}");
 
         Ok(new_timeline)
     }
@@ -2122,6 +2175,7 @@ impl Tenant {
         &self,
         timeline_id: TimelineId,
         pg_version: u32,
+        ctx: &RequestContext,
     ) -> anyhow::Result<Arc<Timeline>> {
         let timeline_uninit_mark = {
             let timelines = self.timelines.lock().unwrap();
@@ -2181,6 +2235,7 @@ impl Tenant {
             unfinished_timeline,
             pgdata_path,
             pgdata_lsn,
+            ctx,
         )
         .await
         .with_context(|| {
@@ -2352,7 +2407,10 @@ impl Tenant {
     ///
     /// Future is cancellation safe. Only one calculation can be running at once per tenant.
     #[instrument(skip_all, fields(tenant_id=%self.tenant_id))]
-    pub async fn gather_size_inputs(&self) -> anyhow::Result<size::ModelInputs> {
+    pub async fn gather_size_inputs(
+        &self,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<size::ModelInputs> {
         let logical_sizes_at_once = self
             .conf
             .concurrent_tenant_size_logical_size_queries
@@ -2364,15 +2422,15 @@ impl Tenant {
         // See more for on the issue #2748 condenced out of the initial PR review.
         let mut shared_cache = self.cached_logical_sizes.lock().await;
 
-        size::gather_inputs(self, logical_sizes_at_once, &mut shared_cache).await
+        size::gather_inputs(self, logical_sizes_at_once, &mut shared_cache, ctx).await
     }
 
     /// Calculate synthetic tenant size
     /// This is periodically called by background worker.
     /// result is cached in tenant struct
     #[instrument(skip_all, fields(tenant_id=%self.tenant_id))]
-    pub async fn calculate_synthetic_size(&self) -> anyhow::Result<u64> {
-        let inputs = self.gather_size_inputs().await?;
+    pub async fn calculate_synthetic_size(&self, ctx: &RequestContext) -> anyhow::Result<u64> {
+        let inputs = self.gather_size_inputs(ctx).await?;
 
         let size = inputs.calculate()?;
 
@@ -2475,26 +2533,19 @@ fn try_create_target_tenant_dir(
         target_tenant_directory,
         temporary_tenant_dir,
     )
-    .with_context(|| format!("Failed to resolve tenant {tenant_id} temporary timelines dir"))?;
+    .with_context(|| format!("resolve tenant {tenant_id} temporary timelines dir"))?;
     let temporary_tenant_config_path = rebase_directory(
         &conf.tenant_config_path(tenant_id),
         target_tenant_directory,
         temporary_tenant_dir,
     )
-    .with_context(|| format!("Failed to resolve tenant {tenant_id} temporary config path"))?;
+    .with_context(|| format!("resolve tenant {tenant_id} temporary config path"))?;
+
+    Tenant::persist_tenant_config(&tenant_id, &temporary_tenant_config_path, tenant_conf, true)?;
 
-    Tenant::persist_tenant_config(&temporary_tenant_config_path, tenant_conf, true).with_context(
-        || {
-            format!(
-                "Failed to write tenant {} config to {}",
-                tenant_id,
-                temporary_tenant_config_path.display()
-            )
-        },
-    )?;
     crashsafe::create_dir(&temporary_tenant_timelines_dir).with_context(|| {
         format!(
-            "could not create tenant {} temporary timelines directory {}",
+            "create tenant {} temporary timelines directory {}",
             tenant_id,
             temporary_tenant_timelines_dir.display()
         )
@@ -2505,7 +2556,7 @@ fn try_create_target_tenant_dir(
 
     fs::rename(temporary_tenant_dir, target_tenant_directory).with_context(|| {
         format!(
-            "failed to move tenant {} temporary directory {} into the permanent one {}",
+            "move tenant {} temporary directory {} into the permanent one {}",
             tenant_id,
             temporary_tenant_dir.display(),
             target_tenant_directory.display()
@@ -2513,14 +2564,14 @@ fn try_create_target_tenant_dir(
     })?;
     let target_dir_parent = target_tenant_directory.parent().with_context(|| {
         format!(
-            "Failed to get tenant {} dir parent for {}",
+            "get tenant {} dir parent for {}",
             tenant_id,
             target_tenant_directory.display()
         )
     })?;
     crashsafe::fsync(target_dir_parent).with_context(|| {
         format!(
-            "Failed to fsync renamed directory's parent {} for tenant {}",
+            "fsync renamed directory's parent {} for tenant {}",
             target_dir_parent.display(),
             tenant_id,
         )
@@ -2743,11 +2794,17 @@ pub mod harness {
             })
         }
 
-        pub async fn load(&self) -> Arc<Tenant> {
-            self.try_load().await.expect("failed to load test tenant")
+        pub async fn load(&self) -> (Arc<Tenant>, RequestContext) {
+            let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+            (
+                self.try_load(&ctx)
+                    .await
+                    .expect("failed to load test tenant"),
+                ctx,
+            )
         }
 
-        pub async fn try_load(&self) -> anyhow::Result<Arc<Tenant>> {
+        pub async fn try_load(&self, ctx: &RequestContext) -> anyhow::Result<Arc<Tenant>> {
             let walredo_mgr = Arc::new(TestRedoManager);
 
             let tenant = Arc::new(Tenant::new(
@@ -2775,8 +2832,7 @@ pub mod harness {
                 timelines_to_load.insert(timeline_id, timeline_metadata);
             }
             // FIXME starts background jobs
-            tenant.load().await?;
-
+            tenant.load(ctx).await?;
             Ok(tenant)
         }
 
@@ -2833,10 +2889,9 @@ mod tests {
 
     #[tokio::test]
     async fn test_basic() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_basic")?.load().await;
-        let tline = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
-            .initialize()?;
+        let (tenant, ctx) = TenantHarness::create("test_basic")?.load().await;
+        let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+        let tline = tline.initialize(&ctx)?;
 
         let writer = tline.writer();
         writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?;
@@ -2849,15 +2904,15 @@ mod tests {
         drop(writer);
 
         assert_eq!(
-            tline.get(*TEST_KEY, Lsn(0x10)).await?,
+            tline.get(*TEST_KEY, Lsn(0x10), &ctx).await?,
             TEST_IMG("foo at 0x10")
         );
         assert_eq!(
-            tline.get(*TEST_KEY, Lsn(0x1f)).await?,
+            tline.get(*TEST_KEY, Lsn(0x1f), &ctx).await?,
             TEST_IMG("foo at 0x10")
         );
         assert_eq!(
-            tline.get(*TEST_KEY, Lsn(0x20)).await?,
+            tline.get(*TEST_KEY, Lsn(0x20), &ctx).await?,
             TEST_IMG("foo at 0x20")
         );
 
@@ -2866,14 +2921,14 @@ mod tests {
 
     #[tokio::test]
     async fn no_duplicate_timelines() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("no_duplicate_timelines")?
+        let (tenant, ctx) = TenantHarness::create("no_duplicate_timelines")?
             .load()
             .await;
-        let _ = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
-            .initialize()?;
+        let timeline =
+            tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+        let _ = timeline.initialize(&ctx)?;
 
-        match tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION) {
+        match tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx) {
             Ok(_) => panic!("duplicate timeline creation should fail"),
             Err(e) => assert_eq!(
                 e.to_string(),
@@ -2899,13 +2954,13 @@ mod tests {
     ///
     #[tokio::test]
     async fn test_branch() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_branch")?.load().await;
-        let tline = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
-            .initialize()?;
-        let writer = tline.writer();
         use std::str::from_utf8;
 
+        let (tenant, ctx) = TenantHarness::create("test_branch")?.load().await;
+        let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+        let tline = tline.initialize(&ctx)?;
+        let writer = tline.writer();
+
         #[allow(non_snake_case)]
         let TEST_KEY_A: Key = Key::from_hex("112222222233333333444444445500000001").unwrap();
         #[allow(non_snake_case)]
@@ -2925,7 +2980,7 @@ mod tests {
 
         // Branch the history, modify relation differently on the new timeline
         tenant
-            .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x30)))
+            .branch_timeline(&tline, NEW_TIMELINE_ID, Some(Lsn(0x30)), &ctx)
             .await?;
         let newtline = tenant
             .get_timeline(NEW_TIMELINE_ID, true)
@@ -2936,15 +2991,15 @@ mod tests {
 
         // Check page contents on both branches
         assert_eq!(
-            from_utf8(&tline.get(TEST_KEY_A, Lsn(0x40)).await?)?,
+            from_utf8(&tline.get(TEST_KEY_A, Lsn(0x40), &ctx).await?)?,
             "foo at 0x40"
         );
         assert_eq!(
-            from_utf8(&newtline.get(TEST_KEY_A, Lsn(0x40)).await?)?,
+            from_utf8(&newtline.get(TEST_KEY_A, Lsn(0x40), &ctx).await?)?,
             "bar at 0x40"
         );
         assert_eq!(
-            from_utf8(&newtline.get(TEST_KEY_B, Lsn(0x40)).await?)?,
+            from_utf8(&newtline.get(TEST_KEY_B, Lsn(0x40), &ctx).await?)?,
             "foobar at 0x20"
         );
 
@@ -2996,13 +3051,12 @@ mod tests {
 
     #[tokio::test]
     async fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> {
-        let tenant =
+        let (tenant, ctx) =
             TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")?
                 .load()
                 .await;
-        let tline = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
-            .initialize()?;
+        let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+        let tline = tline.initialize(&ctx)?;
         make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
 
         // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
@@ -3010,12 +3064,12 @@ mod tests {
         // and compaction works. But it does set the 'cutoff' point so that the cross check
         // below should fail.
         tenant
-            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO)
+            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, &ctx)
             .await?;
 
         // try to branch at lsn 25, should fail because we already garbage collected the data
         match tenant
-            .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25)))
+            .branch_timeline(&tline, NEW_TIMELINE_ID, Some(Lsn(0x25)), &ctx)
             .await
         {
             Ok(_) => panic!("branching should have failed"),
@@ -3034,16 +3088,17 @@ mod tests {
 
     #[tokio::test]
     async fn test_prohibit_branch_creation_on_pre_initdb_lsn() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?
-            .load()
-            .await;
+        let (tenant, ctx) =
+            TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?
+                .load()
+                .await;
 
-        tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0x50), DEFAULT_PG_VERSION)?
-            .initialize()?;
+        let tline = tenant
+            .create_empty_timeline(TIMELINE_ID, Lsn(0x50), DEFAULT_PG_VERSION, &ctx)?
+            .initialize(&ctx)?;
         // try to branch at lsn 0x25, should fail because initdb lsn is 0x50
         match tenant
-            .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25)))
+            .branch_timeline(&tline, NEW_TIMELINE_ID, Some(Lsn(0x25)), &ctx)
             .await
         {
             Ok(_) => panic!("branching should have failed"),
@@ -3085,40 +3140,40 @@ mod tests {
 
     #[tokio::test]
     async fn test_retain_data_in_parent_which_is_needed_for_child() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?
-            .load()
-            .await;
-        let tline = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
-            .initialize()?;
+        let (tenant, ctx) =
+            TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?
+                .load()
+                .await;
+        let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+        let tline = tline.initialize(&ctx)?;
         make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
 
         tenant
-            .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))
+            .branch_timeline(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx)
             .await?;
         let newtline = tenant
             .get_timeline(NEW_TIMELINE_ID, true)
             .expect("Should have a local timeline");
         // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
         tenant
-            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO)
+            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, &ctx)
             .await?;
-        assert!(newtline.get(*TEST_KEY, Lsn(0x25)).await.is_ok());
+        assert!(newtline.get(*TEST_KEY, Lsn(0x25), &ctx).await.is_ok());
 
         Ok(())
     }
     #[tokio::test]
     async fn test_parent_keeps_data_forever_after_branching() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_parent_keeps_data_forever_after_branching")?
-            .load()
-            .await;
-        let tline = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
-            .initialize()?;
+        let (tenant, ctx) =
+            TenantHarness::create("test_parent_keeps_data_forever_after_branching")?
+                .load()
+                .await;
+        let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+        let tline = tline.initialize(&ctx)?;
         make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
 
         tenant
-            .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))
+            .branch_timeline(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx)
             .await?;
         let newtline = tenant
             .get_timeline(NEW_TIMELINE_ID, true)
@@ -3128,12 +3183,12 @@ mod tests {
 
         // run gc on parent
         tenant
-            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO)
+            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, &ctx)
             .await?;
 
         // Check that the data is still accessible on the branch.
         assert_eq!(
-            newtline.get(*TEST_KEY, Lsn(0x50)).await?,
+            newtline.get(*TEST_KEY, Lsn(0x50), &ctx).await?,
             TEST_IMG(&format!("foo at {}", Lsn(0x40)))
         );
 
@@ -3145,14 +3200,14 @@ mod tests {
         const TEST_NAME: &str = "timeline_load";
         let harness = TenantHarness::create(TEST_NAME)?;
         {
-            let tenant = harness.load().await;
-            let tline = tenant
-                .create_empty_timeline(TIMELINE_ID, Lsn(0x8000), DEFAULT_PG_VERSION)?
-                .initialize()?;
+            let (tenant, ctx) = harness.load().await;
+            let tline =
+                tenant.create_empty_timeline(TIMELINE_ID, Lsn(0x8000), DEFAULT_PG_VERSION, &ctx)?;
+            let tline = tline.initialize(&ctx)?;
             make_some_layers(tline.as_ref(), Lsn(0x8000)).await?;
         }
 
-        let tenant = harness.load().await;
+        let (tenant, _ctx) = harness.load().await;
         tenant
             .get_timeline(TIMELINE_ID, true)
             .expect("cannot load timeline");
@@ -3166,15 +3221,15 @@ mod tests {
         let harness = TenantHarness::create(TEST_NAME)?;
         // create two timelines
         {
-            let tenant = harness.load().await;
-            let tline = tenant
-                .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
-                .initialize()?;
+            let (tenant, ctx) = harness.load().await;
+            let tline =
+                tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+            let tline = tline.initialize(&ctx)?;
 
             make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
 
             tenant
-                .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))
+                .branch_timeline(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx)
                 .await?;
 
             let newtline = tenant
@@ -3185,7 +3240,7 @@ mod tests {
         }
 
         // check that both of them are initially unloaded
-        let tenant = harness.load().await;
+        let (tenant, _ctx) = harness.load().await;
 
         // check that both, child and ancestor are loaded
         let _child_tline = tenant
@@ -3203,11 +3258,11 @@ mod tests {
     async fn corrupt_metadata() -> anyhow::Result<()> {
         const TEST_NAME: &str = "corrupt_metadata";
         let harness = TenantHarness::create(TEST_NAME)?;
-        let tenant = harness.load().await;
+        let (tenant, ctx) = harness.load().await;
 
         tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
-            .initialize()?;
+            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?
+            .initialize(&ctx)?;
         drop(tenant);
 
         let metadata_path = harness.timeline_path(&TIMELINE_ID).join(METADATA_FILE_NAME);
@@ -3219,7 +3274,7 @@ mod tests {
         metadata_bytes[8] ^= 1;
         std::fs::write(metadata_path, metadata_bytes)?;
 
-        let err = harness.try_load().await.err().expect("should fail");
+        let err = harness.try_load(&ctx).await.err().expect("should fail");
         assert!(err
             .to_string()
             .starts_with("Failed to parse metadata bytes from path"));
@@ -3243,10 +3298,9 @@ mod tests {
 
     #[tokio::test]
     async fn test_images() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_images")?.load().await;
-        let tline = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
-            .initialize()?;
+        let (tenant, ctx) = TenantHarness::create("test_images")?.load().await;
+        let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+        let tline = tline.initialize(&ctx)?;
 
         let writer = tline.writer();
         writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?;
@@ -3254,7 +3308,7 @@ mod tests {
         drop(writer);
 
         tline.freeze_and_flush().await?;
-        tline.compact().await?;
+        tline.compact(&ctx).await?;
 
         let writer = tline.writer();
         writer.put(*TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))?;
@@ -3262,7 +3316,7 @@ mod tests {
         drop(writer);
 
         tline.freeze_and_flush().await?;
-        tline.compact().await?;
+        tline.compact(&ctx).await?;
 
         let writer = tline.writer();
         writer.put(*TEST_KEY, Lsn(0x30), &Value::Image(TEST_IMG("foo at 0x30")))?;
@@ -3270,7 +3324,7 @@ mod tests {
         drop(writer);
 
         tline.freeze_and_flush().await?;
-        tline.compact().await?;
+        tline.compact(&ctx).await?;
 
         let writer = tline.writer();
         writer.put(*TEST_KEY, Lsn(0x40), &Value::Image(TEST_IMG("foo at 0x40")))?;
@@ -3278,26 +3332,26 @@ mod tests {
         drop(writer);
 
         tline.freeze_and_flush().await?;
-        tline.compact().await?;
+        tline.compact(&ctx).await?;
 
         assert_eq!(
-            tline.get(*TEST_KEY, Lsn(0x10)).await?,
+            tline.get(*TEST_KEY, Lsn(0x10), &ctx).await?,
             TEST_IMG("foo at 0x10")
         );
         assert_eq!(
-            tline.get(*TEST_KEY, Lsn(0x1f)).await?,
+            tline.get(*TEST_KEY, Lsn(0x1f), &ctx).await?,
             TEST_IMG("foo at 0x10")
         );
         assert_eq!(
-            tline.get(*TEST_KEY, Lsn(0x20)).await?,
+            tline.get(*TEST_KEY, Lsn(0x20), &ctx).await?,
             TEST_IMG("foo at 0x20")
         );
         assert_eq!(
-            tline.get(*TEST_KEY, Lsn(0x30)).await?,
+            tline.get(*TEST_KEY, Lsn(0x30), &ctx).await?,
             TEST_IMG("foo at 0x30")
         );
         assert_eq!(
-            tline.get(*TEST_KEY, Lsn(0x40)).await?,
+            tline.get(*TEST_KEY, Lsn(0x40), &ctx).await?,
             TEST_IMG("foo at 0x40")
         );
 
@@ -3310,10 +3364,9 @@ mod tests {
     //
     #[tokio::test]
     async fn test_bulk_insert() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_bulk_insert")?.load().await;
-        let tline = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
-            .initialize()?;
+        let (tenant, ctx) = TenantHarness::create("test_bulk_insert")?.load().await;
+        let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+        let tline = tline.initialize(&ctx)?;
 
         let mut lsn = Lsn(0x10);
 
@@ -3342,10 +3395,10 @@ mod tests {
             let cutoff = tline.get_last_record_lsn();
 
             tline
-                .update_gc_info(Vec::new(), cutoff, Duration::ZERO)
+                .update_gc_info(Vec::new(), cutoff, Duration::ZERO, &ctx)
                 .await?;
             tline.freeze_and_flush().await?;
-            tline.compact().await?;
+            tline.compact(&ctx).await?;
             tline.gc().await?;
         }
 
@@ -3354,10 +3407,9 @@ mod tests {
 
     #[tokio::test]
     async fn test_random_updates() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_random_updates")?.load().await;
-        let tline = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
-            .initialize()?;
+        let (tenant, ctx) = TenantHarness::create("test_random_updates")?.load().await;
+        let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+        let tline = tline.initialize(&ctx)?;
 
         const NUM_KEYS: usize = 1000;
 
@@ -3407,7 +3459,7 @@ mod tests {
             for (blknum, last_lsn) in updated.iter().enumerate() {
                 test_key.field6 = blknum as u32;
                 assert_eq!(
-                    tline.get(test_key, lsn).await?,
+                    tline.get(test_key, lsn, &ctx).await?,
                     TEST_IMG(&format!("{} at {}", blknum, last_lsn))
                 );
             }
@@ -3415,10 +3467,10 @@ mod tests {
             // Perform a cycle of flush, compact, and GC
             let cutoff = tline.get_last_record_lsn();
             tline
-                .update_gc_info(Vec::new(), cutoff, Duration::ZERO)
+                .update_gc_info(Vec::new(), cutoff, Duration::ZERO, &ctx)
                 .await?;
             tline.freeze_and_flush().await?;
-            tline.compact().await?;
+            tline.compact(&ctx).await?;
             tline.gc().await?;
         }
 
@@ -3427,12 +3479,12 @@ mod tests {
 
     #[tokio::test]
     async fn test_traverse_branches() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_traverse_branches")?
+        let (tenant, ctx) = TenantHarness::create("test_traverse_branches")?
             .load()
             .await;
         let mut tline = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
-            .initialize()?;
+            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?
+            .initialize(&ctx)?;
 
         const NUM_KEYS: usize = 1000;
 
@@ -3462,16 +3514,14 @@ mod tests {
             keyspace.add_key(test_key);
         }
 
-        let mut tline_id = TIMELINE_ID;
         for _ in 0..50 {
             let new_tline_id = TimelineId::generate();
             tenant
-                .branch_timeline(tline_id, new_tline_id, Some(lsn))
+                .branch_timeline(&tline, new_tline_id, Some(lsn), &ctx)
                 .await?;
             tline = tenant
                 .get_timeline(new_tline_id, true)
                 .expect("Should have the branched timeline");
-            tline_id = new_tline_id;
 
             for _ in 0..NUM_KEYS {
                 lsn = Lsn(lsn.0 + 0x10);
@@ -3493,7 +3543,7 @@ mod tests {
             for (blknum, last_lsn) in updated.iter().enumerate() {
                 test_key.field6 = blknum as u32;
                 assert_eq!(
-                    tline.get(test_key, lsn).await?,
+                    tline.get(test_key, lsn, &ctx).await?,
                     TEST_IMG(&format!("{} at {}", blknum, last_lsn))
                 );
             }
@@ -3501,10 +3551,10 @@ mod tests {
             // Perform a cycle of flush, compact, and GC
             let cutoff = tline.get_last_record_lsn();
             tline
-                .update_gc_info(Vec::new(), cutoff, Duration::ZERO)
+                .update_gc_info(Vec::new(), cutoff, Duration::ZERO, &ctx)
                 .await?;
             tline.freeze_and_flush().await?;
-            tline.compact().await?;
+            tline.compact(&ctx).await?;
             tline.gc().await?;
         }
 
@@ -3513,12 +3563,12 @@ mod tests {
 
     #[tokio::test]
     async fn test_traverse_ancestors() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_traverse_ancestors")?
+        let (tenant, ctx) = TenantHarness::create("test_traverse_ancestors")?
             .load()
             .await;
         let mut tline = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
-            .initialize()?;
+            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?
+            .initialize(&ctx)?;
 
         const NUM_KEYS: usize = 100;
         const NUM_TLINES: usize = 50;
@@ -3528,18 +3578,16 @@ mod tests {
         let mut updated = [[Lsn(0); NUM_KEYS]; NUM_TLINES];
 
         let mut lsn = Lsn(0);
-        let mut tline_id = TIMELINE_ID;
 
         #[allow(clippy::needless_range_loop)]
         for idx in 0..NUM_TLINES {
             let new_tline_id = TimelineId::generate();
             tenant
-                .branch_timeline(tline_id, new_tline_id, Some(lsn))
+                .branch_timeline(&tline, new_tline_id, Some(lsn), &ctx)
                 .await?;
             tline = tenant
                 .get_timeline(new_tline_id, true)
                 .expect("Should have the branched timeline");
-            tline_id = new_tline_id;
 
             for _ in 0..NUM_KEYS {
                 lsn = Lsn(lsn.0 + 0x10);
@@ -3568,7 +3616,7 @@ mod tests {
                 println!("checking [{idx}][{blknum}] at {lsn}");
                 test_key.field6 = blknum as u32;
                 assert_eq!(
-                    tline.get(test_key, *lsn).await?,
+                    tline.get(test_key, *lsn, &ctx).await?,
                     TEST_IMG(&format!("{idx} {blknum} at {lsn}"))
                 );
             }
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index c95a98fbc7..e66ee0ae36 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -28,7 +28,12 @@ pub mod defaults {
     pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10;
 
     pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
-    pub const DEFAULT_GC_PERIOD: &str = "100 s";
+
+    // Large DEFAULT_GC_PERIOD is fine as long as PITR_INTERVAL is larger.
+    // If there's a need to decrease this value, first make sure that GC
+    // doesn't hold a layer map write lock for non-trivial operations.
+    // Relevant: https://github.com/neondatabase/neon/issues/3394
+    pub const DEFAULT_GC_PERIOD: &str = "1 hr";
     pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
     pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
     pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "2 seconds";
diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index 01c5359e88..ed1a32c8fd 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -9,24 +9,57 @@
 //! are frozen, and it is split up into new image and delta layers and the
 //! corresponding files are written to disk.
 //!
+//! Design overview:
+//!
+//! The `search` method of the layer map is on the read critical path, so we've
+//! built an efficient data structure for fast reads, stored in `LayerMap::historic`.
+//! Other read methods are less critical but still impact performance of background tasks.
+//!
+//! This data structure relies on a persistent/immutable binary search tree. See the
+//! following lecture for an introduction https://www.youtube.com/watch?v=WqCWghETNDc&t=581s
+//! Summary: A persistent/immutable BST (and persistent data structures in general) allows
+//! you to modify the tree in such a way that each modification creates a new "version"
+//! of the tree. When you modify it, you get a new version, but all previous versions are
+//! still accessible too. So if someone is still holding a reference to an older version,
+//! they continue to see the tree as it was then. The persistent BST stores all the
+//! different versions in an efficient way.
+//!
+//! Our persistent BST maintains a map of which layer file "covers" each key. It has only
+//! one dimension, the key. See `layer_coverage.rs`. We use the persistent/immutable property
+//! to handle the LSN dimension.
+//!
+//! To build the layer map, we insert each layer to the persistent BST in LSN.start order,
+//! starting from the oldest one. After each insertion, we grab a reference to that "version"
+//! of the tree, and store it in another tree, a BtreeMap keyed by the LSN. See
+//! `historic_layer_coverage.rs`.
+//!
+//! To search for a particular key-LSN pair, you first look up the right "version" in the
+//! BTreeMap. Then you search that version of the BST with the key.
+//!
+//! The persistent BST keeps all the versions, but there is no way to change the old versions
+//! afterwards. We can add layers as long as they have larger LSNs than any previous layer in
+//! the map, but if we need to remove a layer, or insert anything with an older LSN, we need
+//! to throw away most of the persistent BST and build a new one, starting from the oldest
+//! LSN. See `LayerMap::flush_updates()`.
+//!
 
+mod historic_layer_coverage;
+mod layer_coverage;
+
+use crate::keyspace::KeyPartitioning;
 use crate::metrics::NUM_ONDISK_LAYERS;
 use crate::repository::Key;
-use crate::tenant::storage_layer::{range_eq, range_overlaps};
-use amplify_num::i256;
+use crate::tenant::storage_layer::InMemoryLayer;
+use crate::tenant::storage_layer::Layer;
 use anyhow::Result;
-use num_traits::identities::{One, Zero};
-use num_traits::{Bounded, Num, Signed};
-use rstar::{RTree, RTreeObject, AABB};
-use std::cmp::Ordering;
 use std::collections::VecDeque;
 use std::ops::Range;
-use std::ops::{Add, Div, Mul, Neg, Rem, Sub};
 use std::sync::Arc;
-use tracing::*;
 use utils::lsn::Lsn;
 
-use super::storage_layer::{InMemoryLayer, Layer};
+use historic_layer_coverage::BufferedHistoricLayerCoverage;
+
+use super::storage_layer::range_eq;
 
 ///
 /// LayerMap tracks what layers exist on a timeline.
@@ -51,8 +84,8 @@ pub struct LayerMap<L: ?Sized> {
     ///
     pub frozen_layers: VecDeque<Arc<InMemoryLayer>>,
 
-    /// All the historic layers are kept here
-    historic_layers: RTree<LayerRTreeObject<L>>,
+    /// Index of the historic layers optimized for search
+    historic: BufferedHistoricLayerCoverage<Arc<L>>,
 
     /// L0 layers have key range Key::MIN..Key::MAX, and locating them using R-Tree search is very inefficient.
     /// So L0 layers are held in l0_delta_layers vector, in addition to the R-tree.
@@ -65,177 +98,64 @@ impl<L: ?Sized> Default for LayerMap<L> {
             open_layer: None,
             next_open_layer_at: None,
             frozen_layers: VecDeque::default(),
-            historic_layers: RTree::default(),
             l0_delta_layers: Vec::default(),
+            historic: BufferedHistoricLayerCoverage::default(),
         }
     }
 }
 
-struct LayerRTreeObject<L: ?Sized> {
-    layer: Arc<L>,
-
-    envelope: AABB<[IntKey; 2]>,
+/// The primary update API for the layer map.
+///
+/// Batching historic layer insertions and removals is good for
+/// performance and this struct helps us do that correctly.
+#[must_use]
+pub struct BatchedUpdates<'a, L: ?Sized + Layer> {
+    // While we hold this exclusive reference to the layer map the type checker
+    // will prevent us from accidentally reading any unflushed updates.
+    layer_map: &'a mut LayerMap<L>,
 }
 
-// Representation of Key as numeric type.
-// We can not use native implementation of i128, because rstar::RTree
-// doesn't handle properly integer overflow during area calculation: sum(Xi*Yi).
-// Overflow will cause panic in debug mode and incorrect area calculation in release mode,
-// which leads to non-optimally balanced R-Tree (but doesn't fit correctness of R-Tree work).
-// By using i256 as the type, even though all the actual values would fit in i128, we can be
-// sure that multiplication doesn't overflow.
-//
-
-#[derive(Clone, PartialEq, Eq, PartialOrd, Debug)]
-struct IntKey(i256);
-
-impl Copy for IntKey {}
-
-impl IntKey {
-    fn from(i: i128) -> Self {
-        IntKey(i256::from(i))
-    }
-}
-
-impl Bounded for IntKey {
-    fn min_value() -> Self {
-        IntKey(i256::MIN)
-    }
-    fn max_value() -> Self {
-        IntKey(i256::MAX)
-    }
-}
-
-impl Signed for IntKey {
-    fn is_positive(&self) -> bool {
-        self.0 > i256::ZERO
-    }
-    fn is_negative(&self) -> bool {
-        self.0 < i256::ZERO
-    }
-    fn signum(&self) -> Self {
-        match self.0.cmp(&i256::ZERO) {
-            Ordering::Greater => IntKey(i256::ONE),
-            Ordering::Less => IntKey(-i256::ONE),
-            Ordering::Equal => IntKey(i256::ZERO),
-        }
-    }
-    fn abs(&self) -> Self {
-        IntKey(self.0.abs())
-    }
-    fn abs_sub(&self, other: &Self) -> Self {
-        if self.0 <= other.0 {
-            IntKey(i256::ZERO)
-        } else {
-            IntKey(self.0 - other.0)
-        }
-    }
-}
-
-impl Neg for IntKey {
-    type Output = Self;
-    fn neg(self) -> Self::Output {
-        IntKey(-self.0)
-    }
-}
-
-impl Rem for IntKey {
-    type Output = Self;
-    fn rem(self, rhs: Self) -> Self::Output {
-        IntKey(self.0 % rhs.0)
-    }
-}
-
-impl Div for IntKey {
-    type Output = Self;
-    fn div(self, rhs: Self) -> Self::Output {
-        IntKey(self.0 / rhs.0)
-    }
-}
-
-impl Add for IntKey {
-    type Output = Self;
-    fn add(self, rhs: Self) -> Self::Output {
-        IntKey(self.0 + rhs.0)
-    }
-}
-
-impl Sub for IntKey {
-    type Output = Self;
-    fn sub(self, rhs: Self) -> Self::Output {
-        IntKey(self.0 - rhs.0)
-    }
-}
-
-impl Mul for IntKey {
-    type Output = Self;
-    fn mul(self, rhs: Self) -> Self::Output {
-        IntKey(self.0 * rhs.0)
-    }
-}
-
-impl One for IntKey {
-    fn one() -> Self {
-        IntKey(i256::ONE)
-    }
-}
-
-impl Zero for IntKey {
-    fn zero() -> Self {
-        IntKey(i256::ZERO)
-    }
-    fn is_zero(&self) -> bool {
-        self.0 == i256::ZERO
-    }
-}
-
-impl Num for IntKey {
-    type FromStrRadixErr = <i128 as Num>::FromStrRadixErr;
-    fn from_str_radix(str: &str, radix: u32) -> Result<Self, Self::FromStrRadixErr> {
-        Ok(IntKey(i256::from(i128::from_str_radix(str, radix)?)))
-    }
-}
-
-impl<T: ?Sized> PartialEq for LayerRTreeObject<T> {
-    fn eq(&self, other: &Self) -> bool {
-        // FIXME: ptr_eq might fail to return true for 'dyn'
-        // references.  Clippy complains about this. In practice it
-        // seems to work, the assertion below would be triggered
-        // otherwise but this ought to be fixed.
-        #[allow(clippy::vtable_address_comparisons)]
-        Arc::ptr_eq(&self.layer, &other.layer)
-    }
-}
-
-impl<L> RTreeObject for LayerRTreeObject<L>
-where
-    L: ?Sized,
-{
-    type Envelope = AABB<[IntKey; 2]>;
-    fn envelope(&self) -> Self::Envelope {
-        self.envelope
-    }
-}
-
-impl<L> LayerRTreeObject<L>
+/// Provide ability to batch more updates while hiding the read
+/// API so we don't accidentally read without flushing.
+impl<L> BatchedUpdates<'_, L>
 where
     L: ?Sized + Layer,
 {
-    fn new(layer: Arc<L>) -> Self {
-        let key_range = layer.get_key_range();
-        let lsn_range = layer.get_lsn_range();
+    ///
+    /// Insert an on-disk layer.
+    ///
+    pub fn insert_historic(&mut self, layer: Arc<L>) {
+        self.layer_map.insert_historic_noflush(layer)
+    }
 
-        let envelope = AABB::from_corners(
-            [
-                IntKey::from(key_range.start.to_i128()),
-                IntKey::from(lsn_range.start.0 as i128),
-            ],
-            [
-                IntKey::from(key_range.end.to_i128() - 1),
-                IntKey::from(lsn_range.end.0 as i128 - 1),
-            ], // AABB::upper is inclusive, while `key_range.end` and `lsn_range.end` are exclusive
-        );
-        LayerRTreeObject { layer, envelope }
+    ///
+    /// Remove an on-disk layer from the map.
+    ///
+    /// This should be called when the corresponding file on disk has been deleted.
+    ///
+    pub fn remove_historic(&mut self, layer: Arc<L>) {
+        self.layer_map.remove_historic_noflush(layer)
+    }
+
+    // We will flush on drop anyway, but this method makes it
+    // more explicit that there is some work being done.
+    /// Apply all updates
+    pub fn flush(self) {
+        // Flush happens on drop
+    }
+}
+
+// Ideally the flush() method should be called explicitly for more
+// controlled execution. But if we forget we'd rather flush on drop
+// than panic later or read without flushing.
+//
+// TODO maybe warn if flush hasn't explicitly been called
+impl<L> Drop for BatchedUpdates<'_, L>
+where
+    L: ?Sized + Layer,
+{
+    fn drop(&mut self) {
+        self.layer_map.flush_updates();
     }
 }
 
@@ -281,125 +201,91 @@ where
     /// 'open' and 'frozen' layers!
     ///
     pub fn search(&self, key: Key, end_lsn: Lsn) -> Option<SearchResult<L>> {
-        // Find the latest image layer that covers the given key
-        let mut latest_img: Option<Arc<L>> = None;
-        let mut latest_img_lsn: Option<Lsn> = None;
-        let envelope = AABB::from_corners(
-            [IntKey::from(key.to_i128()), IntKey::from(0i128)],
-            [
-                IntKey::from(key.to_i128()),
-                IntKey::from(end_lsn.0 as i128 - 1),
-            ],
-        );
-        for e in self
-            .historic_layers
-            .locate_in_envelope_intersecting(&envelope)
-        {
-            let l = &e.layer;
-            if l.is_incremental() {
-                continue;
-            }
-            assert!(l.get_key_range().contains(&key));
-            let img_lsn = l.get_lsn_range().start;
-            assert!(img_lsn < end_lsn);
-            if Lsn(img_lsn.0 + 1) == end_lsn {
-                // found exact match
-                return Some(SearchResult {
-                    layer: Arc::clone(l),
-                    lsn_floor: img_lsn,
-                });
-            }
-            if img_lsn > latest_img_lsn.unwrap_or(Lsn(0)) {
-                latest_img = Some(Arc::clone(l));
-                latest_img_lsn = Some(img_lsn);
-            }
-        }
+        let version = self.historic.get().unwrap().get_version(end_lsn.0 - 1)?;
+        let latest_delta = version.delta_coverage.query(key.to_i128());
+        let latest_image = version.image_coverage.query(key.to_i128());
 
-        // Search the delta layers
-        let mut latest_delta: Option<Arc<L>> = None;
-        for e in self
-            .historic_layers
-            .locate_in_envelope_intersecting(&envelope)
-        {
-            let l = &e.layer;
-            if !l.is_incremental() {
-                continue;
+        match (latest_delta, latest_image) {
+            (None, None) => None,
+            (None, Some(image)) => {
+                let lsn_floor = image.get_lsn_range().start;
+                Some(SearchResult {
+                    layer: image,
+                    lsn_floor,
+                })
             }
-            assert!(l.get_key_range().contains(&key));
-            if l.get_lsn_range().start >= end_lsn {
-                info!(
-                    "Candidate delta layer {}..{} is too new for lsn {}",
-                    l.get_lsn_range().start,
-                    l.get_lsn_range().end,
-                    end_lsn
-                );
+            (Some(delta), None) => {
+                let lsn_floor = delta.get_lsn_range().start;
+                Some(SearchResult {
+                    layer: delta,
+                    lsn_floor,
+                })
             }
-            assert!(l.get_lsn_range().start < end_lsn);
-            if l.get_lsn_range().end >= end_lsn {
-                // this layer contains the requested point in the key/lsn space.
-                // No need to search any further
-                trace!(
-                    "found layer {} for request on {key} at {end_lsn}",
-                    l.short_id(),
-                );
-                latest_delta.replace(Arc::clone(l));
-                break;
-            }
-            if l.get_lsn_range().end > latest_img_lsn.unwrap_or(Lsn(0)) {
-                // this layer's end LSN is smaller than the requested point. If there's
-                // nothing newer, this is what we need to return. Remember this.
-                if let Some(old_candidate) = &latest_delta {
-                    if l.get_lsn_range().end > old_candidate.get_lsn_range().end {
-                        latest_delta.replace(Arc::clone(l));
-                    }
+            (Some(delta), Some(image)) => {
+                let img_lsn = image.get_lsn_range().start;
+                let image_is_newer = image.get_lsn_range().end >= delta.get_lsn_range().end;
+                let image_exact_match = img_lsn + 1 == end_lsn;
+                if image_is_newer || image_exact_match {
+                    Some(SearchResult {
+                        layer: image,
+                        lsn_floor: img_lsn,
+                    })
                 } else {
-                    latest_delta.replace(Arc::clone(l));
+                    let lsn_floor =
+                        std::cmp::max(delta.get_lsn_range().start, image.get_lsn_range().start + 1);
+                    Some(SearchResult {
+                        layer: delta,
+                        lsn_floor,
+                    })
                 }
             }
         }
-        if let Some(l) = latest_delta {
-            trace!(
-                "found (old) layer {} for request on {key} at {end_lsn}",
-                l.short_id(),
-            );
-            let lsn_floor = std::cmp::max(
-                Lsn(latest_img_lsn.unwrap_or(Lsn(0)).0 + 1),
-                l.get_lsn_range().start,
-            );
-            Some(SearchResult {
-                lsn_floor,
-                layer: l,
-            })
-        } else if let Some(l) = latest_img {
-            trace!("found img layer and no deltas for request on {key} at {end_lsn}");
-            Some(SearchResult {
-                lsn_floor: latest_img_lsn.unwrap(),
-                layer: l,
-            })
-        } else {
-            trace!("no layer found for request on {key} at {end_lsn}");
-            None
-        }
+    }
+
+    /// Start a batch of updates, applied on drop
+    pub fn batch_update(&mut self) -> BatchedUpdates<'_, L> {
+        BatchedUpdates { layer_map: self }
     }
 
     ///
     /// Insert an on-disk layer
     ///
-    pub fn insert_historic(&mut self, layer: Arc<L>) {
-        if layer.get_key_range() == (Key::MIN..Key::MAX) {
-            self.l0_delta_layers.push(layer.clone());
+    /// Helper function for BatchedUpdates::insert_historic
+    ///
+    pub(self) fn insert_historic_noflush(&mut self, layer: Arc<L>) {
+        let kr = layer.get_key_range();
+        let lr = layer.get_lsn_range();
+        self.historic.insert(
+            historic_layer_coverage::LayerKey {
+                key: kr.start.to_i128()..kr.end.to_i128(),
+                lsn: lr.start.0..lr.end.0,
+                is_image: !layer.is_incremental(),
+            },
+            Arc::clone(&layer),
+        );
+
+        if Self::is_l0(&layer) {
+            self.l0_delta_layers.push(layer);
         }
-        self.historic_layers.insert(LayerRTreeObject::new(layer));
+
         NUM_ONDISK_LAYERS.inc();
     }
 
     ///
     /// Remove an on-disk layer from the map.
     ///
-    /// This should be called when the corresponding file on disk has been deleted.
+    /// Helper function for BatchedUpdates::remove_historic
     ///
-    pub fn remove_historic(&mut self, layer: Arc<L>) {
-        if layer.get_key_range() == (Key::MIN..Key::MAX) {
+    pub fn remove_historic_noflush(&mut self, layer: Arc<L>) {
+        let kr = layer.get_key_range();
+        let lr = layer.get_lsn_range();
+        self.historic.remove(historic_layer_coverage::LayerKey {
+            key: kr.start.to_i128()..kr.end.to_i128(),
+            lsn: lr.start.0..lr.end.0,
+            is_image: !layer.is_incremental(),
+        });
+
+        if Self::is_l0(&layer) {
             let len_before = self.l0_delta_layers.len();
 
             // FIXME: ptr_eq might fail to return true for 'dyn'
@@ -411,98 +297,57 @@ where
                 .retain(|other| !Arc::ptr_eq(other, &layer));
             assert_eq!(self.l0_delta_layers.len(), len_before - 1);
         }
-        assert!(self
-            .historic_layers
-            .remove(&LayerRTreeObject::new(layer))
-            .is_some());
+
         NUM_ONDISK_LAYERS.dec();
     }
 
+    /// Helper function for BatchedUpdates::drop.
+    pub(self) fn flush_updates(&mut self) {
+        self.historic.rebuild();
+    }
+
     /// Is there a newer image layer for given key- and LSN-range? Or a set
     /// of image layers within the specified lsn range that cover the entire
     /// specified key range?
     ///
     /// This is used for garbage collection, to determine if an old layer can
     /// be deleted.
-    pub fn image_layer_exists(
-        &self,
-        key_range: &Range<Key>,
-        lsn_range: &Range<Lsn>,
-    ) -> Result<bool> {
-        let mut range_remain = key_range.clone();
+    pub fn image_layer_exists(&self, key: &Range<Key>, lsn: &Range<Lsn>) -> Result<bool> {
+        if key.is_empty() {
+            // Vacuously true. There's a newer image for all 0 of the kerys in the range.
+            return Ok(true);
+        }
 
-        loop {
-            let mut made_progress = false;
-            let envelope = AABB::from_corners(
-                [
-                    IntKey::from(range_remain.start.to_i128()),
-                    IntKey::from(lsn_range.start.0 as i128),
-                ],
-                [
-                    IntKey::from(range_remain.end.to_i128() - 1),
-                    IntKey::from(lsn_range.end.0 as i128 - 1),
-                ],
-            );
-            for e in self
-                .historic_layers
-                .locate_in_envelope_intersecting(&envelope)
-            {
-                let l = &e.layer;
-                if l.is_incremental() {
-                    continue;
-                }
-                let img_lsn = l.get_lsn_range().start;
-                if l.get_key_range().contains(&range_remain.start) && lsn_range.contains(&img_lsn) {
-                    made_progress = true;
-                    let img_key_end = l.get_key_range().end;
+        let version = match self.historic.get().unwrap().get_version(lsn.end.0 - 1) {
+            Some(v) => v,
+            None => return Ok(false),
+        };
 
-                    if img_key_end >= range_remain.end {
-                        return Ok(true);
-                    }
-                    range_remain.start = img_key_end;
-                }
-            }
+        let start = key.start.to_i128();
+        let end = key.end.to_i128();
 
-            if !made_progress {
+        let layer_covers = |layer: Option<Arc<L>>| match layer {
+            Some(layer) => layer.get_lsn_range().start >= lsn.start,
+            None => false,
+        };
+
+        // Check the start is covered
+        if !layer_covers(version.image_coverage.query(start)) {
+            return Ok(false);
+        }
+
+        // Check after all changes of coverage
+        for (_, change_val) in version.image_coverage.range(start..end) {
+            if !layer_covers(change_val) {
                 return Ok(false);
             }
         }
+
+        Ok(true)
     }
 
     pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<L>> {
-        self.historic_layers.iter().map(|e| e.layer.clone())
-    }
-
-    /// Find the last image layer that covers 'key', ignoring any image layers
-    /// newer than 'lsn'.
-    fn find_latest_image(&self, key: Key, lsn: Lsn) -> Option<Arc<L>> {
-        let mut candidate_lsn = Lsn(0);
-        let mut candidate = None;
-        let envelope = AABB::from_corners(
-            [IntKey::from(key.to_i128()), IntKey::from(0)],
-            [IntKey::from(key.to_i128()), IntKey::from(lsn.0 as i128)],
-        );
-        for e in self
-            .historic_layers
-            .locate_in_envelope_intersecting(&envelope)
-        {
-            let l = &e.layer;
-            if l.is_incremental() {
-                continue;
-            }
-
-            assert!(l.get_key_range().contains(&key));
-            let this_lsn = l.get_lsn_range().start;
-            assert!(this_lsn <= lsn);
-            if this_lsn < candidate_lsn {
-                // our previous candidate was better
-                continue;
-            }
-            candidate_lsn = this_lsn;
-            candidate = Some(Arc::clone(l));
-        }
-
-        candidate
+        self.historic.iter()
     }
 
     ///
@@ -518,94 +363,288 @@ where
         key_range: &Range<Key>,
         lsn: Lsn,
     ) -> Result<Vec<(Range<Key>, Option<Arc<L>>)>> {
-        let mut points = vec![key_range.start];
-        let envelope = AABB::from_corners(
-            [IntKey::from(key_range.start.to_i128()), IntKey::from(0)],
-            [
-                IntKey::from(key_range.end.to_i128()),
-                IntKey::from(lsn.0 as i128),
-            ],
-        );
-        for e in self
-            .historic_layers
-            .locate_in_envelope_intersecting(&envelope)
-        {
-            let l = &e.layer;
-            assert!(l.get_lsn_range().start <= lsn);
-            let range = l.get_key_range();
-            if key_range.contains(&range.start) {
-                points.push(l.get_key_range().start);
-            }
-            if key_range.contains(&range.end) {
-                points.push(l.get_key_range().end);
-            }
+        let version = match self.historic.get().unwrap().get_version(lsn.0) {
+            Some(v) => v,
+            None => return Ok(vec![]),
+        };
+
+        let start = key_range.start.to_i128();
+        let end = key_range.end.to_i128();
+
+        // Initialize loop variables
+        let mut coverage: Vec<(Range<Key>, Option<Arc<L>>)> = vec![];
+        let mut current_key = start;
+        let mut current_val = version.image_coverage.query(start);
+
+        // Loop through the change events and push intervals
+        for (change_key, change_val) in version.image_coverage.range(start..end) {
+            let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
+            coverage.push((kr, current_val.take()));
+            current_key = change_key;
+            current_val = change_val.clone();
         }
-        points.push(key_range.end);
 
-        points.sort();
-        points.dedup();
+        // Add the final interval
+        let kr = Key::from_i128(current_key)..Key::from_i128(end);
+        coverage.push((kr, current_val.take()));
 
-        // Ok, we now have a list of "interesting" points in the key space
-
-        // For each range between the points, find the latest image
-        let mut start = *points.first().unwrap();
-        let mut ranges = Vec::new();
-        for end in points[1..].iter() {
-            let img = self.find_latest_image(start, lsn);
-
-            ranges.push((start..*end, img));
-
-            start = *end;
-        }
-        Ok(ranges)
+        Ok(coverage)
     }
 
-    /// Count the height of the tallest stack of deltas in this 2d region.
+    pub fn is_l0(layer: &L) -> bool {
+        range_eq(&layer.get_key_range(), &(Key::MIN..Key::MAX))
+    }
+
+    /// This function determines which layers are counted in `count_deltas`:
+    /// layers that should count towards deciding whether or not to reimage
+    /// a certain partition range.
+    ///
+    /// There are two kinds of layers we currently consider reimage-worthy:
+    ///
+    /// Case 1: Non-L0 layers are currently reimage-worthy by default.
+    /// TODO Some of these layers are very sparse and cover the entire key
+    ///      range. Replacing 256MB of data (or less!) with terabytes of
+    ///      images doesn't seem wise. We need a better heuristic, possibly
+    ///      based on some of these factors:
+    ///      a) whether this layer has any wal in this partition range
+    ///      b) the size of the layer
+    ///      c) the number of images needed to cover it
+    ///      d) the estimated time until we'll have to reimage over it for GC
+    ///
+    /// Case 2: Since L0 layers by definition cover the entire key space, we consider
+    /// them reimage-worthy only when the entire key space can be covered by very few
+    /// images (currently 1).
+    /// TODO The optimal number should probably be slightly higher than 1, but to
+    ///      implement that we need to plumb a lot more context into this function
+    ///      than just the current partition_range.
+    pub fn is_reimage_worthy(layer: &L, partition_range: &Range<Key>) -> bool {
+        // Case 1
+        if !Self::is_l0(layer) {
+            return true;
+        }
+
+        // Case 2
+        if range_eq(partition_range, &(Key::MIN..Key::MAX)) {
+            return true;
+        }
+
+        false
+    }
+
+    /// Count the height of the tallest stack of reimage-worthy deltas
+    /// in this 2d region.
+    ///
+    /// If `limit` is provided we don't try to count above that number.
     ///
     /// This number is used to compute the largest number of deltas that
     /// we'll need to visit for any page reconstruction in this region.
     /// We use this heuristic to decide whether to create an image layer.
-    ///
-    /// TODO currently we just return the total number of deltas in the
-    ///      region, no matter if they're stacked on top of each other
-    ///      or next to each other.
-    pub fn count_deltas(&self, key_range: &Range<Key>, lsn_range: &Range<Lsn>) -> Result<usize> {
-        let mut result = 0;
-        if lsn_range.start >= lsn_range.end {
+    pub fn count_deltas(
+        &self,
+        key: &Range<Key>,
+        lsn: &Range<Lsn>,
+        limit: Option<usize>,
+    ) -> Result<usize> {
+        // We get the delta coverage of the region, and for each part of the coverage
+        // we recurse right underneath the delta. The recursion depth is limited by
+        // the largest result this function could return, which is in practice between
+        // 3 and 10 (since we usually try to create an image when the number gets larger).
+
+        if lsn.is_empty() || key.is_empty() || limit == Some(0) {
             return Ok(0);
         }
-        let envelope = AABB::from_corners(
-            [
-                IntKey::from(key_range.start.to_i128()),
-                IntKey::from(lsn_range.start.0 as i128),
-            ],
-            [
-                IntKey::from(key_range.end.to_i128() - 1),
-                IntKey::from(lsn_range.end.0 as i128 - 1),
-            ],
-        );
-        for e in self
-            .historic_layers
-            .locate_in_envelope_intersecting(&envelope)
-        {
-            let l = &e.layer;
-            if !l.is_incremental() {
-                continue;
-            }
-            assert!(range_overlaps(&l.get_lsn_range(), lsn_range));
-            assert!(range_overlaps(&l.get_key_range(), key_range));
 
-            // We ignore level0 delta layers. Unless the whole keyspace fits
-            // into one partition
-            if !range_eq(key_range, &(Key::MIN..Key::MAX))
-                && range_eq(&l.get_key_range(), &(Key::MIN..Key::MAX))
-            {
-                continue;
+        let version = match self.historic.get().unwrap().get_version(lsn.end.0 - 1) {
+            Some(v) => v,
+            None => return Ok(0),
+        };
+
+        let start = key.start.to_i128();
+        let end = key.end.to_i128();
+
+        // Initialize loop variables
+        let mut max_stacked_deltas = 0;
+        let mut current_key = start;
+        let mut current_val = version.delta_coverage.query(start);
+
+        // Loop through the delta coverage and recurse on each part
+        for (change_key, change_val) in version.delta_coverage.range(start..end) {
+            // If there's a relevant delta in this part, add 1 and recurse down
+            if let Some(val) = current_val {
+                if val.get_lsn_range().end > lsn.start {
+                    let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
+                    let lr = lsn.start..val.get_lsn_range().start;
+                    if !kr.is_empty() {
+                        let base_count = Self::is_reimage_worthy(&val, key) as usize;
+                        let new_limit = limit.map(|l| l - base_count);
+                        let max_stacked_deltas_underneath =
+                            self.count_deltas(&kr, &lr, new_limit)?;
+                        max_stacked_deltas = std::cmp::max(
+                            max_stacked_deltas,
+                            base_count + max_stacked_deltas_underneath,
+                        );
+                    }
+                }
             }
 
-            result += 1;
+            current_key = change_key;
+            current_val = change_val.clone();
         }
-        Ok(result)
+
+        // Consider the last part
+        if let Some(val) = current_val {
+            if val.get_lsn_range().end > lsn.start {
+                let kr = Key::from_i128(current_key)..Key::from_i128(end);
+                let lr = lsn.start..val.get_lsn_range().start;
+
+                if !kr.is_empty() {
+                    let base_count = Self::is_reimage_worthy(&val, key) as usize;
+                    let new_limit = limit.map(|l| l - base_count);
+                    let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit)?;
+                    max_stacked_deltas = std::cmp::max(
+                        max_stacked_deltas,
+                        base_count + max_stacked_deltas_underneath,
+                    );
+                }
+            }
+        }
+
+        Ok(max_stacked_deltas)
+    }
+
+    /// Count how many reimage-worthy layers we need to visit for given key-lsn pair.
+    ///
+    /// The `partition_range` argument is used as context for the reimage-worthiness decision.
+    ///
+    /// Used as a helper for correctness checks only. Performance not critical.
+    pub fn get_difficulty(&self, lsn: Lsn, key: Key, partition_range: &Range<Key>) -> usize {
+        match self.search(key, lsn) {
+            Some(search_result) => {
+                if search_result.layer.is_incremental() {
+                    (Self::is_reimage_worthy(&search_result.layer, partition_range) as usize)
+                        + self.get_difficulty(search_result.lsn_floor, key, partition_range)
+                } else {
+                    0
+                }
+            }
+            None => 0,
+        }
+    }
+
+    /// Used for correctness checking. Results are expected to be identical to
+    /// self.get_difficulty_map. Assumes self.search is correct.
+    pub fn get_difficulty_map_bruteforce(
+        &self,
+        lsn: Lsn,
+        partitioning: &KeyPartitioning,
+    ) -> Vec<usize> {
+        // Looking at the difficulty as a function of key, it could only increase
+        // when a delta layer starts or an image layer ends. Therefore it's sufficient
+        // to check the difficulties at:
+        // - the key.start for each non-empty part range
+        // - the key.start for each delta
+        // - the key.end for each image
+        let keys_iter: Box<dyn Iterator<Item = Key>> = {
+            let mut keys: Vec<Key> = self
+                .iter_historic_layers()
+                .map(|layer| {
+                    if layer.is_incremental() {
+                        layer.get_key_range().start
+                    } else {
+                        layer.get_key_range().end
+                    }
+                })
+                .collect();
+            keys.sort();
+            Box::new(keys.into_iter())
+        };
+        let mut keys_iter = keys_iter.peekable();
+
+        // Iter the partition and keys together and query all the necessary
+        // keys, computing the max difficulty for each part.
+        partitioning
+            .parts
+            .iter()
+            .map(|part| {
+                let mut difficulty = 0;
+                // Partition ranges are assumed to be sorted and disjoint
+                // TODO assert it
+                for range in &part.ranges {
+                    if !range.is_empty() {
+                        difficulty =
+                            std::cmp::max(difficulty, self.get_difficulty(lsn, range.start, range));
+                    }
+                    while let Some(key) = keys_iter.peek() {
+                        if key >= &range.end {
+                            break;
+                        }
+                        let key = keys_iter.next().unwrap();
+                        if key < range.start {
+                            continue;
+                        }
+                        difficulty =
+                            std::cmp::max(difficulty, self.get_difficulty(lsn, key, range));
+                    }
+                }
+                difficulty
+            })
+            .collect()
+    }
+
+    /// For each part of a keyspace partitioning, return the maximum number of layers
+    /// that would be needed for page reconstruction in that part at the given LSN.
+    ///
+    /// If `limit` is provided we don't try to count above that number.
+    ///
+    /// This method is used to decide where to create new image layers. Computing the
+    /// result for the entire partitioning at once allows this function to be more
+    /// efficient, and further optimization is possible by using iterators instead,
+    /// to allow early return.
+    ///
+    /// TODO actually use this method instead of count_deltas. Currently we only use
+    ///      it for benchmarks.
+    pub fn get_difficulty_map(
+        &self,
+        lsn: Lsn,
+        partitioning: &KeyPartitioning,
+        limit: Option<usize>,
+    ) -> Vec<usize> {
+        // TODO This is a naive implementation. Perf improvements to do:
+        // 1. Instead of calling self.image_coverage and self.count_deltas,
+        //    iterate the image and delta coverage only once.
+        partitioning
+            .parts
+            .iter()
+            .map(|part| {
+                let mut difficulty = 0;
+                for range in &part.ranges {
+                    if limit == Some(difficulty) {
+                        break;
+                    }
+                    for (img_range, last_img) in self
+                        .image_coverage(range, lsn)
+                        .expect("why would this err?")
+                    {
+                        if limit == Some(difficulty) {
+                            break;
+                        }
+                        let img_lsn = if let Some(last_img) = last_img {
+                            last_img.get_lsn_range().end
+                        } else {
+                            Lsn(0)
+                        };
+
+                        if img_lsn < lsn {
+                            let num_deltas = self
+                                .count_deltas(&img_range, &(img_lsn..lsn), limit)
+                                .expect("why would this err lol?");
+                            difficulty = std::cmp::max(difficulty, num_deltas);
+                        }
+                    }
+                }
+                difficulty
+            })
+            .collect()
     }
 
     /// Return all L0 delta layers
@@ -629,8 +668,8 @@ where
         }
 
         println!("historic_layers:");
-        for e in self.historic_layers.iter() {
-            e.layer.dump(verbose)?;
+        for layer in self.iter_historic_layers() {
+            layer.dump(verbose)?;
         }
         println!("End dump LayerMap");
         Ok(())
diff --git a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
new file mode 100644
index 0000000000..46821aef15
--- /dev/null
+++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
@@ -0,0 +1,583 @@
+use std::collections::BTreeMap;
+use std::ops::Range;
+
+use tracing::info;
+
+use super::layer_coverage::LayerCoverageTuple;
+
+/// Layers in this module are identified and indexed by this data.
+///
+/// This is a helper struct to enable sorting layers by lsn.start.
+///
+/// These three values are enough to uniquely identify a layer, since
+/// a layer is obligated to contain all contents within range, so two
+/// deltas (or images) with the same range have identical content.
+#[derive(Debug, PartialEq, Eq, Clone)]
+pub struct LayerKey {
+    // TODO I use i128 and u64 because it was easy for prototyping,
+    //      testing, and benchmarking. If we can use the Lsn and Key
+    //      types without overhead that would be preferable.
+    pub key: Range<i128>,
+    pub lsn: Range<u64>,
+    pub is_image: bool,
+}
+
+impl PartialOrd for LayerKey {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for LayerKey {
+    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        // NOTE we really care about comparing by lsn.start first
+        self.lsn
+            .start
+            .cmp(&other.lsn.start)
+            .then(self.lsn.end.cmp(&other.lsn.end))
+            .then(self.key.start.cmp(&other.key.start))
+            .then(self.key.end.cmp(&other.key.end))
+            .then(self.is_image.cmp(&other.is_image))
+    }
+}
+
+/// Efficiently queryable layer coverage for each LSN.
+///
+/// Allows answering layer map queries very efficiently,
+/// but doesn't allow retroactive insertion, which is
+/// sometimes necessary. See BufferedHistoricLayerCoverage.
+pub struct HistoricLayerCoverage<Value> {
+    /// The latest state
+    head: LayerCoverageTuple<Value>,
+
+    /// All previous states
+    historic: BTreeMap<u64, LayerCoverageTuple<Value>>,
+}
+
+impl<T: Clone> Default for HistoricLayerCoverage<T> {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl<Value: Clone> HistoricLayerCoverage<Value> {
+    pub fn new() -> Self {
+        Self {
+            head: LayerCoverageTuple::default(),
+            historic: BTreeMap::default(),
+        }
+    }
+
+    /// Add a layer
+    ///
+    /// Panics if new layer has older lsn.start than an existing layer.
+    /// See BufferedHistoricLayerCoverage for a more general insertion method.
+    pub fn insert(&mut self, layer_key: LayerKey, value: Value) {
+        // It's only a persistent map, not a retroactive one
+        if let Some(last_entry) = self.historic.iter().next_back() {
+            let last_lsn = last_entry.0;
+            if layer_key.lsn.start < *last_lsn {
+                panic!("unexpected retroactive insert");
+            }
+        }
+
+        // Insert into data structure
+        if layer_key.is_image {
+            self.head
+                .image_coverage
+                .insert(layer_key.key, layer_key.lsn.clone(), value);
+        } else {
+            self.head
+                .delta_coverage
+                .insert(layer_key.key, layer_key.lsn.clone(), value);
+        }
+
+        // Remember history. Clone is O(1)
+        self.historic.insert(layer_key.lsn.start, self.head.clone());
+    }
+
+    /// Query at a particular LSN, inclusive
+    pub fn get_version(&self, lsn: u64) -> Option<&LayerCoverageTuple<Value>> {
+        match self.historic.range(..=lsn).next_back() {
+            Some((_, v)) => Some(v),
+            None => None,
+        }
+    }
+
+    /// Remove all entries after a certain LSN (inclusive)
+    pub fn trim(&mut self, begin: &u64) {
+        self.historic.split_off(begin);
+        self.head = self
+            .historic
+            .iter()
+            .rev()
+            .next()
+            .map(|(_, v)| v.clone())
+            .unwrap_or_default();
+    }
+}
+
+/// This is the most basic test that demonstrates intended usage.
+/// All layers in this test have height 1.
+#[test]
+fn test_persistent_simple() {
+    let mut map = HistoricLayerCoverage::<String>::new();
+    map.insert(
+        LayerKey {
+            key: 0..5,
+            lsn: 100..101,
+            is_image: true,
+        },
+        "Layer 1".to_string(),
+    );
+    map.insert(
+        LayerKey {
+            key: 3..9,
+            lsn: 110..111,
+            is_image: true,
+        },
+        "Layer 2".to_string(),
+    );
+    map.insert(
+        LayerKey {
+            key: 5..6,
+            lsn: 120..121,
+            is_image: true,
+        },
+        "Layer 3".to_string(),
+    );
+
+    // After Layer 1 insertion
+    let version = map.get_version(105).unwrap();
+    assert_eq!(version.image_coverage.query(1), Some("Layer 1".to_string()));
+    assert_eq!(version.image_coverage.query(4), Some("Layer 1".to_string()));
+
+    // After Layer 2 insertion
+    let version = map.get_version(115).unwrap();
+    assert_eq!(version.image_coverage.query(4), Some("Layer 2".to_string()));
+    assert_eq!(version.image_coverage.query(8), Some("Layer 2".to_string()));
+    assert_eq!(version.image_coverage.query(11), None);
+
+    // After Layer 3 insertion
+    let version = map.get_version(125).unwrap();
+    assert_eq!(version.image_coverage.query(4), Some("Layer 2".to_string()));
+    assert_eq!(version.image_coverage.query(5), Some("Layer 3".to_string()));
+    assert_eq!(version.image_coverage.query(7), Some("Layer 2".to_string()));
+}
+
+/// Cover simple off-by-one edge cases
+#[test]
+fn test_off_by_one() {
+    let mut map = HistoricLayerCoverage::<String>::new();
+    map.insert(
+        LayerKey {
+            key: 3..5,
+            lsn: 100..110,
+            is_image: true,
+        },
+        "Layer 1".to_string(),
+    );
+
+    // Check different LSNs
+    let version = map.get_version(99);
+    assert!(version.is_none());
+    let version = map.get_version(100).unwrap();
+    assert_eq!(version.image_coverage.query(4), Some("Layer 1".to_string()));
+    let version = map.get_version(110).unwrap();
+    assert_eq!(version.image_coverage.query(4), Some("Layer 1".to_string()));
+
+    // Check different keys
+    let version = map.get_version(105).unwrap();
+    assert_eq!(version.image_coverage.query(2), None);
+    assert_eq!(version.image_coverage.query(3), Some("Layer 1".to_string()));
+    assert_eq!(version.image_coverage.query(4), Some("Layer 1".to_string()));
+    assert_eq!(version.image_coverage.query(5), None);
+}
+
+/// Cover edge cases where layers begin or end on the same key
+#[test]
+fn test_key_collision() {
+    let mut map = HistoricLayerCoverage::<String>::new();
+
+    map.insert(
+        LayerKey {
+            key: 3..5,
+            lsn: 100..110,
+            is_image: true,
+        },
+        "Layer 10".to_string(),
+    );
+    map.insert(
+        LayerKey {
+            key: 5..8,
+            lsn: 100..110,
+            is_image: true,
+        },
+        "Layer 11".to_string(),
+    );
+    map.insert(
+        LayerKey {
+            key: 3..4,
+            lsn: 200..210,
+            is_image: true,
+        },
+        "Layer 20".to_string(),
+    );
+
+    // Check after layer 11
+    let version = map.get_version(105).unwrap();
+    assert_eq!(version.image_coverage.query(2), None);
+    assert_eq!(
+        version.image_coverage.query(3),
+        Some("Layer 10".to_string())
+    );
+    assert_eq!(
+        version.image_coverage.query(5),
+        Some("Layer 11".to_string())
+    );
+    assert_eq!(
+        version.image_coverage.query(7),
+        Some("Layer 11".to_string())
+    );
+    assert_eq!(version.image_coverage.query(8), None);
+
+    // Check after layer 20
+    let version = map.get_version(205).unwrap();
+    assert_eq!(version.image_coverage.query(2), None);
+    assert_eq!(
+        version.image_coverage.query(3),
+        Some("Layer 20".to_string())
+    );
+    assert_eq!(
+        version.image_coverage.query(5),
+        Some("Layer 11".to_string())
+    );
+    assert_eq!(
+        version.image_coverage.query(7),
+        Some("Layer 11".to_string())
+    );
+    assert_eq!(version.image_coverage.query(8), None);
+}
+
+/// Test when rectangles have nontrivial height and possibly overlap
+#[test]
+fn test_persistent_overlapping() {
+    let mut map = HistoricLayerCoverage::<String>::new();
+
+    // Add 3 key-disjoint layers with varying LSN ranges
+    map.insert(
+        LayerKey {
+            key: 1..2,
+            lsn: 100..200,
+            is_image: true,
+        },
+        "Layer 1".to_string(),
+    );
+    map.insert(
+        LayerKey {
+            key: 4..5,
+            lsn: 110..200,
+            is_image: true,
+        },
+        "Layer 2".to_string(),
+    );
+    map.insert(
+        LayerKey {
+            key: 7..8,
+            lsn: 120..300,
+            is_image: true,
+        },
+        "Layer 3".to_string(),
+    );
+
+    // Add wide and short layer
+    map.insert(
+        LayerKey {
+            key: 0..9,
+            lsn: 130..199,
+            is_image: true,
+        },
+        "Layer 4".to_string(),
+    );
+
+    // Add wide layer taller than some
+    map.insert(
+        LayerKey {
+            key: 0..9,
+            lsn: 140..201,
+            is_image: true,
+        },
+        "Layer 5".to_string(),
+    );
+
+    // Add wide layer taller than all
+    map.insert(
+        LayerKey {
+            key: 0..9,
+            lsn: 150..301,
+            is_image: true,
+        },
+        "Layer 6".to_string(),
+    );
+
+    // After layer 4 insertion
+    let version = map.get_version(135).unwrap();
+    assert_eq!(version.image_coverage.query(0), Some("Layer 4".to_string()));
+    assert_eq!(version.image_coverage.query(1), Some("Layer 1".to_string()));
+    assert_eq!(version.image_coverage.query(2), Some("Layer 4".to_string()));
+    assert_eq!(version.image_coverage.query(4), Some("Layer 2".to_string()));
+    assert_eq!(version.image_coverage.query(5), Some("Layer 4".to_string()));
+    assert_eq!(version.image_coverage.query(7), Some("Layer 3".to_string()));
+    assert_eq!(version.image_coverage.query(8), Some("Layer 4".to_string()));
+
+    // After layer 5 insertion
+    let version = map.get_version(145).unwrap();
+    assert_eq!(version.image_coverage.query(0), Some("Layer 5".to_string()));
+    assert_eq!(version.image_coverage.query(1), Some("Layer 5".to_string()));
+    assert_eq!(version.image_coverage.query(2), Some("Layer 5".to_string()));
+    assert_eq!(version.image_coverage.query(4), Some("Layer 5".to_string()));
+    assert_eq!(version.image_coverage.query(5), Some("Layer 5".to_string()));
+    assert_eq!(version.image_coverage.query(7), Some("Layer 3".to_string()));
+    assert_eq!(version.image_coverage.query(8), Some("Layer 5".to_string()));
+
+    // After layer 6 insertion
+    let version = map.get_version(155).unwrap();
+    assert_eq!(version.image_coverage.query(0), Some("Layer 6".to_string()));
+    assert_eq!(version.image_coverage.query(1), Some("Layer 6".to_string()));
+    assert_eq!(version.image_coverage.query(2), Some("Layer 6".to_string()));
+    assert_eq!(version.image_coverage.query(4), Some("Layer 6".to_string()));
+    assert_eq!(version.image_coverage.query(5), Some("Layer 6".to_string()));
+    assert_eq!(version.image_coverage.query(7), Some("Layer 6".to_string()));
+    assert_eq!(version.image_coverage.query(8), Some("Layer 6".to_string()));
+}
+
+/// Wrapper for HistoricLayerCoverage that allows us to hack around the lack
+/// of support for retroactive insertion by rebuilding the map since the
+/// change.
+///
+/// Why is this needed? We most often insert new layers with newer LSNs,
+/// but during compaction we create layers with non-latest LSN, and during
+/// GC we delete historic layers.
+///
+/// Even though rebuilding is an expensive (N log N) solution to the problem,
+/// it's not critical since we do something equally expensive just to decide
+/// whether or not to create new image layers.
+/// TODO It's not expensive but it's not great to hold a layer map write lock
+///      for that long.
+///
+/// If this becomes an actual bottleneck, one solution would be to build a
+/// segment tree that holds PersistentLayerMaps. Though this would mean that
+/// we take an additional log(N) performance hit for queries, which will probably
+/// still be more critical.
+///
+/// See this for more on persistent and retroactive techniques:
+/// https://www.youtube.com/watch?v=WqCWghETNDc&t=581s
+pub struct BufferedHistoricLayerCoverage<Value> {
+    /// A persistent layer map that we rebuild when we need to retroactively update
+    historic_coverage: HistoricLayerCoverage<Value>,
+
+    /// We buffer insertion into the PersistentLayerMap to decrease the number of rebuilds.
+    buffer: BTreeMap<LayerKey, Option<Value>>,
+
+    /// All current layers. This is not used for search. Only to make rebuilds easier.
+    layers: BTreeMap<LayerKey, Value>,
+}
+
+impl<T: std::fmt::Debug> std::fmt::Debug for BufferedHistoricLayerCoverage<T> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("RetroactiveLayerMap")
+            .field("buffer", &self.buffer)
+            .field("layers", &self.layers)
+            .finish()
+    }
+}
+
+impl<T: Clone> Default for BufferedHistoricLayerCoverage<T> {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {
+    pub fn new() -> Self {
+        Self {
+            historic_coverage: HistoricLayerCoverage::<Value>::new(),
+            buffer: BTreeMap::new(),
+            layers: BTreeMap::new(),
+        }
+    }
+
+    pub fn insert(&mut self, layer_key: LayerKey, value: Value) {
+        self.buffer.insert(layer_key, Some(value));
+    }
+
+    pub fn remove(&mut self, layer_key: LayerKey) {
+        self.buffer.insert(layer_key, None);
+    }
+
+    pub fn rebuild(&mut self) {
+        // Find the first LSN that needs to be rebuilt
+        let rebuild_since: u64 = match self.buffer.iter().next() {
+            Some((LayerKey { lsn, .. }, _)) => lsn.start,
+            None => return, // No need to rebuild if buffer is empty
+        };
+
+        // Apply buffered updates to self.layers
+        let num_updates = self.buffer.len();
+        self.buffer.retain(|layer_key, layer| {
+            match layer {
+                Some(l) => {
+                    self.layers.insert(layer_key.clone(), l.clone());
+                }
+                None => {
+                    self.layers.remove(layer_key);
+                }
+            };
+            false
+        });
+
+        // Rebuild
+        let mut num_inserted = 0;
+        self.historic_coverage.trim(&rebuild_since);
+        for (layer_key, layer) in self.layers.range(
+            LayerKey {
+                lsn: rebuild_since..0,
+                key: 0..0,
+                is_image: false,
+            }..,
+        ) {
+            self.historic_coverage
+                .insert(layer_key.clone(), layer.clone());
+            num_inserted += 1;
+        }
+
+        // TODO maybe only warn if ratio is at least 10
+        info!(
+            "Rebuilt layer map. Did {} insertions to process a batch of {} updates.",
+            num_inserted, num_updates,
+        )
+    }
+
+    /// Iterate all the layers
+    pub fn iter(&self) -> impl '_ + Iterator<Item = Value> {
+        // NOTE we can actually perform this without rebuilding,
+        //      but it's not necessary for now.
+        if !self.buffer.is_empty() {
+            panic!("rebuild pls")
+        }
+
+        self.layers.values().cloned()
+    }
+
+    /// Return a reference to a queryable map, assuming all updates
+    /// have already been processed using self.rebuild()
+    pub fn get(&self) -> anyhow::Result<&HistoricLayerCoverage<Value>> {
+        // NOTE we error here instead of implicitly rebuilding because
+        //      rebuilding is somewhat expensive.
+        // TODO maybe implicitly rebuild and log/sentry an error?
+        if !self.buffer.is_empty() {
+            anyhow::bail!("rebuild required")
+        }
+
+        Ok(&self.historic_coverage)
+    }
+}
+
+#[test]
+fn test_retroactive_regression_1() {
+    let mut map = BufferedHistoricLayerCoverage::new();
+
+    map.insert(
+        LayerKey {
+            key: 0..21267647932558653966460912964485513215,
+            lsn: 23761336..23761457,
+            is_image: false,
+        },
+        "sdfsdfs".to_string(),
+    );
+
+    map.rebuild();
+
+    let version = map.get().unwrap().get_version(23761457).unwrap();
+    assert_eq!(
+        version.delta_coverage.query(100),
+        Some("sdfsdfs".to_string())
+    );
+}
+
+#[test]
+fn test_retroactive_simple() {
+    let mut map = BufferedHistoricLayerCoverage::new();
+
+    // Append some images in increasing LSN order
+    map.insert(
+        LayerKey {
+            key: 0..5,
+            lsn: 100..101,
+            is_image: true,
+        },
+        "Image 1".to_string(),
+    );
+    map.insert(
+        LayerKey {
+            key: 3..9,
+            lsn: 110..111,
+            is_image: true,
+        },
+        "Image 2".to_string(),
+    );
+    map.insert(
+        LayerKey {
+            key: 4..6,
+            lsn: 120..121,
+            is_image: true,
+        },
+        "Image 3".to_string(),
+    );
+    map.insert(
+        LayerKey {
+            key: 8..9,
+            lsn: 120..121,
+            is_image: true,
+        },
+        "Image 4".to_string(),
+    );
+
+    // Add a delta layer out of order
+    map.insert(
+        LayerKey {
+            key: 2..5,
+            lsn: 105..106,
+            is_image: true,
+        },
+        "Delta 1".to_string(),
+    );
+
+    // Rebuild so we can start querying
+    map.rebuild();
+
+    // Query key 4
+    let version = map.get().unwrap().get_version(90);
+    assert!(version.is_none());
+    let version = map.get().unwrap().get_version(102).unwrap();
+    assert_eq!(version.image_coverage.query(4), Some("Image 1".to_string()));
+    let version = map.get().unwrap().get_version(107).unwrap();
+    assert_eq!(version.image_coverage.query(4), Some("Delta 1".to_string()));
+    let version = map.get().unwrap().get_version(115).unwrap();
+    assert_eq!(version.image_coverage.query(4), Some("Image 2".to_string()));
+    let version = map.get().unwrap().get_version(125).unwrap();
+    assert_eq!(version.image_coverage.query(4), Some("Image 3".to_string()));
+
+    // Remove Image 3
+    map.remove(LayerKey {
+        key: 4..6,
+        lsn: 120..121,
+        is_image: true,
+    });
+    map.rebuild();
+
+    // Check deletion worked
+    let version = map.get().unwrap().get_version(125).unwrap();
+    assert_eq!(version.image_coverage.query(4), Some("Image 2".to_string()));
+    assert_eq!(version.image_coverage.query(8), Some("Image 4".to_string()));
+}
diff --git a/pageserver/src/tenant/layer_map/layer_coverage.rs b/pageserver/src/tenant/layer_map/layer_coverage.rs
new file mode 100644
index 0000000000..4e3b4516dc
--- /dev/null
+++ b/pageserver/src/tenant/layer_map/layer_coverage.rs
@@ -0,0 +1,154 @@
+use std::ops::Range;
+
+// TODO the `im` crate has 20x more downloads and also has
+// persistent/immutable BTree. It also runs a bit faster but
+// results are not the same on some tests.
+use rpds::RedBlackTreeMapSync;
+
+/// Data structure that can efficiently:
+/// - find the latest layer by lsn.end at a given key
+/// - iterate the latest layers in a key range
+/// - insert layers in non-decreasing lsn.start order
+///
+/// The struct is parameterized over Value for easier
+/// testing, but in practice it's some sort of layer.
+pub struct LayerCoverage<Value> {
+    /// For every change in coverage (as we sweep the key space)
+    /// we store (lsn.end, value).
+    ///
+    /// We use an immutable/persistent tree so that we can keep historic
+    /// versions of this coverage without cloning the whole thing and
+    /// incurring quadratic memory cost. See HistoricLayerCoverage.
+    ///
+    /// We use the Sync version of the map because we want Self to
+    /// be Sync. Using nonsync might be faster, if we can work with
+    /// that.
+    nodes: RedBlackTreeMapSync<i128, Option<(u64, Value)>>,
+}
+
+impl<T: Clone> Default for LayerCoverage<T> {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl<Value: Clone> LayerCoverage<Value> {
+    pub fn new() -> Self {
+        Self {
+            nodes: RedBlackTreeMapSync::default(),
+        }
+    }
+
+    /// Helper function to subdivide the key range without changing any values
+    ///
+    /// Complexity: O(log N)
+    fn add_node(&mut self, key: i128) {
+        let value = match self.nodes.range(..=key).last() {
+            Some((_, Some(v))) => Some(v.clone()),
+            Some((_, None)) => None,
+            None => None,
+        };
+        self.nodes.insert_mut(key, value);
+    }
+
+    /// Insert a layer.
+    ///
+    /// Complexity: worst case O(N), in practice O(log N). See NOTE in implementation.
+    pub fn insert(&mut self, key: Range<i128>, lsn: Range<u64>, value: Value) {
+        // Add nodes at endpoints
+        //
+        // NOTE The order of lines is important. We add nodes at the start
+        // and end of the key range **before updating any nodes** in order
+        // to pin down the current coverage outside of the relevant key range.
+        // Only the coverage inside the layer's key range should change.
+        self.add_node(key.start);
+        self.add_node(key.end);
+
+        // Raise the height where necessary
+        //
+        // NOTE This loop is worst case O(N), but amortized O(log N) in the special
+        // case when rectangles have no height. In practice I don't think we'll see
+        // the kind of layer intersections needed to trigger O(N) behavior. The worst
+        // case is N/2 horizontal layers overlapped with N/2 vertical layers in a
+        // grid pattern.
+        let mut to_update = Vec::new();
+        let mut to_remove = Vec::new();
+        let mut prev_covered = false;
+        for (k, node) in self.nodes.range(key.clone()) {
+            let needs_cover = match node {
+                None => true,
+                Some((h, _)) => h < &lsn.end,
+            };
+            if needs_cover {
+                match prev_covered {
+                    true => to_remove.push(*k),
+                    false => to_update.push(*k),
+                }
+            }
+            prev_covered = needs_cover;
+        }
+        if !prev_covered {
+            to_remove.push(key.end);
+        }
+        for k in to_update {
+            self.nodes.insert_mut(k, Some((lsn.end, value.clone())));
+        }
+        for k in to_remove {
+            self.nodes.remove_mut(&k);
+        }
+    }
+
+    /// Get the latest (by lsn.end) layer at a given key
+    ///
+    /// Complexity: O(log N)
+    pub fn query(&self, key: i128) -> Option<Value> {
+        self.nodes
+            .range(..=key)
+            .rev()
+            .next()?
+            .1
+            .as_ref()
+            .map(|(_, v)| v.clone())
+    }
+
+    /// Iterate the changes in layer coverage in a given range. You will likely
+    /// want to start with self.query(key.start), and then follow up with self.range
+    ///
+    /// Complexity: O(log N + result_size)
+    pub fn range(&self, key: Range<i128>) -> impl '_ + Iterator<Item = (i128, Option<Value>)> {
+        self.nodes
+            .range(key)
+            .map(|(k, v)| (*k, v.as_ref().map(|x| x.1.clone())))
+    }
+
+    /// O(1) clone
+    pub fn clone(&self) -> Self {
+        Self {
+            nodes: self.nodes.clone(),
+        }
+    }
+}
+
+/// Image and delta coverage at a specific LSN.
+pub struct LayerCoverageTuple<Value> {
+    pub image_coverage: LayerCoverage<Value>,
+    pub delta_coverage: LayerCoverage<Value>,
+}
+
+impl<T: Clone> Default for LayerCoverageTuple<T> {
+    fn default() -> Self {
+        Self {
+            image_coverage: LayerCoverage::default(),
+            delta_coverage: LayerCoverage::default(),
+        }
+    }
+}
+
+impl<Value: Clone> LayerCoverageTuple<Value> {
+    pub fn clone(&self) -> Self {
+        Self {
+            image_coverage: self.image_coverage.clone(),
+            delta_coverage: self.delta_coverage.clone(),
+        }
+    }
+}
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index dce7cd8bae..a9edee3794 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -16,6 +16,7 @@ use remote_storage::GenericRemoteStorage;
 use utils::crashsafe;
 
 use crate::config::PageServerConf;
+use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::{Tenant, TenantState};
@@ -24,8 +25,35 @@ use crate::IGNORED_TENANT_FILE_NAME;
 use utils::fs_ext::PathExt;
 use utils::id::{TenantId, TimelineId};
 
-static TENANTS: Lazy<RwLock<HashMap<TenantId, Arc<Tenant>>>> =
-    Lazy::new(|| RwLock::new(HashMap::new()));
+/// The tenants known to the pageserver.
+/// The enum variants are used to distinguish the different states that the pageserver can be in.
+enum TenantsMap {
+    /// [`init_tenant_mgr`] is not done yet.
+    Initializing,
+    /// [`init_tenant_mgr`] is done, all on-disk tenants have been loaded.
+    /// New tenants can be added using [`tenant_map_insert`].
+    Open(HashMap<TenantId, Arc<Tenant>>),
+    /// The pageserver has entered shutdown mode via [`shutdown_all_tenants`].
+    /// Existing tenants are still accessible, but no new tenants can be created.
+    ShuttingDown(HashMap<TenantId, Arc<Tenant>>),
+}
+
+impl TenantsMap {
+    fn get(&self, tenant_id: &TenantId) -> Option<&Arc<Tenant>> {
+        match self {
+            TenantsMap::Initializing => None,
+            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m.get(tenant_id),
+        }
+    }
+    fn remove(&mut self, tenant_id: &TenantId) -> Option<Arc<Tenant>> {
+        match self {
+            TenantsMap::Initializing => None,
+            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m.remove(tenant_id),
+        }
+    }
+}
+
+static TENANTS: Lazy<RwLock<TenantsMap>> = Lazy::new(|| RwLock::new(TenantsMap::Initializing));
 
 /// Initialize repositories with locally available timelines.
 /// Timelines that are only partially available locally (remote storage has more data than this pageserver)
@@ -36,13 +64,16 @@ pub async fn init_tenant_mgr(
     remote_storage: Option<GenericRemoteStorage>,
 ) -> anyhow::Result<()> {
     // Scan local filesystem for attached tenants
-    let mut number_of_tenants = 0;
     let tenants_dir = conf.tenants_path();
 
+    let mut tenants = HashMap::new();
+
     let mut dir_entries = fs::read_dir(&tenants_dir)
         .await
         .with_context(|| format!("Failed to list tenants dir {tenants_dir:?}"))?;
 
+    let ctx = RequestContext::todo_child(TaskKind::Startup, DownloadBehavior::Warn);
+
     loop {
         match dir_entries.next_entry().await {
             Ok(None) => break,
@@ -86,10 +117,10 @@ pub async fn init_tenant_mgr(
                         conf,
                         &tenant_dir_path,
                         remote_storage.clone(),
+                        &ctx,
                     ) {
                         Ok(tenant) => {
-                            TENANTS.write().await.insert(tenant.tenant_id(), tenant);
-                            number_of_tenants += 1;
+                            tenants.insert(tenant.tenant_id(), tenant);
                         }
                         Err(e) => {
                             error!("Failed to collect tenant files from dir {tenants_dir:?} for entry {dir_entry:?}, reason: {e:#}");
@@ -108,7 +139,11 @@ pub async fn init_tenant_mgr(
         }
     }
 
-    info!("Processed {number_of_tenants} local tenants at startup");
+    info!("Processed {} local tenants at startup", tenants.len());
+
+    let mut tenants_map = TENANTS.write().await;
+    assert!(matches!(&*tenants_map, &TenantsMap::Initializing));
+    *tenants_map = TenantsMap::Open(tenants);
     Ok(())
 }
 
@@ -116,6 +151,7 @@ pub fn schedule_local_tenant_processing(
     conf: &'static PageServerConf,
     tenant_path: &Path,
     remote_storage: Option<GenericRemoteStorage>,
+    ctx: &RequestContext,
 ) -> anyhow::Result<Arc<Tenant>> {
     anyhow::ensure!(
         tenant_path.is_dir(),
@@ -150,7 +186,7 @@ pub fn schedule_local_tenant_processing(
     let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() {
         info!("tenant {tenant_id} has attaching mark file, resuming its attach operation");
         if let Some(remote_storage) = remote_storage {
-            Tenant::spawn_attach(conf, tenant_id, remote_storage)
+            Tenant::spawn_attach(conf, tenant_id, remote_storage, ctx)
         } else {
             warn!("tenant {tenant_id} has attaching mark file, but pageserver has no remote storage configured");
             Tenant::create_broken_tenant(conf, tenant_id)
@@ -158,7 +194,7 @@ pub fn schedule_local_tenant_processing(
     } else {
         info!("tenant {tenant_id} is assumed to be loadable, starting load operation");
         // Start loading the tenant into memory. It will initially be in Loading state.
-        Tenant::spawn_load(conf, tenant_id, remote_storage)
+        Tenant::spawn_load(conf, tenant_id, remote_storage, ctx)
     };
     Ok(tenant)
 }
@@ -166,21 +202,44 @@ pub fn schedule_local_tenant_processing(
 ///
 /// Shut down all tenants. This runs as part of pageserver shutdown.
 ///
+/// NB: We leave the tenants in the map, so that they remain accessible through
+/// the management API until we shut it down. If we removed the shut-down tenants
+/// from the tenants map, the management API would return 404 for these tenants,
+/// because TenantsMap::get() now returns `None`.
+/// That could be easily misinterpreted by control plane, the consumer of the
+/// management API. For example, it could attach the tenant on a different pageserver.
+/// We would then be in split-brain once this pageserver restarts.
 pub async fn shutdown_all_tenants() {
+    // Prevent new tenants from being created.
     let tenants_to_shut_down = {
         let mut m = TENANTS.write().await;
-        let mut tenants_to_shut_down = Vec::with_capacity(m.len());
-        for (_, tenant) in m.drain() {
-            if tenant.is_active() {
-                // updates tenant state, forbidding new GC and compaction iterations from starting
-                tenant.set_stopping();
-                tenants_to_shut_down.push(tenant)
+        match &mut *m {
+            TenantsMap::Initializing => {
+                *m = TenantsMap::ShuttingDown(HashMap::default());
+                info!("tenants map is empty");
+                return;
+            }
+            TenantsMap::Open(tenants) => {
+                let tenants_clone = tenants.clone();
+                *m = TenantsMap::ShuttingDown(std::mem::take(tenants));
+                tenants_clone
+            }
+            TenantsMap::ShuttingDown(_) => {
+                error!("already shutting down, this function isn't supposed to be called more than once");
+                return;
             }
         }
-        drop(m);
-        tenants_to_shut_down
     };
 
+    let mut tenants_to_freeze_and_flush = Vec::with_capacity(tenants_to_shut_down.len());
+    for (_, tenant) in tenants_to_shut_down {
+        if tenant.is_active() {
+            // updates tenant state, forbidding new GC and compaction iterations from starting
+            tenant.set_stopping();
+            tenants_to_freeze_and_flush.push(tenant);
+        }
+    }
+
     // Shut down all existing walreceiver connections and stop accepting the new ones.
     task_mgr::shutdown_tasks(Some(TaskKind::WalReceiverManager), None, None).await;
 
@@ -192,7 +251,7 @@ pub async fn shutdown_all_tenants() {
     // should be no more activity in any of the repositories.
     //
     // On error, log it but continue with the shutdown for other tenants.
-    for tenant in tenants_to_shut_down {
+    for tenant in tenants_to_freeze_and_flush {
         let tenant_id = tenant.tenant_id();
         debug!("shutdown tenant {tenant_id}");
 
@@ -207,27 +266,23 @@ pub async fn create_tenant(
     tenant_conf: TenantConfOpt,
     tenant_id: TenantId,
     remote_storage: Option<GenericRemoteStorage>,
-) -> anyhow::Result<Option<Arc<Tenant>>> {
-    match TENANTS.write().await.entry(tenant_id) {
-        hash_map::Entry::Occupied(_) => {
-            debug!("tenant {tenant_id} already exists");
-            Ok(None)
-        }
-        hash_map::Entry::Vacant(v) => {
-            // Hold the write_tenants() lock, since all of this is local IO.
-            // If this section ever becomes contentious, introduce a new `TenantState::Creating`.
-            let tenant_directory = super::create_tenant_files(conf, tenant_conf, tenant_id)?;
-            let created_tenant =
-                schedule_local_tenant_processing(conf, &tenant_directory, remote_storage)?;
-            let crated_tenant_id = created_tenant.tenant_id();
-            anyhow::ensure!(
+    ctx: &RequestContext,
+) -> Result<Arc<Tenant>, TenantMapInsertError> {
+    tenant_map_insert(tenant_id, |vacant_entry| {
+        // We're holding the tenants lock in write mode while doing local IO.
+        // If this section ever becomes contentious, introduce a new `TenantState::Creating`
+        // and do the work in that state.
+        let tenant_directory = super::create_tenant_files(conf, tenant_conf, tenant_id)?;
+        let created_tenant =
+            schedule_local_tenant_processing(conf, &tenant_directory, remote_storage, ctx)?;
+        let crated_tenant_id = created_tenant.tenant_id();
+        anyhow::ensure!(
                 tenant_id == crated_tenant_id,
                 "loaded created tenant has unexpected tenant id (expect {tenant_id} != actual {crated_tenant_id})",
             );
-            v.insert(Arc::clone(&created_tenant));
-            Ok(Some(created_tenant))
-        }
-    }
+        vacant_entry.insert(Arc::clone(&created_tenant));
+        Ok(created_tenant)
+    }).await
 }
 
 pub async fn update_tenant_config(
@@ -236,10 +291,11 @@ pub async fn update_tenant_config(
     tenant_id: TenantId,
 ) -> anyhow::Result<()> {
     info!("configuring tenant {tenant_id}");
-    get_tenant(tenant_id, true)
-        .await?
-        .update_tenant_config(tenant_conf);
-    Tenant::persist_tenant_config(&conf.tenant_config_path(tenant_id), tenant_conf, false)?;
+    let tenant = get_tenant(tenant_id, true).await?;
+
+    tenant.update_tenant_config(tenant_conf);
+    let tenant_config_path = conf.tenant_config_path(tenant_id);
+    Tenant::persist_tenant_config(&tenant.tenant_id(), &tenant_config_path, tenant_conf, false)?;
     Ok(())
 }
 
@@ -260,10 +316,14 @@ pub async fn get_tenant(tenant_id: TenantId, active_only: bool) -> anyhow::Resul
     }
 }
 
-pub async fn delete_timeline(tenant_id: TenantId, timeline_id: TimelineId) -> anyhow::Result<()> {
+pub async fn delete_timeline(
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    ctx: &RequestContext,
+) -> anyhow::Result<()> {
     match get_tenant(tenant_id, true).await {
         Ok(tenant) => {
-            tenant.delete_timeline(timeline_id).await?;
+            tenant.delete_timeline(timeline_id, ctx).await?;
         }
         Err(e) => anyhow::bail!("Cannot access tenant {tenant_id} in local tenant state: {e:?}"),
     }
@@ -291,8 +351,9 @@ pub async fn load_tenant(
     conf: &'static PageServerConf,
     tenant_id: TenantId,
     remote_storage: Option<GenericRemoteStorage>,
-) -> anyhow::Result<()> {
-    run_if_no_tenant_in_memory(tenant_id, |vacant_entry| {
+    ctx: &RequestContext,
+) -> Result<(), TenantMapInsertError> {
+    tenant_map_insert(tenant_id, |vacant_entry| {
         let tenant_path = conf.tenant_path(&tenant_id);
         let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(tenant_id);
         if tenant_ignore_mark.exists() {
@@ -300,7 +361,7 @@ pub async fn load_tenant(
                 .with_context(|| format!("Failed to remove tenant ignore mark {tenant_ignore_mark:?} during tenant loading"))?;
         }
 
-        let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, remote_storage)
+        let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, remote_storage, ctx)
             .with_context(|| {
                 format!("Failed to schedule tenant processing in path {tenant_path:?}")
             })?;
@@ -329,16 +390,24 @@ pub async fn ignore_tenant(
     .await
 }
 
+#[derive(Debug, thiserror::Error)]
+pub enum TenantMapListError {
+    #[error("tenant map is still initiailizing")]
+    Initializing,
+}
+
 ///
 /// Get list of tenants, for the mgmt API
 ///
-pub async fn list_tenants() -> Vec<(TenantId, TenantState)> {
-    TENANTS
-        .read()
-        .await
-        .iter()
+pub async fn list_tenants() -> Result<Vec<(TenantId, TenantState)>, TenantMapListError> {
+    let tenants = TENANTS.read().await;
+    let m = match &*tenants {
+        TenantsMap::Initializing => return Err(TenantMapListError::Initializing),
+        TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m,
+    };
+    Ok(m.iter()
         .map(|(id, tenant)| (*id, tenant.current_state()))
-        .collect()
+        .collect())
 }
 
 /// Execute Attach mgmt API command.
@@ -349,34 +418,62 @@ pub async fn attach_tenant(
     conf: &'static PageServerConf,
     tenant_id: TenantId,
     remote_storage: GenericRemoteStorage,
-) -> anyhow::Result<()> {
-    run_if_no_tenant_in_memory(tenant_id, |vacant_entry| {
+    ctx: &RequestContext,
+) -> Result<(), TenantMapInsertError> {
+    tenant_map_insert(tenant_id, |vacant_entry| {
         let tenant_path = conf.tenant_path(&tenant_id);
         anyhow::ensure!(
             !tenant_path.exists(),
             "Cannot attach tenant {tenant_id}, local tenant directory already exists"
         );
 
-        let tenant = Tenant::spawn_attach(conf, tenant_id, remote_storage);
+        let tenant = Tenant::spawn_attach(conf, tenant_id, remote_storage, ctx);
         vacant_entry.insert(tenant);
-
         Ok(())
     })
     .await
 }
 
-async fn run_if_no_tenant_in_memory<F, V>(tenant_id: TenantId, run: F) -> anyhow::Result<V>
+#[derive(Debug, thiserror::Error)]
+pub enum TenantMapInsertError {
+    #[error("tenant map is still initializing")]
+    StillInitializing,
+    #[error("tenant map is shutting down")]
+    ShuttingDown,
+    #[error("tenant {0} already exists, state: {1:?}")]
+    TenantAlreadyExists(TenantId, TenantState),
+    #[error(transparent)]
+    Closure(#[from] anyhow::Error),
+}
+
+/// Give the given closure access to the tenants map entry for the given `tenant_id`, iff that
+/// entry is vacant. The closure is responsible for creating the tenant object and inserting
+/// it into the tenants map through the vacnt entry that it receives as argument.
+///
+/// NB: the closure should return quickly because the current implementation of tenants map
+/// serializes access through an `RwLock`.
+async fn tenant_map_insert<F, V>(
+    tenant_id: TenantId,
+    insert_fn: F,
+) -> Result<V, TenantMapInsertError>
 where
     F: FnOnce(hash_map::VacantEntry<TenantId, Arc<Tenant>>) -> anyhow::Result<V>,
 {
-    match TENANTS.write().await.entry(tenant_id) {
-        hash_map::Entry::Occupied(e) => {
-            anyhow::bail!(
-                "tenant {tenant_id} already exists, state: {:?}",
-                e.get().current_state()
-            )
-        }
-        hash_map::Entry::Vacant(v) => run(v),
+    let mut guard = TENANTS.write().await;
+    let m = match &mut *guard {
+        TenantsMap::Initializing => return Err(TenantMapInsertError::StillInitializing),
+        TenantsMap::ShuttingDown(_) => return Err(TenantMapInsertError::ShuttingDown),
+        TenantsMap::Open(m) => m,
+    };
+    match m.entry(tenant_id) {
+        hash_map::Entry::Occupied(e) => Err(TenantMapInsertError::TenantAlreadyExists(
+            tenant_id,
+            e.get().current_state(),
+        )),
+        hash_map::Entry::Vacant(v) => match insert_fn(v) {
+            Ok(v) => Ok(v),
+            Err(e) => Err(TenantMapInsertError::Closure(e)),
+        },
     }
 }
 
@@ -449,9 +546,9 @@ pub async fn immediate_gc(
     tenant_id: TenantId,
     timeline_id: TimelineId,
     gc_req: TimelineGcRequest,
+    ctx: &RequestContext,
 ) -> Result<tokio::sync::oneshot::Receiver<Result<GcResult, anyhow::Error>>, ApiError> {
     let guard = TENANTS.read().await;
-
     let tenant = guard
         .get(&tenant_id)
         .map(Arc::clone)
@@ -462,7 +559,8 @@ pub async fn immediate_gc(
     // Use tenant's pitr setting
     let pitr = tenant.get_pitr_interval();
 
-    // Run in task_mgr to avoid race with detach operation
+    // Run in task_mgr to avoid race with tenant_detach operation
+    let ctx = ctx.detached_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
     let (task_done, wait_task_done) = tokio::sync::oneshot::channel();
     task_mgr::spawn(
         &tokio::runtime::Handle::current(),
@@ -474,7 +572,7 @@ pub async fn immediate_gc(
         async move {
             fail::fail_point!("immediate_gc_task_pre");
             let result = tenant
-                .gc_iteration(Some(timeline_id), gc_horizon, pitr)
+                .gc_iteration(Some(timeline_id), gc_horizon, pitr, &ctx)
                 .instrument(info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id))
                 .await;
                 // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
@@ -497,6 +595,7 @@ pub async fn immediate_gc(
 pub async fn immediate_compact(
     tenant_id: TenantId,
     timeline_id: TimelineId,
+    ctx: &RequestContext,
 ) -> Result<tokio::sync::oneshot::Receiver<anyhow::Result<()>>, ApiError> {
     let guard = TENANTS.read().await;
 
@@ -510,7 +609,8 @@ pub async fn immediate_compact(
         .get_timeline(timeline_id, true)
         .map_err(ApiError::NotFound)?;
 
-    // Run in task_mgr to avoid race with detach operation
+    // Run in task_mgr to avoid race with tenant_detach operation
+    let ctx = ctx.detached_child(TaskKind::Compaction, DownloadBehavior::Download);
     let (task_done, wait_task_done) = tokio::sync::oneshot::channel();
     task_mgr::spawn(
         &tokio::runtime::Handle::current(),
@@ -523,7 +623,7 @@ pub async fn immediate_compact(
         false,
         async move {
             let result = timeline
-                .compact()
+                .compact(&ctx)
                 .instrument(
                     info_span!("manual_compact", tenant = %tenant_id, timeline = %timeline_id),
                 )
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 013591caee..3f69017160 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1010,7 +1010,10 @@ impl RemoteTimelineClient {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::tenant::harness::{TenantHarness, TIMELINE_ID};
+    use crate::{
+        tenant::harness::{TenantHarness, TIMELINE_ID},
+        DEFAULT_PG_VERSION,
+    };
     use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
     use std::{collections::HashSet, path::Path};
     use utils::lsn::Lsn;
@@ -1064,9 +1067,19 @@ mod tests {
     // Test scheduling
     #[test]
     fn upload_scheduling() -> anyhow::Result<()> {
+        // Use a current-thread runtime in the test
+        let runtime = Box::leak(Box::new(
+            tokio::runtime::Builder::new_current_thread()
+                .enable_all()
+                .build()?,
+        ));
+        let _entered = runtime.enter();
+
         let harness = TenantHarness::create("upload_scheduling")?;
+        let (tenant, ctx) = runtime.block_on(harness.load());
+        let _timeline =
+            tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
         let timeline_path = harness.timeline_path(&TIMELINE_ID);
-        std::fs::create_dir_all(&timeline_path)?;
 
         let remote_fs_dir = harness.conf.workdir.join("remote_fs");
         std::fs::create_dir_all(remote_fs_dir)?;
@@ -1084,14 +1097,6 @@ mod tests {
             storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
         };
 
-        // Use a current-thread runtime in the test
-        let runtime = Box::leak(Box::new(
-            tokio::runtime::Builder::new_current_thread()
-                .enable_all()
-                .build()?,
-        ));
-        let _entered = runtime.enter();
-
         // Test outline:
         //
         // Schedule upload of a bunch of layers. Check that they are started immediately, not queued
diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs
index 61cb32fc76..2fed4f88b3 100644
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -6,6 +6,7 @@ use anyhow::Context;
 use tokio::sync::oneshot::error::RecvError;
 use tokio::sync::Semaphore;
 
+use crate::context::RequestContext;
 use crate::pgdatadir_mapping::CalculateLogicalSizeError;
 
 use super::Tenant;
@@ -181,6 +182,7 @@ pub(super) async fn gather_inputs(
     tenant: &Tenant,
     limit: &Arc<Semaphore>,
     logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>,
+    ctx: &RequestContext,
 ) -> anyhow::Result<ModelInputs> {
     // with joinset, on drop, all of the tasks will just be de-scheduled, which we can use to
     // our advantage with `?` error handling.
@@ -188,7 +190,7 @@ pub(super) async fn gather_inputs(
 
     // refresh is needed to update gc related pitr_cutoff and horizon_cutoff
     tenant
-        .refresh_gc_info()
+        .refresh_gc_info(ctx)
         .await
         .context("Failed to refresh gc_info before gathering inputs")?;
 
@@ -329,7 +331,13 @@ pub(super) async fn gather_inputs(
             } else {
                 let timeline = Arc::clone(&timeline);
                 let parallel_size_calcs = Arc::clone(limit);
-                joinset.spawn(calculate_logical_size(parallel_size_calcs, timeline, *lsn));
+                let ctx = ctx.attached_child();
+                joinset.spawn(calculate_logical_size(
+                    parallel_size_calcs,
+                    timeline,
+                    *lsn,
+                    ctx,
+                ));
             }
         }
 
@@ -387,6 +395,7 @@ pub(super) async fn gather_inputs(
                 parallel_size_calcs,
                 timeline.clone(),
                 lsn,
+                ctx.attached_child(),
             ));
 
             if let Some(parent_id) = timeline.get_ancestor_timeline_id() {
@@ -582,13 +591,14 @@ async fn calculate_logical_size(
     limit: Arc<tokio::sync::Semaphore>,
     timeline: Arc<crate::tenant::Timeline>,
     lsn: utils::lsn::Lsn,
+    ctx: RequestContext,
 ) -> Result<TimelineAtLsnSizeResult, RecvError> {
     let _permit = tokio::sync::Semaphore::acquire_owned(limit)
         .await
         .expect("global semaphore should not had been closed");
 
     let size_res = timeline
-        .spawn_ondemand_logical_size_calculation(lsn)
+        .spawn_ondemand_logical_size_calculation(lsn, ctx)
         .instrument(info_span!("spawn_ondemand_logical_size_calculation"))
         .await?;
     Ok(TimelineAtLsnSizeResult(timeline, lsn, size_res))
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 6aee8ce23c..2149fc7eb7 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -196,3 +196,50 @@ pub fn downcast_remote_layer(
         None
     }
 }
+
+impl std::fmt::Debug for dyn Layer {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("Layer")
+            .field("short_id", &self.short_id())
+            .finish()
+    }
+}
+
+/// Holds metadata about a layer without any content. Used mostly for testing.
+pub struct LayerDescriptor {
+    pub key: Range<Key>,
+    pub lsn: Range<Lsn>,
+    pub is_incremental: bool,
+    pub short_id: String,
+}
+
+impl Layer for LayerDescriptor {
+    fn get_key_range(&self) -> Range<Key> {
+        self.key.clone()
+    }
+
+    fn get_lsn_range(&self) -> Range<Lsn> {
+        self.lsn.clone()
+    }
+
+    fn is_incremental(&self) -> bool {
+        self.is_incremental
+    }
+
+    fn get_value_reconstruct_data(
+        &self,
+        _key: Key,
+        _lsn_range: Range<Lsn>,
+        _reconstruct_data: &mut ValueReconstructState,
+    ) -> Result<ValueReconstructResult> {
+        todo!("This method shouldn't be part of the Layer trait")
+    }
+
+    fn short_id(&self) -> String {
+        self.short_id.clone()
+    }
+
+    fn dump(&self, _verbose: bool) -> Result<()> {
+        todo!()
+    }
+}
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index b7ad8fe791..b126545ee4 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -5,6 +5,7 @@ use std::ops::ControlFlow;
 use std::sync::Arc;
 use std::time::Duration;
 
+use crate::context::{DownloadBehavior, RequestContext};
 use crate::metrics::TENANT_TASK_EVENTS;
 use crate::task_mgr;
 use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
@@ -52,19 +53,20 @@ async fn compaction_loop(tenant_id: TenantId) {
     info!("starting");
     TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
     async {
+        let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
         loop {
             trace!("waking up");
 
             let tenant = tokio::select! {
                 _ = task_mgr::shutdown_watcher() => {
                     info!("received cancellation request");
-                    return;
+                return;
                 },
                 tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result {
                     ControlFlow::Break(()) => return,
                     ControlFlow::Continue(tenant) => tenant,
                 },
-            };
+        };
 
             let mut sleep_duration = tenant.get_compaction_period();
             if sleep_duration == Duration::ZERO {
@@ -73,7 +75,7 @@ async fn compaction_loop(tenant_id: TenantId) {
                 sleep_duration = Duration::from_secs(10);
             } else {
                 // Run compaction
-                if let Err(e) = tenant.compaction_iteration().await {
+                if let Err(e) = tenant.compaction_iteration(&ctx).await {
                     sleep_duration = wait_duration;
                     error!("Compaction failed, retrying in {:?}: {e:?}", sleep_duration);
                 }
@@ -103,6 +105,9 @@ async fn gc_loop(tenant_id: TenantId) {
     info!("starting");
     TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
     async {
+        // GC might require downloading, to find the cutoff LSN that corresponds to the
+        // cutoff specified as time.
+        let ctx = RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
         loop {
             trace!("waking up");
 
@@ -127,7 +132,7 @@ async fn gc_loop(tenant_id: TenantId) {
             } else {
                 // Run gc
                 if gc_horizon > 0 {
-                    if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval()).await
+                    if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx).await
                     {
                         sleep_duration = wait_duration;
                         error!("Gc failed, retrying in {:?}: {e:?}", sleep_duration);
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index d59858f582..0ca8a0e491 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1,5 +1,7 @@
 //!
 
+mod walreceiver;
+
 use anyhow::{anyhow, bail, ensure, Context};
 use bytes::Bytes;
 use fail::fail_point;
@@ -13,6 +15,7 @@ use pageserver_api::models::{
 use tokio::sync::{oneshot, watch, Semaphore, TryAcquireError};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
+use utils::id::TenantTimelineId;
 
 use std::cmp::{max, min, Ordering};
 use std::collections::HashMap;
@@ -23,6 +26,8 @@ use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering};
 use std::sync::{Arc, Mutex, MutexGuard, RwLock, Weak};
 use std::time::{Duration, Instant, SystemTime};
 
+use crate::broker_client::is_broker_client_initialized;
+use crate::context::{DownloadBehavior, RequestContext};
 use crate::tenant::remote_timeline_client::{self, index::LayerFileMetadata};
 use crate::tenant::storage_layer::{
     DeltaFileName, DeltaLayerWriter, ImageFileName, ImageLayerWriter, InMemoryLayer, LayerFileName,
@@ -58,11 +63,11 @@ use crate::page_cache;
 use crate::repository::GcResult;
 use crate::repository::{Key, Value};
 use crate::task_mgr::TaskKind;
-use crate::walreceiver::{is_broker_client_initialized, spawn_connection_manager_task};
 use crate::walredo::WalRedoManager;
 use crate::METADATA_FILE_NAME;
 use crate::ZERO_PAGE;
 use crate::{is_temporary, task_mgr};
+use walreceiver::spawn_connection_manager_task;
 
 use super::remote_timeline_client::index::IndexPart;
 use super::remote_timeline_client::RemoteTimelineClient;
@@ -128,7 +133,6 @@ pub struct Timeline {
     ancestor_timeline: Option<Arc<Timeline>>,
     ancestor_lsn: Lsn,
 
-    // Metrics
     metrics: TimelineMetrics,
 
     /// Ensures layers aren't frozen by checkpointer between
@@ -377,6 +381,12 @@ pub enum PageReconstructError {
     #[error(transparent)]
     Other(#[from] anyhow::Error), // source and Display delegate to anyhow::Error
 
+    /// The operation would require downloading a layer that is missing locally.
+    NeedsDownload(TenantTimelineId, LayerFileName),
+
+    /// The operation was cancelled
+    Cancelled,
+
     /// An error happened replaying WAL records
     #[error(transparent)]
     WalRedo(#[from] crate::walredo::WalRedoError),
@@ -386,6 +396,33 @@ impl std::fmt::Debug for PageReconstructError {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
         match self {
             Self::Other(err) => err.fmt(f),
+            Self::NeedsDownload(tenant_timeline_id, layer_file_name) => {
+                write!(
+                    f,
+                    "layer {}/{} needs download",
+                    tenant_timeline_id,
+                    layer_file_name.file_name()
+                )
+            }
+            Self::Cancelled => write!(f, "cancelled"),
+            Self::WalRedo(err) => err.fmt(f),
+        }
+    }
+}
+
+impl std::fmt::Display for PageReconstructError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
+        match self {
+            Self::Other(err) => err.fmt(f),
+            Self::NeedsDownload(tenant_timeline_id, layer_file_name) => {
+                write!(
+                    f,
+                    "layer {}/{} needs download",
+                    tenant_timeline_id,
+                    layer_file_name.file_name()
+                )
+            }
+            Self::Cancelled => write!(f, "cancelled"),
             Self::WalRedo(err) => err.fmt(f),
         }
     }
@@ -422,11 +459,24 @@ impl Timeline {
     /// an ancestor branch, for example, or waste a lot of cycles chasing the
     /// non-existing key.
     ///
-    pub async fn get(&self, key: Key, lsn: Lsn) -> Result<Bytes, PageReconstructError> {
+    pub async fn get(
+        &self,
+        key: Key,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> Result<Bytes, PageReconstructError> {
         if !lsn.is_valid() {
             return Err(PageReconstructError::Other(anyhow::anyhow!("Invalid LSN")));
         }
 
+        // XXX: structured stats collection for layer eviction here.
+        trace!(
+            "get page request for {}@{} from task kind {:?}",
+            key,
+            lsn,
+            ctx.task_kind()
+        );
+
         // Check the page cache. We will get back the most recent page with lsn <= `lsn`.
         // The cached image can be returned directly if there is no WAL between the cached image
         // and requested LSN. The cached image can also be used to reduce the amount of WAL needed
@@ -450,7 +500,7 @@ impl Timeline {
             img: cached_page_img,
         };
 
-        self.get_reconstruct_data(key, lsn, &mut reconstruct_state)
+        self.get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx)
             .await?;
 
         self.metrics
@@ -513,13 +563,25 @@ impl Timeline {
     /// You should call this before any of the other get_* or list_* functions. Calling
     /// those functions with an LSN that has been processed yet is an error.
     ///
-    pub async fn wait_lsn(&self, lsn: Lsn) -> anyhow::Result<()> {
+    pub async fn wait_lsn(
+        &self,
+        lsn: Lsn,
+        _ctx: &RequestContext, /* Prepare for use by cancellation */
+    ) -> anyhow::Result<()> {
         anyhow::ensure!(self.is_active(), "Cannot wait for Lsn on inactive timeline");
 
         // This should never be called from the WAL receiver, because that could lead
         // to a deadlock.
         anyhow::ensure!(
-            task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnection),
+            task_mgr::current_task_kind() != Some(TaskKind::WalReceiverManager),
+            "wait_lsn cannot be called in WAL receiver"
+        );
+        anyhow::ensure!(
+            task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionHandler),
+            "wait_lsn cannot be called in WAL receiver"
+        );
+        anyhow::ensure!(
+            task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionPoller),
             "wait_lsn cannot be called in WAL receiver"
         );
 
@@ -558,7 +620,7 @@ impl Timeline {
         self.flush_frozen_layers_and_wait().await
     }
 
-    pub async fn compact(&self) -> anyhow::Result<()> {
+    pub async fn compact(&self, ctx: &RequestContext) -> anyhow::Result<()> {
         let last_record_lsn = self.get_last_record_lsn();
 
         // Last record Lsn could be zero in case the timeline was just created
@@ -616,14 +678,16 @@ impl Timeline {
             .repartition(
                 self.get_last_record_lsn(),
                 self.get_compaction_target_size(),
+                ctx,
             )
             .await
         {
             Ok((partitioning, lsn)) => {
                 // 2. Create new image layers for partitions that have been modified
                 // "enough".
-                let layer_paths_to_upload =
-                    self.create_image_layers(&partitioning, lsn, false).await?;
+                let layer_paths_to_upload = self
+                    .create_image_layers(&partitioning, lsn, false, ctx)
+                    .await?;
                 if let Some(remote_client) = &self.remote_client {
                     for (path, layer_metadata) in layer_paths_to_upload {
                         remote_client.schedule_layer_file_upload(&path, &layer_metadata)?;
@@ -673,7 +737,10 @@ impl Timeline {
     /// the initial size calculation has not been run (gets triggered on the first size access).
     ///
     /// return size and boolean flag that shows if the size is exact
-    pub fn get_current_logical_size(self: &Arc<Self>) -> anyhow::Result<(u64, bool)> {
+    pub fn get_current_logical_size(
+        self: &Arc<Self>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<(u64, bool)> {
         let current_size = self.current_logical_size.current_size()?;
         debug!("Current size: {current_size:?}");
 
@@ -683,7 +750,7 @@ impl Timeline {
             (current_size, self.current_logical_size.initial_part_end)
         {
             is_exact = false;
-            self.try_spawn_size_init_task(init_lsn);
+            self.try_spawn_size_init_task(init_lsn, ctx);
         }
 
         Ok((size, is_exact))
@@ -729,16 +796,24 @@ impl Timeline {
         Ok(())
     }
 
+    pub fn activate(self: &Arc<Self>) {
+        self.set_state(TimelineState::Active);
+        self.launch_wal_receiver();
+    }
+
     pub fn set_state(&self, new_state: TimelineState) {
         match (self.current_state(), new_state) {
             (equal_state_1, equal_state_2) if equal_state_1 == equal_state_2 => {
-                debug!("Ignoring new state, equal to the existing one: {equal_state_2:?}");
+                warn!("Ignoring new state, equal to the existing one: {equal_state_2:?}");
+            }
+            (st, TimelineState::Loading) => {
+                error!("ignoring transition from {st:?} into Loading state");
             }
             (TimelineState::Broken, _) => {
                 error!("Ignoring state update {new_state:?} for broken tenant");
             }
             (TimelineState::Stopping, TimelineState::Active) => {
-                debug!("Not activating a Stopping timeline");
+                error!("Not activating a Stopping timeline");
             }
             (_, new_state) => {
                 self.state.send_replace(new_state);
@@ -812,7 +887,7 @@ impl Timeline {
         pg_version: u32,
     ) -> Arc<Self> {
         let disk_consistent_lsn = metadata.disk_consistent_lsn();
-        let (state, _) = watch::channel(TimelineState::Suspended);
+        let (state, _) = watch::channel(TimelineState::Loading);
 
         let (layer_flush_start_tx, _) = tokio::sync::watch::channel(0);
         let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(())));
@@ -884,6 +959,10 @@ impl Timeline {
             };
             result.repartition_threshold = result.get_checkpoint_distance() / 10;
             result
+                .metrics
+                .last_record_gauge
+                .set(disk_consistent_lsn.0 as i64);
+            result
         })
     }
 
@@ -909,22 +988,25 @@ impl Timeline {
 
         let layer_flush_start_rx = self.layer_flush_start_tx.subscribe();
         let self_clone = Arc::clone(self);
+
         info!("spawning flush loop");
         task_mgr::spawn(
-                    task_mgr::BACKGROUND_RUNTIME.handle(),
-                    task_mgr::TaskKind::LayerFlushTask,
-                    Some(self.tenant_id),
-                    Some(self.timeline_id),
-                    "layer flush task",
-                    false,
-                    async move {
-                         self_clone.flush_loop(layer_flush_start_rx).await;
-                         let mut flush_loop_state = self_clone.flush_loop_state.lock().unwrap();
-                         assert_eq!(*flush_loop_state, FlushLoopState::Running);
-                         *flush_loop_state  = FlushLoopState::Exited;
-                         Ok(()) }
-                    .instrument(info_span!(parent: None, "layer flush task", tenant = %self.tenant_id, timeline = %self.timeline_id))
-                );
+            task_mgr::BACKGROUND_RUNTIME.handle(),
+            task_mgr::TaskKind::LayerFlushTask,
+            Some(self.tenant_id),
+            Some(self.timeline_id),
+            "layer flush task",
+            false,
+            async move {
+                let background_ctx = RequestContext::todo_child(TaskKind::LayerFlushTask, DownloadBehavior::Error);
+                self_clone.flush_loop(layer_flush_start_rx, &background_ctx).await;
+                let mut flush_loop_state = self_clone.flush_loop_state.lock().unwrap();
+                assert_eq!(*flush_loop_state, FlushLoopState::Running);
+                *flush_loop_state  = FlushLoopState::Exited;
+                Ok(())
+            }
+            .instrument(info_span!(parent: None, "layer flush task", tenant = %self.tenant_id, timeline = %self.timeline_id))
+        );
 
         *flush_loop_state = FlushLoopState::Running;
     }
@@ -955,12 +1037,16 @@ impl Timeline {
             .unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag);
         drop(tenant_conf_guard);
         let self_clone = Arc::clone(self);
+        let background_ctx =
+            // XXX: this is a detached_child. Plumb through the ctx from call sites.
+            RequestContext::todo_child(TaskKind::WalReceiverManager, DownloadBehavior::Error);
         spawn_connection_manager_task(
             self_clone,
             walreceiver_connect_timeout,
             lagging_wal_timeout,
             max_lsn_wal_lag,
             crate::config::SAFEKEEPER_AUTH_TOKEN.get().cloned(),
+            background_ctx,
         );
     }
 
@@ -970,6 +1056,7 @@ impl Timeline {
     ///
     pub(super) fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> {
         let mut layers = self.layers.write().unwrap();
+        let mut updates = layers.batch_update();
         let mut num_layers = 0;
 
         let timer = self.metrics.load_layer_map_histo.start_timer();
@@ -1010,7 +1097,7 @@ impl Timeline {
 
                 trace!("found layer {}", layer.path().display());
                 total_physical_size += file_size;
-                layers.insert_historic(Arc::new(layer));
+                updates.insert_historic(Arc::new(layer));
                 num_layers += 1;
             } else if let Some(deltafilename) = DeltaFileName::parse_str(&fname) {
                 // Create a DeltaLayer struct for each delta file.
@@ -1041,7 +1128,7 @@ impl Timeline {
 
                 trace!("found layer {}", layer.path().display());
                 total_physical_size += file_size;
-                layers.insert_historic(Arc::new(layer));
+                updates.insert_historic(Arc::new(layer));
                 num_layers += 1;
             } else if fname == METADATA_FILE_NAME || fname.ends_with(".old") {
                 // ignore these
@@ -1067,6 +1154,7 @@ impl Timeline {
             }
         }
 
+        updates.flush();
         layers.next_open_layer_at = Some(Lsn(disk_consistent_lsn.0) + 1);
 
         info!(
@@ -1091,6 +1179,11 @@ impl Timeline {
         // Are we missing some files that are present in remote storage?
         // Create RemoteLayer instances for them.
         let mut local_only_layers = local_layers;
+
+        // We're holding a layer map lock for a while but this
+        // method is only called during init so it's fine.
+        let mut layer_map = self.layers.write().unwrap();
+        let mut updates = layer_map.batch_update();
         for remote_layer_name in &index_part.timeline_layers {
             let local_layer = local_only_layers.remove(remote_layer_name);
 
@@ -1129,7 +1222,7 @@ impl Timeline {
                             anyhow::bail!("could not rename file {local_layer_path:?}: {err:?}");
                         } else {
                             self.metrics.resident_physical_size_gauge.sub(local_size);
-                            self.layers.write().unwrap().remove_historic(local_layer);
+                            updates.remove_historic(local_layer);
                             // fall-through to adding the remote layer
                         }
                     } else {
@@ -1171,7 +1264,7 @@ impl Timeline {
                     );
                     let remote_layer = Arc::new(remote_layer);
 
-                    self.layers.write().unwrap().insert_historic(remote_layer);
+                    updates.insert_historic(remote_layer);
                 }
                 LayerFileName::Delta(deltafilename) => {
                     // Create a RemoteLayer for the delta file.
@@ -1194,13 +1287,14 @@ impl Timeline {
                         &remote_layer_metadata,
                     );
                     let remote_layer = Arc::new(remote_layer);
-                    self.layers.write().unwrap().insert_historic(remote_layer);
+                    updates.insert_historic(remote_layer);
                 }
                 #[cfg(test)]
                 LayerFileName::Test(_) => unreachable!(),
             }
         }
 
+        updates.flush();
         Ok(local_only_layers)
     }
 
@@ -1280,7 +1374,7 @@ impl Timeline {
         Ok(())
     }
 
-    fn try_spawn_size_init_task(self: &Arc<Self>, init_lsn: Lsn) {
+    fn try_spawn_size_init_task(self: &Arc<Self>, init_lsn: Lsn, ctx: &RequestContext) {
         let permit = match Arc::clone(&self.current_logical_size.initial_size_computation)
             .try_acquire_owned()
         {
@@ -1296,8 +1390,18 @@ impl Timeline {
             .initial_logical_size
             .get()
             .is_none());
+
+        info!(
+            "spawning logical size computation from context of task kind {:?}",
+            ctx.task_kind()
+        );
         // We need to start the computation task.
+        // It gets a separate context since it will outlive the request that called this function.
         let self_clone = Arc::clone(self);
+        let background_ctx = ctx.detached_child(
+            TaskKind::InitialLogicalSizeCalculation,
+            DownloadBehavior::Download,
+        );
         task_mgr::spawn(
             task_mgr::BACKGROUND_RUNTIME.handle(),
             task_mgr::TaskKind::InitialLogicalSizeCalculation,
@@ -1307,7 +1411,9 @@ impl Timeline {
             false,
             // NB: don't log errors here, task_mgr will do that.
             async move {
-                let calculated_size = match self_clone.logical_size_calculation_task(init_lsn).await
+                let calculated_size = match self_clone
+                    .logical_size_calculation_task(init_lsn, &background_ctx)
+                    .await
                 {
                     Ok(s) => s,
                     Err(CalculateLogicalSizeError::Cancelled) => {
@@ -1342,18 +1448,27 @@ impl Timeline {
     pub fn spawn_ondemand_logical_size_calculation(
         self: &Arc<Self>,
         lsn: Lsn,
+        ctx: RequestContext,
     ) -> oneshot::Receiver<Result<u64, CalculateLogicalSizeError>> {
         let (sender, receiver) = oneshot::channel();
         let self_clone = Arc::clone(self);
+        // XXX if our caller loses interest, i.e., ctx is cancelled,
+        // we should stop the size calculation work and return an error.
+        // That would require restructuring this function's API to
+        // return the result directly, instead of a Receiver for the result.
+        let ctx = ctx.detached_child(
+            TaskKind::OndemandLogicalSizeCalculation,
+            DownloadBehavior::Download,
+        );
         task_mgr::spawn(
             task_mgr::BACKGROUND_RUNTIME.handle(),
-            task_mgr::TaskKind::InitialLogicalSizeCalculation,
+            task_mgr::TaskKind::OndemandLogicalSizeCalculation,
             Some(self.tenant_id),
             Some(self.timeline_id),
             "ondemand logical size calculation",
             false,
             async move {
-                let res = self_clone.logical_size_calculation_task(lsn).await;
+                let res = self_clone.logical_size_calculation_task(lsn, &ctx).await;
                 let _ = sender.send(res).ok();
                 Ok(()) // Receiver is responsible for handling errors
             },
@@ -1365,6 +1480,7 @@ impl Timeline {
     async fn logical_size_calculation_task(
         self: &Arc<Self>,
         init_lsn: Lsn,
+        ctx: &RequestContext,
     ) -> Result<u64, CalculateLogicalSizeError> {
         let mut timeline_state_updates = self.subscribe_for_state_updates();
         let self_calculation = Arc::clone(self);
@@ -1372,12 +1488,13 @@ impl Timeline {
 
         let calculation = async {
             let cancel = cancel.child_token();
+            let ctx = ctx.attached_child();
             tokio::task::spawn_blocking(move || {
                 // Run in a separate thread since this can do a lot of
                 // synchronous file IO without .await inbetween
                 // if there are no RemoteLayers that would require downloading.
                 let h = tokio::runtime::Handle::current();
-                h.block_on(self_calculation.calculate_logical_size(init_lsn, cancel))
+                h.block_on(self_calculation.calculate_logical_size(init_lsn, cancel, &ctx))
             })
             .await
             .context("Failed to spawn calculation result task")?
@@ -1392,7 +1509,7 @@ impl Timeline {
                             TimelineState::Active => continue,
                             TimelineState::Broken
                             | TimelineState::Stopping
-                            | TimelineState::Suspended => {
+                            | TimelineState::Loading => {
                                 break format!("aborted because timeline became inactive (new state: {new_state:?})")
                             }
                         }
@@ -1432,10 +1549,11 @@ impl Timeline {
     ///
     /// NOTE: counted incrementally, includes ancestors. This can be a slow operation,
     /// especially if we need to download remote layers.
-    async fn calculate_logical_size(
+    pub async fn calculate_logical_size(
         &self,
         up_to_lsn: Lsn,
         cancel: CancellationToken,
+        ctx: &RequestContext,
     ) -> Result<u64, CalculateLogicalSizeError> {
         info!(
             "Calculating logical size for timeline {} at {}",
@@ -1478,7 +1596,7 @@ impl Timeline {
             self.metrics.logical_size_histo.start_timer()
         };
         let logical_size = self
-            .get_current_logical_size_non_incremental(up_to_lsn, cancel)
+            .get_current_logical_size_non_incremental(up_to_lsn, cancel, ctx)
             .await?;
         debug!("calculated logical size: {logical_size}");
         timer.stop_and_record();
@@ -1555,6 +1673,7 @@ impl Timeline {
         key: Key,
         request_lsn: Lsn,
         reconstruct_state: &mut ValueReconstructState,
+        ctx: &RequestContext,
     ) -> Result<(), PageReconstructError> {
         // Start from the current timeline.
         let mut timeline_owned;
@@ -1742,14 +1861,43 @@ impl Timeline {
                 let remote_layer_as_persistent: Arc<dyn PersistentLayer> =
                     Arc::clone(&remote_layer) as Arc<dyn PersistentLayer>;
                 let id = remote_layer_as_persistent.traversal_id();
-                info!("need remote layer {id}");
+                info!(
+                    "need remote layer {} for task kind {:?}",
+                    id,
+                    ctx.task_kind()
+                );
 
                 // The next layer doesn't exist locally. Need to download it.
                 // (The control flow is a bit complicated here because we must drop the 'layers'
                 // lock before awaiting on the Future.)
-                info!("on-demand downloading remote layer {id}");
-                timeline.download_remote_layer(remote_layer).await?;
-                continue 'layer_map_search;
+                match (
+                    ctx.download_behavior(),
+                    self.conf.ondemand_download_behavior_treat_error_as_warn,
+                ) {
+                    (DownloadBehavior::Download, _) => {
+                        info!(
+                            "on-demand downloading remote layer {id} for task kind {:?}",
+                            ctx.task_kind()
+                        );
+                        timeline.download_remote_layer(remote_layer).await?;
+                        continue 'layer_map_search;
+                    }
+                    (DownloadBehavior::Warn, _) | (DownloadBehavior::Error, true) => {
+                        warn!(
+                            "unexpectedly on-demand downloading remote layer {} for task kind {:?}",
+                            id,
+                            ctx.task_kind()
+                        );
+                        timeline.download_remote_layer(remote_layer).await?;
+                        continue 'layer_map_search;
+                    }
+                    (DownloadBehavior::Error, false) => {
+                        return Err(PageReconstructError::NeedsDownload(
+                            TenantTimelineId::new(self.tenant_id, self.timeline_id),
+                            remote_layer.file_name.clone(),
+                        ))
+                    }
+                }
             }
         }
     }
@@ -1871,7 +2019,11 @@ impl Timeline {
     }
 
     /// Layer flusher task's main loop.
-    async fn flush_loop(&self, mut layer_flush_start_rx: tokio::sync::watch::Receiver<u64>) {
+    async fn flush_loop(
+        &self,
+        mut layer_flush_start_rx: tokio::sync::watch::Receiver<u64>,
+        ctx: &RequestContext,
+    ) {
         info!("started flush loop");
         loop {
             tokio::select! {
@@ -1892,7 +2044,7 @@ impl Timeline {
                     // drop 'layers' lock to allow concurrent reads and writes
                 };
                 if let Some(layer_to_flush) = layer_to_flush {
-                    if let Err(err) = self.flush_frozen_layer(layer_to_flush).await {
+                    if let Err(err) = self.flush_frozen_layer(layer_to_flush, ctx).await {
                         error!("could not flush frozen layer: {err:?}");
                         break Err(err);
                     }
@@ -1957,8 +2109,12 @@ impl Timeline {
     }
 
     /// Flush one frozen in-memory layer to disk, as a new delta layer.
-    #[instrument(skip(self, frozen_layer), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%frozen_layer.short_id()))]
-    async fn flush_frozen_layer(&self, frozen_layer: Arc<InMemoryLayer>) -> anyhow::Result<()> {
+    #[instrument(skip(self, frozen_layer, ctx), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%frozen_layer.short_id()))]
+    async fn flush_frozen_layer(
+        &self,
+        frozen_layer: Arc<InMemoryLayer>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
         // As a special case, when we have just imported an image into the repository,
         // instead of writing out a L0 delta layer, we directly write out image layer
         // files instead. This is possible as long as *all* the data imported into the
@@ -1966,10 +2122,12 @@ impl Timeline {
         let lsn_range = frozen_layer.get_lsn_range();
         let layer_paths_to_upload =
             if lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1) {
+                // Note: The 'ctx' in use here has DownloadBehavior::Error. We should not
+                // require downloading anything during initial import.
                 let (partitioning, _lsn) = self
-                    .repartition(self.initdb_lsn, self.get_compaction_target_size())
+                    .repartition(self.initdb_lsn, self.get_compaction_target_size(), ctx)
                     .await?;
-                self.create_image_layers(&partitioning, self.initdb_lsn, true)
+                self.create_image_layers(&partitioning, self.initdb_lsn, true, ctx)
                     .await?
             } else {
                 // normal case, write out a L0 delta layer file.
@@ -2099,10 +2257,11 @@ impl Timeline {
         ])?;
 
         // Add it to the layer map
-        {
-            let mut layers = self.layers.write().unwrap();
-            layers.insert_historic(Arc::new(new_delta));
-        }
+        self.layers
+            .write()
+            .unwrap()
+            .batch_update()
+            .insert_historic(Arc::new(new_delta));
 
         // update the timeline's physical size
         let sz = new_delta_path.metadata()?.len();
@@ -2119,6 +2278,7 @@ impl Timeline {
         &self,
         lsn: Lsn,
         partition_size: u64,
+        ctx: &RequestContext,
     ) -> anyhow::Result<(KeyPartitioning, Lsn)> {
         {
             let partitioning_guard = self.partitioning.lock().unwrap();
@@ -2129,7 +2289,7 @@ impl Timeline {
                 return Ok((partitioning_guard.0.clone(), partitioning_guard.1));
             }
         }
-        let keyspace = self.collect_keyspace(lsn).await?;
+        let keyspace = self.collect_keyspace(lsn, ctx).await?;
         let partitioning = keyspace.partition(partition_size);
 
         let mut partitioning_guard = self.partitioning.lock().unwrap();
@@ -2166,13 +2326,15 @@ impl Timeline {
                 // are some delta layers *later* than current 'lsn', if more WAL was processed and flushed
                 // after we read last_record_lsn, which is passed here in the 'lsn' argument.
                 if img_lsn < lsn {
-                    let num_deltas = layers.count_deltas(&img_range, &(img_lsn..lsn))?;
+                    let threshold = self.get_image_creation_threshold();
+                    let num_deltas =
+                        layers.count_deltas(&img_range, &(img_lsn..lsn), Some(threshold))?;
 
                     debug!(
                         "key range {}-{}, has {} deltas on this timeline in LSN range {}..{}",
                         img_range.start, img_range.end, num_deltas, img_lsn, lsn
                     );
-                    if num_deltas >= self.get_image_creation_threshold() {
+                    if num_deltas >= threshold {
                         return Ok(true);
                     }
                 }
@@ -2187,6 +2349,7 @@ impl Timeline {
         partitioning: &KeyPartitioning,
         lsn: Lsn,
         force: bool,
+        ctx: &RequestContext,
     ) -> Result<HashMap<LayerFileName, LayerFileMetadata>, PageReconstructError> {
         let timer = self.metrics.create_images_time_histo.start_timer();
         let mut image_layers: Vec<ImageLayer> = Vec::new();
@@ -2211,7 +2374,7 @@ impl Timeline {
                 for range in &partition.ranges {
                     let mut key = range.start;
                     while key < range.end {
-                        let img = match self.get(key, lsn).await {
+                        let img = match self.get(key, lsn, ctx).await {
                             Ok(img) => img,
                             Err(err) => {
                                 // If we fail to reconstruct a VM or FSM page, we can zero the
@@ -2267,21 +2430,23 @@ impl Timeline {
         let mut layer_paths_to_upload = HashMap::with_capacity(image_layers.len());
 
         let mut layers = self.layers.write().unwrap();
+        let mut updates = layers.batch_update();
         let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id);
         for l in image_layers {
             let path = l.filename();
             let metadata = timeline_path
                 .join(path.file_name())
                 .metadata()
-                .context("reading metadata of layer file {path}")?;
+                .with_context(|| format!("reading metadata of layer file {}", path.file_name()))?;
 
             layer_paths_to_upload.insert(path, LayerFileMetadata::new(metadata.len()));
 
             self.metrics
                 .resident_physical_size_gauge
                 .add(metadata.len());
-            layers.insert_historic(Arc::new(l));
+            updates.insert_historic(Arc::new(l));
         }
+        updates.flush();
         drop(layers);
         timer.stop_and_record();
 
@@ -2577,6 +2742,7 @@ impl Timeline {
         }
 
         let mut layers = self.layers.write().unwrap();
+        let mut updates = layers.batch_update();
         let mut new_layer_paths = HashMap::with_capacity(new_layers.len());
         for l in new_layers {
             let new_delta_path = l.path();
@@ -2597,7 +2763,7 @@ impl Timeline {
 
             new_layer_paths.insert(new_delta_path, LayerFileMetadata::new(metadata.len()));
             let x: Arc<dyn PersistentLayer + 'static> = Arc::new(l);
-            layers.insert_historic(x);
+            updates.insert_historic(x);
         }
 
         // Now that we have reshuffled the data to set of new delta layers, we can
@@ -2611,8 +2777,9 @@ impl Timeline {
             }
             layer_names_to_delete.push(l.filename());
             l.delete()?;
-            layers.remove_historic(l);
+            updates.remove_historic(l);
         }
+        updates.flush();
         drop(layers);
 
         // Also schedule the deletions in remote storage
@@ -2662,6 +2829,7 @@ impl Timeline {
         retain_lsns: Vec<Lsn>,
         cutoff_horizon: Lsn,
         pitr: Duration,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         // First, calculate pitr_cutoff_timestamp and then convert it to LSN.
         //
@@ -2674,7 +2842,7 @@ impl Timeline {
             if let Some(pitr_cutoff_timestamp) = now.checked_sub(pitr) {
                 let pitr_timestamp = to_pg_timestamp(pitr_cutoff_timestamp);
 
-                match self.find_lsn_for_timestamp(pitr_timestamp).await? {
+                match self.find_lsn_for_timestamp(pitr_timestamp, ctx).await? {
                     LsnForTimestamp::Present(lsn) => lsn,
                     LsnForTimestamp::Future(lsn) => {
                         // The timestamp is in the future. That sounds impossible,
@@ -2725,6 +2893,8 @@ impl Timeline {
     /// obsolete.
     ///
     pub(super) async fn gc(&self) -> anyhow::Result<GcResult> {
+        let timer = self.metrics.garbage_collect_histo.start_timer();
+
         fail_point!("before-timeline-gc");
 
         let _layer_removal_cs = self.layer_removal_cs.lock().await;
@@ -2745,11 +2915,17 @@ impl Timeline {
 
         let new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff);
 
-        self.gc_timeline(horizon_cutoff, pitr_cutoff, retain_lsns, new_gc_cutoff)
+        let res = self
+            .gc_timeline(horizon_cutoff, pitr_cutoff, retain_lsns, new_gc_cutoff)
             .instrument(
                 info_span!("gc_timeline", timeline = %self.timeline_id, cutoff = %new_gc_cutoff),
             )
-            .await
+            .await?;
+
+        // only record successes
+        timer.stop_and_record();
+
+        Ok(res)
     }
 
     async fn gc_timeline(
@@ -2812,6 +2988,7 @@ impl Timeline {
         // 3. it doesn't need to be retained for 'retain_lsns';
         // 4. newer on-disk image layers cover the layer's whole key range
         //
+        // TODO holding a write lock is too agressive and avoidable
         let mut layers = self.layers.write().unwrap();
         'outer: for l in layers.iter_historic_layers() {
             result.layers_total += 1;
@@ -2843,6 +3020,8 @@ impl Timeline {
             // might be referenced by child branches forever.
             // We can track this in child timeline GC and delete parent layers when
             // they are no longer needed. This might be complicated with long inheritance chains.
+            //
+            // TODO Vec is not a great choice for `retain_lsns`
             for retain_lsn in &retain_lsns {
                 // start_lsn is inclusive
                 if &l.get_lsn_range().start <= retain_lsn {
@@ -2896,6 +3075,7 @@ impl Timeline {
             layers_to_remove.push(Arc::clone(&l));
         }
 
+        let mut updates = layers.batch_update();
         if !layers_to_remove.is_empty() {
             // Persist the new GC cutoff value in the metadata file, before
             // we actually remove anything.
@@ -2913,7 +3093,13 @@ impl Timeline {
                 }
                 layer_names_to_delete.push(doomed_layer.filename());
                 doomed_layer.delete()?; // FIXME: schedule succeeded deletions before returning?
-                layers.remove_historic(doomed_layer);
+
+                // TODO Removing from the bottom of the layer map is expensive.
+                //      Maybe instead discard all layer map historic versions that
+                //      won't be needed for page reconstruction for this timeline,
+                //      and mark what we can't delete yet as deleted from the layer
+                //      map index without actually rebuilding the index.
+                updates.remove_historic(doomed_layer);
                 result.layers_removed += 1;
             }
 
@@ -2925,6 +3111,7 @@ impl Timeline {
                 remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?;
             }
         }
+        updates.flush();
 
         info!(
             "GC completed removing {} layers, cutoff {}",
@@ -3081,11 +3268,13 @@ impl Timeline {
                     // Delta- or ImageLayer in the layer map.
                     let new_layer = remote_layer.create_downloaded_layer(self_clone.conf, *size);
                     let mut layers = self_clone.layers.write().unwrap();
+                    let mut updates = layers.batch_update();
                     {
                         let l: Arc<dyn PersistentLayer> = remote_layer.clone();
-                        layers.remove_historic(l);
+                        updates.remove_historic(l);
                     }
-                    layers.insert_historic(new_layer);
+                    updates.insert_historic(new_layer);
+                    updates.flush();
                     drop(layers);
 
                     // Now that we've inserted the download into the layer map,
diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs
similarity index 83%
rename from pageserver/src/walreceiver.rs
rename to pageserver/src/tenant/timeline/walreceiver.rs
index fc9daadc5c..f33a12c5cc 100644
--- a/pageserver/src/walreceiver.rs
+++ b/pageserver/src/tenant/timeline/walreceiver.rs
@@ -23,58 +23,15 @@
 mod connection_manager;
 mod walreceiver_connection;
 
-use crate::config::PageServerConf;
 use crate::task_mgr::WALRECEIVER_RUNTIME;
 
-use anyhow::Context;
-use once_cell::sync::OnceCell;
 use std::future::Future;
-use storage_broker::BrokerClientChannel;
 use tokio::sync::watch;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 
 pub use connection_manager::spawn_connection_manager_task;
 
-static BROKER_CLIENT: OnceCell<BrokerClientChannel> = OnceCell::new();
-
-///
-/// Initialize the broker client. This must be called once at page server startup.
-///
-pub async fn init_broker_client(conf: &'static PageServerConf) -> anyhow::Result<()> {
-    let broker_endpoint = conf.broker_endpoint.clone();
-
-    // Note: we do not attempt connecting here (but validate endpoints sanity).
-    let broker_client =
-        storage_broker::connect(broker_endpoint.clone(), conf.broker_keepalive_interval).context(
-            format!(
-                "Failed to create broker client to {}",
-                &conf.broker_endpoint
-            ),
-        )?;
-
-    if BROKER_CLIENT.set(broker_client).is_err() {
-        panic!("broker already initialized");
-    }
-
-    info!(
-        "Initialized broker client with endpoints: {}",
-        broker_endpoint
-    );
-    Ok(())
-}
-
-///
-/// Get a handle to the broker client
-///
-pub fn get_broker_client() -> &'static BrokerClientChannel {
-    BROKER_CLIENT.get().expect("broker client not initialized")
-}
-
-pub fn is_broker_client_initialized() -> bool {
-    BROKER_CLIENT.get().is_some()
-}
-
 /// A handle of an asynchronous task.
 /// The task has a channel that it can use to communicate its lifecycle events in a certain form, see [`TaskEvent`]
 /// and a cancellation token that it can listen to for earlier interrupts.
@@ -95,7 +52,6 @@ pub enum TaskEvent<E> {
 
 #[derive(Debug, Clone)]
 pub enum TaskStateUpdate<E> {
-    Init,
     Started,
     Progress(E),
 }
diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
similarity index 96%
rename from pageserver/src/walreceiver/connection_manager.rs
rename to pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
index 8b60e59305..cd7c7c51d2 100644
--- a/pageserver/src/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -11,10 +11,12 @@
 
 use std::{collections::HashMap, num::NonZeroU64, ops::ControlFlow, sync::Arc, time::Duration};
 
-use crate::task_mgr::TaskKind;
+use super::TaskStateUpdate;
+use crate::broker_client::get_broker_client;
+use crate::context::RequestContext;
 use crate::task_mgr::WALRECEIVER_RUNTIME;
+use crate::task_mgr::{self, TaskKind};
 use crate::tenant::Timeline;
-use crate::{task_mgr, walreceiver::TaskStateUpdate};
 use anyhow::Context;
 use chrono::{NaiveDateTime, Utc};
 use pageserver_api::models::TimelineState;
@@ -27,10 +29,7 @@ use storage_broker::Streaming;
 use tokio::{select, sync::watch};
 use tracing::*;
 
-use crate::{
-    exponential_backoff, walreceiver::get_broker_client, DEFAULT_BASE_BACKOFF_SECONDS,
-    DEFAULT_MAX_BACKOFF_SECONDS,
-};
+use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
 use postgres_connection::{parse_host_port, PgConnectionConfig};
 use utils::{
     id::{NodeId, TenantTimelineId},
@@ -46,6 +45,7 @@ pub fn spawn_connection_manager_task(
     lagging_wal_timeout: Duration,
     max_lsn_wal_lag: NonZeroU64,
     auth_token: Option<Arc<String>>,
+    ctx: RequestContext,
 ) {
     let mut broker_client = get_broker_client().clone();
 
@@ -78,6 +78,7 @@ pub fn spawn_connection_manager_task(
                     loop_step_result = connection_manager_loop_step(
                         &mut broker_client,
                         &mut walreceiver_state,
+                        &ctx,
                     ) => match loop_step_result {
                         ControlFlow::Continue(()) => continue,
                         ControlFlow::Break(()) => {
@@ -101,6 +102,7 @@ pub fn spawn_connection_manager_task(
 async fn connection_manager_loop_step(
     broker_client: &mut BrokerClientChannel,
     walreceiver_state: &mut WalreceiverState,
+    ctx: &RequestContext,
 ) -> ControlFlow<(), ()> {
     let mut timeline_state_updates = walreceiver_state.timeline.subscribe_for_state_updates();
 
@@ -145,7 +147,7 @@ async fn connection_manager_loop_step(
                 let wal_connection = walreceiver_state.wal_connection.as_mut()
                     .expect("Should have a connection, as checked by the corresponding select! guard");
                 match wal_connection_update {
-                    TaskEvent::Update(TaskStateUpdate::Init | TaskStateUpdate::Started) => {},
+                    TaskEvent::Update(TaskStateUpdate::Started) => {},
                     TaskEvent::Update(TaskStateUpdate::Progress(new_status)) => {
                         if new_status.has_processed_wal {
                             // We have advanced last_record_lsn by processing the WAL received
@@ -183,13 +185,23 @@ async fn connection_manager_loop_step(
 
             new_event = async {
                 loop {
+                    if walreceiver_state.timeline.current_state() == TimelineState::Loading {
+                        warn!("wal connection manager should only be launched after timeline has become active");
+                    }
                     match timeline_state_updates.changed().await {
                         Ok(()) => {
                             let new_state = walreceiver_state.timeline.current_state();
                             match new_state {
                                 // we're already active as walreceiver, no need to reactivate
                                 TimelineState::Active => continue,
-                                TimelineState::Broken | TimelineState::Stopping | TimelineState::Suspended => return ControlFlow::Continue(new_state),
+                                TimelineState::Broken | TimelineState::Stopping => {
+                                    info!("timeline entered terminal state {new_state:?}, stopping wal connection manager loop");
+                                    return ControlFlow::Break(());
+                                }
+                                TimelineState::Loading => {
+                                    warn!("timeline transitioned back to Loading state, that should not happen");
+                                    return ControlFlow::Continue(new_state);
+                                }
                             }
                         }
                         Err(_sender_dropped_error) => return ControlFlow::Break(()),
@@ -197,7 +209,7 @@ async fn connection_manager_loop_step(
                 }
             } => match new_event {
                 ControlFlow::Continue(new_state) => {
-                    info!("Timeline became inactive (new state: {new_state:?}), dropping current connections until it reactivates");
+                    info!("observed timeline state change, new state is {new_state:?}");
                     return ControlFlow::Continue(());
                 }
                 ControlFlow::Break(()) => {
@@ -226,6 +238,7 @@ async fn connection_manager_loop_step(
                 .change_connection(
                     new_candidate.safekeeper_id,
                     new_candidate.wal_source_connconf,
+                    ctx,
                 )
                 .await
         }
@@ -289,7 +302,9 @@ async fn subscribe_for_timeline_updates(
                 return resp.into_inner();
             }
             Err(e) => {
-                warn!("Attempt #{attempt}, failed to subscribe for timeline {id} updates in broker: {e:#}");
+                // Safekeeper nodes can stop pushing timeline updates to the broker, when no new writes happen and
+                // entire WAL is streamed. Keep this noticeable with logging, but do not warn/error.
+                info!("Attempt #{attempt}, failed to subscribe for timeline {id} updates in broker: {e:#}");
                 continue;
             }
         }
@@ -389,12 +404,17 @@ impl WalreceiverState {
         &mut self,
         new_sk_id: NodeId,
         new_wal_source_connconf: PgConnectionConfig,
+        ctx: &RequestContext,
     ) {
         self.drop_old_connection(true).await;
 
         let id = self.id;
         let connect_timeout = self.wal_connect_timeout;
         let timeline = Arc::clone(&self.timeline);
+        let ctx = ctx.detached_child(
+            TaskKind::WalReceiverConnectionHandler,
+            ctx.download_behavior(),
+        );
         let connection_handle = TaskHandle::spawn(move |events_sender, cancellation| {
             async move {
                 super::walreceiver_connection::handle_walreceiver_connection(
@@ -403,6 +423,7 @@ impl WalreceiverState {
                     events_sender,
                     cancellation,
                     connect_timeout,
+                    ctx,
                 )
                 .await
                 .context("walreceiver connection handling failure")
@@ -1233,18 +1254,18 @@ mod tests {
     const DUMMY_SAFEKEEPER_HOST: &str = "safekeeper_connstr";
 
     async fn dummy_state(harness: &TenantHarness<'_>) -> WalreceiverState {
+        let (tenant, ctx) = harness.load().await;
+        let timeline = tenant
+            .create_empty_timeline(TIMELINE_ID, Lsn(0), crate::DEFAULT_PG_VERSION, &ctx)
+            .expect("Failed to create an empty timeline for dummy wal connection manager");
+        let timeline = timeline.initialize(&ctx).unwrap();
+
         WalreceiverState {
             id: TenantTimelineId {
                 tenant_id: harness.tenant_id,
                 timeline_id: TIMELINE_ID,
             },
-            timeline: harness
-                .load()
-                .await
-                .create_empty_timeline(TIMELINE_ID, Lsn(0), crate::DEFAULT_PG_VERSION)
-                .expect("Failed to create an empty timeline for dummy wal connection manager")
-                .initialize()
-                .unwrap(),
+            timeline,
             wal_connect_timeout: Duration::from_secs(1),
             lagging_wal_timeout: Duration::from_secs(1),
             max_lsn_wal_lag: NonZeroU64::new(1024 * 1024).unwrap(),
diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
similarity index 94%
rename from pageserver/src/walreceiver/walreceiver_connection.rs
rename to pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 1b9e4923fb..7e06c398af 100644
--- a/pageserver/src/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -22,7 +22,9 @@ use tokio_postgres::{replication::ReplicationStream, Client};
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, trace, warn};
 
-use crate::{metrics::LIVE_CONNECTIONS_COUNT, walreceiver::TaskStateUpdate};
+use super::TaskStateUpdate;
+use crate::context::RequestContext;
+use crate::metrics::LIVE_CONNECTIONS_COUNT;
 use crate::{
     task_mgr,
     task_mgr::TaskKind,
@@ -62,6 +64,7 @@ pub async fn handle_walreceiver_connection(
     events_sender: watch::Sender<TaskStateUpdate<WalConnectionStatus>>,
     cancellation: CancellationToken,
     connect_timeout: Duration,
+    ctx: RequestContext,
 ) -> anyhow::Result<()> {
     // Connect to the database in replication mode.
     info!("connecting to {wal_source_connconf:?}");
@@ -77,9 +80,13 @@ pub async fn handle_walreceiver_connection(
                 info!("DB connection stream finished: {expected_error}");
                 return Ok(());
             }
-            Err(elapsed) => anyhow::bail!(
-                "Timed out while waiting {elapsed} for walreceiver connection to open"
-            ),
+            Err(_) => {
+                // Timing out to connect to a safekeeper node could happen long time, due to
+                // many reasons that pageserver cannot control.
+                // Do not produce an error, but make it visible, that timeouts happen by logging the `event.
+                info!("Timed out while waiting {connect_timeout:?} for walreceiver connection to open");
+                return Ok(());
+            }
         }
     };
 
@@ -99,10 +106,14 @@ pub async fn handle_walreceiver_connection(
 
     // The connection object performs the actual communication with the database,
     // so spawn it off to run on its own.
+    let _connection_ctx = ctx.detached_child(
+        TaskKind::WalReceiverConnectionPoller,
+        ctx.download_behavior(),
+    );
     let connection_cancellation = cancellation.clone();
     task_mgr::spawn(
         WALRECEIVER_RUNTIME.handle(),
-        TaskKind::WalReceiverConnection,
+        TaskKind::WalReceiverConnectionPoller,
         Some(timeline.tenant_id),
         Some(timeline.timeline_id),
         "walreceiver connection",
@@ -117,7 +128,7 @@ pub async fn handle_walreceiver_connection(
                         }
                     }
                 },
-
+                // Future: replace connection_cancellation with connection_ctx cancellation
                 _ = connection_cancellation.cancelled() => info!("Connection cancelled"),
             }
             Ok(())
@@ -180,7 +191,7 @@ pub async fn handle_walreceiver_connection(
 
     let mut waldecoder = WalStreamDecoder::new(startpoint, timeline.pg_version);
 
-    let mut walingest = WalIngest::new(timeline.as_ref(), startpoint).await?;
+    let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx).await?;
 
     while let Some(replication_message) = {
         select! {
@@ -251,7 +262,7 @@ pub async fn handle_walreceiver_connection(
                         ensure!(lsn.is_aligned());
 
                         walingest
-                            .ingest_record(recdata.clone(), lsn, &mut modification, &mut decoded)
+                            .ingest_record(recdata, lsn, &mut modification, &mut decoded, &ctx)
                             .await
                             .with_context(|| format!("could not ingest record at {lsn}"))?;
 
@@ -329,7 +340,7 @@ pub async fn handle_walreceiver_connection(
             // Send the replication feedback message.
             // Regular standby_status_update fields are put into this message.
             let (timeline_logical_size, _) = timeline
-                .get_current_logical_size()
+                .get_current_logical_size(&ctx)
                 .context("Status update creation failed to get current logical size")?;
             let status_update = ReplicationFeedback {
                 current_timeline_size: timeline_logical_size,
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 0de2e6654d..3761c65668 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -29,6 +29,7 @@ use anyhow::Result;
 use bytes::{Buf, Bytes, BytesMut};
 use tracing::*;
 
+use crate::context::RequestContext;
 use crate::pgdatadir_mapping::*;
 use crate::tenant::PageReconstructError;
 use crate::tenant::Timeline;
@@ -52,10 +53,14 @@ pub struct WalIngest<'a> {
 }
 
 impl<'a> WalIngest<'a> {
-    pub async fn new(timeline: &Timeline, startpoint: Lsn) -> anyhow::Result<WalIngest> {
+    pub async fn new(
+        timeline: &'a Timeline,
+        startpoint: Lsn,
+        ctx: &'_ RequestContext,
+    ) -> anyhow::Result<WalIngest<'a>> {
         // Fetch the latest checkpoint into memory, so that we can compare with it
         // quickly in `ingest_record` and update it when it changes.
-        let checkpoint_bytes = timeline.get_checkpoint(startpoint).await?;
+        let checkpoint_bytes = timeline.get_checkpoint(startpoint, ctx).await?;
         let checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
         trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value);
 
@@ -80,6 +85,7 @@ impl<'a> WalIngest<'a> {
         lsn: Lsn,
         modification: &mut DatadirModification<'_>,
         decoded: &mut DecodedWALRecord,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         modification.lsn = lsn;
         decode_wal_record(recdata, decoded, self.timeline.pg_version)?;
@@ -97,7 +103,7 @@ impl<'a> WalIngest<'a> {
         if decoded.xl_rmid == pg_constants::RM_HEAP_ID
             || decoded.xl_rmid == pg_constants::RM_HEAP2_ID
         {
-            self.ingest_heapam_record(&mut buf, modification, decoded)
+            self.ingest_heapam_record(&mut buf, modification, decoded, ctx)
                 .await?;
         }
         // Handle other special record types
@@ -106,13 +112,14 @@ impl<'a> WalIngest<'a> {
                 == pg_constants::XLOG_SMGR_CREATE
         {
             let create = XlSmgrCreate::decode(&mut buf);
-            self.ingest_xlog_smgr_create(modification, &create).await?;
+            self.ingest_xlog_smgr_create(modification, &create, ctx)
+                .await?;
         } else if decoded.xl_rmid == pg_constants::RM_SMGR_ID
             && (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
                 == pg_constants::XLOG_SMGR_TRUNCATE
         {
             let truncate = XlSmgrTruncate::decode(&mut buf);
-            self.ingest_xlog_smgr_truncate(modification, &truncate)
+            self.ingest_xlog_smgr_truncate(modification, &truncate, ctx)
                 .await?;
         } else if decoded.xl_rmid == pg_constants::RM_DBASE_ID {
             debug!(
@@ -126,7 +133,7 @@ impl<'a> WalIngest<'a> {
                     let createdb = XlCreateDatabase::decode(&mut buf);
                     debug!("XLOG_DBASE_CREATE v14");
 
-                    self.ingest_xlog_dbase_create(modification, &createdb)
+                    self.ingest_xlog_dbase_create(modification, &createdb, ctx)
                         .await?;
                 } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
                     == postgres_ffi::v14::bindings::XLOG_DBASE_DROP
@@ -134,7 +141,9 @@ impl<'a> WalIngest<'a> {
                     let dropdb = XlDropDatabase::decode(&mut buf);
                     for tablespace_id in dropdb.tablespace_ids {
                         trace!("Drop db {}, {}", tablespace_id, dropdb.db_id);
-                        modification.drop_dbdir(tablespace_id, dropdb.db_id).await?;
+                        modification
+                            .drop_dbdir(tablespace_id, dropdb.db_id, ctx)
+                            .await?;
                     }
                 }
             } else if self.timeline.pg_version == 15 {
@@ -150,7 +159,7 @@ impl<'a> WalIngest<'a> {
                     // So we can reuse XlCreateDatabase here.
                     debug!("XLOG_DBASE_CREATE_FILE_COPY");
                     let createdb = XlCreateDatabase::decode(&mut buf);
-                    self.ingest_xlog_dbase_create(modification, &createdb)
+                    self.ingest_xlog_dbase_create(modification, &createdb, ctx)
                         .await?;
                 } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
                     == postgres_ffi::v15::bindings::XLOG_DBASE_DROP
@@ -158,7 +167,9 @@ impl<'a> WalIngest<'a> {
                     let dropdb = XlDropDatabase::decode(&mut buf);
                     for tablespace_id in dropdb.tablespace_ids {
                         trace!("Drop db {}, {}", tablespace_id, dropdb.db_id);
-                        modification.drop_dbdir(tablespace_id, dropdb.db_id).await?;
+                        modification
+                            .drop_dbdir(tablespace_id, dropdb.db_id, ctx)
+                            .await?;
                     }
                 }
             }
@@ -176,12 +187,13 @@ impl<'a> WalIngest<'a> {
                     segno,
                     rpageno,
                     ZERO_PAGE.clone(),
+                    ctx,
                 )
                 .await?;
             } else {
                 assert!(info == pg_constants::CLOG_TRUNCATE);
                 let xlrec = XlClogTruncate::decode(&mut buf);
-                self.ingest_clog_truncate_record(modification, &xlrec)
+                self.ingest_clog_truncate_record(modification, &xlrec, ctx)
                     .await?;
             }
         } else if decoded.xl_rmid == pg_constants::RM_XACT_ID {
@@ -193,6 +205,7 @@ impl<'a> WalIngest<'a> {
                     modification,
                     &parsed_xact,
                     info == pg_constants::XLOG_XACT_COMMIT,
+                    ctx,
                 )
                 .await?;
             } else if info == pg_constants::XLOG_XACT_COMMIT_PREPARED
@@ -204,6 +217,7 @@ impl<'a> WalIngest<'a> {
                     modification,
                     &parsed_xact,
                     info == pg_constants::XLOG_XACT_COMMIT_PREPARED,
+                    ctx,
                 )
                 .await?;
                 // Remove twophase file. see RemoveTwoPhaseFile() in postgres code
@@ -213,10 +227,12 @@ impl<'a> WalIngest<'a> {
                     parsed_xact.xid,
                     lsn,
                 );
-                modification.drop_twophase_file(parsed_xact.xid).await?;
+                modification
+                    .drop_twophase_file(parsed_xact.xid, ctx)
+                    .await?;
             } else if info == pg_constants::XLOG_XACT_PREPARE {
                 modification
-                    .put_twophase_file(decoded.xl_xid, Bytes::copy_from_slice(&buf[..]))
+                    .put_twophase_file(decoded.xl_xid, Bytes::copy_from_slice(&buf[..]), ctx)
                     .await?;
             }
         } else if decoded.xl_rmid == pg_constants::RM_MULTIXACT_ID {
@@ -232,6 +248,7 @@ impl<'a> WalIngest<'a> {
                     segno,
                     rpageno,
                     ZERO_PAGE.clone(),
+                    ctx,
                 )
                 .await?;
             } else if info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE {
@@ -244,6 +261,7 @@ impl<'a> WalIngest<'a> {
                     segno,
                     rpageno,
                     ZERO_PAGE.clone(),
+                    ctx,
                 )
                 .await?;
             } else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
@@ -251,12 +269,12 @@ impl<'a> WalIngest<'a> {
                 self.ingest_multixact_create_record(modification, &xlrec)?;
             } else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID {
                 let xlrec = XlMultiXactTruncate::decode(&mut buf);
-                self.ingest_multixact_truncate_record(modification, &xlrec)
+                self.ingest_multixact_truncate_record(modification, &xlrec, ctx)
                     .await?;
             }
         } else if decoded.xl_rmid == pg_constants::RM_RELMAP_ID {
             let xlrec = XlRelmapUpdate::decode(&mut buf);
-            self.ingest_relmap_page(modification, &xlrec, decoded)
+            self.ingest_relmap_page(modification, &xlrec, decoded, ctx)
                 .await?;
         } else if decoded.xl_rmid == pg_constants::RM_XLOG_ID {
             let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
@@ -292,7 +310,7 @@ impl<'a> WalIngest<'a> {
         // Iterate through all the blocks that the record modifies, and
         // "put" a separate copy of the record for each block.
         for blk in decoded.blocks.iter() {
-            self.ingest_decoded_block(modification, lsn, decoded, blk)
+            self.ingest_decoded_block(modification, lsn, decoded, blk, ctx)
                 .await?;
         }
 
@@ -317,6 +335,7 @@ impl<'a> WalIngest<'a> {
         lsn: Lsn,
         decoded: &DecodedWALRecord,
         blk: &DecodedBkpBlock,
+        ctx: &RequestContext,
     ) -> Result<(), PageReconstructError> {
         let rel = RelTag {
             spcnode: blk.rnode_spcnode,
@@ -359,14 +378,14 @@ impl<'a> WalIngest<'a> {
                 page_set_lsn(&mut image, lsn)
             }
             assert_eq!(image.len(), BLCKSZ as usize);
-            self.put_rel_page_image(modification, rel, blk.blkno, image.freeze())
+            self.put_rel_page_image(modification, rel, blk.blkno, image.freeze(), ctx)
                 .await?;
         } else {
             let rec = NeonWalRecord::Postgres {
                 will_init: blk.will_init || blk.apply_image,
                 rec: decoded.record.clone(),
             };
-            self.put_rel_wal_record(modification, rel, blk.blkno, rec)
+            self.put_rel_wal_record(modification, rel, blk.blkno, rec, ctx)
                 .await?;
         }
         Ok(())
@@ -377,6 +396,7 @@ impl<'a> WalIngest<'a> {
         buf: &mut Bytes,
         modification: &mut DatadirModification<'_>,
         decoded: &mut DecodedWALRecord,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         // Handle VM bit updates that are implicitly part of heap records.
 
@@ -456,7 +476,7 @@ impl<'a> WalIngest<'a> {
             // replaying it would fail to find the previous image of the page, because
             // it doesn't exist. So check if the VM page(s) exist, and skip the WAL
             // record if it doesn't.
-            let vm_size = self.get_relsize(vm_rel, modification.lsn).await?;
+            let vm_size = self.get_relsize(vm_rel, modification.lsn, ctx).await?;
             if let Some(blknum) = new_vm_blk {
                 if blknum >= vm_size {
                     new_vm_blk = None;
@@ -481,6 +501,7 @@ impl<'a> WalIngest<'a> {
                             old_heap_blkno,
                             flags: pg_constants::VISIBILITYMAP_VALID_BITS,
                         },
+                        ctx,
                     )
                     .await?;
                 } else {
@@ -496,6 +517,7 @@ impl<'a> WalIngest<'a> {
                                 old_heap_blkno: None,
                                 flags: pg_constants::VISIBILITYMAP_VALID_BITS,
                             },
+                            ctx,
                         )
                         .await?;
                     }
@@ -509,6 +531,7 @@ impl<'a> WalIngest<'a> {
                                 old_heap_blkno,
                                 flags: pg_constants::VISIBILITYMAP_VALID_BITS,
                             },
+                            ctx,
                         )
                         .await?;
                     }
@@ -524,6 +547,7 @@ impl<'a> WalIngest<'a> {
         &mut self,
         modification: &mut DatadirModification<'_>,
         rec: &XlCreateDatabase,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         let db_id = rec.db_id;
         let tablespace_id = rec.tablespace_id;
@@ -539,7 +563,7 @@ impl<'a> WalIngest<'a> {
 
         let rels = modification
             .tline
-            .list_rels(src_tablespace_id, src_db_id, req_lsn)
+            .list_rels(src_tablespace_id, src_db_id, req_lsn, ctx)
             .await?;
 
         debug!("ingest_xlog_dbase_create: {} rels", rels.len());
@@ -547,10 +571,10 @@ impl<'a> WalIngest<'a> {
         // Copy relfilemap
         let filemap = modification
             .tline
-            .get_relmap_file(src_tablespace_id, src_db_id, req_lsn)
+            .get_relmap_file(src_tablespace_id, src_db_id, req_lsn, ctx)
             .await?;
         modification
-            .put_relmap_file(tablespace_id, db_id, filemap)
+            .put_relmap_file(tablespace_id, db_id, filemap, ctx)
             .await?;
 
         let mut num_rels_copied = 0;
@@ -561,7 +585,7 @@ impl<'a> WalIngest<'a> {
 
             let nblocks = modification
                 .tline
-                .get_rel_size(src_rel, req_lsn, true)
+                .get_rel_size(src_rel, req_lsn, true, ctx)
                 .await?;
             let dst_rel = RelTag {
                 spcnode: tablespace_id,
@@ -570,7 +594,7 @@ impl<'a> WalIngest<'a> {
                 forknum: src_rel.forknum,
             };
 
-            modification.put_rel_creation(dst_rel, nblocks).await?;
+            modification.put_rel_creation(dst_rel, nblocks, ctx).await?;
 
             // Copy content
             debug!("copying rel {} to {}, {} blocks", src_rel, dst_rel, nblocks);
@@ -579,7 +603,7 @@ impl<'a> WalIngest<'a> {
 
                 let content = modification
                     .tline
-                    .get_rel_page_at_lsn(src_rel, blknum, req_lsn, true)
+                    .get_rel_page_at_lsn(src_rel, blknum, req_lsn, true, ctx)
                     .await?;
                 modification.put_rel_page_image(dst_rel, blknum, content)?;
                 num_blocks_copied += 1;
@@ -599,6 +623,7 @@ impl<'a> WalIngest<'a> {
         &mut self,
         modification: &mut DatadirModification<'_>,
         rec: &XlSmgrCreate,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         let rel = RelTag {
             spcnode: rec.rnode.spcnode,
@@ -606,7 +631,7 @@ impl<'a> WalIngest<'a> {
             relnode: rec.rnode.relnode,
             forknum: rec.forknum,
         };
-        self.put_rel_creation(modification, rel).await?;
+        self.put_rel_creation(modification, rel, ctx).await?;
         Ok(())
     }
 
@@ -617,6 +642,7 @@ impl<'a> WalIngest<'a> {
         &mut self,
         modification: &mut DatadirModification<'_>,
         rec: &XlSmgrTruncate,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         let spcnode = rec.rnode.spcnode;
         let dbnode = rec.rnode.dbnode;
@@ -629,7 +655,7 @@ impl<'a> WalIngest<'a> {
                 relnode,
                 forknum: MAIN_FORKNUM,
             };
-            self.put_rel_truncation(modification, rel, rec.blkno)
+            self.put_rel_truncation(modification, rel, rec.blkno, ctx)
                 .await?;
         }
         if (rec.flags & pg_constants::SMGR_TRUNCATE_FSM) != 0 {
@@ -648,10 +674,10 @@ impl<'a> WalIngest<'a> {
                 modification.put_rel_page_image(rel, fsm_physical_page_no, ZERO_PAGE.clone())?;
                 fsm_physical_page_no += 1;
             }
-            let nblocks = self.get_relsize(rel, modification.lsn).await?;
+            let nblocks = self.get_relsize(rel, modification.lsn, ctx).await?;
             if nblocks > fsm_physical_page_no {
                 // check if something to do: FSM is larger than truncate position
-                self.put_rel_truncation(modification, rel, fsm_physical_page_no)
+                self.put_rel_truncation(modification, rel, fsm_physical_page_no, ctx)
                     .await?;
             }
         }
@@ -670,10 +696,10 @@ impl<'a> WalIngest<'a> {
                 modification.put_rel_page_image(rel, vm_page_no, ZERO_PAGE.clone())?;
                 vm_page_no += 1;
             }
-            let nblocks = self.get_relsize(rel, modification.lsn).await?;
+            let nblocks = self.get_relsize(rel, modification.lsn, ctx).await?;
             if nblocks > vm_page_no {
                 // check if something to do: VM is larger than truncate position
-                self.put_rel_truncation(modification, rel, vm_page_no)
+                self.put_rel_truncation(modification, rel, vm_page_no, ctx)
                     .await?;
             }
         }
@@ -687,6 +713,7 @@ impl<'a> WalIngest<'a> {
         modification: &mut DatadirModification<'_>,
         parsed: &XlXactParsedRecord,
         is_commit: bool,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         // Record update of CLOG pages
         let mut pageno = parsed.xid / pg_constants::CLOG_XACTS_PER_PAGE;
@@ -745,10 +772,10 @@ impl<'a> WalIngest<'a> {
                 let last_lsn = self.timeline.get_last_record_lsn();
                 if modification
                     .tline
-                    .get_rel_exists(rel, last_lsn, true)
+                    .get_rel_exists(rel, last_lsn, true, ctx)
                     .await?
                 {
-                    self.put_rel_drop(modification, rel).await?;
+                    self.put_rel_drop(modification, rel, ctx).await?;
                 }
             }
         }
@@ -759,6 +786,7 @@ impl<'a> WalIngest<'a> {
         &mut self,
         modification: &mut DatadirModification<'_>,
         xlrec: &XlClogTruncate,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         info!(
             "RM_CLOG_ID truncate pageno {} oldestXid {} oldestXidDB {}",
@@ -799,16 +827,15 @@ impl<'a> WalIngest<'a> {
         // it. So we use the previous record's LSN in the get calls
         // instead.
         let req_lsn = modification.tline.get_last_record_lsn();
-
-        let slru_segments = modification
+        for segno in modification
             .tline
-            .list_slru_segments(SlruKind::Clog, req_lsn)
-            .await?;
-        for segno in slru_segments {
+            .list_slru_segments(SlruKind::Clog, req_lsn, ctx)
+            .await?
+        {
             let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
             if slru_may_delete_clogsegment(segpage, xlrec.pageno) {
                 modification
-                    .drop_slru_segment(SlruKind::Clog, segno)
+                    .drop_slru_segment(SlruKind::Clog, segno, ctx)
                     .await?;
                 trace!("Drop CLOG segment {:>04X}", segno);
             }
@@ -900,6 +927,7 @@ impl<'a> WalIngest<'a> {
         &mut self,
         modification: &mut DatadirModification<'_>,
         xlrec: &XlMultiXactTruncate,
+        ctx: &RequestContext,
     ) -> Result<()> {
         self.checkpoint.oldestMulti = xlrec.end_trunc_off;
         self.checkpoint.oldestMultiDB = xlrec.oldest_multi_db;
@@ -915,7 +943,7 @@ impl<'a> WalIngest<'a> {
         // contain, possibly partially, valid data.
         while segment != endsegment {
             modification
-                .drop_slru_segment(SlruKind::MultiXactMembers, segment as u32)
+                .drop_slru_segment(SlruKind::MultiXactMembers, segment as u32, ctx)
                 .await?;
 
             /* move to next segment, handling wraparound correctly */
@@ -937,6 +965,7 @@ impl<'a> WalIngest<'a> {
         modification: &mut DatadirModification<'_>,
         xlrec: &XlRelmapUpdate,
         decoded: &DecodedWALRecord,
+        ctx: &RequestContext,
     ) -> Result<()> {
         let mut buf = decoded.record.clone();
         buf.advance(decoded.main_data_offset);
@@ -944,18 +973,22 @@ impl<'a> WalIngest<'a> {
         buf.advance(12);
 
         modification
-            .put_relmap_file(xlrec.tsid, xlrec.dbid, Bytes::copy_from_slice(&buf[..]))
-            .await?;
-
-        Ok(())
+            .put_relmap_file(
+                xlrec.tsid,
+                xlrec.dbid,
+                Bytes::copy_from_slice(&buf[..]),
+                ctx,
+            )
+            .await
     }
 
     async fn put_rel_creation(
         &mut self,
         modification: &mut DatadirModification<'_>,
         rel: RelTag,
+        ctx: &RequestContext,
     ) -> Result<()> {
-        modification.put_rel_creation(rel, 0).await?;
+        modification.put_rel_creation(rel, 0, ctx).await?;
         Ok(())
     }
 
@@ -965,8 +998,10 @@ impl<'a> WalIngest<'a> {
         rel: RelTag,
         blknum: BlockNumber,
         img: Bytes,
-    ) -> anyhow::Result<()> {
-        self.handle_rel_extend(modification, rel, blknum).await?;
+        ctx: &RequestContext,
+    ) -> Result<(), PageReconstructError> {
+        self.handle_rel_extend(modification, rel, blknum, ctx)
+            .await?;
         modification.put_rel_page_image(rel, blknum, img)?;
         Ok(())
     }
@@ -977,8 +1012,10 @@ impl<'a> WalIngest<'a> {
         rel: RelTag,
         blknum: BlockNumber,
         rec: NeonWalRecord,
-    ) -> anyhow::Result<()> {
-        self.handle_rel_extend(modification, rel, blknum).await?;
+        ctx: &RequestContext,
+    ) -> Result<()> {
+        self.handle_rel_extend(modification, rel, blknum, ctx)
+            .await?;
         modification.put_rel_wal_record(rel, blknum, rec)?;
         Ok(())
     }
@@ -988,8 +1025,9 @@ impl<'a> WalIngest<'a> {
         modification: &mut DatadirModification<'_>,
         rel: RelTag,
         nblocks: BlockNumber,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
-        modification.put_rel_truncation(rel, nblocks).await?;
+        modification.put_rel_truncation(rel, nblocks, ctx).await?;
         Ok(())
     }
 
@@ -997,17 +1035,22 @@ impl<'a> WalIngest<'a> {
         &mut self,
         modification: &mut DatadirModification<'_>,
         rel: RelTag,
+        ctx: &RequestContext,
     ) -> Result<()> {
-        modification.put_rel_drop(rel).await?;
+        modification.put_rel_drop(rel, ctx).await?;
         Ok(())
     }
 
-    async fn get_relsize(&mut self, rel: RelTag, lsn: Lsn) -> anyhow::Result<BlockNumber> {
-        let exists = self.timeline.get_rel_exists(rel, lsn, true).await?;
-        let nblocks = if !exists {
+    async fn get_relsize(
+        &mut self,
+        rel: RelTag,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<BlockNumber> {
+        let nblocks = if !self.timeline.get_rel_exists(rel, lsn, true, ctx).await? {
             0
         } else {
-            self.timeline.get_rel_size(rel, lsn, true).await?
+            self.timeline.get_rel_size(rel, lsn, true, ctx).await?
         };
         Ok(nblocks)
     }
@@ -1017,23 +1060,28 @@ impl<'a> WalIngest<'a> {
         modification: &mut DatadirModification<'_>,
         rel: RelTag,
         blknum: BlockNumber,
-    ) -> anyhow::Result<()> {
+        ctx: &RequestContext,
+    ) -> Result<(), PageReconstructError> {
         let new_nblocks = blknum + 1;
         // Check if the relation exists. We implicitly create relations on first
         // record.
         // TODO: would be nice if to be more explicit about it
         let last_lsn = modification.lsn;
-        let old_nblocks = if !self.timeline.get_rel_exists(rel, last_lsn, true).await? {
+        let old_nblocks = if !self
+            .timeline
+            .get_rel_exists(rel, last_lsn, true, ctx)
+            .await?
+        {
             // create it with 0 size initially, the logic below will extend it
-            modification.put_rel_creation(rel, 0).await?;
+            modification.put_rel_creation(rel, 0, ctx).await?;
             0
         } else {
-            self.timeline.get_rel_size(rel, last_lsn, true).await?
+            self.timeline.get_rel_size(rel, last_lsn, true, ctx).await?
         };
 
         if new_nblocks > old_nblocks {
             //info!("extending {} {} to {}", rel, old_nblocks, new_nblocks);
-            modification.put_rel_extend(rel, new_nblocks).await?;
+            modification.put_rel_extend(rel, new_nblocks, ctx).await?;
 
             // fill the gap with zeros
             for gap_blknum in old_nblocks..blknum {
@@ -1050,8 +1098,9 @@ impl<'a> WalIngest<'a> {
         segno: u32,
         blknum: BlockNumber,
         img: Bytes,
-    ) -> anyhow::Result<()> {
-        self.handle_slru_extend(modification, kind, segno, blknum)
+        ctx: &RequestContext,
+    ) -> Result<()> {
+        self.handle_slru_extend(modification, kind, segno, blknum, ctx)
             .await?;
         modification.put_slru_page_image(kind, segno, blknum, img)?;
         Ok(())
@@ -1063,6 +1112,7 @@ impl<'a> WalIngest<'a> {
         kind: SlruKind,
         segno: u32,
         blknum: BlockNumber,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         // we don't use a cache for this like we do for relations. SLRUS are explcitly
         // extended with ZEROPAGE records, not with commit records, so it happens
@@ -1075,17 +1125,17 @@ impl<'a> WalIngest<'a> {
         let last_lsn = self.timeline.get_last_record_lsn();
         let old_nblocks = if !self
             .timeline
-            .get_slru_segment_exists(kind, segno, last_lsn)
+            .get_slru_segment_exists(kind, segno, last_lsn, ctx)
             .await?
         {
             // create it with 0 size initially, the logic below will extend it
             modification
-                .put_slru_segment_creation(kind, segno, 0)
+                .put_slru_segment_creation(kind, segno, 0, ctx)
                 .await?;
             0
         } else {
             self.timeline
-                .get_slru_segment_size(kind, segno, last_lsn)
+                .get_slru_segment_size(kind, segno, last_lsn, ctx)
                 .await?
         };
 
@@ -1134,41 +1184,44 @@ mod tests {
 
     static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]);
 
-    async fn init_walingest_test(tline: &Timeline) -> Result<WalIngest> {
+    async fn init_walingest_test<'a>(
+        tline: &'a Timeline,
+        ctx: &RequestContext,
+    ) -> Result<WalIngest<'a>> {
         let mut m = tline.begin_modification(Lsn(0x10));
         m.put_checkpoint(ZERO_CHECKPOINT.clone())?;
-        m.put_relmap_file(0, 111, Bytes::from("")).await?; // dummy relmapper file
+        m.put_relmap_file(0, 111, Bytes::from(""), ctx).await?; // dummy relmapper file
         m.commit()?;
-        let walingest = WalIngest::new(tline, Lsn(0x10)).await?;
+        let walingest = WalIngest::new(tline, Lsn(0x10), ctx).await?;
 
         Ok(walingest)
     }
 
     #[tokio::test]
     async fn test_relsize() -> Result<()> {
-        let tenant = TenantHarness::create("test_relsize")?.load().await;
-        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
-        let mut walingest = init_walingest_test(&tline).await?;
+        let (tenant, ctx) = TenantHarness::create("test_relsize")?.load().await;
+        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?;
+        let mut walingest = init_walingest_test(&tline, &ctx).await?;
 
         let mut m = tline.begin_modification(Lsn(0x20));
-        walingest.put_rel_creation(&mut m, TESTREL_A).await?;
+        walingest.put_rel_creation(&mut m, TESTREL_A, &ctx).await?;
         walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))
+            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx)
             .await?;
         m.commit()?;
         let mut m = tline.begin_modification(Lsn(0x30));
         walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"))
+            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"), &ctx)
             .await?;
         m.commit()?;
         let mut m = tline.begin_modification(Lsn(0x40));
         walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"))
+            .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"), &ctx)
             .await?;
         m.commit()?;
         let mut m = tline.begin_modification(Lsn(0x50));
         walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"))
+            .put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"), &ctx)
             .await?;
         m.commit()?;
 
@@ -1176,120 +1229,157 @@ mod tests {
 
         // The relation was created at LSN 2, not visible at LSN 1 yet.
         assert_eq!(
-            tline.get_rel_exists(TESTREL_A, Lsn(0x10), false).await?,
+            tline
+                .get_rel_exists(TESTREL_A, Lsn(0x10), false, &ctx)
+                .await?,
             false
         );
         assert!(tline
-            .get_rel_size(TESTREL_A, Lsn(0x10), false)
+            .get_rel_size(TESTREL_A, Lsn(0x10), false, &ctx)
             .await
             .is_err());
-
         assert_eq!(
-            tline.get_rel_exists(TESTREL_A, Lsn(0x20), false).await?,
+            tline
+                .get_rel_exists(TESTREL_A, Lsn(0x20), false, &ctx)
+                .await?,
             true
         );
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20), false).await?, 1);
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x50), false).await?, 3);
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x20), false, &ctx)
+                .await?,
+            1
+        );
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x50), false, &ctx)
+                .await?,
+            3
+        );
 
         // Check page contents at each LSN
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x20), false)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x20), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 0 at 2")
         );
 
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x30), false)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x30), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 0 at 3")
         );
 
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x40), false)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x40), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 0 at 3")
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x40), false)
+                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x40), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 1 at 4")
         );
 
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x50), false)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x50), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 0 at 3")
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x50), false)
+                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x50), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 1 at 4")
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false)
+                .get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 2 at 5")
         );
 
         // Truncate last block
         let mut m = tline.begin_modification(Lsn(0x60));
-        walingest.put_rel_truncation(&mut m, TESTREL_A, 2).await?;
+        walingest
+            .put_rel_truncation(&mut m, TESTREL_A, 2, &ctx)
+            .await?;
         m.commit()?;
         assert_current_logical_size(&tline, Lsn(0x60));
 
         // Check reported size and contents after truncation
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x60), false).await?, 2);
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x60), false)
+                .get_rel_size(TESTREL_A, Lsn(0x60), false, &ctx)
+                .await?,
+            2
+        );
+        assert_eq!(
+            tline
+                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x60), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 0 at 3")
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x60), false)
+                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x60), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 1 at 4")
         );
 
         // should still see the truncated block with older LSN
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x50), false).await?, 3);
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false)
+                .get_rel_size(TESTREL_A, Lsn(0x50), false, &ctx)
+                .await?,
+            3
+        );
+        assert_eq!(
+            tline
+                .get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 2 at 5")
         );
 
         // Truncate to zero length
         let mut m = tline.begin_modification(Lsn(0x68));
-        walingest.put_rel_truncation(&mut m, TESTREL_A, 0).await?;
+        walingest
+            .put_rel_truncation(&mut m, TESTREL_A, 0, &ctx)
+            .await?;
         m.commit()?;
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x68), false).await?, 0);
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x68), false, &ctx)
+                .await?,
+            0
+        );
 
         // Extend from 0 to 2 blocks, leaving a gap
         let mut m = tline.begin_modification(Lsn(0x70));
         walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"))
+            .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"), &ctx)
             .await?;
         m.commit()?;
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x70), false).await?, 2);
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x70), false)
+                .get_rel_size(TESTREL_A, Lsn(0x70), false, &ctx)
+                .await?,
+            2
+        );
+        assert_eq!(
+            tline
+                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x70), false, &ctx)
                 .await?,
             ZERO_PAGE
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x70), false)
+                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x70), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 1")
         );
@@ -1297,21 +1387,26 @@ mod tests {
         // Extend a lot more, leaving a big gap that spans across segments
         let mut m = tline.begin_modification(Lsn(0x80));
         walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"))
+            .put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"), &ctx)
             .await?;
         m.commit()?;
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x80), false).await?, 1501);
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x80), false, &ctx)
+                .await?,
+            1501
+        );
         for blk in 2..1500 {
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blk, Lsn(0x80), false)
+                    .get_rel_page_at_lsn(TESTREL_A, blk, Lsn(0x80), false, &ctx)
                     .await?,
                 ZERO_PAGE
             );
         }
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1500, Lsn(0x80), false)
+                .get_rel_page_at_lsn(TESTREL_A, 1500, Lsn(0x80), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 1500")
         );
@@ -1323,31 +1418,40 @@ mod tests {
     // and then created it again within the same layer.
     #[tokio::test]
     async fn test_drop_extend() -> Result<()> {
-        let tenant = TenantHarness::create("test_drop_extend")?.load().await;
-        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
-        let mut walingest = init_walingest_test(&tline).await?;
+        let (tenant, ctx) = TenantHarness::create("test_drop_extend")?.load().await;
+        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?;
+        let mut walingest = init_walingest_test(&tline, &ctx).await?;
 
         let mut m = tline.begin_modification(Lsn(0x20));
         walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))
+            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx)
             .await?;
         m.commit()?;
 
         // Check that rel exists and size is correct
         assert_eq!(
-            tline.get_rel_exists(TESTREL_A, Lsn(0x20), false).await?,
+            tline
+                .get_rel_exists(TESTREL_A, Lsn(0x20), false, &ctx)
+                .await?,
             true
         );
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20), false).await?, 1);
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x20), false, &ctx)
+                .await?,
+            1
+        );
 
         // Drop rel
         let mut m = tline.begin_modification(Lsn(0x30));
-        walingest.put_rel_drop(&mut m, TESTREL_A).await?;
+        walingest.put_rel_drop(&mut m, TESTREL_A, &ctx).await?;
         m.commit()?;
 
         // Check that rel is not visible anymore
         assert_eq!(
-            tline.get_rel_exists(TESTREL_A, Lsn(0x30), false).await?,
+            tline
+                .get_rel_exists(TESTREL_A, Lsn(0x30), false, &ctx)
+                .await?,
             false
         );
 
@@ -1357,16 +1461,23 @@ mod tests {
         // Re-create it
         let mut m = tline.begin_modification(Lsn(0x40));
         walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"))
+            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"), &ctx)
             .await?;
         m.commit()?;
 
         // Check that rel exists and size is correct
         assert_eq!(
-            tline.get_rel_exists(TESTREL_A, Lsn(0x40), false).await?,
+            tline
+                .get_rel_exists(TESTREL_A, Lsn(0x40), false, &ctx)
+                .await?,
             true
         );
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x40), false).await?, 1);
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x40), false, &ctx)
+                .await?,
+            1
+        );
 
         Ok(())
     }
@@ -1376,9 +1487,9 @@ mod tests {
     // and then extended it again within the same layer.
     #[tokio::test]
     async fn test_truncate_extend() -> Result<()> {
-        let tenant = TenantHarness::create("test_truncate_extend")?.load().await;
-        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
-        let mut walingest = init_walingest_test(&tline).await?;
+        let (tenant, ctx) = TenantHarness::create("test_truncate_extend")?.load().await;
+        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?;
+        let mut walingest = init_walingest_test(&tline, &ctx).await?;
 
         // Create a 20 MB relation (the size is arbitrary)
         let relsize = 20 * 1024 * 1024 / 8192;
@@ -1386,27 +1497,33 @@ mod tests {
         for blkno in 0..relsize {
             let data = format!("foo blk {} at {}", blkno, Lsn(0x20));
             walingest
-                .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data))
+                .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx)
                 .await?;
         }
         m.commit()?;
 
         // The relation was created at LSN 20, not visible at LSN 1 yet.
         assert_eq!(
-            tline.get_rel_exists(TESTREL_A, Lsn(0x10), false).await?,
+            tline
+                .get_rel_exists(TESTREL_A, Lsn(0x10), false, &ctx)
+                .await?,
             false
         );
         assert!(tline
-            .get_rel_size(TESTREL_A, Lsn(0x10), false)
+            .get_rel_size(TESTREL_A, Lsn(0x10), false, &ctx)
             .await
             .is_err());
 
         assert_eq!(
-            tline.get_rel_exists(TESTREL_A, Lsn(0x20), false).await?,
+            tline
+                .get_rel_exists(TESTREL_A, Lsn(0x20), false, &ctx)
+                .await?,
             true
         );
         assert_eq!(
-            tline.get_rel_size(TESTREL_A, Lsn(0x20), false).await?,
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x20), false, &ctx)
+                .await?,
             relsize
         );
 
@@ -1416,7 +1533,7 @@ mod tests {
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, lsn, false)
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, lsn, false, &ctx)
                     .await?,
                 TEST_IMG(&data)
             );
@@ -1425,18 +1542,25 @@ mod tests {
         // Truncate relation so that second segment was dropped
         // - only leave one page
         let mut m = tline.begin_modification(Lsn(0x60));
-        walingest.put_rel_truncation(&mut m, TESTREL_A, 1).await?;
+        walingest
+            .put_rel_truncation(&mut m, TESTREL_A, 1, &ctx)
+            .await?;
         m.commit()?;
 
         // Check reported size and contents after truncation
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x60), false).await?, 1);
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x60), false, &ctx)
+                .await?,
+            1
+        );
 
         for blkno in 0..1 {
             let lsn = Lsn(0x20);
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x60), false)
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x60), false, &ctx)
                     .await?,
                 TEST_IMG(&data)
             );
@@ -1444,7 +1568,9 @@ mod tests {
 
         // should still see all blocks with older LSN
         assert_eq!(
-            tline.get_rel_size(TESTREL_A, Lsn(0x50), false).await?,
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x50), false, &ctx)
+                .await?,
             relsize
         );
         for blkno in 0..relsize {
@@ -1452,7 +1578,7 @@ mod tests {
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x50), false)
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x50), false, &ctx)
                     .await?,
                 TEST_IMG(&data)
             );
@@ -1465,17 +1591,21 @@ mod tests {
         for blkno in 0..relsize {
             let data = format!("foo blk {} at {}", blkno, lsn);
             walingest
-                .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data))
+                .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx)
                 .await?;
         }
         m.commit()?;
 
         assert_eq!(
-            tline.get_rel_exists(TESTREL_A, Lsn(0x80), false).await?,
+            tline
+                .get_rel_exists(TESTREL_A, Lsn(0x80), false, &ctx)
+                .await?,
             true
         );
         assert_eq!(
-            tline.get_rel_size(TESTREL_A, Lsn(0x80), false).await?,
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x80), false, &ctx)
+                .await?,
             relsize
         );
         // Check relation content
@@ -1484,7 +1614,7 @@ mod tests {
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x80), false)
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x80), false, &ctx)
                     .await?,
                 TEST_IMG(&data)
             );
@@ -1497,9 +1627,9 @@ mod tests {
     /// split into multiple 1 GB segments in Postgres.
     #[tokio::test]
     async fn test_large_rel() -> Result<()> {
-        let tenant = TenantHarness::create("test_large_rel")?.load().await;
-        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
-        let mut walingest = init_walingest_test(&tline).await?;
+        let (tenant, ctx) = TenantHarness::create("test_large_rel")?.load().await;
+        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?;
+        let mut walingest = init_walingest_test(&tline, &ctx).await?;
 
         let mut lsn = 0x10;
         for blknum in 0..RELSEG_SIZE + 1 {
@@ -1507,7 +1637,7 @@ mod tests {
             let mut m = tline.begin_modification(Lsn(lsn));
             let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn)));
             walingest
-                .put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img)
+                .put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img, &ctx)
                 .await?;
             m.commit()?;
         }
@@ -1515,7 +1645,7 @@ mod tests {
         assert_current_logical_size(&tline, Lsn(lsn));
 
         assert_eq!(
-            tline.get_rel_size(TESTREL_A, Lsn(lsn), false).await?,
+            tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
             RELSEG_SIZE + 1
         );
 
@@ -1523,11 +1653,11 @@ mod tests {
         lsn += 0x10;
         let mut m = tline.begin_modification(Lsn(lsn));
         walingest
-            .put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE)
+            .put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE, &ctx)
             .await?;
         m.commit()?;
         assert_eq!(
-            tline.get_rel_size(TESTREL_A, Lsn(lsn), false).await?,
+            tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
             RELSEG_SIZE
         );
         assert_current_logical_size(&tline, Lsn(lsn));
@@ -1536,11 +1666,11 @@ mod tests {
         lsn += 0x10;
         let mut m = tline.begin_modification(Lsn(lsn));
         walingest
-            .put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE - 1)
+            .put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE - 1, &ctx)
             .await?;
         m.commit()?;
         assert_eq!(
-            tline.get_rel_size(TESTREL_A, Lsn(lsn), false).await?,
+            tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
             RELSEG_SIZE - 1
         );
         assert_current_logical_size(&tline, Lsn(lsn));
@@ -1552,11 +1682,11 @@ mod tests {
             lsn += 0x10;
             let mut m = tline.begin_modification(Lsn(lsn));
             walingest
-                .put_rel_truncation(&mut m, TESTREL_A, size as BlockNumber)
+                .put_rel_truncation(&mut m, TESTREL_A, size as BlockNumber, &ctx)
                 .await?;
             m.commit()?;
             assert_eq!(
-                tline.get_rel_size(TESTREL_A, Lsn(lsn), false).await?,
+                tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
                 size as BlockNumber
             );
 
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index fd0524016f..c943bf0a27 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -22,16 +22,18 @@ use byteorder::{ByteOrder, LittleEndian};
 use bytes::{BufMut, Bytes, BytesMut};
 use nix::poll::*;
 use serde::Serialize;
+use std::collections::VecDeque;
 use std::fs::OpenOptions;
 use std::io::prelude::*;
 use std::io::{Error, ErrorKind};
 use std::ops::{Deref, DerefMut};
+use std::os::fd::RawFd;
 use std::os::unix::io::AsRawFd;
 use std::os::unix::prelude::CommandExt;
 use std::path::PathBuf;
 use std::process::Stdio;
 use std::process::{Child, ChildStderr, ChildStdin, ChildStdout, Command};
-use std::sync::Mutex;
+use std::sync::{Mutex, MutexGuard};
 use std::time::Duration;
 use std::time::Instant;
 use std::{fs, io};
@@ -90,6 +92,20 @@ pub trait WalRedoManager: Send + Sync {
     ) -> Result<Bytes, WalRedoError>;
 }
 
+struct ProcessInput {
+    child: NoLeakChild,
+    stdin: ChildStdin,
+    stderr_fd: RawFd,
+    stdout_fd: RawFd,
+    n_requests: usize,
+}
+
+struct ProcessOutput {
+    stdout: ChildStdout,
+    pending_responses: VecDeque<Option<Bytes>>,
+    n_processed_responses: usize,
+}
+
 ///
 /// This is the real implementation that uses a Postgres process to
 /// perform WAL replay. Only one thread can use the process at a time,
@@ -101,7 +117,9 @@ pub struct PostgresRedoManager {
     tenant_id: TenantId,
     conf: &'static PageServerConf,
 
-    process: Mutex<Option<PostgresRedoProcess>>,
+    stdout: Mutex<Option<ProcessOutput>>,
+    stdin: Mutex<Option<ProcessInput>>,
+    stderr: Mutex<Option<ChildStderr>>,
 }
 
 /// Can this request be served by neon redo functions
@@ -209,16 +227,17 @@ impl PostgresRedoManager {
         PostgresRedoManager {
             tenant_id,
             conf,
-            process: Mutex::new(None),
+            stdin: Mutex::new(None),
+            stdout: Mutex::new(None),
+            stderr: Mutex::new(None),
         }
     }
 
     /// Launch process pre-emptively. Should not be needed except for benchmarking.
-    pub fn launch_process(&mut self, pg_version: u32) -> anyhow::Result<()> {
-        let inner = self.process.get_mut().unwrap();
-        if inner.is_none() {
-            let p = PostgresRedoProcess::launch(self.conf, self.tenant_id, pg_version)?;
-            *inner = Some(p);
+    pub fn launch_process(&self, pg_version: u32) -> anyhow::Result<()> {
+        let mut proc = self.stdin.lock().unwrap();
+        if proc.is_none() {
+            self.launch(&mut proc, pg_version)?;
         }
         Ok(())
     }
@@ -241,22 +260,19 @@ impl PostgresRedoManager {
 
         let start_time = Instant::now();
 
-        let mut process_guard = self.process.lock().unwrap();
+        let mut proc = self.stdin.lock().unwrap();
         let lock_time = Instant::now();
 
         // launch the WAL redo process on first use
-        if process_guard.is_none() {
-            let p = PostgresRedoProcess::launch(self.conf, self.tenant_id, pg_version)?;
-            *process_guard = Some(p);
+        if proc.is_none() {
+            self.launch(&mut proc, pg_version)?;
         }
-        let process = process_guard.as_mut().unwrap();
-
         WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64());
 
         // Relational WAL records are applied using wal-redo-postgres
         let buf_tag = BufferTag { rel, blknum };
-        let result = process
-            .apply_wal_records(buf_tag, base_img, records, wal_redo_timeout)
+        let result = self
+            .apply_wal_records(proc, buf_tag, base_img, records, wal_redo_timeout)
             .map_err(WalRedoError::IoError);
 
         let end_time = Instant::now();
@@ -295,8 +311,22 @@ impl PostgresRedoManager {
 				base_img_lsn,
                 lsn
             );
-            let process = process_guard.take().unwrap();
-            process.kill();
+            // self.stdin only holds stdin & stderr as_raw_fd().
+            // Dropping it as part of take() doesn't close them.
+            // The owning objects (ChildStdout and ChildStderr) are stored in
+            // self.stdout and self.stderr, respsectively.
+            // We intentionally keep them open here to avoid a race between
+            // currently running `apply_wal_records()` and a `launch()` call
+            // after we return here.
+            // The currently running `apply_wal_records()` must not read from
+            // the newly launched process.
+            // By keeping self.stdout and self.stderr open here, `launch()` will
+            // get other file descriptors for the new child's stdout and stderr,
+            // and hence the current `apply_wal_records()` calls will observe
+            //  `output.stdout.as_raw_fd() != stdout_fd` .
+            if let Some(proc) = self.stdin.lock().unwrap().take() {
+                proc.child.kill_and_wait();
+            }
         }
         result
     }
@@ -595,32 +625,23 @@ impl<C: CommandExt> CloseFileDescriptors for C {
     }
 }
 
-///
-/// Handle to the Postgres WAL redo process
-///
-struct PostgresRedoProcess {
-    tenant_id: TenantId,
-    child: NoLeakChild,
-    stdin: ChildStdin,
-    stdout: ChildStdout,
-    stderr: ChildStderr,
-}
-
-impl PostgresRedoProcess {
+impl PostgresRedoManager {
     //
     // Start postgres binary in special WAL redo mode.
     //
-    #[instrument(skip_all,fields(tenant_id=%tenant_id, pg_version=pg_version))]
+    #[instrument(skip_all,fields(tenant_id=%self.tenant_id, pg_version=pg_version))]
     fn launch(
-        conf: &PageServerConf,
-        tenant_id: TenantId,
+        &self,
+        input: &mut MutexGuard<Option<ProcessInput>>,
         pg_version: u32,
-    ) -> Result<PostgresRedoProcess, Error> {
+    ) -> Result<(), Error> {
         // FIXME: We need a dummy Postgres cluster to run the process in. Currently, we
         // just create one with constant name. That fails if you try to launch more than
         // one WAL redo manager concurrently.
         let datadir = path_with_suffix_extension(
-            conf.tenant_path(&tenant_id).join("wal-redo-datadir"),
+            self.conf
+                .tenant_path(&self.tenant_id)
+                .join("wal-redo-datadir"),
             TEMP_FILE_SUFFIX,
         );
 
@@ -634,10 +655,12 @@ impl PostgresRedoProcess {
                 )
             })?;
         }
-        let pg_bin_dir_path = conf
+        let pg_bin_dir_path = self
+            .conf
             .pg_bin_dir(pg_version)
             .map_err(|e| Error::new(ErrorKind::Other, format!("incorrect pg_bin_dir path: {e}")))?;
-        let pg_lib_dir_path = conf
+        let pg_lib_dir_path = self
+            .conf
             .pg_lib_dir(pg_version)
             .map_err(|e| Error::new(ErrorKind::Other, format!("incorrect pg_lib_dir path: {e}")))?;
 
@@ -723,27 +746,31 @@ impl PostgresRedoProcess {
         // all fallible operations post-spawn are complete, so get rid of the guard
         let child = scopeguard::ScopeGuard::into_inner(child);
 
-        Ok(PostgresRedoProcess {
-            tenant_id,
+        **input = Some(ProcessInput {
             child,
+            stdout_fd: stdout.as_raw_fd(),
+            stderr_fd: stderr.as_raw_fd(),
             stdin,
+            n_requests: 0,
+        });
+
+        *self.stdout.lock().unwrap() = Some(ProcessOutput {
             stdout,
-            stderr,
-        })
+            pending_responses: VecDeque::new(),
+            n_processed_responses: 0,
+        });
+        *self.stderr.lock().unwrap() = Some(stderr);
+
+        Ok(())
     }
 
-    #[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%self.child.id()))]
-    fn kill(self) {
-        self.child.kill_and_wait();
-    }
-
-    //
     // Apply given WAL records ('records') over an old page image. Returns
     // new page image.
     //
-    #[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%self.child.id()))]
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%input.as_ref().unwrap().child.id()))]
     fn apply_wal_records(
-        &mut self,
+        &self,
+        mut input: MutexGuard<Option<ProcessInput>>,
         tag: BufferTag,
         base_img: Option<Bytes>,
         records: &[(Lsn, NeonWalRecord)],
@@ -780,33 +807,23 @@ impl PostgresRedoProcess {
         build_get_page_msg(tag, &mut writebuf);
         WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
 
-        // The input is now in 'writebuf'. Do a blind write first, writing as much as
-        // we can, before calling poll(). That skips one call to poll() if the stdin is
-        // already available for writing, which it almost certainly is because the
-        // process is idle.
-        let mut nwrite = self.stdin.write(&writebuf)?;
-
-        // We expect the WAL redo process to respond with an 8k page image. We read it
-        // into this buffer.
-        let mut resultbuf = vec![0; BLCKSZ.into()];
-        let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far
+        let proc = input.as_mut().unwrap();
+        let mut nwrite = 0usize;
+        let stdout_fd = proc.stdout_fd;
 
         // Prepare for calling poll()
         let mut pollfds = [
-            PollFd::new(self.stdout.as_raw_fd(), PollFlags::POLLIN),
-            PollFd::new(self.stderr.as_raw_fd(), PollFlags::POLLIN),
-            PollFd::new(self.stdin.as_raw_fd(), PollFlags::POLLOUT),
+            PollFd::new(proc.stdin.as_raw_fd(), PollFlags::POLLOUT),
+            PollFd::new(proc.stderr_fd, PollFlags::POLLIN),
+            PollFd::new(stdout_fd, PollFlags::POLLIN),
         ];
 
-        // We do three things simultaneously: send the old base image and WAL records to
-        // the child process's stdin, read the result from child's stdout, and forward any logging
+        // We do two things simultaneously: send the old base image and WAL records to
+        // the child process's stdin and forward any logging
         // information that the child writes to its stderr to the page server's log.
-        while nresult < BLCKSZ.into() {
-            // If we have more data to write, wake up if 'stdin' becomes writeable or
-            // we have data to read. Otherwise only wake up if there's data to read.
-            let nfds = if nwrite < writebuf.len() { 3 } else { 2 };
+        while nwrite < writebuf.len() {
             let n = loop {
-                match nix::poll::poll(&mut pollfds[0..nfds], wal_redo_timeout.as_millis() as i32) {
+                match nix::poll::poll(&mut pollfds[0..2], wal_redo_timeout.as_millis() as i32) {
                     Err(e) if e == nix::errno::Errno::EINTR => continue,
                     res => break res,
                 }
@@ -820,14 +837,16 @@ impl PostgresRedoProcess {
             let err_revents = pollfds[1].revents().unwrap();
             if err_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
                 let mut errbuf: [u8; 16384] = [0; 16384];
-                let n = self.stderr.read(&mut errbuf)?;
+                let mut stderr_guard = self.stderr.lock().unwrap();
+                let stderr = stderr_guard.as_mut().unwrap();
+                let len = stderr.read(&mut errbuf)?;
 
                 // The message might not be split correctly into lines here. But this is
                 // good enough, the important thing is to get the message to the log.
-                if n > 0 {
+                if len > 0 {
                     error!(
                         "wal-redo-postgres: {}",
-                        String::from_utf8_lossy(&errbuf[0..n])
+                        String::from_utf8_lossy(&errbuf[0..len])
                     );
 
                     // To make sure we capture all log from the process if it fails, keep
@@ -841,33 +860,157 @@ impl PostgresRedoProcess {
                 ));
             }
 
-            // If we have more data to write and 'stdin' is writeable, do write.
-            if nwrite < writebuf.len() {
-                let in_revents = pollfds[2].revents().unwrap();
-                if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
-                    nwrite += self.stdin.write(&writebuf[nwrite..])?;
-                } else if in_revents.contains(PollFlags::POLLHUP) {
-                    // We still have more data to write, but the process closed the pipe.
-                    return Err(Error::new(
-                        ErrorKind::BrokenPipe,
-                        "WAL redo process closed its stdin unexpectedly",
-                    ));
-                }
-            }
-
-            // If we have some data in stdout, read it to the result buffer.
-            let out_revents = pollfds[0].revents().unwrap();
-            if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
-                nresult += self.stdout.read(&mut resultbuf[nresult..])?;
-            } else if out_revents.contains(PollFlags::POLLHUP) {
+            // If 'stdin' is writeable, do write.
+            let in_revents = pollfds[0].revents().unwrap();
+            if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
+                nwrite += proc.stdin.write(&writebuf[nwrite..])?;
+            } else if in_revents.contains(PollFlags::POLLHUP) {
+                // We still have more data to write, but the process closed the pipe.
                 return Err(Error::new(
                     ErrorKind::BrokenPipe,
-                    "WAL redo process closed its stdout unexpectedly",
+                    "WAL redo process closed its stdin unexpectedly",
                 ));
             }
         }
+        let request_no = proc.n_requests;
+        proc.n_requests += 1;
+        drop(input);
 
-        Ok(Bytes::from(resultbuf))
+        // To improve walredo performance we separate sending requests and receiving
+        // responses. Them are protected by different mutexes (output and input).
+        // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
+        // then there is not warranty that T1 will first granted output mutex lock.
+        // To address this issue we maintain number of sent requests, number of processed
+        // responses and ring buffer with pending responses. After sending response
+        // (under input mutex), threads remembers request number. Then it releases
+        // input mutex, locks output mutex and fetch in ring buffer all responses until
+        // its stored request number. The it takes correspondent element from
+        // pending responses ring buffer and truncate all empty elements from the front,
+        // advancing processed responses number.
+
+        let mut output_guard = self.stdout.lock().unwrap();
+        let output = output_guard.as_mut().unwrap();
+        if output.stdout.as_raw_fd() != stdout_fd {
+            // If stdout file descriptor is changed then it means that walredo process is crashed and restarted.
+            // As far as ProcessInput and ProcessOutout are protected by different mutexes,
+            // it can happen that we send request to one process and waiting response from another.
+            // To prevent such situation we compare stdout file descriptors.
+            // As far as old stdout pipe is destroyed only after new one is created,
+            // it can not reuse the same file descriptor, so this check is safe.
+            //
+            // Cross-read this with the comment in apply_batch_postgres if result.is_err().
+            // That's where we kill the child process.
+            return Err(Error::new(
+                ErrorKind::BrokenPipe,
+                "WAL redo process closed its stdout unexpectedly",
+            ));
+        }
+        let n_processed_responses = output.n_processed_responses;
+        while n_processed_responses + output.pending_responses.len() <= request_no {
+            // We expect the WAL redo process to respond with an 8k page image. We read it
+            // into this buffer.
+            let mut resultbuf = vec![0; BLCKSZ.into()];
+            let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far
+            while nresult < BLCKSZ.into() {
+                // We do two things simultaneously: reading response from stdout
+                // and forward any logging information that the child writes to its stderr to the page server's log.
+                let n = loop {
+                    match nix::poll::poll(&mut pollfds[1..3], wal_redo_timeout.as_millis() as i32) {
+                        Err(e) if e == nix::errno::Errno::EINTR => continue,
+                        res => break res,
+                    }
+                }?;
+
+                if n == 0 {
+                    return Err(Error::new(ErrorKind::Other, "WAL redo timed out"));
+                }
+
+                // If we have some messages in stderr, forward them to the log.
+                let err_revents = pollfds[1].revents().unwrap();
+                if err_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
+                    let mut errbuf: [u8; 16384] = [0; 16384];
+                    let mut stderr_guard = self.stderr.lock().unwrap();
+                    let stderr = stderr_guard.as_mut().unwrap();
+                    let len = stderr.read(&mut errbuf)?;
+
+                    // The message might not be split correctly into lines here. But this is
+                    // good enough, the important thing is to get the message to the log.
+                    if len > 0 {
+                        error!(
+                            "wal-redo-postgres: {}",
+                            String::from_utf8_lossy(&errbuf[0..len])
+                        );
+
+                        // To make sure we capture all log from the process if it fails, keep
+                        // reading from the stderr, before checking the stdout.
+                        continue;
+                    }
+                } else if err_revents.contains(PollFlags::POLLHUP) {
+                    return Err(Error::new(
+                        ErrorKind::BrokenPipe,
+                        "WAL redo process closed its stderr unexpectedly",
+                    ));
+                }
+
+                // If we have some data in stdout, read it to the result buffer.
+                let out_revents = pollfds[2].revents().unwrap();
+                if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
+                    nresult += output.stdout.read(&mut resultbuf[nresult..])?;
+                } else if out_revents.contains(PollFlags::POLLHUP) {
+                    return Err(Error::new(
+                        ErrorKind::BrokenPipe,
+                        "WAL redo process closed its stdout unexpectedly",
+                    ));
+                }
+            }
+            output
+                .pending_responses
+                .push_back(Some(Bytes::from(resultbuf)));
+        }
+        // Replace our request's response with None in `pending_responses`.
+        // Then make space in the ring buffer by clearing out any seqence of contiguous
+        // `None`'s from the front of `pending_responses`.
+        // NB: We can't pop_front() because other requests' responses because another
+        // requester might have grabbed the output mutex before us:
+        // T1: grab input mutex
+        // T1: send request_no 23
+        // T1: release input mutex
+        // T2: grab input mutex
+        // T2: send request_no 24
+        // T2: release input mutex
+        // T2: grab output mutex
+        // T2: n_processed_responses + output.pending_responses.len() <= request_no
+        //            23                                0                   24
+        // T2: enters poll loop that reads stdout
+        // T2: put response for 23 into pending_responses
+        // T2: put response for 24 into pending_resposnes
+        // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
+        // T2: takes its response_24
+        // pending_responses now looks like this: Front Some(response_23) None Back
+        // T2: does the while loop below
+        // pending_responses now looks like this: Front Some(response_23) None Back
+        // T2: releases output mutex
+        // T1: grabs output mutex
+        // T1: n_processed_responses + output.pending_responses.len() > request_no
+        //            23                                2                   23
+        // T1: skips poll loop that reads stdout
+        // T1: takes its response_23
+        // pending_responses now looks like this: Front None None Back
+        // T2: does the while loop below
+        // pending_responses now looks like this: Front Back
+        // n_processed_responses now has value 25
+        let res = output.pending_responses[request_no - n_processed_responses]
+            .take()
+            .expect("we own this request_no, nobody else is supposed to take it");
+        while let Some(front) = output.pending_responses.front() {
+            if front.is_none() {
+                output.pending_responses.pop_front();
+                output.n_processed_responses += 1;
+            } else {
+                break;
+            }
+        }
+        Ok(res)
     }
 }
 
diff --git a/poetry.lock b/poetry.lock
index edbcddd576..fc37124184 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,3 +1,21 @@
+[[package]]
+name = "aiohttp"
+version = "3.7.0"
+description = "Async http client/server framework (asyncio)"
+category = "main"
+optional = false
+python-versions = ">=3.6"
+
+[package.dependencies]
+async-timeout = ">=3.0,<4.0"
+attrs = ">=17.3.0"
+chardet = ">=2.0,<4.0"
+multidict = ">=4.5,<7.0"
+yarl = ">=1.0,<2.0"
+
+[package.extras]
+speedups = ["aiodns", "brotlipy", "cchardet"]
+
 [[package]]
 name = "aiopg"
 version = "1.3.4"
@@ -41,11 +59,11 @@ six = ">=1.9.0"
 
 [[package]]
 name = "async-timeout"
-version = "4.0.2"
+version = "3.0.1"
 description = "Timeout context manager for asyncio programs"
 category = "main"
 optional = false
-python-versions = ">=3.6"
+python-versions = ">=3.5.3"
 
 [[package]]
 name = "asyncpg"
@@ -560,6 +578,14 @@ networkx = ">=2.4,<3.0"
 pyyaml = ">5.4"
 sarif-om = ">=1.0.4,<1.1.0"
 
+[[package]]
+name = "chardet"
+version = "3.0.4"
+description = "Universal encoding detector for Python 2 and 3"
+category = "main"
+optional = false
+python-versions = "*"
+
 [[package]]
 name = "charset-normalizer"
 version = "2.1.0"
@@ -939,6 +965,14 @@ server = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.4.0)"
 ssm = ["PyYAML (>=5.1)", "dataclasses"]
 xray = ["aws-xray-sdk (>=0.93,!=0.96)", "setuptools"]
 
+[[package]]
+name = "multidict"
+version = "6.0.4"
+description = "multidict implementation"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+
 [[package]]
 name = "mypy"
 version = "0.991"
@@ -1580,6 +1614,18 @@ category = "main"
 optional = false
 python-versions = ">=3.4"
 
+[[package]]
+name = "yarl"
+version = "1.8.2"
+description = "Yet another URL library"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+
+[package.dependencies]
+idna = ">=2.0"
+multidict = ">=4.0"
+
 [[package]]
 name = "zipp"
 version = "3.8.1"
@@ -1595,9 +1641,44 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>=
 [metadata]
 lock-version = "1.1"
 python-versions = "^3.9"
-content-hash = "af44b269c235a6fd59dacb4ff9e05cbc13a79b57254a8d5d4bde934bd5691a70"
+content-hash = "0f7289ef9439d1d7cd36b07efb53741b773669b0f860189c800270b7def0c241"
 
 [metadata.files]
+aiohttp = [
+    {file = "aiohttp-3.7.0-cp36-cp36m-macosx_10_14_x86_64.whl", hash = "sha256:72fe89f7e14939e896d984c4b592580f8cdfa7497feb1c0c24639a9c60be3eb9"},
+    {file = "aiohttp-3.7.0-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:fdf778d4c4bf976e69a37213fe8083613d0851976ddcf485bd7c0650a43d3852"},
+    {file = "aiohttp-3.7.0-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:fee7b5e68939ffc09f9b29f167ed49c8b50de3eee0a1d8108b439ddd9963af46"},
+    {file = "aiohttp-3.7.0-cp36-cp36m-manylinux2014_i686.whl", hash = "sha256:dd64634713be409202058f2ea267dfbcdd74b387b8793425f21ef0266d45d0e9"},
+    {file = "aiohttp-3.7.0-cp36-cp36m-manylinux2014_ppc64le.whl", hash = "sha256:713dd7fd70ddda9dc8d014c49dd0e55b58afe4e0cddb8722c7501f53edf30c3f"},
+    {file = "aiohttp-3.7.0-cp36-cp36m-manylinux2014_s390x.whl", hash = "sha256:d31c43f7c4948ce01957f9a1ceee0784e067778477557ebccdf805398331c1a1"},
+    {file = "aiohttp-3.7.0-cp36-cp36m-manylinux2014_x86_64.whl", hash = "sha256:5e26d6003eb6df304608d9fd9c9437065a8532d869a3ffcbd8113a3d710f8239"},
+    {file = "aiohttp-3.7.0-cp36-cp36m-win_amd64.whl", hash = "sha256:bf08462cddd10ddd8ffe5cb5c1638bfa051290909ebedb31c06e46578b9b7529"},
+    {file = "aiohttp-3.7.0-cp37-cp37m-macosx_10_14_x86_64.whl", hash = "sha256:07bacf6721db51a4c6160ed3031a2a97910647969dafd7c653f600f3b542f463"},
+    {file = "aiohttp-3.7.0-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:245b58e30bc889d18b783db2f09ef1d814f466e15c84325410827451297003a0"},
+    {file = "aiohttp-3.7.0-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:b392e5c3e122586c49cd8b9426f577bf4d51958933b839d158d28b69515af74e"},
+    {file = "aiohttp-3.7.0-cp37-cp37m-manylinux2014_i686.whl", hash = "sha256:5b5c320621a171aa85f96909af28fbb5286bd6842066db3062b083ba92261256"},
+    {file = "aiohttp-3.7.0-cp37-cp37m-manylinux2014_ppc64le.whl", hash = "sha256:97d2341d1360dbe2c5b1d94922f7d68f9ce2ded1daab88b9bdeb49ce419cdc1b"},
+    {file = "aiohttp-3.7.0-cp37-cp37m-manylinux2014_s390x.whl", hash = "sha256:beda23f292716887532661dc19abb9db2302ccfbd671a080cd8f4be7463d0841"},
+    {file = "aiohttp-3.7.0-cp37-cp37m-manylinux2014_x86_64.whl", hash = "sha256:cbcaae9a6f14f762348d19b2dce8162772c0b0a1739314e18492a308a22caf96"},
+    {file = "aiohttp-3.7.0-cp37-cp37m-win_amd64.whl", hash = "sha256:7a49ef7b691babc83db126db874fbf26ba2f781899b91399f9ff8b235f059245"},
+    {file = "aiohttp-3.7.0-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:f56892f57310415cf6a179eec3ea6c7a82a9d37fbc00894943ea3154011a6d2a"},
+    {file = "aiohttp-3.7.0-cp38-cp38-manylinux1_i686.whl", hash = "sha256:df1274b7620c32d3b15bfb0a8fb3165dd6cdc9c39f4db74d162f051c80826542"},
+    {file = "aiohttp-3.7.0-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:a04ba359dc5f2e21b96bfc90c4a7665441441ba61b52e992b7799493889a3419"},
+    {file = "aiohttp-3.7.0-cp38-cp38-manylinux2014_i686.whl", hash = "sha256:f548d7976d168f0f45ac5909ca5f606ae3f6f7aa1725b22504004a053b29a7d0"},
+    {file = "aiohttp-3.7.0-cp38-cp38-manylinux2014_ppc64le.whl", hash = "sha256:deef02e2a9f5095463098c7c22d5566f20a6e4e14fc0996c0c2efc74d461b680"},
+    {file = "aiohttp-3.7.0-cp38-cp38-manylinux2014_s390x.whl", hash = "sha256:fe44c96bc380588d36729392b602470d88a7c18e646e95dd4348cafe3900d91d"},
+    {file = "aiohttp-3.7.0-cp38-cp38-manylinux2014_x86_64.whl", hash = "sha256:9210532e6e95b40d22a33415bb84423eef3f633b2d2339b97f3b26438eebc466"},
+    {file = "aiohttp-3.7.0-cp38-cp38-win_amd64.whl", hash = "sha256:a586e476a251483d222c73dfb2f27df90bc4ea1b8c7da9396236510e0d4046c8"},
+    {file = "aiohttp-3.7.0-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:900012c5f12ff72b1453229afe288ddc9135176df8b3b3cc5b8f6cfde912aaa4"},
+    {file = "aiohttp-3.7.0-cp39-cp39-manylinux1_i686.whl", hash = "sha256:064d5f0738bcbab3e0c0ecf85c93b5ee1e07e124f994eaa03bf73687f3ecd9da"},
+    {file = "aiohttp-3.7.0-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:0a2edf27865e66a33f64fa793cd14d0aae8127ce20a858539e97c25b600556dc"},
+    {file = "aiohttp-3.7.0-cp39-cp39-manylinux2014_i686.whl", hash = "sha256:eaa8ae734639d5a0a3b5e33a154b8bfef384cdc090706f95c387cae8b21af764"},
+    {file = "aiohttp-3.7.0-cp39-cp39-manylinux2014_ppc64le.whl", hash = "sha256:a8a42f05491d9c04a77806875a68f84fea9af7a59d47b7897cb166632f74606c"},
+    {file = "aiohttp-3.7.0-cp39-cp39-manylinux2014_s390x.whl", hash = "sha256:b19ded3f6957693b97ba8372aacb5b0021639bbd5e77b1e960796bcef5431969"},
+    {file = "aiohttp-3.7.0-cp39-cp39-manylinux2014_x86_64.whl", hash = "sha256:cefbd7ce7d1f1db43749a077e4970e29e2b631f367c9eff3862c3c886b4218dd"},
+    {file = "aiohttp-3.7.0-cp39-cp39-win_amd64.whl", hash = "sha256:7d64f7dfd4e326d9b0d11b07fcd5ebf78844ba3c8f7699f38b50b0e0db0ae68f"},
+    {file = "aiohttp-3.7.0.tar.gz", hash = "sha256:176f1d2b2bc07044f4ed583216578a72a2bd35dffdeb92e0517d0aaa29d29549"},
+]
 aiopg = [
     {file = "aiopg-1.3.4-py3-none-any.whl", hash = "sha256:b5b74a124831aad71608c3c203479db90bac4a7eb3f8982bc48c3d3e6f1e57bf"},
     {file = "aiopg-1.3.4.tar.gz", hash = "sha256:23f9e4cd9f28e9d91a6de3b4fb517e8bed25511cd954acccba9fe3a702d9b7d0"},
@@ -1611,8 +1692,8 @@ allure-python-commons = [
     {file = "allure_python_commons-2.10.0-py3-none-any.whl", hash = "sha256:2a717e8ca8d296bf89cd57f38fc3c21893bd7ea8cd02a6ae5420e6d1a6eda5d0"},
 ]
 async-timeout = [
-    {file = "async-timeout-4.0.2.tar.gz", hash = "sha256:2163e1640ddb52b7a8c80d0a67a08587e5d245cc9c553a74a847056bc2976b15"},
-    {file = "async_timeout-4.0.2-py3-none-any.whl", hash = "sha256:8ca1e4fcf50d07413d66d1a5e416e42cfdf5851c981d679a09851a6853383b3c"},
+    {file = "async-timeout-3.0.1.tar.gz", hash = "sha256:0c3c816a028d47f659d6ff5c745cb2acf1f966da1fe5c19c77a70282b25f4c5f"},
+    {file = "async_timeout-3.0.1-py3-none-any.whl", hash = "sha256:4291ca197d287d274d0b6cb5d6f8f8f82d434ed288f962539ff18cc9012f9ea3"},
 ]
 asyncpg = [
     {file = "asyncpg-0.27.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:fca608d199ffed4903dce1bcd97ad0fe8260f405c1c225bdf0002709132171c2"},
@@ -1787,6 +1868,10 @@ cfn-lint = [
     {file = "cfn-lint-0.61.3.tar.gz", hash = "sha256:3806e010d77901f5e935496df690c10e39676434a738fce1a1161cf9c7bd36a2"},
     {file = "cfn_lint-0.61.3-py3-none-any.whl", hash = "sha256:8e9522fad0c7c98b31ecbdd4724f8d8a5787457cc0f71e62ae0d11104d6e52ab"},
 ]
+chardet = [
+    {file = "chardet-3.0.4-py2.py3-none-any.whl", hash = "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"},
+    {file = "chardet-3.0.4.tar.gz", hash = "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae"},
+]
 charset-normalizer = [
     {file = "charset-normalizer-2.1.0.tar.gz", hash = "sha256:575e708016ff3a5e3681541cb9d79312c416835686d054a23accb873b254f413"},
     {file = "charset_normalizer-2.1.0-py3-none-any.whl", hash = "sha256:5189b6f22b01957427f35b6a08d9a0bc45b46d3788ef5a92e978433c7a35f8a5"},
@@ -1960,6 +2045,82 @@ moto = [
     {file = "moto-3.1.18-py3-none-any.whl", hash = "sha256:b6eb096e7880c46ac44d6d90988c0043e31462115cfdc913a0ee8f470bd9555c"},
     {file = "moto-3.1.18.tar.gz", hash = "sha256:1e05276a62aa5a4aa821b441647c2cbaa2ea175388980b10d5de88d41b327cf7"},
 ]
+multidict = [
+    {file = "multidict-6.0.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:0b1a97283e0c85772d613878028fec909f003993e1007eafa715b24b377cb9b8"},
+    {file = "multidict-6.0.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:eeb6dcc05e911516ae3d1f207d4b0520d07f54484c49dfc294d6e7d63b734171"},
+    {file = "multidict-6.0.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d6d635d5209b82a3492508cf5b365f3446afb65ae7ebd755e70e18f287b0adf7"},
+    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c048099e4c9e9d615545e2001d3d8a4380bd403e1a0578734e0d31703d1b0c0b"},
+    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ea20853c6dbbb53ed34cb4d080382169b6f4554d394015f1bef35e881bf83547"},
+    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:16d232d4e5396c2efbbf4f6d4df89bfa905eb0d4dc5b3549d872ab898451f569"},
+    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:36c63aaa167f6c6b04ef2c85704e93af16c11d20de1d133e39de6a0e84582a93"},
+    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:64bdf1086b6043bf519869678f5f2757f473dee970d7abf6da91ec00acb9cb98"},
+    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:43644e38f42e3af682690876cff722d301ac585c5b9e1eacc013b7a3f7b696a0"},
+    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:7582a1d1030e15422262de9f58711774e02fa80df0d1578995c76214f6954988"},
+    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:ddff9c4e225a63a5afab9dd15590432c22e8057e1a9a13d28ed128ecf047bbdc"},
+    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:ee2a1ece51b9b9e7752e742cfb661d2a29e7bcdba2d27e66e28a99f1890e4fa0"},
+    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a2e4369eb3d47d2034032a26c7a80fcb21a2cb22e1173d761a162f11e562caa5"},
+    {file = "multidict-6.0.4-cp310-cp310-win32.whl", hash = "sha256:574b7eae1ab267e5f8285f0fe881f17efe4b98c39a40858247720935b893bba8"},
+    {file = "multidict-6.0.4-cp310-cp310-win_amd64.whl", hash = "sha256:4dcbb0906e38440fa3e325df2359ac6cb043df8e58c965bb45f4e406ecb162cc"},
+    {file = "multidict-6.0.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:0dfad7a5a1e39c53ed00d2dd0c2e36aed4650936dc18fd9a1826a5ae1cad6f03"},
+    {file = "multidict-6.0.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:64da238a09d6039e3bd39bb3aee9c21a5e34f28bfa5aa22518581f910ff94af3"},
+    {file = "multidict-6.0.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ff959bee35038c4624250473988b24f846cbeb2c6639de3602c073f10410ceba"},
+    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:01a3a55bd90018c9c080fbb0b9f4891db37d148a0a18722b42f94694f8b6d4c9"},
+    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c5cb09abb18c1ea940fb99360ea0396f34d46566f157122c92dfa069d3e0e982"},
+    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:666daae833559deb2d609afa4490b85830ab0dfca811a98b70a205621a6109fe"},
+    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:11bdf3f5e1518b24530b8241529d2050014c884cf18b6fc69c0c2b30ca248710"},
+    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7d18748f2d30f94f498e852c67d61261c643b349b9d2a581131725595c45ec6c"},
+    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:458f37be2d9e4c95e2d8866a851663cbc76e865b78395090786f6cd9b3bbf4f4"},
+    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:b1a2eeedcead3a41694130495593a559a668f382eee0727352b9a41e1c45759a"},
+    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:7d6ae9d593ef8641544d6263c7fa6408cc90370c8cb2bbb65f8d43e5b0351d9c"},
+    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:5979b5632c3e3534e42ca6ff856bb24b2e3071b37861c2c727ce220d80eee9ed"},
+    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:dcfe792765fab89c365123c81046ad4103fcabbc4f56d1c1997e6715e8015461"},
+    {file = "multidict-6.0.4-cp311-cp311-win32.whl", hash = "sha256:3601a3cece3819534b11d4efc1eb76047488fddd0c85a3948099d5da4d504636"},
+    {file = "multidict-6.0.4-cp311-cp311-win_amd64.whl", hash = "sha256:81a4f0b34bd92df3da93315c6a59034df95866014ac08535fc819f043bfd51f0"},
+    {file = "multidict-6.0.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:67040058f37a2a51ed8ea8f6b0e6ee5bd78ca67f169ce6122f3e2ec80dfe9b78"},
+    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:853888594621e6604c978ce2a0444a1e6e70c8d253ab65ba11657659dcc9100f"},
+    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:39ff62e7d0f26c248b15e364517a72932a611a9b75f35b45be078d81bdb86603"},
+    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:af048912e045a2dc732847d33821a9d84ba553f5c5f028adbd364dd4765092ac"},
+    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b1e8b901e607795ec06c9e42530788c45ac21ef3aaa11dbd0c69de543bfb79a9"},
+    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:62501642008a8b9871ddfccbf83e4222cf8ac0d5aeedf73da36153ef2ec222d2"},
+    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:99b76c052e9f1bc0721f7541e5e8c05db3941eb9ebe7b8553c625ef88d6eefde"},
+    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:509eac6cf09c794aa27bcacfd4d62c885cce62bef7b2c3e8b2e49d365b5003fe"},
+    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:21a12c4eb6ddc9952c415f24eef97e3e55ba3af61f67c7bc388dcdec1404a067"},
+    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:5cad9430ab3e2e4fa4a2ef4450f548768400a2ac635841bc2a56a2052cdbeb87"},
+    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:ab55edc2e84460694295f401215f4a58597f8f7c9466faec545093045476327d"},
+    {file = "multidict-6.0.4-cp37-cp37m-win32.whl", hash = "sha256:5a4dcf02b908c3b8b17a45fb0f15b695bf117a67b76b7ad18b73cf8e92608775"},
+    {file = "multidict-6.0.4-cp37-cp37m-win_amd64.whl", hash = "sha256:6ed5f161328b7df384d71b07317f4d8656434e34591f20552c7bcef27b0ab88e"},
+    {file = "multidict-6.0.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5fc1b16f586f049820c5c5b17bb4ee7583092fa0d1c4e28b5239181ff9532e0c"},
+    {file = "multidict-6.0.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1502e24330eb681bdaa3eb70d6358e818e8e8f908a22a1851dfd4e15bc2f8161"},
+    {file = "multidict-6.0.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b692f419760c0e65d060959df05f2a531945af31fda0c8a3b3195d4efd06de11"},
+    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45e1ecb0379bfaab5eef059f50115b54571acfbe422a14f668fc8c27ba410e7e"},
+    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ddd3915998d93fbcd2566ddf9cf62cdb35c9e093075f862935573d265cf8f65d"},
+    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:59d43b61c59d82f2effb39a93c48b845efe23a3852d201ed2d24ba830d0b4cf2"},
+    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc8e1d0c705233c5dd0c5e6460fbad7827d5d36f310a0fadfd45cc3029762258"},
+    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d6aa0418fcc838522256761b3415822626f866758ee0bc6632c9486b179d0b52"},
+    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:6748717bb10339c4760c1e63da040f5f29f5ed6e59d76daee30305894069a660"},
+    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:4d1a3d7ef5e96b1c9e92f973e43aa5e5b96c659c9bc3124acbbd81b0b9c8a951"},
+    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:4372381634485bec7e46718edc71528024fcdc6f835baefe517b34a33c731d60"},
+    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:fc35cb4676846ef752816d5be2193a1e8367b4c1397b74a565a9d0389c433a1d"},
+    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:4b9d9e4e2b37daddb5c23ea33a3417901fa7c7b3dee2d855f63ee67a0b21e5b1"},
+    {file = "multidict-6.0.4-cp38-cp38-win32.whl", hash = "sha256:e41b7e2b59679edfa309e8db64fdf22399eec4b0b24694e1b2104fb789207779"},
+    {file = "multidict-6.0.4-cp38-cp38-win_amd64.whl", hash = "sha256:d6c254ba6e45d8e72739281ebc46ea5eb5f101234f3ce171f0e9f5cc86991480"},
+    {file = "multidict-6.0.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:16ab77bbeb596e14212e7bab8429f24c1579234a3a462105cda4a66904998664"},
+    {file = "multidict-6.0.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bc779e9e6f7fda81b3f9aa58e3a6091d49ad528b11ed19f6621408806204ad35"},
+    {file = "multidict-6.0.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:4ceef517eca3e03c1cceb22030a3e39cb399ac86bff4e426d4fc6ae49052cc60"},
+    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:281af09f488903fde97923c7744bb001a9b23b039a909460d0f14edc7bf59706"},
+    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:52f2dffc8acaba9a2f27174c41c9e57f60b907bb9f096b36b1a1f3be71c6284d"},
+    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b41156839806aecb3641f3208c0dafd3ac7775b9c4c422d82ee2a45c34ba81ca"},
+    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d5e3fc56f88cc98ef8139255cf8cd63eb2c586531e43310ff859d6bb3a6b51f1"},
+    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8316a77808c501004802f9beebde51c9f857054a0c871bd6da8280e718444449"},
+    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:f70b98cd94886b49d91170ef23ec5c0e8ebb6f242d734ed7ed677b24d50c82cf"},
+    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:bf6774e60d67a9efe02b3616fee22441d86fab4c6d335f9d2051d19d90a40063"},
+    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:e69924bfcdda39b722ef4d9aa762b2dd38e4632b3641b1d9a57ca9cd18f2f83a"},
+    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:6b181d8c23da913d4ff585afd1155a0e1194c0b50c54fcfe286f70cdaf2b7176"},
+    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:52509b5be062d9eafc8170e53026fbc54cf3b32759a23d07fd935fb04fc22d95"},
+    {file = "multidict-6.0.4-cp39-cp39-win32.whl", hash = "sha256:27c523fbfbdfd19c6867af7346332b62b586eed663887392cff78d614f9ec313"},
+    {file = "multidict-6.0.4-cp39-cp39-win_amd64.whl", hash = "sha256:33029f5734336aa0d4c0384525da0387ef89148dc7191aae00ca5fb23d7aafc2"},
+    {file = "multidict-6.0.4.tar.gz", hash = "sha256:3666906492efb76453c0e7b97f2cf459b0682e7402c0489a95484965dbc1da49"},
+]
 mypy = [
     {file = "mypy-0.991-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7d17e0a9707d0772f4a7b878f04b4fd11f6f5bcb9b3813975a9b13c9332153ab"},
     {file = "mypy-0.991-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0714258640194d75677e86c786e80ccf294972cc76885d3ebbb560f11db0003d"},
@@ -2412,6 +2573,82 @@ xmltodict = [
     {file = "xmltodict-0.13.0-py2.py3-none-any.whl", hash = "sha256:aa89e8fd76320154a40d19a0df04a4695fb9dc5ba977cbb68ab3e4eb225e7852"},
     {file = "xmltodict-0.13.0.tar.gz", hash = "sha256:341595a488e3e01a85a9d8911d8912fd922ede5fecc4dce437eb4b6c8d037e56"},
 ]
+yarl = [
+    {file = "yarl-1.8.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:bb81f753c815f6b8e2ddd2eef3c855cf7da193b82396ac013c661aaa6cc6b0a5"},
+    {file = "yarl-1.8.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:47d49ac96156f0928f002e2424299b2c91d9db73e08c4cd6742923a086f1c863"},
+    {file = "yarl-1.8.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3fc056e35fa6fba63248d93ff6e672c096f95f7836938241ebc8260e062832fe"},
+    {file = "yarl-1.8.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:58a3c13d1c3005dbbac5c9f0d3210b60220a65a999b1833aa46bd6677c69b08e"},
+    {file = "yarl-1.8.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:10b08293cda921157f1e7c2790999d903b3fd28cd5c208cf8826b3b508026996"},
+    {file = "yarl-1.8.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:de986979bbd87272fe557e0a8fcb66fd40ae2ddfe28a8b1ce4eae22681728fef"},
+    {file = "yarl-1.8.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c4fcfa71e2c6a3cb568cf81aadc12768b9995323186a10827beccf5fa23d4f8"},
+    {file = "yarl-1.8.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ae4d7ff1049f36accde9e1ef7301912a751e5bae0a9d142459646114c70ecba6"},
+    {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:bf071f797aec5b96abfc735ab97da9fd8f8768b43ce2abd85356a3127909d146"},
+    {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:74dece2bfc60f0f70907c34b857ee98f2c6dd0f75185db133770cd67300d505f"},
+    {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:df60a94d332158b444301c7f569659c926168e4d4aad2cfbf4bce0e8fb8be826"},
+    {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:63243b21c6e28ec2375f932a10ce7eda65139b5b854c0f6b82ed945ba526bff3"},
+    {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:cfa2bbca929aa742b5084fd4663dd4b87c191c844326fcb21c3afd2d11497f80"},
+    {file = "yarl-1.8.2-cp310-cp310-win32.whl", hash = "sha256:b05df9ea7496df11b710081bd90ecc3a3db6adb4fee36f6a411e7bc91a18aa42"},
+    {file = "yarl-1.8.2-cp310-cp310-win_amd64.whl", hash = "sha256:24ad1d10c9db1953291f56b5fe76203977f1ed05f82d09ec97acb623a7976574"},
+    {file = "yarl-1.8.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:2a1fca9588f360036242f379bfea2b8b44cae2721859b1c56d033adfd5893634"},
+    {file = "yarl-1.8.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f37db05c6051eff17bc832914fe46869f8849de5b92dc4a3466cd63095d23dfd"},
+    {file = "yarl-1.8.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:77e913b846a6b9c5f767b14dc1e759e5aff05502fe73079f6f4176359d832581"},
+    {file = "yarl-1.8.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0978f29222e649c351b173da2b9b4665ad1feb8d1daa9d971eb90df08702668a"},
+    {file = "yarl-1.8.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:388a45dc77198b2460eac0aca1efd6a7c09e976ee768b0d5109173e521a19daf"},
+    {file = "yarl-1.8.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2305517e332a862ef75be8fad3606ea10108662bc6fe08509d5ca99503ac2aee"},
+    {file = "yarl-1.8.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:42430ff511571940d51e75cf42f1e4dbdded477e71c1b7a17f4da76c1da8ea76"},
+    {file = "yarl-1.8.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3150078118f62371375e1e69b13b48288e44f6691c1069340081c3fd12c94d5b"},
+    {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:c15163b6125db87c8f53c98baa5e785782078fbd2dbeaa04c6141935eb6dab7a"},
+    {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4d04acba75c72e6eb90745447d69f84e6c9056390f7a9724605ca9c56b4afcc6"},
+    {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:e7fd20d6576c10306dea2d6a5765f46f0ac5d6f53436217913e952d19237efc4"},
+    {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:75c16b2a900b3536dfc7014905a128a2bea8fb01f9ee26d2d7d8db0a08e7cb2c"},
+    {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:6d88056a04860a98341a0cf53e950e3ac9f4e51d1b6f61a53b0609df342cc8b2"},
+    {file = "yarl-1.8.2-cp311-cp311-win32.whl", hash = "sha256:fb742dcdd5eec9f26b61224c23baea46c9055cf16f62475e11b9b15dfd5c117b"},
+    {file = "yarl-1.8.2-cp311-cp311-win_amd64.whl", hash = "sha256:8c46d3d89902c393a1d1e243ac847e0442d0196bbd81aecc94fcebbc2fd5857c"},
+    {file = "yarl-1.8.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:ceff9722e0df2e0a9e8a79c610842004fa54e5b309fe6d218e47cd52f791d7ef"},
+    {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3f6b4aca43b602ba0f1459de647af954769919c4714706be36af670a5f44c9c1"},
+    {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1684a9bd9077e922300ecd48003ddae7a7474e0412bea38d4631443a91d61077"},
+    {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ebb78745273e51b9832ef90c0898501006670d6e059f2cdb0e999494eb1450c2"},
+    {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3adeef150d528ded2a8e734ebf9ae2e658f4c49bf413f5f157a470e17a4a2e89"},
+    {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:57a7c87927a468e5a1dc60c17caf9597161d66457a34273ab1760219953f7f4c"},
+    {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:efff27bd8cbe1f9bd127e7894942ccc20c857aa8b5a0327874f30201e5ce83d0"},
+    {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:a783cd344113cb88c5ff7ca32f1f16532a6f2142185147822187913eb989f739"},
+    {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:705227dccbe96ab02c7cb2c43e1228e2826e7ead880bb19ec94ef279e9555b5b"},
+    {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:34c09b43bd538bf6c4b891ecce94b6fa4f1f10663a8d4ca589a079a5018f6ed7"},
+    {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:a48f4f7fea9a51098b02209d90297ac324241bf37ff6be6d2b0149ab2bd51b37"},
+    {file = "yarl-1.8.2-cp37-cp37m-win32.whl", hash = "sha256:0414fd91ce0b763d4eadb4456795b307a71524dbacd015c657bb2a39db2eab89"},
+    {file = "yarl-1.8.2-cp37-cp37m-win_amd64.whl", hash = "sha256:d881d152ae0007809c2c02e22aa534e702f12071e6b285e90945aa3c376463c5"},
+    {file = "yarl-1.8.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5df5e3d04101c1e5c3b1d69710b0574171cc02fddc4b23d1b2813e75f35a30b1"},
+    {file = "yarl-1.8.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:7a66c506ec67eb3159eea5096acd05f5e788ceec7b96087d30c7d2865a243918"},
+    {file = "yarl-1.8.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:2b4fa2606adf392051d990c3b3877d768771adc3faf2e117b9de7eb977741229"},
+    {file = "yarl-1.8.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1e21fb44e1eff06dd6ef971d4bdc611807d6bd3691223d9c01a18cec3677939e"},
+    {file = "yarl-1.8.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:93202666046d9edadfe9f2e7bf5e0782ea0d497b6d63da322e541665d65a044e"},
+    {file = "yarl-1.8.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fc77086ce244453e074e445104f0ecb27530d6fd3a46698e33f6c38951d5a0f1"},
+    {file = "yarl-1.8.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:64dd68a92cab699a233641f5929a40f02a4ede8c009068ca8aa1fe87b8c20ae3"},
+    {file = "yarl-1.8.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1b372aad2b5f81db66ee7ec085cbad72c4da660d994e8e590c997e9b01e44901"},
+    {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:e6f3515aafe0209dd17fb9bdd3b4e892963370b3de781f53e1746a521fb39fc0"},
+    {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:dfef7350ee369197106805e193d420b75467b6cceac646ea5ed3049fcc950a05"},
+    {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:728be34f70a190566d20aa13dc1f01dc44b6aa74580e10a3fb159691bc76909d"},
+    {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:ff205b58dc2929191f68162633d5e10e8044398d7a45265f90a0f1d51f85f72c"},
+    {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:baf211dcad448a87a0d9047dc8282d7de59473ade7d7fdf22150b1d23859f946"},
+    {file = "yarl-1.8.2-cp38-cp38-win32.whl", hash = "sha256:272b4f1599f1b621bf2aabe4e5b54f39a933971f4e7c9aa311d6d7dc06965165"},
+    {file = "yarl-1.8.2-cp38-cp38-win_amd64.whl", hash = "sha256:326dd1d3caf910cd26a26ccbfb84c03b608ba32499b5d6eeb09252c920bcbe4f"},
+    {file = "yarl-1.8.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:f8ca8ad414c85bbc50f49c0a106f951613dfa5f948ab69c10ce9b128d368baf8"},
+    {file = "yarl-1.8.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:418857f837347e8aaef682679f41e36c24250097f9e2f315d39bae3a99a34cbf"},
+    {file = "yarl-1.8.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ae0eec05ab49e91a78700761777f284c2df119376e391db42c38ab46fd662b77"},
+    {file = "yarl-1.8.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:009a028127e0a1755c38b03244c0bea9d5565630db9c4cf9572496e947137a87"},
+    {file = "yarl-1.8.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3edac5d74bb3209c418805bda77f973117836e1de7c000e9755e572c1f7850d0"},
+    {file = "yarl-1.8.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:da65c3f263729e47351261351b8679c6429151ef9649bba08ef2528ff2c423b2"},
+    {file = "yarl-1.8.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0ef8fb25e52663a1c85d608f6dd72e19bd390e2ecaf29c17fb08f730226e3a08"},
+    {file = "yarl-1.8.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bcd7bb1e5c45274af9a1dd7494d3c52b2be5e6bd8d7e49c612705fd45420b12d"},
+    {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:44ceac0450e648de86da8e42674f9b7077d763ea80c8ceb9d1c3e41f0f0a9951"},
+    {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:97209cc91189b48e7cfe777237c04af8e7cc51eb369004e061809bcdf4e55220"},
+    {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:48dd18adcf98ea9cd721a25313aef49d70d413a999d7d89df44f469edfb38a06"},
+    {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:e59399dda559688461762800d7fb34d9e8a6a7444fd76ec33220a926c8be1516"},
+    {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:d617c241c8c3ad5c4e78a08429fa49e4b04bedfc507b34b4d8dceb83b4af3588"},
+    {file = "yarl-1.8.2-cp39-cp39-win32.whl", hash = "sha256:cb6d48d80a41f68de41212f3dfd1a9d9898d7841c8f7ce6696cf2fd9cb57ef83"},
+    {file = "yarl-1.8.2-cp39-cp39-win_amd64.whl", hash = "sha256:6604711362f2dbf7160df21c416f81fac0de6dbcf0b5445a2ef25478ecc4c778"},
+    {file = "yarl-1.8.2.tar.gz", hash = "sha256:49d43402c6e3013ad0978602bf6bf5328535c48d192304b91b97a3c6790b1562"},
+]
 zipp = [
     {file = "zipp-3.8.1-py3-none-any.whl", hash = "sha256:47c40d7fe183a6f21403a199b3e4192cca5774656965b0a4988ad2f8feb5f009"},
     {file = "zipp-3.8.1.tar.gz", hash = "sha256:05b45f1ee8f807d0cc928485ca40a07cb491cf092ff587c0df9cb1fd154848d2"},
diff --git a/proxy/src/main.rs b/proxy/src/main.rs
index 5d44774df9..1b61ab108f 100644
--- a/proxy/src/main.rs
+++ b/proxy/src/main.rs
@@ -30,7 +30,7 @@ use std::{borrow::Cow, future::Future, net::SocketAddr};
 use tokio::{net::TcpListener, task::JoinError};
 use tracing::{info, info_span, Instrument};
 use utils::project_git_version;
-use utils::sentry_init::{init_sentry, release_name};
+use utils::sentry_init::init_sentry;
 
 project_git_version!(GIT_VERSION);
 
@@ -49,7 +49,7 @@ async fn main() -> anyhow::Result<()> {
         .init();
 
     // initialize sentry if SENTRY_DSN is provided
-    let _sentry_guard = init_sentry(release_name!(), &[]);
+    let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
 
     let arg_matches = cli().get_matches();
 
diff --git a/pyproject.toml b/pyproject.toml
index b4fb7a9e7d..a817e9dda5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -33,6 +33,7 @@ psutil = "^5.9.4"
 types-psutil = "^5.9.5.4"
 types-toml = "^0.10.8"
 pytest-httpserver = "^1.0.6"
+aiohttp = "3.7"
 
 [tool.poetry.dev-dependencies]
 flake8 = "^5.0.4"
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index b130ea86bd..1a068412c8 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -38,7 +38,7 @@ use utils::{
     id::NodeId,
     logging::{self, LogFormat},
     project_git_version,
-    sentry_init::{init_sentry, release_name},
+    sentry_init::init_sentry,
     signals, tcp_listener,
 };
 
@@ -173,7 +173,10 @@ fn main() -> anyhow::Result<()> {
     };
 
     // initialize sentry if SENTRY_DSN is provided
-    let _sentry_guard = init_sentry(release_name!(), &[("node_id", &conf.my_id.to_string())]);
+    let _sentry_guard = init_sentry(
+        Some(GIT_VERSION.into()),
+        &[("node_id", &conf.my_id.to_string())],
+    );
     start_safekeeper(conf)
 }
 
diff --git a/scripts/force_layer_download.py b/scripts/force_layer_download.py
new file mode 100644
index 0000000000..5472d86d8f
--- /dev/null
+++ b/scripts/force_layer_download.py
@@ -0,0 +1,324 @@
+import argparse
+import asyncio
+import json
+import logging
+import signal
+import sys
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Any, Awaitable, Dict, List, Tuple
+
+import aiohttp
+
+
+class ClientException(Exception):
+    pass
+
+
+class Client:
+    def __init__(self, pageserver_api_endpoint: str, max_concurrent_layer_downloads: int):
+        self.endpoint = pageserver_api_endpoint
+        self.max_concurrent_layer_downloads = max_concurrent_layer_downloads
+        self.sess = aiohttp.ClientSession()
+
+    async def close(self):
+        await self.sess.close()
+
+    async def __aenter__(self):
+        return self
+
+    async def __aexit__(self, exc_t, exc_v, exc_tb):
+        await self.close()
+
+    async def parse_response(self, resp, expected_type):
+        body = await resp.json()
+        if not resp.ok:
+            raise ClientException(f"Response: {resp} Body: {body}")
+
+        if not isinstance(body, expected_type):
+            raise ClientException(f"expecting {expected_type.__name__}")
+        return body
+
+    async def get_tenant_ids(self):
+        resp = await self.sess.get(f"{self.endpoint}/v1/tenant")
+        payload = await self.parse_response(resp=resp, expected_type=list)
+        return [t["id"] for t in payload]
+
+    async def get_timeline_ids(self, tenant_id):
+        resp = await self.sess.get(f"{self.endpoint}/v1/tenant/{tenant_id}/timeline")
+        payload = await self.parse_response(resp=resp, expected_type=list)
+        return [t["timeline_id"] for t in payload]
+
+    async def timeline_spawn_download_remote_layers(self, tenant_id, timeline_id, ongoing_ok=False):
+        resp = await self.sess.post(
+            f"{self.endpoint}/v1/tenant/{tenant_id}/timeline/{timeline_id}/download_remote_layers",
+            json={"max_concurrent_downloads": self.max_concurrent_layer_downloads},
+        )
+        body = await resp.json()
+        if resp.status == 409:
+            if not ongoing_ok:
+                raise ClientException("download already ongoing")
+            # response body has same shape for ongoing and newly created
+        elif not resp.ok:
+            raise ClientException(f"Response: {resp} Body: {body}")
+
+        if not isinstance(body, dict):
+            raise ClientException("expecting dict")
+
+        return body
+
+    async def timeline_poll_download_remote_layers_status(
+        self,
+        tenant_id,
+        timeline_id,
+    ):
+        resp = await self.sess.get(
+            f"{self.endpoint}/v1/tenant/{tenant_id}/timeline/{timeline_id}/download_remote_layers",
+        )
+        body = await resp.json()
+
+        if resp.status == 404:
+            return None
+        elif not resp.ok:
+            raise ClientException(f"Response: {resp} Body: {body}")
+
+        return body
+
+
+@dataclass
+class Completed:
+    """The status dict returned by the API"""
+
+    status: Dict[str, Any]
+
+
+sigint_received = asyncio.Event()
+
+
+async def do_timeline(client: Client, tenant_id, timeline_id):
+    """
+    Spawn download_remote_layers task for given timeline,
+    then poll until the download has reached a terminal state.
+
+    If the terminal state is not 'Completed', the method raises an exception.
+    The caller is responsible for inspecting `failed_download_count`.
+
+    If there is already a task going on when this method is invoked,
+    it raises an exception.
+    """
+
+    # Don't start new downloads if user pressed SIGINT.
+    # This task will show up as "raised_exception" in the report.
+    if sigint_received.is_set():
+        raise Exception("not starting because SIGINT received")
+
+    # run downloads to completion
+
+    status = await client.timeline_poll_download_remote_layers_status(tenant_id, timeline_id)
+    if status is not None and status["state"] == "Running":
+        raise Exception("download is already running")
+
+    spawned = await client.timeline_spawn_download_remote_layers(
+        tenant_id, timeline_id, ongoing_ok=False
+    )
+
+    while True:
+        st = await client.timeline_poll_download_remote_layers_status(tenant_id, timeline_id)
+        logging.info(f"{tenant_id}:{timeline_id} state is: {st}")
+
+        if spawned["task_id"] != st["task_id"]:
+            raise ClientException("download task ids changed while polling")
+
+        if st["state"] == "Running":
+            await asyncio.sleep(10)
+            continue
+
+        if st["state"] != "Completed":
+            raise ClientException(
+                f"download task reached terminal state != Completed: {st['state']}"
+            )
+
+        return Completed(st)
+
+
+def handle_sigint():
+    logging.info("SIGINT received, asyncio event set. Will not start new downloads.")
+    global sigint_received
+    sigint_received.set()
+
+
+async def main(args):
+    async with Client(args.pageserver_http_endpoint, args.max_concurrent_layer_downloads) as client:
+        exit_code = await main_impl(args, args.report_output, client)
+
+    return exit_code
+
+
+async def taskq_handler(task_q, result_q):
+    while True:
+        try:
+            (id, fut) = task_q.get_nowait()
+        except asyncio.QueueEmpty:
+            logging.debug("taskq_handler observed empty task_q, returning")
+            return
+        logging.info(f"starting task {id}")
+        try:
+            res = await fut
+        except Exception as e:
+            res = e
+        result_q.put_nowait((id, res))
+
+
+async def print_progress(result_q, tasks):
+    while True:
+        await asyncio.sleep(10)
+        logging.info(f"{result_q.qsize()} / {len(tasks)} tasks done")
+
+
+async def main_impl(args, report_out, client: Client):
+    """
+    Returns OS exit status.
+    """
+    tenant_and_timline_ids: List[Tuple[str, str]] = []
+    # fill tenant_and_timline_ids based on spec
+    for spec in args.what:
+        comps = spec.split(":")
+        if comps == ["ALL"]:
+            logging.info("get tenant list")
+            tenant_ids = await client.get_tenant_ids()
+            get_timeline_id_coros = [client.get_timeline_ids(tenant_id) for tenant_id in tenant_ids]
+            gathered = await asyncio.gather(*get_timeline_id_coros, return_exceptions=True)
+            assert len(tenant_ids) == len(gathered)
+            tenant_and_timline_ids = []
+            for tid, tlids in zip(tenant_ids, gathered):
+                for tlid in tlids:
+                    tenant_and_timline_ids.append((tid, tlid))
+        elif len(comps) == 1:
+            tid = comps[0]
+            tlids = await client.get_timeline_ids(tid)
+            for tlid in tlids:
+                tenant_and_timline_ids.append((tid, tlid))
+        elif len(comps) == 2:
+            tenant_and_timline_ids.append((comps[0], comps[1]))
+        else:
+            raise ValueError(f"invalid what-spec: {spec}")
+
+    logging.info("expanded spec:")
+    for tid, tlid in tenant_and_timline_ids:
+        logging.info(f"{tid}:{tlid}")
+
+    logging.info("remove duplicates after expanding spec")
+    tmp = list(set(tenant_and_timline_ids))
+    assert len(tmp) <= len(tenant_and_timline_ids)
+    if len(tmp) != len(tenant_and_timline_ids):
+        logging.info(f"spec had {len(tenant_and_timline_ids) - len(tmp)} duplicates")
+    tenant_and_timline_ids = tmp
+
+    logging.info("create tasks and process them at specified concurrency")
+    task_q: asyncio.Queue[Tuple[str, Awaitable[Any]]] = asyncio.Queue()
+    tasks = {
+        f"{tid}:{tlid}": do_timeline(client, tid, tlid) for tid, tlid in tenant_and_timline_ids
+    }
+    for task in tasks.items():
+        task_q.put_nowait(task)
+
+    result_q: asyncio.Queue[Tuple[str, Any]] = asyncio.Queue()
+    taskq_handlers = []
+    for _ in range(0, args.concurrent_tasks):
+        taskq_handlers.append(taskq_handler(task_q, result_q))
+
+    print_progress_task = asyncio.create_task(print_progress(result_q, tasks))
+
+    await asyncio.gather(*taskq_handlers)
+    print_progress_task.cancel()
+
+    logging.info("all tasks handled, generating report")
+
+    results = []
+    while True:
+        try:
+            results.append(result_q.get_nowait())
+        except asyncio.QueueEmpty:
+            break
+    assert task_q.empty()
+
+    report = defaultdict(list)
+    for id, result in results:
+        logging.info(f"result for {id}: {result}")
+        if isinstance(result, Completed):
+            if result.status["failed_download_count"] == 0:
+                report["completed_without_errors"].append(id)
+            else:
+                report["completed_with_download_errors"].append(id)
+        elif isinstance(result, Exception):
+            report["raised_exception"].append(id)
+        else:
+            raise ValueError("unexpected result type")
+    json.dump(report, report_out)
+
+    logging.info("--------------------------------------------------------------------------------")
+
+    report_success = len(report["completed_without_errors"]) == len(tenant_and_timline_ids)
+    if not report_success:
+        logging.error("One or more tasks encountered errors.")
+    else:
+        logging.info("All tasks reported success.")
+    logging.info("Inspect log for details and report file for JSON summary.")
+
+    return report_success
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--report-output",
+        type=argparse.FileType("w"),
+        default="-",
+        help="where to write report output (default: stdout)",
+    )
+    parser.add_argument(
+        "--pageserver-http-endpoint",
+        default="http://localhost:9898",
+        help="pageserver http endpoint, (default http://localhost:9898)",
+    )
+    parser.add_argument(
+        "--concurrent-tasks",
+        required=False,
+        default=5,
+        type=int,
+        help="Max concurrent download tasks created & polled by this script",
+    )
+    parser.add_argument(
+        "--max-concurrent-layer-downloads",
+        dest="max_concurrent_layer_downloads",
+        required=False,
+        default=8,
+        type=int,
+        help="Max concurrent download tasks spawned by pageserver. Each layer is a separate task.",
+    )
+
+    parser.add_argument(
+        "what",
+        nargs="+",
+        help="what to download: ALL|tenant_id|tenant_id:timeline_id",
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="enable verbose logging",
+    )
+    args = parser.parse_args()
+
+    level = logging.INFO
+    if args.verbose:
+        level = logging.DEBUG
+    logging.basicConfig(
+        format="%(asctime)s,%(msecs)03d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s",
+        datefmt="%Y-%m-%d:%H:%M:%S",
+        level=level,
+    )
+
+    loop = asyncio.get_event_loop()
+
+    loop.add_signal_handler(signal.SIGINT, handle_sigint)
+    sys.exit(asyncio.run(main(args)))
diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs
index 6d80e96bf1..e33369bbb1 100644
--- a/storage_broker/src/bin/storage_broker.rs
+++ b/storage_broker/src/bin/storage_broker.rs
@@ -45,7 +45,7 @@ use storage_broker::{
 use utils::id::TenantTimelineId;
 use utils::logging::{self, LogFormat};
 use utils::project_git_version;
-use utils::sentry_init::{init_sentry, release_name};
+use utils::sentry_init::init_sentry;
 
 project_git_version!(GIT_VERSION);
 
@@ -425,7 +425,7 @@ async fn http1_handler(
 #[tokio::main]
 async fn main() -> Result<(), Box<dyn std::error::Error>> {
     // initialize sentry if SENTRY_DSN is provided
-    let _sentry_guard = init_sentry(release_name!(), &[]);
+    let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
 
     let args = Args::parse();
 
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index 8b78e06c22..bdaaa95216 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -46,6 +46,12 @@ PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS: Tuple[str, ...] = (
     "pageserver_remote_physical_size",
 )
 
+PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
+    "pageserver_storage_operations_seconds_global_count",
+    "pageserver_storage_operations_seconds_global_sum",
+    "pageserver_storage_operations_seconds_global_bucket",
+)
+
 PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
     "pageserver_current_logical_size",
     "pageserver_resident_physical_size",
@@ -61,13 +67,13 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
     "pageserver_smgr_query_seconds_bucket",
     "pageserver_smgr_query_seconds_count",
     "pageserver_smgr_query_seconds_sum",
-    "pageserver_storage_operations_seconds_bucket",
-    "pageserver_storage_operations_seconds_count",
-    "pageserver_storage_operations_seconds_sum",
+    "pageserver_storage_operations_seconds_count_total",
+    "pageserver_storage_operations_seconds_sum_total",
     "pageserver_wait_lsn_seconds_bucket",
     "pageserver_wait_lsn_seconds_count",
     "pageserver_wait_lsn_seconds_sum",
     "pageserver_created_persistent_files_total",
     "pageserver_written_persistent_bytes_total",
+    "pageserver_tenant_states_count",
     *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
 )
diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py
index 29cdcb18ce..cbbf01a285 100644
--- a/test_runner/regress/test_tenant_conf.py
+++ b/test_runner/regress/test_tenant_conf.py
@@ -2,7 +2,15 @@ from contextlib import closing
 
 import psycopg2.extras
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnvBuilder
+from fixtures.neon_fixtures import (
+    LocalFsStorage,
+    NeonEnvBuilder,
+    RemoteStorageKind,
+    assert_tenant_status,
+    wait_for_upload,
+)
+from fixtures.types import Lsn
+from fixtures.utils import wait_until
 
 
 def test_tenant_config(neon_env_builder: NeonEnvBuilder):
@@ -57,7 +65,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}"""
                     "compaction_period": 20,
                     "compaction_threshold": 10,
                     "gc_horizon": 67108864,
-                    "gc_period": 100,
+                    "gc_period": 60 * 60,
                     "image_creation_threshold": 3,
                     "pitr_interval": 604800,  # 7 days
                 }.items()
@@ -158,3 +166,46 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}"""
                     "pitr_interval": 60,
                 }.items()
             )
+
+
+def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=RemoteStorageKind.LOCAL_FS,
+        test_name="test_creating_tenant_conf_after_attach",
+    )
+
+    env = neon_env_builder.init_start()
+    assert isinstance(env.remote_storage, LocalFsStorage)
+
+    # tenant is created with defaults, as in without config file
+    (tenant_id, timeline_id) = env.neon_cli.create_tenant()
+    config_path = env.repo_dir / "tenants" / str(tenant_id) / "config"
+    assert config_path.exists(), "config file is always initially created"
+
+    http_client = env.pageserver.http_client()
+
+    detail = http_client.timeline_detail(tenant_id, timeline_id)
+    last_record_lsn = Lsn(detail["last_record_lsn"])
+    assert last_record_lsn.lsn_int != 0, "initdb must have executed"
+
+    wait_for_upload(http_client, tenant_id, timeline_id, last_record_lsn)
+
+    http_client.tenant_detach(tenant_id)
+
+    assert not config_path.exists(), "detach did not remove config file"
+
+    http_client.tenant_attach(tenant_id)
+    wait_until(
+        number_of_iterations=5,
+        interval=1,
+        func=lambda: assert_tenant_status(http_client, tenant_id, "Active"),
+    )
+
+    env.neon_cli.config_tenant(tenant_id, {"gc_horizon": "1000000"})
+    contents_first = config_path.read_text()
+    env.neon_cli.config_tenant(tenant_id, {"gc_horizon": "0"})
+    contents_later = config_path.read_text()
+
+    # dont test applying the setting here, we have that another test case to show it
+    # we just care about being able to create the file
+    assert len(contents_first) > len(contents_later)
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index db5bb679f2..6c3454b79b 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -6,6 +6,7 @@ from threading import Thread
 import asyncpg
 import pytest
 from fixtures.log_helper import log
+from fixtures.metrics import parse_metrics
 from fixtures.neon_fixtures import (
     NeonEnv,
     NeonEnvBuilder,
@@ -59,11 +60,11 @@ def test_tenant_reattach(
     # create new nenant
     tenant_id, timeline_id = env.neon_cli.create_tenant()
 
-    pg = env.postgres.create_start("main", tenant_id=tenant_id)
-    with pg.cursor() as cur:
-        cur.execute("CREATE TABLE t(key int primary key, value text)")
-        cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
-        current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
+    with env.postgres.create_start("main", tenant_id=tenant_id) as pg:
+        with pg.cursor() as cur:
+            cur.execute("CREATE TABLE t(key int primary key, value text)")
+            cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
+            current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
 
     # Wait for the all data to be processed by the pageserver and uploaded in remote storage
     wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn)
@@ -78,15 +79,34 @@ def test_tenant_reattach(
         ".*failed to perform remote task UploadMetadata.*, will retry.*"
     )
 
+    ps_metrics = parse_metrics(pageserver_http.get_metrics(), "pageserver")
+    tenant_metric_filter = {
+        "tenant_id": str(tenant_id),
+        "timeline_id": str(timeline_id),
+    }
+    pageserver_last_record_lsn_before_detach = int(
+        ps_metrics.query_one("pageserver_last_record_lsn", filter=tenant_metric_filter).value
+    )
+
     pageserver_http.tenant_detach(tenant_id)
     pageserver_http.tenant_attach(tenant_id)
 
-    with pg.cursor() as cur:
-        assert query_scalar(cur, "SELECT count(*) FROM t") == 100000
+    time.sleep(1)  # for metrics propagation
 
-    # Check that we had to retry the downloads
-    assert env.pageserver.log_contains(".*list prefixes.*failed, will retry.*")
-    assert env.pageserver.log_contains(".*download.*failed, will retry.*")
+    ps_metrics = parse_metrics(pageserver_http.get_metrics(), "pageserver")
+    pageserver_last_record_lsn = int(
+        ps_metrics.query_one("pageserver_last_record_lsn", filter=tenant_metric_filter).value
+    )
+
+    assert pageserver_last_record_lsn_before_detach == pageserver_last_record_lsn
+
+    with env.postgres.create_start("main", tenant_id=tenant_id) as pg:
+        with pg.cursor() as cur:
+            assert query_scalar(cur, "SELECT count(*) FROM t") == 100000
+
+        # Check that we had to retry the downloads
+        assert env.pageserver.log_contains(".*list prefixes.*failed, will retry.*")
+        assert env.pageserver.log_contains(".*download.*failed, will retry.*")
 
 
 num_connections = 10
@@ -237,7 +257,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
     pageserver_http = env.pageserver.http_client()
 
-    env.pageserver.allowed_errors.append(".*NotFound\\(Tenant .* not found")
+    env.pageserver.allowed_errors.append(".*NotFound: Tenant .* not found")
 
     # first check for non existing tenant
     tenant_id = TenantId.generate()
@@ -272,8 +292,10 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
         bogus_timeline_id = TimelineId.generate()
         pageserver_http.timeline_gc(tenant_id, bogus_timeline_id, 0)
 
-        # the error will be printed to the log too
+    # the error will be printed to the log too
     env.pageserver.allowed_errors.append(".*gc target timeline does not exist.*")
+    # Timelines get stopped during detach, ignore the gc calls that error, whitnessing that
+    env.pageserver.allowed_errors.append(".*InternalServerError\\(timeline is Stopping.*")
 
     # Detach while running manual GC.
     # It should wait for manual GC to finish because it runs in a task associated with the tenant.
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index 9477ae3c25..e56bb1b469 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -1,5 +1,6 @@
 import os
 import shutil
+import time
 from contextlib import closing
 from datetime import datetime
 from pathlib import Path
@@ -8,6 +9,7 @@ from typing import List
 import pytest
 from fixtures.log_helper import log
 from fixtures.metrics import (
+    PAGESERVER_GLOBAL_METRICS,
     PAGESERVER_PER_TENANT_METRICS,
     PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
     parse_metrics,
@@ -160,6 +162,14 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder):
             f"process_start_time_seconds (UTC): {datetime.fromtimestamp(metrics.query_one('process_start_time_seconds').value)}"
         )
 
+    # Test (a subset of) pageserver global metrics
+    for metric in PAGESERVER_GLOBAL_METRICS:
+        ps_samples = ps_metrics.query_all(metric, {})
+        assert len(ps_samples) > 0
+        for sample in ps_samples:
+            labels = ",".join([f'{key}="{value}"' for key, value in sample.labels.items()])
+            log.info(f"{sample.name}{{{labels}}} {sample.value}")
+
 
 @pytest.mark.parametrize(
     "remote_storage_kind",
@@ -259,7 +269,7 @@ def test_pageserver_with_empty_tenants(
         files_in_timelines_dir == 0
     ), f"Tenant {tenant_with_empty_timelines_dir} should have an empty timelines/ directory"
 
-    # Trigger timeline reinitialization after pageserver restart
+    # Trigger timeline re-initialization after pageserver restart
     env.postgres.stop_all()
     env.pageserver.stop()
 
@@ -278,7 +288,51 @@ def test_pageserver_with_empty_tenants(
         broken_tenant["state"] == "Broken"
     ), f"Tenant {tenant_without_timelines_dir} without timelines dir should be broken"
 
+    broken_tenant_status = client.tenant_status(tenant_without_timelines_dir)
+    assert (
+        broken_tenant_status["state"] == "Broken"
+    ), f"Tenant {tenant_without_timelines_dir} without timelines dir should be broken"
+
+    assert env.pageserver.log_contains(".*Setting tenant as Broken state, reason:.*")
+
     [loaded_tenant] = [t for t in tenants if t["id"] == str(tenant_with_empty_timelines_dir)]
     assert (
         loaded_tenant["state"] == "Active"
     ), "Tenant {tenant_with_empty_timelines_dir} with empty timelines dir should be active and ready for timeline creation"
+
+    loaded_tenant_status = client.tenant_status(tenant_with_empty_timelines_dir)
+    assert (
+        loaded_tenant_status["state"] == "Active"
+    ), f"Tenant {tenant_with_empty_timelines_dir} without timelines dir should be active"
+
+    time.sleep(1)  # to allow metrics propagation
+
+    ps_metrics = parse_metrics(client.get_metrics(), "pageserver")
+    broken_tenants_metric_filter = {
+        "tenant_id": str(tenant_without_timelines_dir),
+        "state": "broken",
+    }
+    active_tenants_metric_filter = {
+        "tenant_id": str(tenant_with_empty_timelines_dir),
+        "state": "active",
+    }
+
+    tenant_active_count = int(
+        ps_metrics.query_one(
+            "pageserver_tenant_states_count", filter=active_tenants_metric_filter
+        ).value
+    )
+
+    assert (
+        tenant_active_count == 1
+    ), f"Tenant {tenant_with_empty_timelines_dir} should have metric as active"
+
+    tenant_broken_count = int(
+        ps_metrics.query_one(
+            "pageserver_tenant_states_count", filter=broken_tenants_metric_filter
+        ).value
+    )
+
+    assert (
+        tenant_broken_count == 1
+    ), f"Tenant {tenant_without_timelines_dir} should have metric as broken"
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index f4b71ae9b7..3a852b2207 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -20,7 +20,9 @@ clap = { version = "4", features = ["derive", "string"] }
 crossbeam-utils = { version = "0.8" }
 either = { version = "1" }
 fail = { version = "0.5", default-features = false, features = ["failpoints"] }
+futures = { version = "0.3" }
 futures-channel = { version = "0.3", features = ["sink"] }
+futures-executor = { version = "0.3" }
 futures-task = { version = "0.3", default-features = false, features = ["std"] }
 futures-util = { version = "0.3", features = ["channel", "io", "sink"] }
 indexmap = { version = "1", default-features = false, features = ["std"] }
@@ -31,17 +33,21 @@ memchr = { version = "2" }
 nom = { version = "7" }
 num-bigint = { version = "0.4" }
 num-integer = { version = "0.1", features = ["i128"] }
-num-traits = { version = "0.2", features = ["i128", "libm"] }
+num-traits = { version = "0.2", features = ["i128"] }
 prost = { version = "0.11" }
 rand = { version = "0.8", features = ["small_rng"] }
 regex = { version = "1" }
 regex-syntax = { version = "0.6" }
+reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
+ring = { version = "0.16", features = ["std"] }
+rustls = { version = "0.20", features = ["dangerous_configuration"] }
 scopeguard = { version = "1" }
 serde = { version = "1", features = ["alloc", "derive"] }
 serde_json = { version = "1", features = ["raw_value"] }
 socket2 = { version = "0.4", default-features = false, features = ["all"] }
 tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "sync", "time"] }
 tokio-util = { version = "0.7", features = ["codec", "io"] }
+tonic = { version = "0.8", features = ["tls-roots"] }
 tower = { version = "0.4", features = ["balance", "buffer", "limit", "retry", "timeout", "util"] }
 tracing = { version = "0.1", features = ["log"] }
 tracing-core = { version = "0.1" }

From a41b5244a84116c16ace9143a02f8d21d218a84c Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Mon, 20 Feb 2023 18:22:49 +0300
Subject: [PATCH 036/412] Add new safekeeper to ap-southeast-1 prod (#3645)
 (#3646)

To trigger deployment of #3645 to production.
---
 .github/ansible/prod.ap-southeast-1.hosts.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/ansible/prod.ap-southeast-1.hosts.yaml b/.github/ansible/prod.ap-southeast-1.hosts.yaml
index 7c6d1db6d7..71fced23c2 100644
--- a/.github/ansible/prod.ap-southeast-1.hosts.yaml
+++ b/.github/ansible/prod.ap-southeast-1.hosts.yaml
@@ -36,3 +36,5 @@ storage:
           ansible_host:  i-0e338adda8eb2d19f
         safekeeper-2.ap-southeast-1.aws.neon.tech:
           ansible_host:  i-04fb63634e4679eb9
+        safekeeper-3.ap-southeast-1.aws.neon.tech:
+          ansible_host:  i-05481f3bc88cfc2d4

From 78aca668d08190934708f864fbddd14da316e8a8 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 21 Feb 2023 19:31:53 +0200
Subject: [PATCH 037/412] fix: log download failed error (#3661)

Fixes #3659
---
 pageserver/src/tenant/tasks.rs      | 2 +-
 pageserver/src/tenant/timeline.rs   | 1 +
 test_runner/regress/test_tenants.py | 4 ++++
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index e9ce52d1ab..20d1d2bfb6 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -74,7 +74,7 @@ async fn compaction_loop(tenant_id: TenantId) {
             let period = tenant.get_compaction_period();
 
             // TODO: we shouldn't need to await to find tenant and this could be moved outside of
-            // loop
+            // loop, #3501. There are also additional "allowed_errors" in tests.
             if first {
                 first = false;
                 if random_init_delay(period, &cancel).await.is_err() {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index f2b0a98509..8bc02cd10a 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3745,6 +3745,7 @@ impl Timeline {
                     remote_layer.ongoing_download.close();
                 } else {
                     // Keep semaphore open. We'll drop the permit at the end of the function.
+                    info!("on-demand download failed: {:?}", result.as_ref().unwrap_err());
                 }
 
                 // Don't treat it as an error if the task that triggered the download
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index e56bb1b469..9e75396799 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -250,6 +250,10 @@ def test_pageserver_with_empty_tenants(
     env.pageserver.allowed_errors.append(
         ".*could not load tenant.*Failed to list timelines directory.*"
     )
+    # this is until #3501
+    env.pageserver.allowed_errors.append(
+        ".*Compaction failed, retrying in 2s: Cannot run compaction iteration on inactive tenant"
+    )
 
     client = env.pageserver.http_client()
 

From 15273a9b669c75b8a5bcda186715e7c1a940bfd6 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 21 Feb 2023 20:20:13 +0200
Subject: [PATCH 038/412] chore: ignore all compaction inactive tenant errors
 (#3665)

these are happening in tests because of #3655 but they sure took some
time to appear.

makes the `Compaction failed, retrying in 2s: Cannot run compaction
iteration on inactive tenant` into a globally allowed error, because it
has been seen failing on different test cases.
---
 test_runner/fixtures/neon_fixtures.py | 2 ++
 test_runner/regress/test_tenants.py   | 4 ----
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 63196609cc..73f224039e 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2080,6 +2080,8 @@ class NeonPageserver(PgProtocol):
             ".*query handler for 'pagestream.*failed: Timeline .* was not found",  # postgres reconnects while timeline_delete doesn't hold the tenant's timelines.lock()
             ".*query handler for 'pagestream.*failed: Timeline .* is not active",  # timeline delete in progress
             ".*task iteration took longer than the configured period.*",
+            # this is until #3501
+            ".*Compaction failed, retrying in [^:]+: Cannot run compaction iteration on inactive tenant",
         ]
 
     def start(
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index 9e75396799..e56bb1b469 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -250,10 +250,6 @@ def test_pageserver_with_empty_tenants(
     env.pageserver.allowed_errors.append(
         ".*could not load tenant.*Failed to list timelines directory.*"
     )
-    # this is until #3501
-    env.pageserver.allowed_errors.append(
-        ".*Compaction failed, retrying in 2s: Cannot run compaction iteration on inactive tenant"
-    )
 
     client = env.pageserver.http_client()
 

From 43bf6d0a0f2695074f360bbc3b1deea2065f6321 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 21 Feb 2023 21:09:31 +0200
Subject: [PATCH 039/412] calculate_logical_size: no longer use spawn_blocking
 (#3664)

Calculation of logical size is now async because of layer downloads, so
we shouldn't use spawn_blocking for it. Use of `spawn_blocking`
exhausted resources which are needed by `tokio::io::copy` when copying
from a stream to a file which lead to deadlock.

Fixes: #3657
---
 pageserver/src/tenant/timeline.rs | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 8bc02cd10a..176eb61ff3 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1770,15 +1770,9 @@ impl Timeline {
         let calculation = async {
             let cancel = cancel.child_token();
             let ctx = ctx.attached_child();
-            tokio::task::spawn_blocking(move || {
-                // Run in a separate thread since this can do a lot of
-                // synchronous file IO without .await inbetween
-                // if there are no RemoteLayers that would require downloading.
-                let h = tokio::runtime::Handle::current();
-                h.block_on(self_calculation.calculate_logical_size(init_lsn, cancel, &ctx))
-            })
-            .await
-            .context("Failed to spawn calculation result task")?
+            self_calculation
+                .calculate_logical_size(init_lsn, cancel, &ctx)
+                .await
         };
         let timeline_state_cancellation = async {
             loop {
@@ -1811,7 +1805,7 @@ impl Timeline {
         tokio::pin!(calculation);
         loop {
             tokio::select! {
-                res = &mut calculation =>  { return res }
+                res = &mut calculation => { return res }
                 reason = timeline_state_cancellation => {
                     debug!(reason = reason, "cancelling calculation");
                     cancel.cancel();

From a51b269f1502b69c5b2720d0a0e20b1e702ebd58 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 21 Feb 2023 21:14:08 +0200
Subject: [PATCH 040/412] fix: hold permit until GetObject eof (#3663)

previously we applied the ratelimiting only up to receiving the headers
from s3, or somewhere near it. the commit adds an adapter which carries
the permit until the AsyncRead has been disposed.

fixes #3662.
---
 Cargo.lock                           |  1 +
 libs/remote_storage/Cargo.toml       |  2 +-
 libs/remote_storage/src/s3_bucket.rs | 45 +++++++++++++++++++++++-----
 3 files changed, 40 insertions(+), 8 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index d154b4eaea..dab3d12263 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3054,6 +3054,7 @@ dependencies = [
  "hyper",
  "metrics",
  "once_cell",
+ "pin-project-lite",
  "serde",
  "serde_json",
  "tempfile",
diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml
index 4382fbac32..15812e8439 100644
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -21,7 +21,7 @@ toml_edit.workspace = true
 tracing.workspace = true
 metrics.workspace = true
 utils.workspace = true
-
+pin-project-lite.workspace = true
 workspace_hack.workspace = true
 
 [dev-dependencies]
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 18a2c5dedd..93f5e0596e 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -20,7 +20,10 @@ use aws_sdk_s3::{
 };
 use aws_smithy_http::body::SdkBody;
 use hyper::Body;
-use tokio::{io, sync::Semaphore};
+use tokio::{
+    io::{self, AsyncRead},
+    sync::Semaphore,
+};
 use tokio_util::io::ReaderStream;
 use tracing::debug;
 
@@ -102,7 +105,7 @@ pub struct S3Bucket {
     // Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded.
     // Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold.
     // The helps to ensure we don't exceed the thresholds.
-    concurrency_limiter: Semaphore,
+    concurrency_limiter: Arc<Semaphore>,
 }
 
 #[derive(Default)]
@@ -162,7 +165,7 @@ impl S3Bucket {
             client,
             bucket_name: aws_config.bucket_name.clone(),
             prefix_in_bucket,
-            concurrency_limiter: Semaphore::new(aws_config.concurrency_limit.get()),
+            concurrency_limiter: Arc::new(Semaphore::new(aws_config.concurrency_limit.get())),
         })
     }
 
@@ -194,9 +197,10 @@ impl S3Bucket {
     }
 
     async fn download_object(&self, request: GetObjectRequest) -> Result<Download, DownloadError> {
-        let _guard = self
+        let permit = self
             .concurrency_limiter
-            .acquire()
+            .clone()
+            .acquire_owned()
             .await
             .context("Concurrency limiter semaphore got closed during S3 download")
             .map_err(DownloadError::Other)?;
@@ -217,9 +221,10 @@ impl S3Bucket {
                 let metadata = object_output.metadata().cloned().map(StorageMetadata);
                 Ok(Download {
                     metadata,
-                    download_stream: Box::pin(io::BufReader::new(
+                    download_stream: Box::pin(io::BufReader::new(RatelimitedAsyncRead::new(
+                        permit,
                         object_output.body.into_async_read(),
-                    )),
+                    ))),
                 })
             }
             Err(SdkError::ServiceError {
@@ -240,6 +245,32 @@ impl S3Bucket {
     }
 }
 
+pin_project_lite::pin_project! {
+    /// An `AsyncRead` adapter which carries a permit for the lifetime of the value.
+    struct RatelimitedAsyncRead<S> {
+        permit: tokio::sync::OwnedSemaphorePermit,
+        #[pin]
+        inner: S,
+    }
+}
+
+impl<S: AsyncRead> RatelimitedAsyncRead<S> {
+    fn new(permit: tokio::sync::OwnedSemaphorePermit, inner: S) -> Self {
+        RatelimitedAsyncRead { permit, inner }
+    }
+}
+
+impl<S: AsyncRead> AsyncRead for RatelimitedAsyncRead<S> {
+    fn poll_read(
+        self: std::pin::Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+        buf: &mut io::ReadBuf<'_>,
+    ) -> std::task::Poll<std::io::Result<()>> {
+        let this = self.project();
+        this.inner.poll_read(cx, buf)
+    }
+}
+
 #[async_trait::async_trait]
 impl RemoteStorage for S3Bucket {
     async fn list(&self) -> anyhow::Result<Vec<RemotePath>> {

From 38cd90dd0c85ff6286e689cae33a95fe095ffca0 Mon Sep 17 00:00:00 2001
From: Sergey Melnikov <sergey@neon.tech>
Date: Tue, 21 Feb 2023 21:11:52 +0100
Subject: [PATCH 041/412] Add -v to ansible invocations (#3670)

To get more debug output on failures
---
 .github/workflows/deploy-dev.yml  | 2 +-
 .github/workflows/deploy-prod.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/deploy-dev.yml b/.github/workflows/deploy-dev.yml
index 409517bf63..b080a29f7c 100644
--- a/.github/workflows/deploy-dev.yml
+++ b/.github/workflows/deploy-dev.yml
@@ -67,7 +67,7 @@ jobs:
           ./get_binaries.sh
 
           ansible-galaxy collection install sivel.toiletwater
-          ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_STAGING_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
+          ansible-playbook -v deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_STAGING_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
           rm -f neon_install.tar.gz .neon_current_version
 
       - name: Cleanup ansible folder
diff --git a/.github/workflows/deploy-prod.yml b/.github/workflows/deploy-prod.yml
index 540d187274..6096ac8ab9 100644
--- a/.github/workflows/deploy-prod.yml
+++ b/.github/workflows/deploy-prod.yml
@@ -68,7 +68,7 @@ jobs:
           ./get_binaries.sh
 
           ansible-galaxy collection install sivel.toiletwater
-          ansible-playbook deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
+          ansible-playbook -v deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
           rm -f neon_install.tar.gz .neon_current_version
 
   deploy-proxy-prod-new:

From 46cc8b7982b270796d6266861801b2e466319968 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Wed, 22 Feb 2023 12:55:41 +0300
Subject: [PATCH 042/412] Remove safekeeper-1.ap-southeast-1.aws.neon.tech
 (#3671)

We migrated all timelines to
`safekeeper-3.ap-southeast-1.aws.neon.tech`, now old instance can be
removed.
---
 .github/ansible/prod.ap-southeast-1.hosts.yaml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.github/ansible/prod.ap-southeast-1.hosts.yaml b/.github/ansible/prod.ap-southeast-1.hosts.yaml
index 13b44f4052..8ccb67b04a 100644
--- a/.github/ansible/prod.ap-southeast-1.hosts.yaml
+++ b/.github/ansible/prod.ap-southeast-1.hosts.yaml
@@ -32,8 +32,6 @@ storage:
       hosts:
         safekeeper-0.ap-southeast-1.aws.neon.tech:
           ansible_host:  i-0d6f1dc5161eef894
-        safekeeper-1.ap-southeast-1.aws.neon.tech:
-          ansible_host:  i-0e338adda8eb2d19f
         safekeeper-2.ap-southeast-1.aws.neon.tech:
           ansible_host:  i-04fb63634e4679eb9
         safekeeper-3.ap-southeast-1.aws.neon.tech:

From efef68ce9972ba47ce7890e386d1b066e67f695d Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Wed, 22 Feb 2023 22:52:22 +0400
Subject: [PATCH 043/412] Bump vendor/postgres to include hotfix for unlogged
 tables with indexes.

https://github.com/neondatabase/postgres/pull/259
https://github.com/neondatabase/postgres/pull/262
---
 vendor/postgres-v14 | 2 +-
 vendor/postgres-v15 | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index f210ac524b..b44ee1d9a5 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit f210ac524b42d2d6f404f8505c64de36e977d17c
+Subproject commit b44ee1d9a5b061ababb31f89a4e30a1795573f51
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 33f9763454..303fa4050f 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 33f976345490351f951d72f81621c2263c186c9a
+Subproject commit 303fa4050fafba3771052b3d49b8e2d00d6ea2e3

From 91a4ea0de2a69b6003c6d224b6c96789ce1a4513 Mon Sep 17 00:00:00 2001
From: MMeent <matthias@neon.tech>
Date: Thu, 23 Feb 2023 15:10:22 +0100
Subject: [PATCH 044/412] Update vendored PostgreSQL versions to 14.7 and 15.2
 (#3581)

## Describe your changes
Rebase vendored PostgreSQL onto 14.7 and 15.2

## Issue ticket number and link

#3579

## Checklist before requesting a review
- [x] I have performed a self-review of my code.
- [x] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [x] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.
    ```
The version of PostgreSQL that we use is updated to 14.7 for PostgreSQL
14 and 15.2 for PostgreSQL 15.
    ```
---
 test_runner/fixtures/neon_fixtures.py   |  10 +-
 test_runner/regress/test_tenant_size.py | 164 ++++++++++++++++--------
 vendor/postgres-v14                     |   2 +-
 vendor/postgres-v15                     |   2 +-
 4 files changed, 120 insertions(+), 58 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 73f224039e..c4b3d057f8 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1669,7 +1669,7 @@ class AbstractNeonCli(abc.ABC):
             timeout=timeout,
         )
         if not res.returncode:
-            log.info(f"Run success: {res.stdout}")
+            log.info(f"Run {res.args} success: {res.stdout}")
         elif check_return_code:
             # this way command output will be in recorded and shown in CI in failure message
             msg = f"""\
@@ -3463,6 +3463,14 @@ def wait_for_last_flush_lsn(
     return wait_for_last_record_lsn(env.pageserver.http_client(), tenant, timeline, last_flush_lsn)
 
 
+def wait_for_wal_insert_lsn(
+    env: NeonEnv, pg: Postgres, tenant: TenantId, timeline: TimelineId
+) -> Lsn:
+    """Wait for pageserver to catch up the latest flush LSN, returns the last observed lsn."""
+    last_flush_lsn = Lsn(pg.safe_psql("SELECT pg_current_wal_insert_lsn()")[0][0])
+    return wait_for_last_record_lsn(env.pageserver.http_client(), tenant, timeline, last_flush_lsn)
+
+
 def fork_at_current_lsn(
     env: NeonEnv,
     pg: Postgres,
diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py
index 8c2996f491..a4b5f7739a 100644
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -3,8 +3,15 @@ from typing import List, Tuple
 
 import pytest
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, wait_for_last_flush_lsn
-from fixtures.types import Lsn
+from fixtures.neon_fixtures import (
+    NeonEnv,
+    NeonEnvBuilder,
+    PageserverHttpClient,
+    Postgres,
+    wait_for_last_flush_lsn,
+    wait_for_wal_insert_lsn,
+)
+from fixtures.types import Lsn, TenantId, TimelineId
 
 
 def test_empty_tenant_size(neon_simple_env: NeonEnv, test_output_dir: Path):
@@ -324,7 +331,7 @@ def test_single_branch_get_tenant_size_grows(
     # inserts is larger than gc_horizon. for example 0x20000 here hid the fact
     # that there next_gc_cutoff could be smaller than initdb_lsn, which will
     # obviously lead to issues when calculating the size.
-    gc_horizon = 0x30000
+    gc_horizon = 0x38000
     neon_env_builder.pageserver_config_override = f"tenant_config={{compaction_period='0s', gc_period='0s', pitr_interval='0sec', gc_horizon={gc_horizon}}}"
 
     env = neon_env_builder.init_start()
@@ -334,29 +341,75 @@ def test_single_branch_get_tenant_size_grows(
 
     http_client = env.pageserver.http_client()
 
-    collected_responses: List[Tuple[Lsn, int]] = []
+    collected_responses: List[Tuple[str, Lsn, int]] = []
 
     size_debug_file = open(test_output_dir / "size_debug.html", "w")
 
-    def check_size_change(current_lsn: Lsn, initdb_lsn: Lsn, gc_horizon: int, size: int, prev: int):
-        if current_lsn - initdb_lsn > gc_horizon:
+    def check_size_change(
+        current_lsn: Lsn, initdb_lsn: Lsn, gc_horizon: int, size: int, prev_size: int
+    ):
+        if current_lsn - initdb_lsn >= gc_horizon:
             assert (
-                size >= prev
+                size >= prev_size
             ), "tenant_size may grow or not grow, because we only add gc_horizon amount of WAL to initial snapshot size"
         else:
             assert (
-                size > prev
+                size > prev_size
             ), "tenant_size should grow, because we continue to add WAL to initial snapshot size"
 
-    with env.postgres.create_start(branch_name, tenant_id=tenant_id) as pg:
-        initdb_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+    def get_current_consistent_size(
+        env: NeonEnv,
+        pg: Postgres,
+        size_debug_file,  # apparently there is no public signature for open()...
+        http_client: PageserverHttpClient,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> Tuple[Lsn, int]:
+        consistent = False
+        size_debug = None
+
+        current_lsn = wait_for_wal_insert_lsn(env, pg, tenant_id, timeline_id)
+        # We want to make sure we have a self-consistent set of values.
+        # Size changes with WAL, so only if both before and after getting
+        # the size of the tenant reports the same WAL insert LSN, we're OK
+        # to use that (size, LSN) combination.
+        # Note that 'wait_for_wal_flush_lsn' is not accurate enough: There
+        # can be more wal after the flush LSN that can arrive on the
+        # pageserver before we're requesting the page size.
+        # Anyway, in general this is only one iteration, so in general
+        # this is fine.
+        while not consistent:
+            size, sizes = http_client.tenant_size_and_modelinputs(tenant_id)
+            size_debug = http_client.tenant_size_debug(tenant_id)
+
+            after_lsn = wait_for_wal_insert_lsn(env, pg, tenant_id, timeline_id)
+            consistent = current_lsn == after_lsn
+            current_lsn = after_lsn
+        size_debug_file.write(size_debug)
+        return (current_lsn, size)
+
+    with env.postgres.create_start(
+        branch_name,
+        tenant_id=tenant_id,
+        ### autovacuum is disabled to limit WAL logging.
+        config_lines=["autovacuum=off"],
+    ) as pg:
+        (initdb_lsn, size) = get_current_consistent_size(
+            env, pg, size_debug_file, http_client, tenant_id, timeline_id
+        )
+        collected_responses.append(("INITDB", initdb_lsn, size))
+
         with pg.cursor() as cur:
-            cur.execute("CREATE TABLE t0 (i BIGINT NOT NULL)")
+            cur.execute("CREATE TABLE t0 (i BIGINT NOT NULL) WITH (fillfactor = 40)")
+
+        (current_lsn, size) = get_current_consistent_size(
+            env, pg, size_debug_file, http_client, tenant_id, timeline_id
+        )
+        collected_responses.append(("CREATE", current_lsn, size))
 
         batch_size = 100
 
-        i = 0
-        while True:
+        for i in range(3):
             with pg.cursor() as cur:
                 cur.execute(
                     f"INSERT INTO t0(i) SELECT i FROM generate_series({batch_size} * %s, ({batch_size} * (%s + 1)) - 1) s(i)",
@@ -365,27 +418,24 @@ def test_single_branch_get_tenant_size_grows(
 
             i += 1
 
-            current_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+            (current_lsn, size) = get_current_consistent_size(
+                env, pg, size_debug_file, http_client, tenant_id, timeline_id
+            )
 
-            size, sizes = http_client.tenant_size_and_modelinputs(tenant_id)
+            prev_size = collected_responses[-1][2]
+            if size == 0:
+                assert prev_size == 0
+            else:
+                # branch start shouldn't be past gc_horizon yet
+                # thus the size should grow as we insert more data
+                # "gc_horizon" is tuned so that it kicks in _after_ the
+                # insert phase, but before the update phase ends.
+                assert (
+                    current_lsn - initdb_lsn <= gc_horizon
+                ), "Tuning of GC window is likely out-of-date"
+                assert size > prev_size
 
-            size_debug = http_client.tenant_size_debug(tenant_id)
-            size_debug_file.write(size_debug)
-
-            if len(collected_responses) > 0:
-                prev = collected_responses[-1][1]
-                if size == 0:
-                    assert prev == 0
-                else:
-                    # branch start shouldn't be past gc_horizon yet
-                    # thus the size should grow as we insert more data
-                    assert current_lsn - initdb_lsn <= gc_horizon
-                    assert size > prev
-
-            collected_responses.append((current_lsn, size))
-
-            if len(collected_responses) > 2:
-                break
+            collected_responses.append(("INSERT", current_lsn, size))
 
         while True:
             with pg.cursor() as cur:
@@ -397,18 +447,15 @@ def test_single_branch_get_tenant_size_grows(
             if updated == 0:
                 break
 
-            current_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+            (current_lsn, size) = get_current_consistent_size(
+                env, pg, size_debug_file, http_client, tenant_id, timeline_id
+            )
 
-            size, sizes = http_client.tenant_size_and_modelinputs(tenant_id)
+            prev_size = collected_responses[-1][2]
 
-            size_debug = http_client.tenant_size_debug(tenant_id)
-            size_debug_file.write(size_debug)
+            check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size)
 
-            prev = collected_responses[-1][1]
-
-            check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev)
-
-            collected_responses.append((current_lsn, size))
+            collected_responses.append(("UPDATE", current_lsn, size))
 
         while True:
             with pg.cursor() as cur:
@@ -418,40 +465,47 @@ def test_single_branch_get_tenant_size_grows(
             if deleted == 0:
                 break
 
-            current_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+            (current_lsn, size) = get_current_consistent_size(
+                env, pg, size_debug_file, http_client, tenant_id, timeline_id
+            )
 
-            size = http_client.tenant_size(tenant_id)
-            prev = collected_responses[-1][1]
+            prev_size = collected_responses[-1][2]
 
-            check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev)
+            check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size)
 
-            collected_responses.append((current_lsn, size))
+            collected_responses.append(("DELETE", current_lsn, size))
 
         with pg.cursor() as cur:
             cur.execute("DROP TABLE t0")
 
-        current_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+        # The size of the tenant should still be as large as before we dropped
+        # the table, because the drop operation can still be undone in the PITR
+        # defined by gc_horizon.
+        (current_lsn, size) = get_current_consistent_size(
+            env, pg, size_debug_file, http_client, tenant_id, timeline_id
+        )
 
-        size = http_client.tenant_size(tenant_id)
-        prev = collected_responses[-1][1]
+        prev_size = collected_responses[-1][2]
 
-        check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev)
+        check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size)
 
-        collected_responses.append((current_lsn, size))
+        collected_responses.append(("DROP", current_lsn, size))
 
     # this isn't too many lines to forget for a while. observed while
     # developing these tests that locally the value is a bit more than what we
     # get in the ci.
-    for lsn, size in collected_responses:
-        log.info(f"collected: {lsn}, {size}")
+    for phase, lsn, size in collected_responses:
+        log.info(f"collected: {phase}, {lsn}, {size}")
 
     env.pageserver.stop()
     env.pageserver.start()
 
+    size_after = http_client.tenant_size(tenant_id)
+    size_debug = http_client.tenant_size_debug(tenant_id)
+    size_debug_file.write(size_debug)
     size_debug_file.close()
 
-    size_after = http_client.tenant_size(tenant_id)
-    prev = collected_responses[-1][1]
+    prev = collected_responses[-1][2]
 
     assert size_after == prev, "size after restarting pageserver should not have changed"
 
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index b44ee1d9a5..468d3c0824 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit b44ee1d9a5b061ababb31f89a4e30a1795573f51
+Subproject commit 468d3c08245906f083fed1009759f9f953f5915d
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 303fa4050f..9a2093383a 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 303fa4050fafba3771052b3d49b8e2d00d6ea2e3
+Subproject commit 9a2093383ae19906f025b008ceecf89ebc9ea869

From 240913912a4d9450ac3c67810b60f1df7e65afb4 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 24 Feb 2023 13:45:32 +0200
Subject: [PATCH 045/412] Fix UNLOGGED tables.

Instead of trying to create missing files on the way, send init fork contents as
main fork from pageserver during basebackup. Add test for that. Call
put_rel_drop for init forks; previously they weren't removed. Bump
vendor/postgres to revert previous approach on Postgres side.

Co-authored-by: Arseny Sher <sher-ars@yandex.ru>

ref https://github.com/neondatabase/postgres/pull/264
ref https://github.com/neondatabase/postgres/pull/259
ref https://github.com/neondatabase/neon/issues/1222
---
 libs/pageserver_api/src/reltag.rs    |  9 ++++++
 pageserver/src/basebackup.rs         | 45 ++++++++++++++++++++--------
 pageserver/src/walingest.rs          |  4 +--
 test_runner/regress/test_unlogged.py | 34 +++++++++++++++++++++
 vendor/postgres-v14                  |  2 +-
 vendor/postgres-v15                  |  2 +-
 6 files changed, 79 insertions(+), 17 deletions(-)
 create mode 100644 test_runner/regress/test_unlogged.py

diff --git a/libs/pageserver_api/src/reltag.rs b/libs/pageserver_api/src/reltag.rs
index 43d38bd986..12693379f5 100644
--- a/libs/pageserver_api/src/reltag.rs
+++ b/libs/pageserver_api/src/reltag.rs
@@ -98,6 +98,15 @@ impl RelTag {
 
         name
     }
+
+    pub fn with_forknum(&self, forknum: u8) -> Self {
+        RelTag {
+            forknum,
+            spcnode: self.spcnode,
+            dbnode: self.dbnode,
+            relnode: self.relnode,
+        }
+    }
 }
 
 ///
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index 06d4853274..41fa0a67bb 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -33,6 +33,7 @@ use pageserver_api::reltag::{RelTag, SlruKind};
 
 use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
 use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PGDATA_SUBDIRS, PG_HBA};
+use postgres_ffi::relfile_utils::{INIT_FORKNUM, MAIN_FORKNUM};
 use postgres_ffi::TransactionId;
 use postgres_ffi::XLogFileName;
 use postgres_ffi::PG_TLI;
@@ -190,14 +191,31 @@ where
         {
             self.add_dbdir(spcnode, dbnode, has_relmap_file).await?;
 
-            // Gather and send relational files in each database if full backup is requested.
-            if self.full_backup {
-                for rel in self
-                    .timeline
-                    .list_rels(spcnode, dbnode, self.lsn, self.ctx)
-                    .await?
-                {
-                    self.add_rel(rel).await?;
+            // If full backup is requested, include all relation files.
+            // Otherwise only include init forks of unlogged relations.
+            let rels = self
+                .timeline
+                .list_rels(spcnode, dbnode, self.lsn, self.ctx)
+                .await?;
+            for &rel in rels.iter() {
+                // Send init fork as main fork to provide well formed empty
+                // contents of UNLOGGED relations. Postgres copies it in
+                // `reinit.c` during recovery.
+                if rel.forknum == INIT_FORKNUM {
+                    // I doubt we need _init fork itself, but having it at least
+                    // serves as a marker relation is unlogged.
+                    self.add_rel(rel, rel).await?;
+                    self.add_rel(rel, rel.with_forknum(MAIN_FORKNUM)).await?;
+                    continue;
+                }
+
+                if self.full_backup {
+                    if rel.forknum == MAIN_FORKNUM && rels.contains(&rel.with_forknum(INIT_FORKNUM))
+                    {
+                        // skip this, will include it when we reach the init fork
+                        continue;
+                    }
+                    self.add_rel(rel, rel).await?;
                 }
             }
         }
@@ -220,15 +238,16 @@ where
         Ok(())
     }
 
-    async fn add_rel(&mut self, tag: RelTag) -> anyhow::Result<()> {
+    /// Add contents of relfilenode `src`, naming it as `dst`.
+    async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> anyhow::Result<()> {
         let nblocks = self
             .timeline
-            .get_rel_size(tag, self.lsn, false, self.ctx)
+            .get_rel_size(src, self.lsn, false, self.ctx)
             .await?;
 
         // If the relation is empty, create an empty file
         if nblocks == 0 {
-            let file_name = tag.to_segfile_name(0);
+            let file_name = dst.to_segfile_name(0);
             let header = new_tar_header(&file_name, 0)?;
             self.ar.append(&header, &mut io::empty()).await?;
             return Ok(());
@@ -244,12 +263,12 @@ where
             for blknum in startblk..endblk {
                 let img = self
                     .timeline
-                    .get_rel_page_at_lsn(tag, blknum, self.lsn, false, self.ctx)
+                    .get_rel_page_at_lsn(src, blknum, self.lsn, false, self.ctx)
                     .await?;
                 segment_data.extend_from_slice(&img[..]);
             }
 
-            let file_name = tag.to_segfile_name(seg as u32);
+            let file_name = dst.to_segfile_name(seg as u32);
             let header = new_tar_header(&file_name, segment_data.len() as u64)?;
             self.ar.append(&header, segment_data.as_slice()).await?;
 
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 3761c65668..63d568a342 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -37,7 +37,7 @@ use crate::walrecord::*;
 use crate::ZERO_PAGE;
 use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::pg_constants;
-use postgres_ffi::relfile_utils::{FSM_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
+use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::v14::nonrelfile_utils::mx_offset_to_member_segment;
 use postgres_ffi::v14::xlog_utils::*;
 use postgres_ffi::v14::CheckPoint;
@@ -762,7 +762,7 @@ impl<'a> WalIngest<'a> {
         )?;
 
         for xnode in &parsed.xnodes {
-            for forknum in MAIN_FORKNUM..=VISIBILITYMAP_FORKNUM {
+            for forknum in MAIN_FORKNUM..=INIT_FORKNUM {
                 let rel = RelTag {
                     forknum,
                     spcnode: xnode.spcnode,
diff --git a/test_runner/regress/test_unlogged.py b/test_runner/regress/test_unlogged.py
new file mode 100644
index 0000000000..b6b20f1230
--- /dev/null
+++ b/test_runner/regress/test_unlogged.py
@@ -0,0 +1,34 @@
+from fixtures.neon_fixtures import NeonEnv, fork_at_current_lsn
+
+
+#
+# Test UNLOGGED tables/relations. Postgres copies init fork contents to main
+# fork to reset them during recovery. In Neon, pageserver directly sends init
+# fork contents as main fork during basebackup.
+#
+def test_unlogged(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+    env.neon_cli.create_branch("test_unlogged", "empty")
+    pg = env.postgres.create_start("test_unlogged")
+
+    conn = pg.connect()
+    cur = conn.cursor()
+
+    cur.execute("CREATE UNLOGGED TABLE iut (id int);")
+    # create index to test unlogged index relation as well
+    cur.execute("CREATE UNIQUE INDEX iut_idx ON iut (id);")
+    cur.execute("INSERT INTO iut values (42);")
+
+    # create another compute to fetch inital empty contents from pageserver
+    fork_at_current_lsn(env, pg, "test_unlogged_basebackup", "test_unlogged")
+    pg2 = env.postgres.create_start(
+        "test_unlogged_basebackup",
+    )
+
+    conn2 = pg2.connect()
+    cur2 = conn2.cursor()
+    # after restart table should be empty but valid
+    cur2.execute("PREPARE iut_plan (int) AS INSERT INTO iut VALUES ($1)")
+    cur2.execute("EXECUTE iut_plan (43);")
+    cur2.execute("SELECT * FROM iut")
+    assert cur2.fetchall() == [(43,)]
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 468d3c0824..5fb2e0bba0 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 468d3c08245906f083fed1009759f9f953f5915d
+Subproject commit 5fb2e0bba06cc018ee2506f337c91751ab695454
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 9a2093383a..919851e781 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 9a2093383ae19906f025b008ceecf89ebc9ea869
+Subproject commit 919851e7811fcb2ecfc67f35bfd63a35639c73b5

From 99752286d836653e3af3bc798871394970af8db0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lassi=20P=C3=B6l=C3=B6nen?= <lassi.polonen@iki.fi>
Date: Tue, 14 Mar 2023 15:23:46 +0200
Subject: [PATCH 046/412] Use RollingUpdate strategy also for legacy proxy
 (#3814)

## Describe your changes
We have previously changed the neon-proxy to use RollingUpdate. This
should be enabled in legacy proxy too in order to avoid breaking
connections for the clients and allow for example backups to run even
during deployment. (https://github.com/neondatabase/neon/pull/3683)

## Issue ticket number and link
https://github.com/neondatabase/neon/issues/3333
---
 ...od-us-west-2-eta.neon-proxy-scram-legacy.yaml | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml
index e67a3e4461..d23ea41bd7 100644
--- a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml
+++ b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml
@@ -1,6 +1,22 @@
 # Helm chart values for neon-proxy-scram.
 # This is a YAML-formatted file.
 
+deploymentStrategy:
+  type: RollingUpdate
+  rollingUpdate:
+    maxSurge: 100%
+    maxUnavailable: 50%
+
+# Delay the kill signal by 7 days (7 * 24 * 60 * 60)
+# The pod(s) will stay in Terminating, keeps the existing connections
+# but doesn't receive new ones
+containerLifecycle:
+  preStop:
+    exec:
+      command: ["/bin/sh", "-c", "sleep 604800"]
+terminationGracePeriodSeconds: 604800
+
+
 image:
   repository: neondatabase/neon
 

From afd0a6b39aa10392a74ab1200fe4765936b75ce0 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Wed, 15 Mar 2023 11:44:55 +0400
Subject: [PATCH 047/412] Forward framed read buf contents to compute before
 proxy pass.

Otherwise they get lost. Normally buffer is empty before proxy pass, but this is
not the case with pipeline mode of out npm driver; fixes connection hangup
introduced by b80fe41af3e for it.

fixes https://github.com/neondatabase/neon/issues/3822
---
 libs/pq_proto/src/framed.rs |  6 +++---
 proxy/src/proxy.rs          | 27 +++++++++++++++++++++------
 proxy/src/stream.rs         |  5 +++--
 3 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/libs/pq_proto/src/framed.rs b/libs/pq_proto/src/framed.rs
index 972730cbab..3cdca45009 100644
--- a/libs/pq_proto/src/framed.rs
+++ b/libs/pq_proto/src/framed.rs
@@ -63,9 +63,9 @@ impl<S> Framed<S> {
         &self.stream
     }
 
-    /// Extract the underlying stream.
-    pub fn into_inner(self) -> S {
-        self.stream
+    /// Deconstruct into the underlying stream and read buffer.
+    pub fn into_inner(self) -> (S, BytesMut) {
+        (self.stream, self.read_buf)
     }
 
     /// Return new Framed with stream type transformed by async f, for TLS
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index abeff6a33b..efe0e8795b 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -16,7 +16,7 @@ use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCou
 use once_cell::sync::Lazy;
 use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
 use std::sync::Arc;
-use tokio::io::{AsyncRead, AsyncWrite};
+use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
 use tracing::{error, info, warn};
 use utils::measured_stream::MeasuredStream;
 
@@ -209,9 +209,18 @@ async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                     if let Some(tls) = tls.take() {
                         // Upgrade raw stream into a secure TLS-backed stream.
                         // NOTE: We've consumed `tls`; this fact will be used later.
-                        stream = PqStream::new(
-                            stream.into_inner().upgrade(tls.to_server_config()).await?,
-                        );
+
+                        let (raw, read_buf) = stream.into_inner();
+                        // TODO: Normally, client doesn't send any data before
+                        // server says TLS handshake is ok and read_buf is empy.
+                        // However, you could imagine pipelining of postgres
+                        // SSLRequest + TLS ClientHello in one hunk similar to
+                        // pipelining in our node js driver. We should probably
+                        // support that by chaining read_buf with the stream.
+                        if !read_buf.is_empty() {
+                            bail!("data is sent before server replied with EncryptionResponse");
+                        }
+                        stream = PqStream::new(raw.upgrade(tls.to_server_config()).await?);
                     }
                 }
                 _ => bail!(ERR_PROTO_VIOLATION),
@@ -443,11 +452,17 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
             value: mut node_info,
         } = auth_result;
 
-        let node = connect_to_compute(&mut node_info, params, &extra, &creds)
+        let mut node = connect_to_compute(&mut node_info, params, &extra, &creds)
             .or_else(|e| stream.throw_error(e))
             .await?;
 
         prepare_client_connection(&node, reported_auth_ok, session, &mut stream).await?;
-        proxy_pass(stream.into_inner(), node.stream, &node_info.aux).await
+        // Before proxy passing, forward to compute whatever data is left in the
+        // PqStream input buffer. Normally there is none, but our serverless npm
+        // driver in pipeline mode sends startup, password and first query
+        // immediately after opening the connection.
+        let (stream, read_buf) = stream.into_inner();
+        node.stream.write_all(&read_buf).await?;
+        proxy_pass(stream, node.stream, &node_info.aux).await
     }
 }
diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs
index 9dfc435e39..7cb292ed58 100644
--- a/proxy/src/stream.rs
+++ b/proxy/src/stream.rs
@@ -1,5 +1,6 @@
 use crate::error::UserFacingError;
 use anyhow::bail;
+use bytes::BytesMut;
 use pin_project_lite::pin_project;
 use pq_proto::framed::{ConnectionError, Framed};
 use pq_proto::{BeMessage, FeMessage, FeStartupPacket, ProtocolError};
@@ -27,8 +28,8 @@ impl<S> PqStream<S> {
         }
     }
 
-    /// Extract the underlying stream.
-    pub fn into_inner(self) -> S {
+    /// Extract the underlying stream and read buffer.
+    pub fn into_inner(self) -> (S, BytesMut) {
         self.framed.into_inner()
     }
 

From 4ed51ad33b12edfec2e03b47f8fcfb7e7b8173cb Mon Sep 17 00:00:00 2001
From: Stas Kelvich <stas.kelvich@gmail.com>
Date: Tue, 11 Apr 2023 12:50:10 +0300
Subject: [PATCH 048/412] Add more proxy cnames

---
 .../prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml           | 2 +-
 .../helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml   | 2 +-
 .github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml  | 2 +-
 .github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml    | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
index 36dac8309d..5a98217bae 100644
--- a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
@@ -24,7 +24,7 @@ settings:
   authBackend: "console"
   authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
   domain: "*.ap-southeast-1.aws.neon.tech"
-  extraDomains: ["*.ap-southeast-1.retooldb.com"]
+  extraDomains: ["*.ap-southeast-1.retooldb.com", "*.ap-southeast-1.postgres.vercel-storage.com"]
   sentryEnvironment: "production"
   wssPort: 8443
   metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
diff --git a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
index f5b2f31cb9..a9ee49d82f 100644
--- a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
@@ -24,7 +24,7 @@ settings:
   authBackend: "console"
   authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
   domain: "*.eu-central-1.aws.neon.tech"
-  extraDomains: ["*.eu-central-1.retooldb.com"]
+  extraDomains: ["*.eu-central-1.retooldb.com", "*.eu-central-1.postgres.vercel-storage.com"]
   sentryEnvironment: "production"
   wssPort: 8443
   metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
diff --git a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
index 0be78d868a..239a9911c7 100644
--- a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
@@ -24,7 +24,7 @@ settings:
   authBackend: "console"
   authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
   domain: "*.us-east-2.aws.neon.tech"
-  extraDomains: ["*.us-east-2.retooldb.com"]
+  extraDomains: ["*.us-east-2.retooldb.com", "*.us-east-2.postgres.vercel-storage.com"]
   sentryEnvironment: "production"
   wssPort: 8443
   metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
diff --git a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
index 79115be0e2..c987ae236a 100644
--- a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
@@ -24,7 +24,7 @@ settings:
   authBackend: "console"
   authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
   domain: "*.us-west-2.aws.neon.tech"
-  extraDomains: ["*.us-west-2.retooldb.com"]
+  extraDomains: ["*.us-west-2.retooldb.com", "*.us-west-2.postgres.vercel-storage.com"]
   sentryEnvironment: "production"
   wssPort: 8443
   metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"

From d11d781afc300b1717050cad38ff02e21aa2dc02 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 28 Apr 2023 17:20:18 +0300
Subject: [PATCH 049/412] revert: "Add check for duplicates of generated image
 layers" (#4104)

This reverts commit 732acc5.

Reverted PR: #3869

As noted in PR #4094, we do in fact try to insert duplicates to the
layer map, if L0->L1 compaction is interrupted. We do not have a proper
fix for that right now, and we are in a hurry to make a release to
production, so revert the changes related to this to the state that we
have in production currently. We know that we have a bug here, but
better to live with the bug that we've had in production for a long
time, than rush a fix to production without testing it in staging first.

Cc: #4094, #4088
---
 pageserver/benches/bench_layer_map.rs         |  4 +--
 pageserver/src/tenant.rs                      |  7 +-----
 pageserver/src/tenant/layer_map.rs            | 23 +++++++----------
 .../layer_map/historic_layer_coverage.rs      |  8 ------
 pageserver/src/tenant/timeline.rs             | 25 +++++++------------
 5 files changed, 21 insertions(+), 46 deletions(-)

diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs
index 8f139a6596..ee5980212e 100644
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -33,7 +33,7 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap<LayerDescriptor> {
         min_lsn = min(min_lsn, lsn_range.start);
         max_lsn = max(max_lsn, Lsn(lsn_range.end.0 - 1));
 
-        updates.insert_historic(Arc::new(layer)).unwrap();
+        updates.insert_historic(Arc::new(layer));
     }
 
     println!("min: {min_lsn}, max: {max_lsn}");
@@ -215,7 +215,7 @@ fn bench_sequential(c: &mut Criterion) {
             is_incremental: false,
             short_id: format!("Layer {}", i),
         };
-        updates.insert_historic(Arc::new(layer)).unwrap();
+        updates.insert_historic(Arc::new(layer));
     }
     updates.flush();
     println!("Finished layer map init in {:?}", now.elapsed());
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index d69d5e4b45..5cfc466111 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -271,10 +271,7 @@ impl UninitializedTimeline<'_> {
             .await
             .context("Failed to flush after basebackup import")?;
 
-        // Initialize without loading the layer map. We started with an empty layer map, and already
-        // updated it for the layers that we created during the import.
-        let mut timelines = self.owning_tenant.timelines.lock().unwrap();
-        self.initialize_with_lock(ctx, &mut timelines, false, true)
+        self.initialize(ctx)
     }
 
     fn raw_timeline(&self) -> anyhow::Result<&Arc<Timeline>> {
@@ -2355,8 +2352,6 @@ impl Tenant {
                 )
             })?;
 
-        // Initialize the timeline without loading the layer map, because we already updated the layer
-        // map above, when we imported the datadir.
         let timeline = {
             let mut timelines = self.timelines.lock().unwrap();
             raw_timeline.initialize_with_lock(ctx, &mut timelines, false, true)?
diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index 0ee0c6f77d..8d06ccd565 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -51,7 +51,7 @@ use crate::keyspace::KeyPartitioning;
 use crate::repository::Key;
 use crate::tenant::storage_layer::InMemoryLayer;
 use crate::tenant::storage_layer::Layer;
-use anyhow::{bail, Result};
+use anyhow::Result;
 use std::collections::VecDeque;
 use std::ops::Range;
 use std::sync::Arc;
@@ -125,7 +125,7 @@ where
     ///
     /// Insert an on-disk layer.
     ///
-    pub fn insert_historic(&mut self, layer: Arc<L>) -> anyhow::Result<()> {
+    pub fn insert_historic(&mut self, layer: Arc<L>) {
         self.layer_map.insert_historic_noflush(layer)
     }
 
@@ -273,21 +273,16 @@ where
     ///
     /// Helper function for BatchedUpdates::insert_historic
     ///
-    pub(self) fn insert_historic_noflush(&mut self, layer: Arc<L>) -> anyhow::Result<()> {
-        let key = historic_layer_coverage::LayerKey::from(&*layer);
-        if self.historic.contains(&key) {
-            bail!(
-                "Attempt to insert duplicate layer {} in layer map",
-                layer.short_id()
-            );
-        }
-        self.historic.insert(key, Arc::clone(&layer));
+    pub(self) fn insert_historic_noflush(&mut self, layer: Arc<L>) {
+        // TODO: See #3869, resulting #4088, attempted fix and repro #4094
+        self.historic.insert(
+            historic_layer_coverage::LayerKey::from(&*layer),
+            Arc::clone(&layer),
+        );
 
         if Self::is_l0(&layer) {
             self.l0_delta_layers.push(layer);
         }
-
-        Ok(())
     }
 
     ///
@@ -839,7 +834,7 @@ mod tests {
 
             let expected_in_counts = (1, usize::from(expected_l0));
 
-            map.batch_update().insert_historic(remote.clone()).unwrap();
+            map.batch_update().insert_historic(remote.clone());
             assert_eq!(count_layer_in(&map, &remote), expected_in_counts);
 
             let replaced = map
diff --git a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
index 1fdcd5e5a4..b63c361314 100644
--- a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
@@ -417,14 +417,6 @@ impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {
         }
     }
 
-    pub fn contains(&self, layer_key: &LayerKey) -> bool {
-        match self.buffer.get(layer_key) {
-            Some(None) => false,                         // layer remove was buffered
-            Some(_) => true,                             // layer insert was buffered
-            None => self.layers.contains_key(layer_key), // no buffered ops for this layer
-        }
-    }
-
     pub fn insert(&mut self, layer_key: LayerKey, value: Value) {
         self.buffer.insert(layer_key, Some(value));
     }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 5c671ffd63..8768841d87 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1484,7 +1484,7 @@ impl Timeline {
 
                 trace!("found layer {}", layer.path().display());
                 total_physical_size += file_size;
-                updates.insert_historic(Arc::new(layer))?;
+                updates.insert_historic(Arc::new(layer));
                 num_layers += 1;
             } else if let Some(deltafilename) = DeltaFileName::parse_str(&fname) {
                 // Create a DeltaLayer struct for each delta file.
@@ -1516,7 +1516,7 @@ impl Timeline {
 
                 trace!("found layer {}", layer.path().display());
                 total_physical_size += file_size;
-                updates.insert_historic(Arc::new(layer))?;
+                updates.insert_historic(Arc::new(layer));
                 num_layers += 1;
             } else if fname == METADATA_FILE_NAME || fname.ends_with(".old") {
                 // ignore these
@@ -1590,7 +1590,7 @@ impl Timeline {
             // remote index file?
             // If so, rename_to_backup those files & replace their local layer with
             // a RemoteLayer in the layer map so that we re-download them on-demand.
-            if let Some(local_layer) = &local_layer {
+            if let Some(local_layer) = local_layer {
                 let local_layer_path = local_layer
                     .local_path()
                     .expect("caller must ensure that local_layers only contains local layers");
@@ -1615,6 +1615,7 @@ impl Timeline {
                         anyhow::bail!("could not rename file {local_layer_path:?}: {err:?}");
                     } else {
                         self.metrics.resident_physical_size_gauge.sub(local_size);
+                        updates.remove_historic(local_layer);
                         // fall-through to adding the remote layer
                     }
                 } else {
@@ -1650,11 +1651,7 @@ impl Timeline {
                     );
                     let remote_layer = Arc::new(remote_layer);
 
-                    if let Some(local_layer) = &local_layer {
-                        updates.replace_historic(local_layer, remote_layer)?;
-                    } else {
-                        updates.insert_historic(remote_layer)?;
-                    }
+                    updates.insert_historic(remote_layer);
                 }
                 LayerFileName::Delta(deltafilename) => {
                     // Create a RemoteLayer for the delta file.
@@ -1678,11 +1675,7 @@ impl Timeline {
                         LayerAccessStats::for_loading_layer(LayerResidenceStatus::Evicted),
                     );
                     let remote_layer = Arc::new(remote_layer);
-                    if let Some(local_layer) = &local_layer {
-                        updates.replace_historic(local_layer, remote_layer)?;
-                    } else {
-                        updates.insert_historic(remote_layer)?;
-                    }
+                    updates.insert_historic(remote_layer);
                 }
             }
         }
@@ -2730,7 +2723,7 @@ impl Timeline {
             .write()
             .unwrap()
             .batch_update()
-            .insert_historic(Arc::new(new_delta))?;
+            .insert_historic(Arc::new(new_delta));
 
         // update the timeline's physical size
         let sz = new_delta_path.metadata()?.len();
@@ -2935,7 +2928,7 @@ impl Timeline {
             self.metrics
                 .resident_physical_size_gauge
                 .add(metadata.len());
-            updates.insert_historic(Arc::new(l))?;
+            updates.insert_historic(Arc::new(l));
         }
         updates.flush();
         drop(layers);
@@ -3368,7 +3361,7 @@ impl Timeline {
 
             new_layer_paths.insert(new_delta_path, LayerFileMetadata::new(metadata.len()));
             let x: Arc<dyn PersistentLayer + 'static> = Arc::new(l);
-            updates.insert_historic(x)?;
+            updates.insert_historic(x);
         }
 
         // Now that we have reshuffled the data to set of new delta layers, we can

From 4c3ba1627b26178917880f23c4c9eede98960626 Mon Sep 17 00:00:00 2001
From: Shany Pozin <shany@neon.tech>
Date: Fri, 28 Apr 2023 20:16:02 +0300
Subject: [PATCH 050/412] Add 4 new Pageservers for retool launch

---
 .github/ansible/prod.us-west-2.hosts.yaml | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/.github/ansible/prod.us-west-2.hosts.yaml b/.github/ansible/prod.us-west-2.hosts.yaml
index 9cf847bcb1..1fde83520e 100644
--- a/.github/ansible/prod.us-west-2.hosts.yaml
+++ b/.github/ansible/prod.us-west-2.hosts.yaml
@@ -41,6 +41,14 @@ storage:
           ansible_host: i-051642d372c0a4f32
         pageserver-3.us-west-2.aws.neon.tech:
           ansible_host: i-00c3844beb9ad1c6b
+        pageserver-4.us-west-2.aws.neon.tech:
+          ansible_host: i-013263dd1c239adcc
+        pageserver-5.us-west-2.aws.neon.tech:
+          ansible_host: i-00ca6417c7bf96820
+        pageserver-6.us-west-2.aws.neon.tech:
+          ansible_host: i-01cdf7d2bc1433b6a
+        pageserver-7.us-west-2.aws.neon.tech:
+          ansible_host: i-02eec9b40617db5bc
 
     safekeepers:
       hosts:
@@ -50,4 +58,3 @@ storage:
           ansible_host: i-074682f9d3c712e7c
         safekeeper-2.us-west-2.aws.neon.tech:
           ansible_host: i-042b7efb1729d7966
-

From ff1119da6678d589026d3a74004f2dfe53411abe Mon Sep 17 00:00:00 2001
From: Shany Pozin <shany@neon.tech>
Date: Mon, 1 May 2023 13:33:10 +0300
Subject: [PATCH 051/412] Add 2 new sets of safekeepers to us-west2

---
 .github/ansible/prod.us-west-2.hosts.yaml | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/.github/ansible/prod.us-west-2.hosts.yaml b/.github/ansible/prod.us-west-2.hosts.yaml
index 1fde83520e..be65d8e63c 100644
--- a/.github/ansible/prod.us-west-2.hosts.yaml
+++ b/.github/ansible/prod.us-west-2.hosts.yaml
@@ -58,3 +58,15 @@ storage:
           ansible_host: i-074682f9d3c712e7c
         safekeeper-2.us-west-2.aws.neon.tech:
           ansible_host: i-042b7efb1729d7966
+        safekeeper-3.us-west-2.aws.neon.tech:
+          ansible_host: i-089f6b9ef426dff76
+        safekeeper-4.us-west-2.aws.neon.tech:
+          ansible_host: i-0fe6bf912c4710c82
+        safekeeper-5.us-west-2.aws.neon.tech:
+          ansible_host: i-0a83c1c46d2b4e409
+        safekeeper-6.us-west-2.aws.neon.tech:
+          ansible_host: i-0fef5317b8fdc9f8d
+        safekeeper-7.us-west-2.aws.neon.tech:
+          ansible_host: i-0be739190d4289bf9
+        safekeeper-8.us-west-2.aws.neon.tech:
+          ansible_host: i-00e851803669e5cfe                    

From 840183e51fde41b6f3019d33655d9974d2f6880e Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 11 May 2023 15:08:39 +0300
Subject: [PATCH 052/412] try: higher page_service timeouts to isolate an issue

---
 pageserver/src/page_service.rs | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index a7a0d1a22e..bd3ece2dfc 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -256,7 +256,10 @@ async fn page_service_conn_main(
     //
     // no write timeout is used, because the kernel is assumed to error writes after some time.
     let mut socket = tokio_io_timeout::TimeoutReader::new(socket);
-    socket.set_timeout(Some(std::time::Duration::from_secs(60 * 10)));
+
+    // timeout should be lower, but trying out multiple days for
+    // <https://github.com/neondatabase/neon/issues/4205>
+    socket.set_timeout(Some(std::time::Duration::from_secs(60 * 60 * 24 * 3)));
     let socket = std::pin::pin!(socket);
 
     // XXX: pgbackend.run() should take the connection_ctx,

From 85d6194aa40671566f349283f63a692d1861c640 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 16 May 2023 17:19:12 +0100
Subject: [PATCH 053/412] Fix regress-tests job for Postgres 15 on release
 branch (#4254)

## Problem

Compatibility tests don't support Postgres 15 yet, but we're still
trying to upload compatibility snapshot (which we do not collect).

Ref
https://github.com/neondatabase/neon/actions/runs/4991394158/jobs/8940369368#step:4:38129

## Summary of changes

Add `pg_version` parameter to `run-python-test-set` actions and do not
upload compatibility snapshot for Postgres 15
---
 .github/actions/run-python-test-set/action.yml | 11 ++++++++---
 .github/workflows/build_and_test.yml           |  2 +-
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index d6c960bfda..bb120e9470 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -48,6 +48,10 @@ inputs:
     description: 'Whether to rerun flaky tests'
     required: false
     default: 'false'
+  pg_version:
+    description: 'Postgres version to use for tests'
+    required: false
+    default: 'v14'
 
 runs:
   using: "composite"
@@ -68,7 +72,7 @@ runs:
         prefix: latest
 
     - name: Download compatibility snapshot for Postgres 14
-      if: inputs.build_type != 'remote'
+      if: inputs.build_type != 'remote' && inputs.pg_version == 'v14'
       uses: ./.github/actions/download
       with:
         name: compatibility-snapshot-${{ inputs.build_type }}-pg14
@@ -106,13 +110,14 @@ runs:
         ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'backward compatibility breakage')
         ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage')
         RERUN_FLAKY: ${{ inputs.rerun_flaky }}
+        PG_VERSION: ${{ inputs.pg_version }}
       shell: bash -euxo pipefail {0}
       run: |
         # PLATFORM will be embedded in the perf test report
         # and it is needed to distinguish different environments
         export PLATFORM=${PLATFORM:-github-actions-selfhosted}
         export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install}
-        export DEFAULT_PG_VERSION=${DEFAULT_PG_VERSION:-14}
+        export DEFAULT_PG_VERSION=${PG_VERSION#v}
 
         if [ "${BUILD_TYPE}" = "remote" ]; then
           export REMOTE_ENV=1
@@ -193,7 +198,7 @@ runs:
         fi
 
     - name: Upload compatibility snapshot for Postgres 14
-      if: github.ref_name == 'release'
+      if: github.ref_name == 'release' && inputs.pg_version == 'v14'
       uses: ./.github/actions/upload
       with:
         name: compatibility-snapshot-${{ inputs.build_type }}-pg14-${{ github.run_id }}
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 5a09f0b4aa..ef9f171766 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -350,8 +350,8 @@ jobs:
           real_s3_access_key_id: "${{ secrets.AWS_ACCESS_KEY_ID_CI_TESTS_S3 }}"
           real_s3_secret_access_key: "${{ secrets.AWS_SECRET_ACCESS_KEY_CI_TESTS_S3 }}"
           rerun_flaky: true
+          pg_version: ${{ matrix.pg_version }}
         env:
-          DEFAULT_PG_VERSION: ${{ matrix.pg_version }}
           TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
           CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
 

From b0a77844f60cdcfb84648f2a7d40f0bb85c1f189 Mon Sep 17 00:00:00 2001
From: Stas Kelvich <stas.kelvich@gmail.com>
Date: Fri, 14 Apr 2023 19:41:02 +0300
Subject: [PATCH 054/412] Add SQL-over-HTTP endpoint to Proxy

This commit introduces an SQL-over-HTTP endpoint in the proxy, with a JSON
response structure resembling that of the node-postgres driver. This method,
using HTTP POST, achieves smaller amortized latencies in edge setups due to
fewer round trips and an enhanced open connection reuse by the v8 engine.

This update involves several intricacies:
1. SQL injection protection: We employed the extended query protocol, modifying
   the rust-postgres driver to send queries in one roundtrip using a text
   protocol rather than binary, bypassing potential issues like those identified
   in https://github.com/sfackler/rust-postgres/issues/1030.

2. Postgres type compatibility: As not all postgres types have binary
   representations (e.g., acl's in pg_class), we adjusted rust-postgres to
   respond with text protocol, simplifying serialization and fixing queries with
   text-only types in response.

3. Data type conversion: Considering JSON supports fewer data types than
   Postgres, we perform conversions where possible, passing all other types as
   strings. Key conversions include:
   - postgres int2, int4, float4, float8 -> json number (NaN and Inf remain
     text)
   - postgres bool, null, text -> json bool, null, string
   - postgres array -> json array
   - postgres json and jsonb -> json object

4. Alignment with node-postgres: To facilitate integration with js libraries,
   we've matched the response structure of node-postgres, returning command tags
   and column oids. Command tag capturing was added to the rust-postgres
   functionality as part of this change.
---
 Cargo.lock                                    |  10 +-
 Cargo.toml                                    |  12 +-
 proxy/README.md                               |  86 ++-
 proxy/src/config.rs                           |   5 +-
 proxy/src/http.rs                             |   1 +
 proxy/src/http/sql_over_http.rs               | 603 ++++++++++++++++++
 proxy/src/http/websocket.rs                   |  86 ++-
 test_runner/fixtures/neon_fixtures.py         |  41 +-
 test_runner/regress/test_metric_collection.py |   2 +
 test_runner/regress/test_proxy.py             | 128 +++-
 10 files changed, 909 insertions(+), 65 deletions(-)
 create mode 100644 proxy/src/http/sql_over_http.rs

diff --git a/Cargo.lock b/Cargo.lock
index 55418473d5..4d63ebd99d 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2820,7 +2820,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=0bc41d8503c092b040142214aac3cf7d11d0c19f#0bc41d8503c092b040142214aac3cf7d11d0c19f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9#2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -2833,7 +2833,7 @@ dependencies = [
 [[package]]
 name = "postgres-native-tls"
 version = "0.5.0"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=0bc41d8503c092b040142214aac3cf7d11d0c19f#0bc41d8503c092b040142214aac3cf7d11d0c19f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9#2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9"
 dependencies = [
  "native-tls",
  "tokio",
@@ -2844,7 +2844,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=0bc41d8503c092b040142214aac3cf7d11d0c19f#0bc41d8503c092b040142214aac3cf7d11d0c19f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9#2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9"
 dependencies = [
  "base64 0.20.0",
  "byteorder",
@@ -2862,7 +2862,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=0bc41d8503c092b040142214aac3cf7d11d0c19f#0bc41d8503c092b040142214aac3cf7d11d0c19f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9#2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -4321,7 +4321,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=0bc41d8503c092b040142214aac3cf7d11d0c19f#0bc41d8503c092b040142214aac3cf7d11d0c19f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9#2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9"
 dependencies = [
  "async-trait",
  "byteorder",
diff --git a/Cargo.toml b/Cargo.toml
index c901532f86..7895459841 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -126,11 +126,11 @@ env_logger = "0.10"
 log = "0.4"
 
 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
-postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
+postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
 tokio-tar = { git = "https://github.com/neondatabase/tokio-tar.git", rev="404df61437de0feef49ba2ccdbdd94eb8ad6e142" }
 
 ## Other git libraries
@@ -166,7 +166,7 @@ tonic-build = "0.9"
 
 # This is only needed for proxy's tests.
 # TODO: we should probably fork `tokio-postgres-rustls` instead.
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
 
 # Changes the MAX_THREADS limit from 4096 to 32768.
 # This is a temporary workaround for using tracing from many threads in safekeepers code,
diff --git a/proxy/README.md b/proxy/README.md
index 4ead098b73..cd76a2443f 100644
--- a/proxy/README.md
+++ b/proxy/README.md
@@ -1,6 +1,6 @@
 # Proxy
 
-Proxy binary accepts `--auth-backend` CLI option, which determines auth scheme and cluster routing method. Following backends are currently implemented:
+Proxy binary accepts `--auth-backend` CLI option, which determines auth scheme and cluster routing method. Following routing backends are currently implemented:
 
 * console
   new SCRAM-based console API; uses SNI info to select the destination project (endpoint soon)
@@ -9,6 +9,90 @@ Proxy binary accepts `--auth-backend` CLI option, which determines auth scheme a
 * link
   sends login link for all usernames
 
+Also proxy can expose following services to the external world:
+
+* postgres protocol over TCP -- usual postgres endpoint compatible with usual
+  postgres drivers
+* postgres protocol over WebSockets -- same protocol tunneled over websockets
+  for environments where TCP connection is not available. We have our own
+  implementation of a client that uses node-postgres and tunnels traffic through
+  websockets: https://github.com/neondatabase/serverless
+* SQL over HTTP -- service that accepts POST requests with SQL text over HTTP
+  and responds with JSON-serialised results.
+
+
+## SQL over HTTP
+
+Contrary to the usual postgres proto over TCP and WebSockets using plain
+one-shot HTTP request achieves smaller amortized latencies in edge setups due to
+fewer round trips and an enhanced open connection reuse by the v8 engine. Also
+such endpoint could be used directly without any driver.
+
+To play with it locally one may start proxy over a local postgres installation
+(see end of this page on how to generate certs with openssl):
+
+```
+./target/debug/proxy -c server.crt -k server.key --auth-backend=postgres --auth-endpoint=postgres://stas@127.0.0.1:5432/stas --wss 0.0.0.0:4444
+```
+
+If both postgres and proxy are running you may send a SQL query:
+```json
+curl -k -X POST 'https://proxy.localtest.me:4444/sql' \
+  -H 'Neon-Connection-String: postgres://stas:pass@proxy.localtest.me:4444/postgres' \
+  -H 'Content-Type: application/json' \
+  --data '{
+    "query":"SELECT $1::int[] as arr, $2::jsonb as obj, 42 as num",
+    "params":[ "{{1,2},{\"3\",4}}", {"key":"val", "ikey":4242}]
+  }' | jq
+
+{
+  "command": "SELECT",
+  "fields": [
+    { "dataTypeID": 1007, "name": "arr" },
+    { "dataTypeID": 3802, "name": "obj" },
+    { "dataTypeID": 23, "name": "num" }
+  ],
+  "rowCount": 1,
+  "rows": [
+    {
+      "arr": [[1,2],[3,4]],
+      "num": 42,
+      "obj": {
+        "ikey": 4242,
+        "key": "val"
+      }
+    }
+  ]
+}
+```
+
+
+With the current approach we made the following design decisions:
+
+1. SQL injection protection: We employed the extended query protocol, modifying
+   the rust-postgres driver to send queries in one roundtrip using a text
+   protocol rather than binary, bypassing potential issues like those identified
+   in sfackler/rust-postgres#1030.
+
+2. Postgres type compatibility: As not all postgres types have binary
+   representations (e.g., acl's in pg_class), we adjusted rust-postgres to
+   respond with text protocol, simplifying serialization and fixing queries with
+   text-only types in response.
+
+3. Data type conversion: Considering JSON supports fewer data types than
+   Postgres, we perform conversions where possible, passing all other types as
+   strings. Key conversions include:
+   - postgres int2, int4, float4, float8 -> json number (NaN and Inf remain
+     text)
+   - postgres bool, null, text -> json bool, null, string
+   - postgres array -> json array
+   - postgres json and jsonb -> json object
+
+4. Alignment with node-postgres: To facilitate integration with js libraries,
+   we've matched the response structure of node-postgres, returning command tags
+   and column oids. Command tag capturing was added to the rust-postgres
+   functionality as part of this change.
+
 ## Using SNI-based routing on localhost
 
 Now proxy determines project name from the subdomain, request to the `round-rice-566201.somedomain.tld` will be routed to the project named `round-rice-566201`. Unfortunately, `/etc/hosts` does not support domain wildcards, so I usually use `*.localtest.me` which resolves to `127.0.0.1`. Now we can create self-signed certificate and play with proxy:
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 530229b3fd..6a26cea78e 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -100,9 +100,10 @@ impl CertResolver {
         is_default: bool,
     ) -> anyhow::Result<()> {
         let priv_key = {
-            let key_bytes = std::fs::read(key_path).context("TLS key file")?;
-            let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..])
+            let key_bytes = std::fs::read(key_path)
                 .context(format!("Failed to read TLS keys at '{key_path}'"))?;
+            let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..])
+                .context(format!("Failed to parse TLS keys at '{key_path}'"))?;
 
             ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len());
             keys.pop().map(rustls::PrivateKey).unwrap()
diff --git a/proxy/src/http.rs b/proxy/src/http.rs
index a544157800..5cf49b669c 100644
--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -3,6 +3,7 @@
 //! directly relying on deps like `reqwest` (think loose coupling).
 
 pub mod server;
+pub mod sql_over_http;
 pub mod websocket;
 
 pub use reqwest::{Request, Response, StatusCode};
diff --git a/proxy/src/http/sql_over_http.rs b/proxy/src/http/sql_over_http.rs
new file mode 100644
index 0000000000..0438a82c12
--- /dev/null
+++ b/proxy/src/http/sql_over_http.rs
@@ -0,0 +1,603 @@
+use futures::pin_mut;
+use futures::StreamExt;
+use hyper::body::HttpBody;
+use hyper::{Body, HeaderMap, Request};
+use pq_proto::StartupMessageParams;
+use serde_json::json;
+use serde_json::Map;
+use serde_json::Value;
+use tokio_postgres::types::Kind;
+use tokio_postgres::types::Type;
+use tokio_postgres::Row;
+use url::Url;
+
+use crate::{auth, config::ProxyConfig, console};
+
+#[derive(serde::Deserialize)]
+struct QueryData {
+    query: String,
+    params: Vec<serde_json::Value>,
+}
+
+const APP_NAME: &str = "sql_over_http";
+const MAX_RESPONSE_SIZE: usize = 1024 * 1024; // 1 MB
+const MAX_REQUEST_SIZE: u64 = 1024 * 1024; // 1 MB
+
+//
+// Convert json non-string types to strings, so that they can be passed to Postgres
+// as parameters.
+//
+fn json_to_pg_text(json: Vec<Value>) -> Result<Vec<String>, serde_json::Error> {
+    json.iter()
+        .map(|value| {
+            match value {
+                Value::Null => serde_json::to_string(value),
+                Value::Bool(_) => serde_json::to_string(value),
+                Value::Number(_) => serde_json::to_string(value),
+                Value::Object(_) => serde_json::to_string(value),
+
+                // no need to escape
+                Value::String(s) => Ok(s.to_string()),
+
+                // special care for arrays
+                Value::Array(_) => json_array_to_pg_array(value),
+            }
+        })
+        .collect()
+}
+
+//
+// Serialize a JSON array to a Postgres array. Contrary to the strings in the params
+// in the array we need to escape the strings. Postgres is okay with arrays of form
+// '{1,"2",3}'::int[], so we don't check that array holds values of the same type, leaving
+// it for Postgres to check.
+//
+// Example of the same escaping in node-postgres: packages/pg/lib/utils.js
+//
+fn json_array_to_pg_array(value: &Value) -> Result<String, serde_json::Error> {
+    match value {
+        // same
+        Value::Null => serde_json::to_string(value),
+        Value::Bool(_) => serde_json::to_string(value),
+        Value::Number(_) => serde_json::to_string(value),
+        Value::Object(_) => serde_json::to_string(value),
+
+        // now needs to be escaped, as it is part of the array
+        Value::String(_) => serde_json::to_string(value),
+
+        // recurse into array
+        Value::Array(arr) => {
+            let vals = arr
+                .iter()
+                .map(json_array_to_pg_array)
+                .collect::<Result<Vec<_>, _>>()?
+                .join(",");
+            Ok(format!("{{{}}}", vals))
+        }
+    }
+}
+
+fn get_conn_info(
+    headers: &HeaderMap,
+    sni_hostname: Option<String>,
+) -> Result<(String, String, String, String), anyhow::Error> {
+    let connection_string = headers
+        .get("Neon-Connection-String")
+        .ok_or(anyhow::anyhow!("missing connection string"))?
+        .to_str()?;
+
+    let connection_url = Url::parse(connection_string)?;
+
+    let protocol = connection_url.scheme();
+    if protocol != "postgres" && protocol != "postgresql" {
+        return Err(anyhow::anyhow!(
+            "connection string must start with postgres: or postgresql:"
+        ));
+    }
+
+    let mut url_path = connection_url
+        .path_segments()
+        .ok_or(anyhow::anyhow!("missing database name"))?;
+
+    let dbname = url_path
+        .next()
+        .ok_or(anyhow::anyhow!("invalid database name"))?;
+
+    let username = connection_url.username();
+    if username.is_empty() {
+        return Err(anyhow::anyhow!("missing username"));
+    }
+
+    let password = connection_url
+        .password()
+        .ok_or(anyhow::anyhow!("no password"))?;
+
+    // TLS certificate selector now based on SNI hostname, so if we are running here
+    // we are sure that SNI hostname is set to one of the configured domain names.
+    let sni_hostname = sni_hostname.ok_or(anyhow::anyhow!("no SNI hostname set"))?;
+
+    let hostname = connection_url
+        .host_str()
+        .ok_or(anyhow::anyhow!("no host"))?;
+
+    let host_header = headers
+        .get("host")
+        .and_then(|h| h.to_str().ok())
+        .and_then(|h| h.split(':').next());
+
+    if hostname != sni_hostname {
+        return Err(anyhow::anyhow!("mismatched SNI hostname and hostname"));
+    } else if let Some(h) = host_header {
+        if h != hostname {
+            return Err(anyhow::anyhow!("mismatched host header and hostname"));
+        }
+    }
+
+    Ok((
+        username.to_owned(),
+        dbname.to_owned(),
+        hostname.to_owned(),
+        password.to_owned(),
+    ))
+}
+
+// TODO: return different http error codes
+pub async fn handle(
+    config: &'static ProxyConfig,
+    request: Request<Body>,
+    sni_hostname: Option<String>,
+) -> anyhow::Result<Value> {
+    //
+    // Determine the destination and connection params
+    //
+    let headers = request.headers();
+    let (username, dbname, hostname, password) = get_conn_info(headers, sni_hostname)?;
+    let credential_params = StartupMessageParams::new([
+        ("user", &username),
+        ("database", &dbname),
+        ("application_name", APP_NAME),
+    ]);
+
+    //
+    // Wake up the destination if needed. Code here is a bit involved because
+    // we reuse the code from the usual proxy and we need to prepare few structures
+    // that this code expects.
+    //
+    let tls = config.tls_config.as_ref();
+    let common_names = tls.and_then(|tls| tls.common_names.clone());
+    let creds = config
+        .auth_backend
+        .as_ref()
+        .map(|_| auth::ClientCredentials::parse(&credential_params, Some(&hostname), common_names))
+        .transpose()?;
+    let extra = console::ConsoleReqExtra {
+        session_id: uuid::Uuid::new_v4(),
+        application_name: Some(APP_NAME),
+    };
+    let node = creds.wake_compute(&extra).await?.expect("msg");
+    let conf = node.value.config;
+    let port = *conf.get_ports().first().expect("no port");
+    let host = match conf.get_hosts().first().expect("no host") {
+        tokio_postgres::config::Host::Tcp(host) => host,
+        tokio_postgres::config::Host::Unix(_) => {
+            return Err(anyhow::anyhow!("unix socket is not supported"));
+        }
+    };
+
+    let request_content_length = match request.body().size_hint().upper() {
+        Some(v) => v,
+        None => MAX_REQUEST_SIZE + 1,
+    };
+
+    if request_content_length > MAX_REQUEST_SIZE {
+        return Err(anyhow::anyhow!(
+            "request is too large (max {MAX_REQUEST_SIZE} bytes)"
+        ));
+    }
+
+    //
+    // Read the query and query params from the request body
+    //
+    let body = hyper::body::to_bytes(request.into_body()).await?;
+    let QueryData { query, params } = serde_json::from_slice(&body)?;
+    let query_params = json_to_pg_text(params)?;
+
+    //
+    // Connenct to the destination
+    //
+    let (client, connection) = tokio_postgres::Config::new()
+        .host(host)
+        .port(port)
+        .user(&username)
+        .password(&password)
+        .dbname(&dbname)
+        .max_backend_message_size(MAX_RESPONSE_SIZE)
+        .connect(tokio_postgres::NoTls)
+        .await?;
+
+    tokio::spawn(async move {
+        if let Err(e) = connection.await {
+            eprintln!("connection error: {}", e);
+        }
+    });
+
+    //
+    // Now execute the query and return the result
+    //
+    let row_stream = client.query_raw_txt(query, query_params).await?;
+
+    // Manually drain the stream into a vector to leave row_stream hanging
+    // around to get a command tag. Also check that the response is not too
+    // big.
+    pin_mut!(row_stream);
+    let mut rows: Vec<tokio_postgres::Row> = Vec::new();
+    let mut curret_size = 0;
+    while let Some(row) = row_stream.next().await {
+        let row = row?;
+        curret_size += row.body_len();
+        rows.push(row);
+        if curret_size > MAX_RESPONSE_SIZE {
+            return Err(anyhow::anyhow!("response too large"));
+        }
+    }
+
+    // grab the command tag and number of rows affected
+    let command_tag = row_stream.command_tag().unwrap_or_default();
+    let mut command_tag_split = command_tag.split(' ');
+    let command_tag_name = command_tag_split.next().unwrap_or_default();
+    let command_tag_count = if command_tag_name == "INSERT" {
+        // INSERT returns OID first and then number of rows
+        command_tag_split.nth(1)
+    } else {
+        // other commands return number of rows (if any)
+        command_tag_split.next()
+    }
+    .and_then(|s| s.parse::<i64>().ok());
+
+    let fields = if !rows.is_empty() {
+        rows[0]
+            .columns()
+            .iter()
+            .map(|c| {
+                json!({
+                    "name": Value::String(c.name().to_owned()),
+                    "dataTypeID": Value::Number(c.type_().oid().into()),
+                })
+            })
+            .collect::<Vec<_>>()
+    } else {
+        Vec::new()
+    };
+
+    // convert rows to JSON
+    let rows = rows
+        .iter()
+        .map(pg_text_row_to_json)
+        .collect::<Result<Vec<_>, _>>()?;
+
+    // resulting JSON format is based on the format of node-postgres result
+    Ok(json!({
+        "command": command_tag_name,
+        "rowCount": command_tag_count,
+        "rows": rows,
+        "fields": fields,
+    }))
+}
+
+//
+// Convert postgres row with text-encoded values to JSON object
+//
+pub fn pg_text_row_to_json(row: &Row) -> Result<Value, anyhow::Error> {
+    let res = row
+        .columns()
+        .iter()
+        .enumerate()
+        .map(|(i, column)| {
+            let name = column.name();
+            let pg_value = row.as_text(i)?;
+            let json_value = pg_text_to_json(pg_value, column.type_())?;
+            Ok((name.to_string(), json_value))
+        })
+        .collect::<Result<Map<String, Value>, anyhow::Error>>()?;
+
+    Ok(Value::Object(res))
+}
+
+//
+// Convert postgres text-encoded value to JSON value
+//
+pub fn pg_text_to_json(pg_value: Option<&str>, pg_type: &Type) -> Result<Value, anyhow::Error> {
+    if let Some(val) = pg_value {
+        if val == "NULL" {
+            return Ok(Value::Null);
+        }
+
+        if let Kind::Array(elem_type) = pg_type.kind() {
+            return pg_array_parse(val, elem_type);
+        }
+
+        match *pg_type {
+            Type::BOOL => Ok(Value::Bool(val == "t")),
+            Type::INT2 | Type::INT4 => {
+                let val = val.parse::<i32>()?;
+                Ok(Value::Number(serde_json::Number::from(val)))
+            }
+            Type::FLOAT4 | Type::FLOAT8 => {
+                let fval = val.parse::<f64>()?;
+                let num = serde_json::Number::from_f64(fval);
+                if let Some(num) = num {
+                    Ok(Value::Number(num))
+                } else {
+                    // Pass Nan, Inf, -Inf as strings
+                    // JS JSON.stringify() does converts them to null, but we
+                    // want to preserve them, so we pass them as strings
+                    Ok(Value::String(val.to_string()))
+                }
+            }
+            Type::JSON | Type::JSONB => Ok(serde_json::from_str(val)?),
+            _ => Ok(Value::String(val.to_string())),
+        }
+    } else {
+        Ok(Value::Null)
+    }
+}
+
+//
+// Parse postgres array into JSON array.
+//
+// This is a bit involved because we need to handle nested arrays and quoted
+// values. Unlike postgres we don't check that all nested arrays have the same
+// dimensions, we just return them as is.
+//
+fn pg_array_parse(pg_array: &str, elem_type: &Type) -> Result<Value, anyhow::Error> {
+    _pg_array_parse(pg_array, elem_type, false).map(|(v, _)| v)
+}
+
+fn _pg_array_parse(
+    pg_array: &str,
+    elem_type: &Type,
+    nested: bool,
+) -> Result<(Value, usize), anyhow::Error> {
+    let mut pg_array_chr = pg_array.char_indices();
+    let mut level = 0;
+    let mut quote = false;
+    let mut entries: Vec<Value> = Vec::new();
+    let mut entry = String::new();
+
+    // skip bounds decoration
+    if let Some('[') = pg_array.chars().next() {
+        for (_, c) in pg_array_chr.by_ref() {
+            if c == '=' {
+                break;
+            }
+        }
+    }
+
+    while let Some((mut i, mut c)) = pg_array_chr.next() {
+        let mut escaped = false;
+
+        if c == '\\' {
+            escaped = true;
+            (i, c) = pg_array_chr.next().unwrap();
+        }
+
+        match c {
+            '{' if !quote => {
+                level += 1;
+                if level > 1 {
+                    let (res, off) = _pg_array_parse(&pg_array[i..], elem_type, true)?;
+                    entries.push(res);
+                    for _ in 0..off - 1 {
+                        pg_array_chr.next();
+                    }
+                }
+            }
+            '}' => {
+                level -= 1;
+                if level == 0 {
+                    if !entry.is_empty() {
+                        entries.push(pg_text_to_json(Some(&entry), elem_type)?);
+                    }
+                    if nested {
+                        return Ok((Value::Array(entries), i));
+                    }
+                }
+            }
+            '"' if !escaped => {
+                if quote {
+                    // push even if empty
+                    entries.push(pg_text_to_json(Some(&entry), elem_type)?);
+                    entry = String::new();
+                }
+                quote = !quote;
+            }
+            ',' if !quote => {
+                if !entry.is_empty() {
+                    entries.push(pg_text_to_json(Some(&entry), elem_type)?);
+                    entry = String::new();
+                }
+            }
+            _ => {
+                entry.push(c);
+            }
+        }
+    }
+
+    if level != 0 {
+        return Err(anyhow::anyhow!("unbalanced array"));
+    }
+
+    Ok((Value::Array(entries), 0))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use serde_json::json;
+
+    #[test]
+    fn test_atomic_types_to_pg_params() {
+        let json = vec![Value::Bool(true), Value::Bool(false)];
+        let pg_params = json_to_pg_text(json).unwrap();
+        assert_eq!(pg_params, vec!["true", "false"]);
+
+        let json = vec![Value::Number(serde_json::Number::from(42))];
+        let pg_params = json_to_pg_text(json).unwrap();
+        assert_eq!(pg_params, vec!["42"]);
+
+        let json = vec![Value::String("foo\"".to_string())];
+        let pg_params = json_to_pg_text(json).unwrap();
+        assert_eq!(pg_params, vec!["foo\""]);
+
+        let json = vec![Value::Null];
+        let pg_params = json_to_pg_text(json).unwrap();
+        assert_eq!(pg_params, vec!["null"]);
+    }
+
+    #[test]
+    fn test_json_array_to_pg_array() {
+        // atoms and escaping
+        let json = "[true, false, null, 42, \"foo\", \"bar\\\"-\\\\\"]";
+        let json: Value = serde_json::from_str(json).unwrap();
+        let pg_params = json_to_pg_text(vec![json]).unwrap();
+        assert_eq!(
+            pg_params,
+            vec!["{true,false,null,42,\"foo\",\"bar\\\"-\\\\\"}"]
+        );
+
+        // nested arrays
+        let json = "[[true, false], [null, 42], [\"foo\", \"bar\\\"-\\\\\"]]";
+        let json: Value = serde_json::from_str(json).unwrap();
+        let pg_params = json_to_pg_text(vec![json]).unwrap();
+        assert_eq!(
+            pg_params,
+            vec!["{{true,false},{null,42},{\"foo\",\"bar\\\"-\\\\\"}}"]
+        );
+    }
+
+    #[test]
+    fn test_atomic_types_parse() {
+        assert_eq!(
+            pg_text_to_json(Some("foo"), &Type::TEXT).unwrap(),
+            json!("foo")
+        );
+        assert_eq!(pg_text_to_json(None, &Type::TEXT).unwrap(), json!(null));
+        assert_eq!(pg_text_to_json(Some("42"), &Type::INT4).unwrap(), json!(42));
+        assert_eq!(pg_text_to_json(Some("42"), &Type::INT2).unwrap(), json!(42));
+        assert_eq!(
+            pg_text_to_json(Some("42"), &Type::INT8).unwrap(),
+            json!("42")
+        );
+        assert_eq!(
+            pg_text_to_json(Some("42.42"), &Type::FLOAT8).unwrap(),
+            json!(42.42)
+        );
+        assert_eq!(
+            pg_text_to_json(Some("42.42"), &Type::FLOAT4).unwrap(),
+            json!(42.42)
+        );
+        assert_eq!(
+            pg_text_to_json(Some("NaN"), &Type::FLOAT4).unwrap(),
+            json!("NaN")
+        );
+        assert_eq!(
+            pg_text_to_json(Some("Infinity"), &Type::FLOAT4).unwrap(),
+            json!("Infinity")
+        );
+        assert_eq!(
+            pg_text_to_json(Some("-Infinity"), &Type::FLOAT4).unwrap(),
+            json!("-Infinity")
+        );
+
+        let json: Value =
+            serde_json::from_str("{\"s\":\"str\",\"n\":42,\"f\":4.2,\"a\":[null,3,\"a\"]}")
+                .unwrap();
+        assert_eq!(
+            pg_text_to_json(
+                Some(r#"{"s":"str","n":42,"f":4.2,"a":[null,3,"a"]}"#),
+                &Type::JSONB
+            )
+            .unwrap(),
+            json
+        );
+    }
+
+    #[test]
+    fn test_pg_array_parse_text() {
+        fn pt(pg_arr: &str) -> Value {
+            pg_array_parse(pg_arr, &Type::TEXT).unwrap()
+        }
+        assert_eq!(
+            pt(r#"{"aa\"\\\,a",cha,"bbbb"}"#),
+            json!(["aa\"\\,a", "cha", "bbbb"])
+        );
+        assert_eq!(
+            pt(r#"{{"foo","bar"},{"bee","bop"}}"#),
+            json!([["foo", "bar"], ["bee", "bop"]])
+        );
+        assert_eq!(
+            pt(r#"{{{{"foo",NULL,"bop",bup}}}}"#),
+            json!([[[["foo", null, "bop", "bup"]]]])
+        );
+        assert_eq!(
+            pt(r#"{{"1",2,3},{4,NULL,6},{NULL,NULL,NULL}}"#),
+            json!([["1", "2", "3"], ["4", null, "6"], [null, null, null]])
+        );
+    }
+
+    #[test]
+    fn test_pg_array_parse_bool() {
+        fn pb(pg_arr: &str) -> Value {
+            pg_array_parse(pg_arr, &Type::BOOL).unwrap()
+        }
+        assert_eq!(pb(r#"{t,f,t}"#), json!([true, false, true]));
+        assert_eq!(pb(r#"{{t,f,t}}"#), json!([[true, false, true]]));
+        assert_eq!(
+            pb(r#"{{t,f},{f,t}}"#),
+            json!([[true, false], [false, true]])
+        );
+        assert_eq!(
+            pb(r#"{{t,NULL},{NULL,f}}"#),
+            json!([[true, null], [null, false]])
+        );
+    }
+
+    #[test]
+    fn test_pg_array_parse_numbers() {
+        fn pn(pg_arr: &str, ty: &Type) -> Value {
+            pg_array_parse(pg_arr, ty).unwrap()
+        }
+        assert_eq!(pn(r#"{1,2,3}"#, &Type::INT4), json!([1, 2, 3]));
+        assert_eq!(pn(r#"{1,2,3}"#, &Type::INT2), json!([1, 2, 3]));
+        assert_eq!(pn(r#"{1,2,3}"#, &Type::INT8), json!(["1", "2", "3"]));
+        assert_eq!(pn(r#"{1,2,3}"#, &Type::FLOAT4), json!([1.0, 2.0, 3.0]));
+        assert_eq!(pn(r#"{1,2,3}"#, &Type::FLOAT8), json!([1.0, 2.0, 3.0]));
+        assert_eq!(
+            pn(r#"{1.1,2.2,3.3}"#, &Type::FLOAT4),
+            json!([1.1, 2.2, 3.3])
+        );
+        assert_eq!(
+            pn(r#"{1.1,2.2,3.3}"#, &Type::FLOAT8),
+            json!([1.1, 2.2, 3.3])
+        );
+        assert_eq!(
+            pn(r#"{NaN,Infinity,-Infinity}"#, &Type::FLOAT4),
+            json!(["NaN", "Infinity", "-Infinity"])
+        );
+        assert_eq!(
+            pn(r#"{NaN,Infinity,-Infinity}"#, &Type::FLOAT8),
+            json!(["NaN", "Infinity", "-Infinity"])
+        );
+    }
+
+    #[test]
+    fn test_pg_array_with_decoration() {
+        fn p(pg_arr: &str) -> Value {
+            pg_array_parse(pg_arr, &Type::INT2).unwrap()
+        }
+        assert_eq!(
+            p(r#"[1:1][-2:-1][3:5]={{{1,2,3},{4,5,6}}}"#),
+            json!([[[1, 2, 3], [4, 5, 6]]])
+        );
+    }
+}
diff --git a/proxy/src/http/websocket.rs b/proxy/src/http/websocket.rs
index c7676e8e14..fbb602e3d2 100644
--- a/proxy/src/http/websocket.rs
+++ b/proxy/src/http/websocket.rs
@@ -4,12 +4,17 @@ use crate::{
 use bytes::{Buf, Bytes};
 use futures::{Sink, Stream, StreamExt};
 use hyper::{
-    server::{accept, conn::AddrIncoming},
+    server::{
+        accept,
+        conn::{AddrIncoming, AddrStream},
+    },
     upgrade::Upgraded,
-    Body, Request, Response, StatusCode,
+    Body, Method, Request, Response, StatusCode,
 };
 use hyper_tungstenite::{tungstenite::Message, HyperWebsocket, WebSocketStream};
 use pin_project_lite::pin_project;
+use serde_json::{json, Value};
+
 use std::{
     convert::Infallible,
     future::ready,
@@ -21,6 +26,7 @@ use tls_listener::TlsListener;
 use tokio::{
     io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf},
     net::TcpListener,
+    select,
 };
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, info_span, warn, Instrument};
@@ -30,6 +36,8 @@ use utils::http::{error::ApiError, json::json_response};
 // Tracking issue: https://github.com/rust-lang/rust/issues/98407.
 use sync_wrapper::SyncWrapper;
 
+use super::sql_over_http;
+
 pin_project! {
     /// This is a wrapper around a [`WebSocketStream`] that
     /// implements [`AsyncRead`] and [`AsyncWrite`].
@@ -159,6 +167,7 @@ async fn ws_handler(
     config: &'static ProxyConfig,
     cancel_map: Arc<CancelMap>,
     session_id: uuid::Uuid,
+    sni_hostname: Option<String>,
 ) -> Result<Response<Body>, ApiError> {
     let host = request
         .headers()
@@ -181,8 +190,44 @@ async fn ws_handler(
 
         // Return the response so the spawned future can continue.
         Ok(response)
+    // TODO: that deserves a refactor as now this function also handles http json client besides websockets.
+    // Right now I don't want to blow up sql-over-http patch with file renames and do that as a follow up instead.
+    } else if request.uri().path() == "/sql" && request.method() == Method::POST {
+        let result = select! {
+            _ = tokio::time::sleep(std::time::Duration::from_secs(10)) => {
+                Err(anyhow::anyhow!("Query timed out"))
+            }
+            response = sql_over_http::handle(config, request, sni_hostname) => {
+                response
+            }
+        };
+        let status_code = match result {
+            Ok(_) => StatusCode::OK,
+            Err(_) => StatusCode::BAD_REQUEST,
+        };
+        let json = match result {
+            Ok(r) => r,
+            Err(e) => {
+                let message = format!("{:?}", e);
+                let code = match e.downcast_ref::<tokio_postgres::Error>() {
+                    Some(e) => match e.code() {
+                        Some(e) => serde_json::to_value(e.code()).unwrap(),
+                        None => Value::Null,
+                    },
+                    None => Value::Null,
+                };
+                json!({ "message": message, "code": code })
+            }
+        };
+        json_response(status_code, json).map(|mut r| {
+            r.headers_mut().insert(
+                "Access-Control-Allow-Origin",
+                hyper::http::HeaderValue::from_static("*"),
+            );
+            r
+        })
     } else {
-        json_response(StatusCode::OK, "Connect with a websocket client")
+        json_response(StatusCode::BAD_REQUEST, "query is not supported")
     }
 }
 
@@ -216,20 +261,27 @@ pub async fn task_main(
         }
     });
 
-    let make_svc = hyper::service::make_service_fn(|_stream| async move {
-        Ok::<_, Infallible>(hyper::service::service_fn(
-            move |req: Request<Body>| async move {
-                let cancel_map = Arc::new(CancelMap::default());
-                let session_id = uuid::Uuid::new_v4();
-                ws_handler(req, config, cancel_map, session_id)
-                    .instrument(info_span!(
-                        "ws-client",
-                        session = format_args!("{session_id}")
-                    ))
-                    .await
-            },
-        ))
-    });
+    let make_svc =
+        hyper::service::make_service_fn(|stream: &tokio_rustls::server::TlsStream<AddrStream>| {
+            let sni_name = stream.get_ref().1.sni_hostname().map(|s| s.to_string());
+
+            async move {
+                Ok::<_, Infallible>(hyper::service::service_fn(move |req: Request<Body>| {
+                    let sni_name = sni_name.clone();
+                    async move {
+                        let cancel_map = Arc::new(CancelMap::default());
+                        let session_id = uuid::Uuid::new_v4();
+
+                        ws_handler(req, config, cancel_map, session_id, sni_name)
+                            .instrument(info_span!(
+                                "ws-client",
+                                session = format_args!("{session_id}")
+                            ))
+                            .await
+                    }
+                }))
+            }
+        });
 
     hyper::Server::builder(accept::from_stream(tls_listener))
         .serve(make_svc)
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 8ec17834ac..bde91e6783 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2042,15 +2042,19 @@ class NeonProxy(PgProtocol):
         proxy_port: int,
         http_port: int,
         mgmt_port: int,
+        external_http_port: int,
         auth_backend: NeonProxy.AuthBackend,
         metric_collection_endpoint: Optional[str] = None,
         metric_collection_interval: Optional[str] = None,
     ):
         host = "127.0.0.1"
-        super().__init__(dsn=auth_backend.default_conn_url, host=host, port=proxy_port)
+        domain = "proxy.localtest.me"  # resolves to 127.0.0.1
+        super().__init__(dsn=auth_backend.default_conn_url, host=domain, port=proxy_port)
 
+        self.domain = domain
         self.host = host
         self.http_port = http_port
+        self.external_http_port = external_http_port
         self.neon_binpath = neon_binpath
         self.test_output_dir = test_output_dir
         self.proxy_port = proxy_port
@@ -2062,11 +2066,42 @@ class NeonProxy(PgProtocol):
 
     def start(self) -> NeonProxy:
         assert self._popen is None
+
+        # generate key of it doesn't exist
+        crt_path = self.test_output_dir / "proxy.crt"
+        key_path = self.test_output_dir / "proxy.key"
+
+        if not key_path.exists():
+            r = subprocess.run(
+                [
+                    "openssl",
+                    "req",
+                    "-new",
+                    "-x509",
+                    "-days",
+                    "365",
+                    "-nodes",
+                    "-text",
+                    "-out",
+                    str(crt_path),
+                    "-keyout",
+                    str(key_path),
+                    "-subj",
+                    "/CN=*.localtest.me",
+                    "-addext",
+                    "subjectAltName = DNS:*.localtest.me",
+                ]
+            )
+            assert r.returncode == 0
+
         args = [
             str(self.neon_binpath / "proxy"),
             *["--http", f"{self.host}:{self.http_port}"],
             *["--proxy", f"{self.host}:{self.proxy_port}"],
             *["--mgmt", f"{self.host}:{self.mgmt_port}"],
+            *["--wss", f"{self.host}:{self.external_http_port}"],
+            *["-c", str(crt_path)],
+            *["-k", str(key_path)],
             *self.auth_backend.extra_args(),
         ]
 
@@ -2190,6 +2225,7 @@ def link_proxy(
     http_port = port_distributor.get_port()
     proxy_port = port_distributor.get_port()
     mgmt_port = port_distributor.get_port()
+    external_http_port = port_distributor.get_port()
 
     with NeonProxy(
         neon_binpath=neon_binpath,
@@ -2197,6 +2233,7 @@ def link_proxy(
         proxy_port=proxy_port,
         http_port=http_port,
         mgmt_port=mgmt_port,
+        external_http_port=external_http_port,
         auth_backend=NeonProxy.Link(),
     ) as proxy:
         proxy.start()
@@ -2224,6 +2261,7 @@ def static_proxy(
     proxy_port = port_distributor.get_port()
     mgmt_port = port_distributor.get_port()
     http_port = port_distributor.get_port()
+    external_http_port = port_distributor.get_port()
 
     with NeonProxy(
         neon_binpath=neon_binpath,
@@ -2231,6 +2269,7 @@ def static_proxy(
         proxy_port=proxy_port,
         http_port=http_port,
         mgmt_port=mgmt_port,
+        external_http_port=external_http_port,
         auth_backend=NeonProxy.Postgres(auth_endpoint),
     ) as proxy:
         proxy.start()
diff --git a/test_runner/regress/test_metric_collection.py b/test_runner/regress/test_metric_collection.py
index 1231188896..00ea77f2e7 100644
--- a/test_runner/regress/test_metric_collection.py
+++ b/test_runner/regress/test_metric_collection.py
@@ -204,6 +204,7 @@ def proxy_with_metric_collector(
     http_port = port_distributor.get_port()
     proxy_port = port_distributor.get_port()
     mgmt_port = port_distributor.get_port()
+    external_http_port = port_distributor.get_port()
 
     (host, port) = httpserver_listen_address
     metric_collection_endpoint = f"http://{host}:{port}/billing/api/v1/usage_events"
@@ -215,6 +216,7 @@ def proxy_with_metric_collector(
         proxy_port=proxy_port,
         http_port=http_port,
         mgmt_port=mgmt_port,
+        external_http_port=external_http_port,
         metric_collection_endpoint=metric_collection_endpoint,
         metric_collection_interval=metric_collection_interval,
         auth_backend=NeonProxy.Link(),
diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py
index ae914e384e..6be3995714 100644
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -1,22 +1,32 @@
+import json
 import subprocess
+from typing import Any, List
 
 import psycopg2
 import pytest
+import requests
 from fixtures.neon_fixtures import PSQL, NeonProxy, VanillaPostgres
 
 
-@pytest.mark.parametrize("option_name", ["project", "endpoint"])
-def test_proxy_select_1(static_proxy: NeonProxy, option_name: str):
+def test_proxy_select_1(static_proxy: NeonProxy):
     """
     A simplest smoke test: check proxy against a local postgres instance.
     """
 
-    out = static_proxy.safe_psql("select 1", options=f"{option_name}=generic-project-name")
+    # no SNI, deprecated `options=project` syntax (before we had several endpoint in project)
+    out = static_proxy.safe_psql("select 1", sslsni=0, options="project=generic-project-name")
     assert out[0][0] == 1
 
+    # no SNI, new `options=endpoint` syntax
+    out = static_proxy.safe_psql("select 1", sslsni=0, options="endpoint=generic-project-name")
+    assert out[0][0] == 1
 
-@pytest.mark.parametrize("option_name", ["project", "endpoint"])
-def test_password_hack(static_proxy: NeonProxy, option_name: str):
+    # with SNI
+    out = static_proxy.safe_psql("select 42", host="generic-project-name.localtest.me")
+    assert out[0][0] == 42
+
+
+def test_password_hack(static_proxy: NeonProxy):
     """
     Check the PasswordHack auth flow: an alternative to SCRAM auth for
     clients which can't provide the project/endpoint name via SNI or `options`.
@@ -24,14 +34,16 @@ def test_password_hack(static_proxy: NeonProxy, option_name: str):
 
     user = "borat"
     password = "password"
-    static_proxy.safe_psql(
-        f"create role {user} with login password '{password}'",
-        options=f"{option_name}=irrelevant",
-    )
+    static_proxy.safe_psql(f"create role {user} with login password '{password}'")
 
     # Note the format of `magic`!
-    magic = f"{option_name}=irrelevant;{password}"
-    static_proxy.safe_psql("select 1", sslsni=0, user=user, password=magic)
+    magic = f"project=irrelevant;{password}"
+    out = static_proxy.safe_psql("select 1", sslsni=0, user=user, password=magic)
+    assert out[0][0] == 1
+
+    magic = f"endpoint=irrelevant;{password}"
+    out = static_proxy.safe_psql("select 1", sslsni=0, user=user, password=magic)
+    assert out[0][0] == 1
 
     # Must also check that invalid magic won't be accepted.
     with pytest.raises(psycopg2.OperationalError):
@@ -69,52 +81,55 @@ def test_proxy_options(static_proxy: NeonProxy, option_name: str):
     """
 
     options = f"{option_name}=irrelevant -cproxytest.option=value"
-    out = static_proxy.safe_psql("show proxytest.option", options=options)
+    out = static_proxy.safe_psql("show proxytest.option", options=options, sslsni=0)
     assert out[0][0] == "value"
 
     options = f"-c proxytest.foo=\\ str {option_name}=irrelevant"
+    out = static_proxy.safe_psql("show proxytest.foo", options=options, sslsni=0)
+    assert out[0][0] == " str"
+
+    options = "-cproxytest.option=value"
+    out = static_proxy.safe_psql("show proxytest.option", options=options)
+    assert out[0][0] == "value"
+
+    options = "-c proxytest.foo=\\ str"
     out = static_proxy.safe_psql("show proxytest.foo", options=options)
     assert out[0][0] == " str"
 
 
-@pytest.mark.parametrize("option_name", ["project", "endpoint"])
-def test_auth_errors(static_proxy: NeonProxy, option_name: str):
+def test_auth_errors(static_proxy: NeonProxy):
     """
     Check that we throw very specific errors in some unsuccessful auth scenarios.
     """
 
     # User does not exist
     with pytest.raises(psycopg2.Error) as exprinfo:
-        static_proxy.connect(user="pinocchio", options=f"{option_name}=irrelevant")
+        static_proxy.connect(user="pinocchio")
     text = str(exprinfo.value).strip()
-    assert text.endswith("password authentication failed for user 'pinocchio'")
+    assert text.find("password authentication failed for user 'pinocchio'") != -1
 
     static_proxy.safe_psql(
         "create role pinocchio with login password 'magic'",
-        options=f"{option_name}=irrelevant",
     )
 
     # User exists, but password is missing
     with pytest.raises(psycopg2.Error) as exprinfo:
-        static_proxy.connect(user="pinocchio", password=None, options=f"{option_name}=irrelevant")
+        static_proxy.connect(user="pinocchio", password=None)
     text = str(exprinfo.value).strip()
-    assert text.endswith("password authentication failed for user 'pinocchio'")
+    assert text.find("password authentication failed for user 'pinocchio'") != -1
 
     # User exists, but password is wrong
     with pytest.raises(psycopg2.Error) as exprinfo:
-        static_proxy.connect(user="pinocchio", password="bad", options=f"{option_name}=irrelevant")
+        static_proxy.connect(user="pinocchio", password="bad")
     text = str(exprinfo.value).strip()
-    assert text.endswith("password authentication failed for user 'pinocchio'")
+    assert text.find("password authentication failed for user 'pinocchio'") != -1
 
     # Finally, check that the user can connect
-    with static_proxy.connect(
-        user="pinocchio", password="magic", options=f"{option_name}=irrelevant"
-    ):
+    with static_proxy.connect(user="pinocchio", password="magic"):
         pass
 
 
-@pytest.mark.parametrize("option_name", ["project", "endpoint"])
-def test_forward_params_to_client(static_proxy: NeonProxy, option_name: str):
+def test_forward_params_to_client(static_proxy: NeonProxy):
     """
     Check that we forward all necessary PostgreSQL server params to client.
     """
@@ -140,7 +155,7 @@ def test_forward_params_to_client(static_proxy: NeonProxy, option_name: str):
         where name = any(%s)
     """
 
-    with static_proxy.connect(options=f"{option_name}=irrelevant") as conn:
+    with static_proxy.connect() as conn:
         with conn.cursor() as cur:
             cur.execute(query, (reported_params_subset,))
             for name, value in cur.fetchall():
@@ -148,18 +163,65 @@ def test_forward_params_to_client(static_proxy: NeonProxy, option_name: str):
                 assert conn.get_parameter_status(name) == value
 
 
-@pytest.mark.parametrize("option_name", ["project", "endpoint"])
 @pytest.mark.timeout(5)
-def test_close_on_connections_exit(static_proxy: NeonProxy, option_name: str):
+def test_close_on_connections_exit(static_proxy: NeonProxy):
     # Open two connections, send SIGTERM, then ensure that proxy doesn't exit
     # until after connections close.
-    with static_proxy.connect(options=f"{option_name}=irrelevant"), static_proxy.connect(
-        options=f"{option_name}=irrelevant"
-    ):
+    with static_proxy.connect(), static_proxy.connect():
         static_proxy.terminate()
         with pytest.raises(subprocess.TimeoutExpired):
             static_proxy.wait_for_exit(timeout=2)
         # Ensure we don't accept any more connections
         with pytest.raises(psycopg2.OperationalError):
-            static_proxy.connect(options=f"{option_name}=irrelevant")
+            static_proxy.connect()
     static_proxy.wait_for_exit()
+
+
+def test_sql_over_http(static_proxy: NeonProxy):
+    static_proxy.safe_psql("create role http with login password 'http' superuser")
+
+    def q(sql: str, params: List[Any] = []) -> Any:
+        connstr = f"postgresql://http:http@{static_proxy.domain}:{static_proxy.proxy_port}/postgres"
+        response = requests.post(
+            f"https://{static_proxy.domain}:{static_proxy.external_http_port}/sql",
+            data=json.dumps({"query": sql, "params": params}),
+            headers={"Content-Type": "application/sql", "Neon-Connection-String": connstr},
+            verify=str(static_proxy.test_output_dir / "proxy.crt"),
+        )
+        assert response.status_code == 200
+        return response.json()
+
+    rows = q("select 42 as answer")["rows"]
+    assert rows == [{"answer": 42}]
+
+    rows = q("select $1 as answer", [42])["rows"]
+    assert rows == [{"answer": "42"}]
+
+    rows = q("select $1 * 1 as answer", [42])["rows"]
+    assert rows == [{"answer": 42}]
+
+    rows = q("select $1::int[] as answer", [[1, 2, 3]])["rows"]
+    assert rows == [{"answer": [1, 2, 3]}]
+
+    rows = q("select $1::json->'a' as answer", [{"a": {"b": 42}}])["rows"]
+    assert rows == [{"answer": {"b": 42}}]
+
+    rows = q("select * from pg_class limit 1")["rows"]
+    assert len(rows) == 1
+
+    res = q("create table t(id serial primary key, val int)")
+    assert res["command"] == "CREATE"
+    assert res["rowCount"] is None
+
+    res = q("insert into t(val) values (10), (20), (30) returning id")
+    assert res["command"] == "INSERT"
+    assert res["rowCount"] == 3
+    assert res["rows"] == [{"id": 1}, {"id": 2}, {"id": 3}]
+
+    res = q("select * from t")
+    assert res["command"] == "SELECT"
+    assert res["rowCount"] == 3
+
+    res = q("drop table t")
+    assert res["command"] == "DROP"
+    assert res["rowCount"] is None

From a475cdf642df08501c3c2035bcee454da68a3dfa Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Tue, 13 Jun 2023 13:34:56 +0200
Subject: [PATCH 055/412] [compute_ctl] Fix logging if catalog updates are
 skipped (#4480)

Otherwise, it wasn't clear from the log when Postgres started up
completely if catalog updates were skipped.

Follow-up for 4936ab6
---
 compute_tools/src/compute.rs | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 977708a18f..94cebf93de 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -370,11 +370,6 @@ impl ComputeNode {
         // 'Close' connection
         drop(client);
 
-        info!(
-            "finished configuration of compute for project {}",
-            spec.cluster.cluster_id.as_deref().unwrap_or("None")
-        );
-
         Ok(())
     }
 
@@ -427,22 +422,22 @@ impl ComputeNode {
     #[instrument(skip(self))]
     pub fn start_compute(&self) -> Result<std::process::Child> {
         let compute_state = self.state.lock().unwrap().clone();
-        let spec = compute_state.pspec.as_ref().expect("spec must be set");
+        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
         info!(
             "starting compute for project {}, operation {}, tenant {}, timeline {}",
-            spec.spec.cluster.cluster_id.as_deref().unwrap_or("None"),
-            spec.spec.operation_uuid.as_deref().unwrap_or("None"),
-            spec.tenant_id,
-            spec.timeline_id,
+            pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None"),
+            pspec.spec.operation_uuid.as_deref().unwrap_or("None"),
+            pspec.tenant_id,
+            pspec.timeline_id,
         );
 
         self.prepare_pgdata(&compute_state)?;
 
         let start_time = Utc::now();
 
-        let pg = self.start_postgres(spec.storage_auth_token.clone())?;
+        let pg = self.start_postgres(pspec.storage_auth_token.clone())?;
 
-        if spec.spec.mode == ComputeMode::Primary && !spec.spec.skip_pg_catalog_updates {
+        if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {
             self.apply_config(&compute_state)?;
         }
 
@@ -462,6 +457,11 @@ impl ComputeNode {
         }
         self.set_status(ComputeStatus::Running);
 
+        info!(
+            "finished configuration of compute for project {}",
+            pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None")
+        );
+
         Ok(pg)
     }
 

From e437787c8fdefc9859a37c69c31d0bf336f84aa9 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 21 Jun 2023 15:50:52 +0300
Subject: [PATCH 056/412] cargo update -p openssl (#4542)

To unblock release
https://github.com/neondatabase/neon/pull/4536#issuecomment-1600678054

Context: https://rustsec.org/advisories/RUSTSEC-2023-0044
---
 Cargo.lock | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 71a6699c50..4be74614c2 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2349,9 +2349,9 @@ checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575"
 
 [[package]]
 name = "openssl"
-version = "0.10.52"
+version = "0.10.55"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "01b8574602df80f7b85fdfc5392fa884a4e3b3f4f35402c070ab34c3d3f78d56"
+checksum = "345df152bc43501c5eb9e4654ff05f794effb78d4efe3d53abc158baddc0703d"
 dependencies = [
  "bitflags",
  "cfg-if",
@@ -2381,9 +2381,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
 
 [[package]]
 name = "openssl-sys"
-version = "0.9.87"
+version = "0.9.90"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8e17f59264b2809d77ae94f0e1ebabc434773f370d6ca667bd223ea10e06cc7e"
+checksum = "374533b0e45f3a7ced10fcaeccca020e66656bc03dac384f852e4e5a7a8104a6"
 dependencies = [
  "cc",
  "libc",

From 6bc756129065cb3d2e1be92289fce749e2920891 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <me@cschwarz.com>
Date: Fri, 23 Jun 2023 20:43:20 +0200
Subject: [PATCH 057/412] don't use MGMT_REQUEST_RUNTIME for consumption
 metrics synthetic size worker

The consumption metrics synthetic size worker does logical size calculation.
Logical size calculation currently does synchronous disk IO.
This blocks the MGMT_REQUEST_RUNTIME's executor threads, starving other futures.

While there's work on the way to move the synchronous disk IO into spawn_blocking,
the quickfix here is to use the BACKGROUND_RUNTIME instead of MGMT_REQUEST_RUNTIME.

Actually it's not just a quickfix. We simply shouldn't be blocking MGMT_REQUEST_RUNTIME
executor threads on CPU or sync disk IO.
That work isn't done yet, as many of the mgmt tasks still _do_ disk IO.
But it's not as intensive as the logical size calculations that we're fixing here.

While we're at it, fix disk-usage-based eviction in a similar way.
It wasn't the culprit here, according to prod logs, but it can theoretically be
a little CPU-intensive.

More context, including graphs from Prod:
https://neondb.slack.com/archives/C03F5SM1N02/p1687541681336949

(cherry picked from commit d6e35222ea592428b78401ff0053b51424674e03)
---
 pageserver/src/bin/pageserver.rs | 82 ++++++++++++++++----------------
 pageserver/src/http/routes.rs    |  4 +-
 2 files changed, 42 insertions(+), 44 deletions(-)

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 1fa5e4ab3b..b01ace63e4 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -495,50 +495,50 @@ fn start_pageserver(
                 Ok(())
             },
         );
+    }
 
-        if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
-            let background_jobs_barrier = background_jobs_barrier;
-            let metrics_ctx = RequestContext::todo_child(
-                TaskKind::MetricsCollection,
-                // This task itself shouldn't download anything.
-                // The actual size calculation does need downloads, and
-                // creates a child context with the right DownloadBehavior.
-                DownloadBehavior::Error,
-            );
-            task_mgr::spawn(
-                MGMT_REQUEST_RUNTIME.handle(),
-                TaskKind::MetricsCollection,
-                None,
-                None,
-                "consumption metrics collection",
-                true,
-                async move {
-                    // first wait until background jobs are cleared to launch.
-                    //
-                    // this is because we only process active tenants and timelines, and the
-                    // Timeline::get_current_logical_size will spawn the logical size calculation,
-                    // which will not be rate-limited.
-                    let cancel = task_mgr::shutdown_token();
+    if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
+        let background_jobs_barrier = background_jobs_barrier;
+        let metrics_ctx = RequestContext::todo_child(
+            TaskKind::MetricsCollection,
+            // This task itself shouldn't download anything.
+            // The actual size calculation does need downloads, and
+            // creates a child context with the right DownloadBehavior.
+            DownloadBehavior::Error,
+        );
+        task_mgr::spawn(
+            crate::BACKGROUND_RUNTIME.handle(),
+            TaskKind::MetricsCollection,
+            None,
+            None,
+            "consumption metrics collection",
+            true,
+            async move {
+                // first wait until background jobs are cleared to launch.
+                //
+                // this is because we only process active tenants and timelines, and the
+                // Timeline::get_current_logical_size will spawn the logical size calculation,
+                // which will not be rate-limited.
+                let cancel = task_mgr::shutdown_token();
 
-                    tokio::select! {
-                        _ = cancel.cancelled() => { return Ok(()); },
-                        _ = background_jobs_barrier.wait() => {}
-                    };
+                tokio::select! {
+                    _ = cancel.cancelled() => { return Ok(()); },
+                    _ = background_jobs_barrier.wait() => {}
+                };
 
-                    pageserver::consumption_metrics::collect_metrics(
-                        metric_collection_endpoint,
-                        conf.metric_collection_interval,
-                        conf.cached_metric_collection_interval,
-                        conf.synthetic_size_calculation_interval,
-                        conf.id,
-                        metrics_ctx,
-                    )
-                    .instrument(info_span!("metrics_collection"))
-                    .await?;
-                    Ok(())
-                },
-            );
-        }
+                pageserver::consumption_metrics::collect_metrics(
+                    metric_collection_endpoint,
+                    conf.metric_collection_interval,
+                    conf.cached_metric_collection_interval,
+                    conf.synthetic_size_calculation_interval,
+                    conf.id,
+                    metrics_ctx,
+                )
+                .instrument(info_span!("metrics_collection"))
+                .await?;
+                Ok(())
+            },
+        );
     }
 
     // Spawn a task to listen for libpq connections. It will spawn further tasks
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index fc8da70cc0..0a55741f84 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1128,8 +1128,6 @@ async fn disk_usage_eviction_run(
         freed_bytes: 0,
     };
 
-    use crate::task_mgr::MGMT_REQUEST_RUNTIME;
-
     let (tx, rx) = tokio::sync::oneshot::channel();
 
     let state = get_state(&r);
@@ -1147,7 +1145,7 @@ async fn disk_usage_eviction_run(
     let _g = cancel.drop_guard();
 
     crate::task_mgr::spawn(
-        MGMT_REQUEST_RUNTIME.handle(),
+        crate::task_mgr::BACKGROUND_RUNTIME.handle(),
         TaskKind::DiskUsageEviction,
         None,
         None,

From feff887c6f5418316870533435564adbaacf8195 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 5 Jul 2023 18:40:25 +0100
Subject: [PATCH 058/412] Compile `pg_embedding` extension (#4634)

```
CREATE EXTENSION embedding;
CREATE TABLE t (val real[]);
INSERT INTO t (val) VALUES ('{0,0,0}'), ('{1,2,3}'), ('{1,1,1}'), (NULL);
CREATE INDEX ON t USING hnsw (val) WITH (maxelements = 10, dims=3, m=3);
INSERT INTO t (val) VALUES (array[1,2,4]);

SELECT * FROM t ORDER BY val <-> array[3,3,3];
   val
---------
 {1,2,3}
 {1,2,4}
 {1,1,1}
 {0,0,0}

(5 rows)
```
---
 Dockerfile.compute-node | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 310e4c32a3..7208024d63 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -515,6 +515,25 @@ RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/roaringbitmap.control
 
+#########################################################################################
+#
+# Layer "pg-embedding-pg-build"
+# compile pg_embedding extension
+#
+#########################################################################################
+FROM build-deps AS pg-embedding-pg-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+ENV PATH "/usr/local/pgsql/bin/:$PATH"
+# 2465f831ea1f8d49c1d74f8959adb7fc277d70cd made on 05/07/2023
+# There is no release tag yet
+RUN wget https://github.com/neondatabase/pg_embedding/archive/2465f831ea1f8d49c1d74f8959adb7fc277d70cd.tar.gz -O pg_embedding.tar.gz && \
+    echo "047af2b1f664a1e6e37867bd4eeaf5934fa27d6ba3d6c4461efa388ddf7cd1d5 pg_embedding.tar.gz" | sha256sum --check && \
+    mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/embedding.control
+
 #########################################################################################
 #
 # Layer "pg-anon-pg-build"
@@ -671,6 +690,7 @@ COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY pgxn/ pgxn/
 
 RUN make -j $(getconf _NPROCESSORS_ONLN) \

From efa6aa134f4190d4ad9c10eaa55b17abd79b9457 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 7 Jul 2023 17:50:50 +0100
Subject: [PATCH 059/412] allow repeated IO errors from compute node (#4624)

## Problem

#4598 compute nodes are not accessible some time after wake up due to
kubernetes DNS not being fully propagated.

## Summary of changes

Update connect retry mechanism to support handling IO errors and
sleeping for 100ms

## Checklist before requesting a review

- [x] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.
---
 proxy/src/http/conn_pool.rs | 69 +++++++++++++++++-----------
 proxy/src/proxy.rs          | 92 ++++++++++++++++++++++++++++++-------
 proxy/src/proxy/tests.rs    | 17 +++++++
 3 files changed, 136 insertions(+), 42 deletions(-)

diff --git a/proxy/src/http/conn_pool.rs b/proxy/src/http/conn_pool.rs
index 52c1e2f2ce..27950d3a20 100644
--- a/proxy/src/http/conn_pool.rs
+++ b/proxy/src/http/conn_pool.rs
@@ -2,16 +2,15 @@ use parking_lot::Mutex;
 use pq_proto::StartupMessageParams;
 use std::fmt;
 use std::{collections::HashMap, sync::Arc};
-
-use futures::TryFutureExt;
+use tokio::time;
 
 use crate::config;
 use crate::{auth, console};
 
 use super::sql_over_http::MAX_RESPONSE_SIZE;
 
-use crate::proxy::invalidate_cache;
-use crate::proxy::NUM_RETRIES_WAKE_COMPUTE;
+use crate::proxy::try_wake;
+use crate::proxy::{BASE_RETRY_WAIT_DURATION, NUM_RETRIES_WAKE_COMPUTE};
 
 use tracing::error;
 use tracing::info;
@@ -223,32 +222,59 @@ async fn connect_to_compute(
 
     // This code is a copy of `connect_to_compute` from `src/proxy.rs` with
     // the difference that it uses `tokio_postgres` for the connection.
-    let mut num_retries: usize = NUM_RETRIES_WAKE_COMPUTE;
+    let mut num_retries = 0;
+    let mut should_wake = true;
     loop {
         match connect_to_compute_once(node_info, conn_info).await {
-            Err(e) if num_retries > 0 => {
-                info!("compute node's state has changed; requesting a wake-up");
-                match creds.wake_compute(&extra).await? {
-                    // Update `node_info` and try one more time.
-                    Some(new) => {
-                        *node_info = new;
+            Err(e) if num_retries == NUM_RETRIES_WAKE_COMPUTE => {
+                if let Some(wait_duration) = retry_connect_in(&e, num_retries) {
+                    error!(error = ?e, "could not connect to compute node");
+                    if should_wake {
+                        match try_wake(node_info, &extra, &creds).await {
+                            Ok(Some(x)) => should_wake = x,
+                            Ok(None) => return Err(e.into()),
+                            Err(e) => return Err(e.into()),
+                        }
                     }
-                    // Link auth doesn't work that way, so we just exit.
-                    None => return Err(e),
+                    if !wait_duration.is_zero() {
+                        time::sleep(wait_duration).await;
+                    }
+                } else {
+                    return Err(e.into());
                 }
             }
-            other => return other,
+            other => return Ok(other?),
         }
 
-        num_retries -= 1;
-        info!("retrying after wake-up ({num_retries} attempts left)");
+        num_retries += 1;
+        info!(retries_left = num_retries, "retrying connect");
+    }
+}
+
+fn retry_connect_in(err: &tokio_postgres::Error, num_retries: u32) -> Option<time::Duration> {
+    use tokio_postgres::error::SqlState;
+    match err.code() {
+        // retry all errors at least once immediately
+        _ if num_retries == 0 => Some(time::Duration::ZERO),
+        // keep retrying connection errors every 100ms
+        Some(
+            &SqlState::CONNECTION_FAILURE
+            | &SqlState::CONNECTION_EXCEPTION
+            | &SqlState::CONNECTION_DOES_NOT_EXIST
+            | &SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION,
+        ) => {
+            // 3/2 = 1.5 which seems to be an ok growth factor heuristic
+            Some(BASE_RETRY_WAIT_DURATION * 3_u32.pow(num_retries) / 2_u32.pow(num_retries))
+        }
+        // otherwise, don't retry
+        _ => None,
     }
 }
 
 async fn connect_to_compute_once(
     node_info: &console::CachedNodeInfo,
     conn_info: &ConnInfo,
-) -> anyhow::Result<tokio_postgres::Client> {
+) -> Result<tokio_postgres::Client, tokio_postgres::Error> {
     let mut config = (*node_info.config).clone();
 
     let (client, connection) = config
@@ -257,15 +283,6 @@ async fn connect_to_compute_once(
         .dbname(&conn_info.dbname)
         .max_backend_message_size(MAX_RESPONSE_SIZE)
         .connect(tokio_postgres::NoTls)
-        .inspect_err(|e: &tokio_postgres::Error| {
-            error!(
-                "failed to connect to compute node hosts={:?} ports={:?}: {}",
-                node_info.config.get_hosts(),
-                node_info.config.get_ports(),
-                e
-            );
-            invalidate_cache(node_info)
-        })
         .await?;
 
     tokio::spawn(async move {
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 2433412c4c..5c5353a63e 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -6,12 +6,17 @@ use crate::{
     cancellation::{self, CancelMap},
     compute::{self, PostgresConnection},
     config::{ProxyConfig, TlsConfig},
-    console::{self, messages::MetricsAuxInfo},
+    console::{
+        self,
+        errors::{ApiError, WakeComputeError},
+        messages::MetricsAuxInfo,
+    },
     error::io_error,
     stream::{PqStream, Stream},
 };
 use anyhow::{bail, Context};
 use futures::TryFutureExt;
+use hyper::StatusCode;
 use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
 use once_cell::sync::Lazy;
 use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
@@ -25,7 +30,9 @@ use tracing::{error, info, warn};
 use utils::measured_stream::MeasuredStream;
 
 /// Number of times we should retry the `/proxy_wake_compute` http request.
-pub const NUM_RETRIES_WAKE_COMPUTE: usize = 1;
+/// Retry duration is BASE_RETRY_WAIT_DURATION * 1.5^n
+pub const NUM_RETRIES_WAKE_COMPUTE: u32 = 10;
+pub const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(100);
 
 const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
 const ERR_PROTO_VIOLATION: &str = "protocol violation";
@@ -315,7 +322,6 @@ async fn connect_to_compute_once(
     node_info
         .config
         .connect(allow_self_signed_compute, timeout)
-        .inspect_err(|_: &compute::ConnectionError| invalidate_cache(node_info))
         .await
 }
 
@@ -328,7 +334,8 @@ async fn connect_to_compute(
     extra: &console::ConsoleReqExtra<'_>,
     creds: &auth::BackendType<'_, auth::ClientCredentials<'_>>,
 ) -> Result<PostgresConnection, compute::ConnectionError> {
-    let mut num_retries: usize = NUM_RETRIES_WAKE_COMPUTE;
+    let mut num_retries = 0;
+    let mut should_wake = true;
     loop {
         // Apply startup params to the (possibly, cached) compute node info.
         node_info.config.set_startup_params(params);
@@ -346,30 +353,83 @@ async fn connect_to_compute(
         // We only use caching in case of scram proxy backed by the console, so reduce
         // the timeout only in that case.
         let is_scram_proxy = matches!(creds, auth::BackendType::Console(_, _));
-        let timeout = if is_scram_proxy && num_retries == NUM_RETRIES_WAKE_COMPUTE {
+        let timeout = if is_scram_proxy && num_retries == 0 {
             time::Duration::from_secs(2)
         } else {
             time::Duration::from_secs(10)
         };
 
         match connect_to_compute_once(node_info, timeout).await {
-            Err(e) if num_retries > 0 => {
-                info!("compute node's state has changed; requesting a wake-up");
-                match creds.wake_compute(extra).map_err(io_error).await? {
-                    // Update `node_info` and try one more time.
-                    Some(mut new) => {
-                        new.config.reuse_password(&node_info.config);
-                        *node_info = new;
+            Err(e) if num_retries < NUM_RETRIES_WAKE_COMPUTE => {
+                if let Some(wait_duration) = retry_connect_in(&e, num_retries) {
+                    error!(error = ?e, "could not connect to compute node");
+                    if should_wake {
+                        match try_wake(node_info, extra, creds).await {
+                            Ok(Some(x)) => {
+                                should_wake = x;
+                            }
+                            Ok(None) => return Err(e),
+                            Err(e) => return Err(io_error(e).into()),
+                        }
                     }
-                    // Link auth doesn't work that way, so we just exit.
-                    None => return Err(e),
+                    if !wait_duration.is_zero() {
+                        time::sleep(wait_duration).await;
+                    }
+                } else {
+                    return Err(e);
                 }
             }
             other => return other,
         }
 
-        num_retries -= 1;
-        info!("retrying after wake-up ({num_retries} attempts left)");
+        num_retries += 1;
+        info!(retries_left = num_retries, "retrying connect");
+    }
+}
+
+/// Attempts to wake up the compute node.
+/// * Returns Ok(Some(true)) if there was an error waking but retries are acceptable
+/// * Returns Ok(Some(false)) if the wakeup succeeded
+/// * Returns Ok(None) or Err(e) if there was an error
+pub async fn try_wake(
+    node_info: &mut console::CachedNodeInfo,
+    extra: &console::ConsoleReqExtra<'_>,
+    creds: &auth::BackendType<'_, auth::ClientCredentials<'_>>,
+) -> Result<Option<bool>, WakeComputeError> {
+    info!("compute node's state has likely changed; requesting a wake-up");
+    invalidate_cache(node_info);
+    match creds.wake_compute(extra).await {
+        // retry wake if the compute was in an invalid state
+        Err(WakeComputeError::ApiError(ApiError::Console {
+            status: StatusCode::BAD_REQUEST,
+            ..
+        })) => Ok(Some(true)),
+        // Update `node_info` and try again.
+        Ok(Some(mut new)) => {
+            new.config.reuse_password(&node_info.config);
+            *node_info = new;
+            Ok(Some(false))
+        }
+        Err(e) => Err(e),
+        Ok(None) => Ok(None),
+    }
+}
+
+fn retry_connect_in(err: &compute::ConnectionError, num_retries: u32) -> Option<time::Duration> {
+    use std::io::ErrorKind;
+    match err {
+        // retry all errors at least once immediately
+        _ if num_retries == 0 => Some(time::Duration::ZERO),
+        // keep retrying connection errors every 100ms
+        compute::ConnectionError::CouldNotConnect(io_err) => match io_err.kind() {
+            ErrorKind::ConnectionRefused | ErrorKind::AddrNotAvailable => {
+                // 3/2 = 1.5 which seems to be an ok growth factor heuristic
+                Some(BASE_RETRY_WAIT_DURATION * 3_u32.pow(num_retries) / 2_u32.pow(num_retries))
+            }
+            _ => None,
+        },
+        // otherwise, don't retry
+        _ => None,
     }
 }
 
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 3373c49676..a1f6cd3ed4 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -1,4 +1,6 @@
 //! A group of high-level tests for connection establishing logic and auth.
+use std::io;
+
 use super::*;
 use crate::{auth, sasl, scram};
 use async_trait::async_trait;
@@ -294,3 +296,18 @@ async fn scram_auth_mock() -> anyhow::Result<()> {
 
     Ok(())
 }
+
+#[test]
+fn connect_compute_total_wait() {
+    let err = compute::ConnectionError::CouldNotConnect(io::Error::new(
+        io::ErrorKind::ConnectionRefused,
+        "conn refused",
+    ));
+
+    let mut total_wait = tokio::time::Duration::ZERO;
+    for num_retries in 0..10 {
+        total_wait += retry_connect_in(&err, num_retries).unwrap();
+    }
+    assert!(total_wait < tokio::time::Duration::from_secs(12));
+    assert!(total_wait > tokio::time::Duration::from_secs(10));
+}

From 39a28d11083eefc72c5f302c21e0e74742129e14 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 12 Jul 2023 11:38:36 +0100
Subject: [PATCH 060/412] proxy wake_compute loop (#4675)

## Problem

If we fail to wake up the compute node, a subsequent connect attempt
will definitely fail. However, kubernetes won't fail the connection
immediately, instead it hangs until we timeout (10s).

## Summary of changes

Refactor the loop to allow fast retries of compute_wake and to skip a
connect attempt.
---
 proxy/src/http/conn_pool.rs |  81 ++++++++++++++++-----------
 proxy/src/proxy.rs          | 108 ++++++++++++++++++++++++------------
 proxy/src/proxy/tests.rs    |   9 +--
 3 files changed, 121 insertions(+), 77 deletions(-)

diff --git a/proxy/src/http/conn_pool.rs b/proxy/src/http/conn_pool.rs
index 27950d3a20..fb53c663c8 100644
--- a/proxy/src/http/conn_pool.rs
+++ b/proxy/src/http/conn_pool.rs
@@ -1,6 +1,7 @@
 use parking_lot::Mutex;
 use pq_proto::StartupMessageParams;
 use std::fmt;
+use std::ops::ControlFlow;
 use std::{collections::HashMap, sync::Arc};
 use tokio::time;
 
@@ -9,8 +10,7 @@ use crate::{auth, console};
 
 use super::sql_over_http::MAX_RESPONSE_SIZE;
 
-use crate::proxy::try_wake;
-use crate::proxy::{BASE_RETRY_WAIT_DURATION, NUM_RETRIES_WAKE_COMPUTE};
+use crate::proxy::{invalidate_cache, retry_after, try_wake, NUM_RETRIES_WAKE_COMPUTE};
 
 use tracing::error;
 use tracing::info;
@@ -184,11 +184,10 @@ impl GlobalConnPool {
     }
 }
 
-//
 // Wake up the destination if needed. Code here is a bit involved because
 // we reuse the code from the usual proxy and we need to prepare few structures
 // that this code expects.
-//
+#[tracing::instrument(skip_all)]
 async fn connect_to_compute(
     config: &config::ProxyConfig,
     conn_info: &ConnInfo,
@@ -220,54 +219,72 @@ async fn connect_to_compute(
 
     let node_info = &mut creds.wake_compute(&extra).await?.expect("msg");
 
-    // This code is a copy of `connect_to_compute` from `src/proxy.rs` with
-    // the difference that it uses `tokio_postgres` for the connection.
     let mut num_retries = 0;
-    let mut should_wake = true;
+    let mut wait_duration = time::Duration::ZERO;
+    let mut should_wake_with_error = None;
     loop {
+        if !wait_duration.is_zero() {
+            time::sleep(wait_duration).await;
+        }
+
+        // try wake the compute node if we have determined it's sensible to do so
+        if let Some(err) = should_wake_with_error.take() {
+            match try_wake(node_info, &extra, &creds).await {
+                // we can't wake up the compute node
+                Ok(None) => return Err(err),
+                // there was an error communicating with the control plane
+                Err(e) => return Err(e.into()),
+                // failed to wake up but we can continue to retry
+                Ok(Some(ControlFlow::Continue(()))) => {
+                    wait_duration = retry_after(num_retries);
+                    should_wake_with_error = Some(err);
+
+                    num_retries += 1;
+                    info!(num_retries, "retrying wake compute");
+                    continue;
+                }
+                // successfully woke up a compute node and can break the wakeup loop
+                Ok(Some(ControlFlow::Break(()))) => {}
+            }
+        }
+
         match connect_to_compute_once(node_info, conn_info).await {
-            Err(e) if num_retries == NUM_RETRIES_WAKE_COMPUTE => {
-                if let Some(wait_duration) = retry_connect_in(&e, num_retries) {
-                    error!(error = ?e, "could not connect to compute node");
-                    if should_wake {
-                        match try_wake(node_info, &extra, &creds).await {
-                            Ok(Some(x)) => should_wake = x,
-                            Ok(None) => return Err(e.into()),
-                            Err(e) => return Err(e.into()),
-                        }
-                    }
-                    if !wait_duration.is_zero() {
-                        time::sleep(wait_duration).await;
-                    }
-                } else {
+            Ok(res) => return Ok(res),
+            Err(e) => {
+                error!(error = ?e, "could not connect to compute node");
+                if !can_retry_error(&e, num_retries) {
                     return Err(e.into());
                 }
+                wait_duration = retry_after(num_retries);
+
+                // after the first connect failure,
+                // we should invalidate the cache and wake up a new compute node
+                if num_retries == 0 {
+                    invalidate_cache(node_info);
+                    should_wake_with_error = Some(e.into());
+                }
             }
-            other => return Ok(other?),
         }
 
         num_retries += 1;
-        info!(retries_left = num_retries, "retrying connect");
+        info!(num_retries, "retrying connect");
     }
 }
 
-fn retry_connect_in(err: &tokio_postgres::Error, num_retries: u32) -> Option<time::Duration> {
+fn can_retry_error(err: &tokio_postgres::Error, num_retries: u32) -> bool {
     use tokio_postgres::error::SqlState;
     match err.code() {
-        // retry all errors at least once immediately
-        _ if num_retries == 0 => Some(time::Duration::ZERO),
-        // keep retrying connection errors every 100ms
+        // retry all errors at least once
+        _ if num_retries == 0 => true,
+        // keep retrying connection errors
         Some(
             &SqlState::CONNECTION_FAILURE
             | &SqlState::CONNECTION_EXCEPTION
             | &SqlState::CONNECTION_DOES_NOT_EXIST
             | &SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION,
-        ) => {
-            // 3/2 = 1.5 which seems to be an ok growth factor heuristic
-            Some(BASE_RETRY_WAIT_DURATION * 3_u32.pow(num_retries) / 2_u32.pow(num_retries))
-        }
+        ) if num_retries < NUM_RETRIES_WAKE_COMPUTE => true,
         // otherwise, don't retry
-        _ => None,
+        _ => false,
     }
 }
 
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 5c5353a63e..12ca9c5187 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -20,7 +20,7 @@ use hyper::StatusCode;
 use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
 use once_cell::sync::Lazy;
 use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
-use std::sync::Arc;
+use std::{ops::ControlFlow, sync::Arc};
 use tokio::{
     io::{AsyncRead, AsyncWrite, AsyncWriteExt},
     time,
@@ -32,7 +32,7 @@ use utils::measured_stream::MeasuredStream;
 /// Number of times we should retry the `/proxy_wake_compute` http request.
 /// Retry duration is BASE_RETRY_WAIT_DURATION * 1.5^n
 pub const NUM_RETRIES_WAKE_COMPUTE: u32 = 10;
-pub const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(100);
+const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(100);
 
 const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
 const ERR_PROTO_VIOLATION: &str = "protocol violation";
@@ -335,11 +335,37 @@ async fn connect_to_compute(
     creds: &auth::BackendType<'_, auth::ClientCredentials<'_>>,
 ) -> Result<PostgresConnection, compute::ConnectionError> {
     let mut num_retries = 0;
-    let mut should_wake = true;
+    let mut wait_duration = time::Duration::ZERO;
+    let mut should_wake_with_error = None;
     loop {
         // Apply startup params to the (possibly, cached) compute node info.
         node_info.config.set_startup_params(params);
 
+        if !wait_duration.is_zero() {
+            time::sleep(wait_duration).await;
+        }
+
+        // try wake the compute node if we have determined it's sensible to do so
+        if let Some(err) = should_wake_with_error.take() {
+            match try_wake(node_info, extra, creds).await {
+                // we can't wake up the compute node
+                Ok(None) => return Err(err),
+                // there was an error communicating with the control plane
+                Err(e) => return Err(io_error(e).into()),
+                // failed to wake up but we can continue to retry
+                Ok(Some(ControlFlow::Continue(()))) => {
+                    wait_duration = retry_after(num_retries);
+                    should_wake_with_error = Some(err);
+
+                    num_retries += 1;
+                    info!(num_retries, "retrying wake compute");
+                    continue;
+                }
+                // successfully woke up a compute node and can break the wakeup loop
+                Ok(Some(ControlFlow::Break(()))) => {}
+            }
+        }
+
         // Set a shorter timeout for the initial connection attempt.
         //
         // In case we try to connect to an outdated address that is no longer valid, the
@@ -359,31 +385,29 @@ async fn connect_to_compute(
             time::Duration::from_secs(10)
         };
 
+        // do this again to ensure we have username?
+        node_info.config.set_startup_params(params);
+
         match connect_to_compute_once(node_info, timeout).await {
-            Err(e) if num_retries < NUM_RETRIES_WAKE_COMPUTE => {
-                if let Some(wait_duration) = retry_connect_in(&e, num_retries) {
-                    error!(error = ?e, "could not connect to compute node");
-                    if should_wake {
-                        match try_wake(node_info, extra, creds).await {
-                            Ok(Some(x)) => {
-                                should_wake = x;
-                            }
-                            Ok(None) => return Err(e),
-                            Err(e) => return Err(io_error(e).into()),
-                        }
-                    }
-                    if !wait_duration.is_zero() {
-                        time::sleep(wait_duration).await;
-                    }
-                } else {
+            Ok(res) => return Ok(res),
+            Err(e) => {
+                error!(error = ?e, "could not connect to compute node");
+                if !can_retry_error(&e, num_retries) {
                     return Err(e);
                 }
+                wait_duration = retry_after(num_retries);
+
+                // after the first connect failure,
+                // we should invalidate the cache and wake up a new compute node
+                if num_retries == 0 {
+                    invalidate_cache(node_info);
+                    should_wake_with_error = Some(e);
+                }
             }
-            other => return other,
         }
 
         num_retries += 1;
-        info!(retries_left = num_retries, "retrying connect");
+        info!(num_retries, "retrying connect");
     }
 }
 
@@ -395,41 +419,51 @@ pub async fn try_wake(
     node_info: &mut console::CachedNodeInfo,
     extra: &console::ConsoleReqExtra<'_>,
     creds: &auth::BackendType<'_, auth::ClientCredentials<'_>>,
-) -> Result<Option<bool>, WakeComputeError> {
+) -> Result<Option<ControlFlow<()>>, WakeComputeError> {
     info!("compute node's state has likely changed; requesting a wake-up");
-    invalidate_cache(node_info);
     match creds.wake_compute(extra).await {
         // retry wake if the compute was in an invalid state
         Err(WakeComputeError::ApiError(ApiError::Console {
             status: StatusCode::BAD_REQUEST,
             ..
-        })) => Ok(Some(true)),
+        })) => Ok(Some(ControlFlow::Continue(()))),
         // Update `node_info` and try again.
         Ok(Some(mut new)) => {
             new.config.reuse_password(&node_info.config);
             *node_info = new;
-            Ok(Some(false))
+            Ok(Some(ControlFlow::Break(())))
         }
         Err(e) => Err(e),
         Ok(None) => Ok(None),
     }
 }
 
-fn retry_connect_in(err: &compute::ConnectionError, num_retries: u32) -> Option<time::Duration> {
+fn can_retry_error(err: &compute::ConnectionError, num_retries: u32) -> bool {
     use std::io::ErrorKind;
     match err {
-        // retry all errors at least once immediately
-        _ if num_retries == 0 => Some(time::Duration::ZERO),
-        // keep retrying connection errors every 100ms
-        compute::ConnectionError::CouldNotConnect(io_err) => match io_err.kind() {
-            ErrorKind::ConnectionRefused | ErrorKind::AddrNotAvailable => {
-                // 3/2 = 1.5 which seems to be an ok growth factor heuristic
-                Some(BASE_RETRY_WAIT_DURATION * 3_u32.pow(num_retries) / 2_u32.pow(num_retries))
-            }
-            _ => None,
-        },
+        // retry all errors at least once
+        _ if num_retries == 0 => true,
+        // keep retrying connection errors
+        compute::ConnectionError::CouldNotConnect(io_err)
+            if num_retries < NUM_RETRIES_WAKE_COMPUTE =>
+        {
+            matches!(
+                io_err.kind(),
+                ErrorKind::ConnectionRefused | ErrorKind::AddrNotAvailable
+            )
+        }
         // otherwise, don't retry
-        _ => None,
+        _ => false,
+    }
+}
+
+pub fn retry_after(num_retries: u32) -> time::Duration {
+    match num_retries {
+        0 => time::Duration::ZERO,
+        _ => {
+            // 3/2 = 1.5 which seems to be an ok growth factor heuristic
+            BASE_RETRY_WAIT_DURATION * 3_u32.pow(num_retries) / 2_u32.pow(num_retries)
+        }
     }
 }
 
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index a1f6cd3ed4..b9215cd90e 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -1,6 +1,4 @@
 //! A group of high-level tests for connection establishing logic and auth.
-use std::io;
-
 use super::*;
 use crate::{auth, sasl, scram};
 use async_trait::async_trait;
@@ -299,14 +297,9 @@ async fn scram_auth_mock() -> anyhow::Result<()> {
 
 #[test]
 fn connect_compute_total_wait() {
-    let err = compute::ConnectionError::CouldNotConnect(io::Error::new(
-        io::ErrorKind::ConnectionRefused,
-        "conn refused",
-    ));
-
     let mut total_wait = tokio::time::Duration::ZERO;
     for num_retries in 0..10 {
-        total_wait += retry_connect_in(&err, num_retries).unwrap();
+        total_wait += retry_after(num_retries);
     }
     assert!(total_wait < tokio::time::Duration::from_secs(12));
     assert!(total_wait > tokio::time::Duration::from_secs(10));

From 4204960942fc456906396072c63b373f600b89e5 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <me@cschwarz.com>
Date: Tue, 1 Aug 2023 11:52:35 +0200
Subject: [PATCH 061/412] ci: fix upload-postgres-extensions-to-s3 job

commit

	commit 5f8fd640bfa8e5d4d23dbc3df1b0a521ec666e56
	Author: Alek Westover <alek.westover@gmail.com>
	Date:   Wed Jul 26 08:24:03 2023 -0400

	    Upload Test Remote Extensions (#4792)

switched to using the release tag instead of `latest`, but,
the `promote-images` job only uploads `latest` to the prod ECR.

The switch to using release tag was good in principle, but,
reverting that part to make the release pipeine work.

Note that a proper fix should abandon use of `:latest` tag
at all: currently, if a `main` pipeline runs concurrently
with a `release` pipeline, the `release` pipeline may end
up using the `main` pipeline's images.
---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 27bad61639..dcdc388a93 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -955,7 +955,7 @@ jobs:
         version: [ v14, v15 ]
 
     env:
-      EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
+      EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:latest
       AWS_ACCESS_KEY_ID: ${{ github.ref_name == 'release' && secrets.AWS_ACCESS_KEY_PROD || secrets.AWS_ACCESS_KEY_DEV }}
       AWS_SECRET_ACCESS_KEY: ${{ github.ref_name == 'release' && secrets.AWS_SECRET_KEY_PROD || secrets.AWS_SECRET_KEY_DEV }}
       S3_BUCKETS: ${{ github.ref_name == 'release' && vars.S3_EXTENSIONS_BUCKETS_PROD || vars.S3_EXTENSIONS_BUCKETS_DEV }}

From 6eae4fc9aa9d143f6f2000fa2d761ccccf079710 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 3 Aug 2023 07:48:09 +0100
Subject: [PATCH 062/412] Release 2023-08-02: update pg_embedding  (#4877)

Cherry-picking ca4d71a9549be5638d84e6a9ff5c66ca14cbb05d from `main` into
the `release`

Co-authored-by: Vadim Kharitonov <vadim2404@users.noreply.github.com>
---
 Dockerfile.compute-node | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 9759faf733..b2d096dc43 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -551,8 +551,8 @@ FROM build-deps AS pg-embedding-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
-RUN wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/0.3.1.tar.gz -O pg_embedding.tar.gz && \
-    echo "c4ae84eef36fa8ec5868f6e061f39812f19ee5ba3604d428d40935685c7be512 pg_embedding.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/0.3.5.tar.gz -O pg_embedding.tar.gz && \
+    echo "0e95b27b8b6196e2cf0a0c9ec143fe2219b82e54c5bb4ee064e76398cbe69ae9 pg_embedding.tar.gz" | sha256sum --check && \
     mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \

From b1ddd01289f1092781d5603a88d0e75a9c23bc2b Mon Sep 17 00:00:00 2001
From: Vadim Kharitonov <vadim2404@users.noreply.github.com>
Date: Thu, 3 Aug 2023 15:28:31 +0200
Subject: [PATCH 063/412] Define NEON_SMGR to make it possible for extensions
 to use Neon SMG API (#4889)

Co-authored-by: Konstantin Knizhnik <knizhnik@garret.ru>
Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/revisions.json | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index ebedb34d01..da3885c34d 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit ebedb34d01c8ac9c31e8ea4628b9854103a1dc8f
+Subproject commit da3885c34db312afd555802be2ce985fafd1d8ad
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 1220c8a63f..770c6dffc5 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 1220c8a63f00101829f9222a5821fc084b4384c7
+Subproject commit 770c6dffc5ef6aac05bf049693877fb377eea6fc
diff --git a/vendor/revisions.json b/vendor/revisions.json
index f5d7428cd9..8579b5abaa 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,4 +1,4 @@
 {
-    "postgres-v15": "1220c8a63f00101829f9222a5821fc084b4384c7",
-    "postgres-v14": "ebedb34d01c8ac9c31e8ea4628b9854103a1dc8f"
+    "postgres-v15": "770c6dffc5ef6aac05bf049693877fb377eea6fc",
+    "postgres-v14": "da3885c34db312afd555802be2ce985fafd1d8ad"
 }

From e78ac221077827a21f155ab7dc697b13159479d7 Mon Sep 17 00:00:00 2001
From: Felix Prasanna <91577249+fprasx@users.noreply.github.com>
Date: Tue, 8 Aug 2023 13:08:46 -0500
Subject: [PATCH 064/412] release fix: revert vm builder bump from 0.13.1 ->
 0.15.0-alpha1 (#4932)

This reverts commit 682dfb3a31ad13fd70b1d05fb099e4f54f93bf4e.

hotfix for a CLI arg issue in the monitor
---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 16fc22ce62..dcdc388a93 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -794,7 +794,7 @@ jobs:
       run:
         shell: sh -eu {0}
     env:
-      VM_BUILDER_VERSION: v0.15.0-alpha1
+      VM_BUILDER_VERSION: v0.13.1
 
     steps:
       - name: Checkout

From d7d066d493074b1d68deaba59c15165f96ac7bda Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Tue, 8 Aug 2023 15:19:24 +0100
Subject: [PATCH 065/412] proxy: delay auth on retry (#4929)

## Problem

When an endpoint is shutting down, it can take a few seconds. Currently
when starting a new compute, this causes an "endpoint is in transition"
error. We need to add delays before retrying to ensure that we allow
time for the endpoint to shutdown properly.

## Summary of changes

Adds a delay before retrying in auth. connect_to_compute already has
this delay
---
 proxy/src/auth/backend/classic.rs | 7 +++++--
 proxy/src/proxy.rs                | 2 +-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs
index d8ee1e8b64..46bd215f3b 100644
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -5,7 +5,7 @@ use crate::{
     auth::{self, AuthFlow, ClientCredentials},
     compute,
     console::{self, AuthInfo, CachedNodeInfo, ConsoleReqExtra},
-    proxy::handle_try_wake,
+    proxy::{handle_try_wake, retry_after},
     sasl, scram,
     stream::PqStream,
 };
@@ -62,10 +62,13 @@ pub(super) async fn authenticate(
             }
             Ok(ControlFlow::Continue(e)) => {
                 warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node");
-                num_retries += 1;
             }
             Ok(ControlFlow::Break(n)) => break n,
         }
+
+        let wait_duration = retry_after(num_retries);
+        num_retries += 1;
+        tokio::time::sleep(wait_duration).await;
     };
     if let Some(keys) = scram_keys {
         use tokio_postgres::config::AuthKeys;
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index e6fcf901d9..0267d767ee 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -545,7 +545,7 @@ impl ShouldRetry for compute::ConnectionError {
     }
 }
 
-fn retry_after(num_retries: u32) -> time::Duration {
+pub fn retry_after(num_retries: u32) -> time::Duration {
     // 1.5 seems to be an ok growth factor heuristic
     BASE_RETRY_WAIT_DURATION.mul_f64(1.5_f64.powi(num_retries as i32))
 }

From 2f74287c9b24f2e575bd84fa8d06701478f1f26f Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Tue, 15 Aug 2023 15:31:48 +0000
Subject: [PATCH 066/412] Disable neon-pool-opt-in

---
 proxy/src/http/sql_over_http.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/proxy/src/http/sql_over_http.rs b/proxy/src/http/sql_over_http.rs
index aa06a91bba..09642b9a93 100644
--- a/proxy/src/http/sql_over_http.rs
+++ b/proxy/src/http/sql_over_http.rs
@@ -193,7 +193,7 @@ pub async fn handle(
     let array_mode = headers.get(&ARRAY_MODE) == Some(&HEADER_VALUE_TRUE);
 
     // Allow connection pooling only if explicitly requested
-    let allow_pool = headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE);
+    let allow_pool = false;
 
     // isolation level and read only
 

From fec2ad6283184047cfb758880ca095fbd1493115 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Tue, 15 Aug 2023 15:45:03 +0000
Subject: [PATCH 067/412] Fix lint

---
 proxy/src/http/sql_over_http.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/proxy/src/http/sql_over_http.rs b/proxy/src/http/sql_over_http.rs
index 09642b9a93..c7fd7f6e39 100644
--- a/proxy/src/http/sql_over_http.rs
+++ b/proxy/src/http/sql_over_http.rs
@@ -44,7 +44,6 @@ const MAX_REQUEST_SIZE: u64 = 1024 * 1024; // 1 MB
 
 static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output");
 static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode");
-static ALLOW_POOL: HeaderName = HeaderName::from_static("neon-pool-opt-in");
 static TXN_ISOLATION_LEVEL: HeaderName = HeaderName::from_static("neon-batch-isolation-level");
 static TXN_READ_ONLY: HeaderName = HeaderName::from_static("neon-batch-read-only");
 

From 4e2e44e5240959ac48bb30571eee136c0226c989 Mon Sep 17 00:00:00 2001
From: Vadim Kharitonov <vadim2404@users.noreply.github.com>
Date: Tue, 22 Aug 2023 10:06:14 +0200
Subject: [PATCH 068/412] Enable neon-pool-opt-in (#5062)

---
 proxy/src/http/sql_over_http.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/proxy/src/http/sql_over_http.rs b/proxy/src/http/sql_over_http.rs
index 782e37641c..fe57096105 100644
--- a/proxy/src/http/sql_over_http.rs
+++ b/proxy/src/http/sql_over_http.rs
@@ -44,6 +44,7 @@ const MAX_REQUEST_SIZE: u64 = 1024 * 1024; // 1 MB
 
 static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output");
 static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode");
+static ALLOW_POOL: HeaderName = HeaderName::from_static("neon-pool-opt-in");
 static TXN_ISOLATION_LEVEL: HeaderName = HeaderName::from_static("neon-batch-isolation-level");
 static TXN_READ_ONLY: HeaderName = HeaderName::from_static("neon-batch-read-only");
 static TXN_DEFERRABLE: HeaderName = HeaderName::from_static("neon-batch-deferrable");
@@ -194,7 +195,7 @@ pub async fn handle(
     let array_mode = headers.get(&ARRAY_MODE) == Some(&HEADER_VALUE_TRUE);
 
     // Allow connection pooling only if explicitly requested
-    let allow_pool = false;
+    let allow_pool = headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE);
 
     // isolation level, read only and deferrable
 

From 67db8432b481e585f654616f2260949aa137e9a2 Mon Sep 17 00:00:00 2001
From: Alek Westover <alek.westover@gmail.com>
Date: Tue, 22 Aug 2023 11:41:32 -0400
Subject: [PATCH 069/412] Fix cargo deny errors (#5068)

## Problem
cargo deny lint broken

Links to the CVEs:

[rustsec.org/advisories/RUSTSEC-2023-0052](https://rustsec.org/advisories/RUSTSEC-2023-0052)

[rustsec.org/advisories/RUSTSEC-2023-0053](https://rustsec.org/advisories/RUSTSEC-2023-0053)
One is fixed, the other one isn't so we allow it (for now), to unbreak
CI. Then later we'll try to get rid of webpki in favour of the rustls
fork.

## Summary of changes
```
+ignore = ["RUSTSEC-2023-0052"]
```
---
 Cargo.lock | 4 ++--
 deny.toml  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 23741e2a3d..81d2a04122 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3569,9 +3569,9 @@ dependencies = [
 
 [[package]]
 name = "rustls-webpki"
-version = "0.100.1"
+version = "0.100.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d6207cd5ed3d8dca7816f8f3725513a34609c0c765bf652b8c3cb4cfd87db46b"
+checksum = "e98ff011474fa39949b7e5c0428f9b4937eda7da7848bbb947786b7be0b27dab"
 dependencies = [
  "ring",
  "untrusted",
diff --git a/deny.toml b/deny.toml
index cfb699bb21..71006b14cd 100644
--- a/deny.toml
+++ b/deny.toml
@@ -18,7 +18,7 @@ vulnerability = "deny"
 unmaintained = "warn"
 yanked = "warn"
 notice = "warn"
-ignore = []
+ignore = ["RUSTSEC-2023-0052"]
 
 # This section is considered when running `cargo deny check licenses`
 # More documentation for the licenses section can be found here:

From ce1753d036ea4d8be87e446ebe8c448e134862d1 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 25 Aug 2023 13:08:45 +0100
Subject: [PATCH 070/412] proxy: dont return connection pending (#5107)

## Problem

We were returning Pending when a connection had a notice/notification
(introduced recently in #5020). When returning pending, the runtime
assumes you will call `cx.waker().wake()` in order to continue
processing.

We weren't doing that, so the connection task would get stuck

## Summary of changes

Don't return pending. Loop instead
---
 proxy/src/http/conn_pool.rs | 42 +++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 20 deletions(-)

diff --git a/proxy/src/http/conn_pool.rs b/proxy/src/http/conn_pool.rs
index c02ec61945..e771e5d7ed 100644
--- a/proxy/src/http/conn_pool.rs
+++ b/proxy/src/http/conn_pool.rs
@@ -408,9 +408,9 @@ async fn connect_to_compute_once(
     let (tx, mut rx) = tokio::sync::watch::channel(session);
 
     let conn_id = uuid::Uuid::new_v4();
-    let span = info_span!(parent: None, "connection", %conn_info, %conn_id);
+    let span = info_span!(parent: None, "connection", %conn_id);
     span.in_scope(|| {
-        info!(%session, "new connection");
+        info!(%conn_info, %session, "new connection");
     });
 
     tokio::spawn(
@@ -420,26 +420,28 @@ async fn connect_to_compute_once(
                 info!(%session, "changed session");
             }
 
-            let message = ready!(connection.poll_message(cx));
+            loop {
+                let message = ready!(connection.poll_message(cx));
 
-            match message {
-                Some(Ok(AsyncMessage::Notice(notice))) => {
-                    info!(%session, "notice: {}", notice);
-                    Poll::Pending
+                match message {
+                    Some(Ok(AsyncMessage::Notice(notice))) => {
+                        info!(%session, "notice: {}", notice);
+                    }
+                    Some(Ok(AsyncMessage::Notification(notif))) => {
+                        warn!(%session, pid = notif.process_id(), channel = notif.channel(), "notification received");
+                    }
+                    Some(Ok(_)) => {
+                        warn!(%session, "unknown message");
+                    }
+                    Some(Err(e)) => {
+                        error!(%session, "connection error: {}", e);
+                        return Poll::Ready(())
+                    }
+                    None => {
+                        info!("connection closed");
+                        return Poll::Ready(())
+                    }
                 }
-                Some(Ok(AsyncMessage::Notification(notif))) => {
-                    warn!(%session, pid = notif.process_id(), channel = notif.channel(), "notification received");
-                    Poll::Pending
-                }
-                Some(Ok(_)) => {
-                    warn!(%session, "unknown message");
-                    Poll::Pending
-                }
-                Some(Err(e)) => {
-                    error!(%session, "connection error: {}", e);
-                    Poll::Ready(())
-                }
-                None => Poll::Ready(()),
             }
         })
         .instrument(span)

From 521438a5c6bbc5ecc061f35593b2dbb1715e0e06 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 12 Sep 2023 11:23:46 +0200
Subject: [PATCH 071/412] fix deadlock around TENANTS (#5285)

The sequence that can lead to a deadlock:

1. DELETE request gets all the way to `tenant.shutdown(progress,
false).await.is_err() ` , while holding TENANTS.read()
2. POST request for tenant creation comes in, calls `tenant_map_insert`,
it does `let mut guard = TENANTS.write().await;`
3. Something that `tenant.shutdown()` needs to wait for needs a
`TENANTS.read().await`.
The only case identified in exhaustive manual scanning of the code base
is this one:
Imitate size access does `get_tenant().await`, which does
`TENANTS.read().await` under the hood.

In the above case (1) waits for (3), (3)'s read-lock request is queued
behind (2)'s write-lock, and (2) waits for (1).
Deadlock.

I made a reproducer/proof-that-above-hypothesis-holds in
https://github.com/neondatabase/neon/pull/5281 , but, it's not ready for
merge yet and we want the fix _now_.

fixes https://github.com/neondatabase/neon/issues/5284
---
 pageserver/src/tenant/mgr.rs                  |  2 ++
 .../src/tenant/timeline/eviction_task.rs      | 21 ++++++++++++++++---
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 0cf5c36b87..74faee1115 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -503,6 +503,8 @@ pub enum GetTenantError {
 
 /// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query.
 /// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
+///
+/// This method is cancel-safe.
 pub async fn get_tenant(
     tenant_id: TenantId,
     active_only: bool,
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index 3e407dda57..39f0d03a01 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -328,9 +328,24 @@ impl Timeline {
         // Make one of the tenant's timelines draw the short straw and run the calculation.
         // The others wait until the calculation is done so that they take into account the
         // imitated accesses that the winner made.
-        let Ok(tenant) = crate::tenant::mgr::get_tenant(self.tenant_id, true).await else {
-            // likely, we're shutting down
-            return ControlFlow::Break(());
+        //
+        // It is critical we are responsive to cancellation here. Otherwise, we deadlock with
+        // tenant deletion (holds TENANTS in read mode) any other task that attempts to
+        // acquire TENANTS in write mode before we here call get_tenant.
+        // See https://github.com/neondatabase/neon/issues/5284.
+        let res = tokio::select! {
+            _ = cancel.cancelled() => {
+                return ControlFlow::Break(());
+            }
+            res = crate::tenant::mgr::get_tenant(self.tenant_id, true) => {
+                res
+            }
+        };
+        let tenant = match res {
+            Ok(t) => t,
+            Err(_) => {
+                return ControlFlow::Break(());
+            }
         };
         let mut state = tenant.eviction_task_tenant_state.lock().await;
         match state.last_layer_access_imitation {

From 373c7057cc73fd13f2873c353ced250ddb900700 Mon Sep 17 00:00:00 2001
From: Em Sharnoff <sharnoff@neon.tech>
Date: Thu, 14 Sep 2023 03:21:50 -0700
Subject: [PATCH 072/412] vm-monitor: Fix cgroup throttling (#5303)

I believe this (not actual IO problems) is the cause of the "disk speed
issue" that we've had for VMs recently. See e.g.:

1. https://neondb.slack.com/archives/C03H1K0PGKH/p1694287808046179?thread_ts=1694271790.580099&cid=C03H1K0PGKH
2. https://neondb.slack.com/archives/C03H1K0PGKH/p1694511932560659

The vm-informant (and now, the vm-monitor, its replacement) is supposed
to gradually increase the `neon-postgres` cgroup's memory.high value,
because otherwise the kernel will throttle all the processes in the
cgroup.

This PR fixes a bug with the vm-monitor's implementation of this
behavior.

---

Other references, for the vm-informant's implementation:

- Original issue: neondatabase/autoscaling#44
- Original PR: neondatabase/autoscaling#223
---
 libs/vm_monitor/src/cgroup.rs | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/libs/vm_monitor/src/cgroup.rs b/libs/vm_monitor/src/cgroup.rs
index 4f529d16fd..fe1c514f76 100644
--- a/libs/vm_monitor/src/cgroup.rs
+++ b/libs/vm_monitor/src/cgroup.rs
@@ -315,12 +315,8 @@ impl CgroupWatcher {
     where
         E: Stream<Item = Sequenced<u64>>,
     {
-        // There are several actions might do when receiving a `memory.high`,
-        // such as freezing the cgroup, or increasing its `memory.high`. We don't
-        // want to do these things too often (because postgres needs to run, and
-        // we only have so much memory). These timers serve as rate limits for this.
         let mut wait_to_freeze = pin!(tokio::time::sleep(Duration::ZERO));
-        let mut wait_to_increase_memory_high = pin!(tokio::time::sleep(Duration::ZERO));
+        let mut last_memory_high_increase_at: Option<Instant> = None;
         let mut events = pin!(events);
 
         // Are we waiting to be upscaled? Could be true if we request upscale due
@@ -332,6 +328,8 @@ impl CgroupWatcher {
                 upscale = upscales.recv() => {
                     let Sequenced { seqnum, data } = upscale
                         .context("failed to listen on upscale notification channel")?;
+                    waiting_on_upscale = false;
+                    last_memory_high_increase_at = None;
                     self.last_upscale_seqnum.store(seqnum, Ordering::Release);
                     info!(cpu = data.cpu, mem_bytes = data.mem, "received upscale");
                 }
@@ -396,12 +394,17 @@ impl CgroupWatcher {
                             .send(())
                             .await
                             .context("failed to request upscale")?;
+                        waiting_on_upscale = true;
                         continue;
                     }
 
                     // Shoot, we can't freeze or and we're still waiting on upscale,
                     // increase memory.high to reduce throttling
-                    if wait_to_increase_memory_high.is_elapsed() {
+                    let can_increase_memory_high = match last_memory_high_increase_at {
+                        None => true,
+                        Some(t) => t.elapsed() > self.config.memory_high_increase_every,
+                    };
+                    if can_increase_memory_high {
                         info!(
                             "received memory.high event, \
                             but too soon to refreeze and already requested upscale \
@@ -437,12 +440,11 @@ impl CgroupWatcher {
                         );
                         self.set_high_bytes(new_high)
                             .context("failed to set memory.high")?;
-                        wait_to_increase_memory_high
-                            .as_mut()
-                            .reset(Instant::now() + self.config.memory_high_increase_every)
+                        last_memory_high_increase_at = Some(Instant::now());
+                        continue;
                     }
 
-                    // we can't do anything
+                    info!("received memory.high event, but can't do anything");
                 }
             };
         }

From 6686ede30f4e5271c16616d6ca08a17ab307e9c8 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 14 Sep 2023 16:17:50 +0100
Subject: [PATCH 073/412] Update checksum for pg_hint_plan (#5309)

## Problem

The checksum for `pg_hint_plan` doesn't match:
```
sha256sum: WARNING: 1 computed checksum did NOT match
```

Ref
https://github.com/neondatabase/neon/actions/runs/6185715461/job/16793609251?pr=5307

It seems that the release was retagged yesterday:
https://github.com/ossc-db/pg_hint_plan/releases/tag/REL16_1_6_0

I don't see any malicious changes from 15_1.5.1:
https://github.com/ossc-db/pg_hint_plan/compare/REL15_1_5_1...REL16_1_6_0,
so it should be ok to update.

## Summary of changes
- Update checksum for `pg_hint_plan` 16_1.6.0
---
 Dockerfile.compute-node | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index ecf76f97c2..d724185d5d 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -416,7 +416,7 @@ RUN case "${PG_VERSION}" in \
         ;; \
       "v16") \
         export PG_HINT_PLAN_VERSION=16_1_6_0 \
-        export PG_HINT_PLAN_CHECKSUM=ce6a8040c78012000f5da7240caf6a971401412f41d33f930f09291e6c304b99 \
+        export PG_HINT_PLAN_CHECKSUM=fc85a9212e7d2819d4ae4ac75817481101833c3cfa9f0fe1f980984e12347d00 \
         ;; \
       *) \
         echo "Export the valid PG_HINT_PLAN_VERSION variable" && exit 1 \

From a5987eebfdf1ea3cc39d9730ed9f1f092e0ebf0e Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Fri, 15 Sep 2023 10:32:25 +0300
Subject: [PATCH 074/412] References to old and new blocks were mixed in
 xlog_heap_update handler (#5312)

## Problem

See https://neondb.slack.com/archives/C05L7D1JAUS/p1694614585955029

https://www.notion.so/neondatabase/Duplicate-key-issue-651627ce843c45188fbdcb2d30fd2178

## Summary of changes

Swap old/new block references

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
---
 pageserver/src/walingest.rs         | 16 +++++-----
 test_runner/regress/test_vm_bits.py | 48 +++++++++++++++++++++++------
 2 files changed, 47 insertions(+), 17 deletions(-)

diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 9192af0ee8..d5ef1f2a24 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -470,14 +470,14 @@ impl<'a> WalIngest<'a> {
                         // we can't validate the remaining number of bytes without parsing
                         // the tuple data.
                         if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
-                            old_heap_blkno = Some(decoded.blocks[0].blkno);
+                            old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
                         }
                         if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
                             // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
                             // non-HOT update where the new tuple goes to different page than
                             // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
                             // set.
-                            new_heap_blkno = Some(decoded.blocks[1].blkno);
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
                         }
                     }
                 } else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID {
@@ -526,14 +526,14 @@ impl<'a> WalIngest<'a> {
                         // we can't validate the remaining number of bytes without parsing
                         // the tuple data.
                         if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
-                            old_heap_blkno = Some(decoded.blocks[0].blkno);
+                            old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
                         }
                         if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
                             // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
                             // non-HOT update where the new tuple goes to different page than
                             // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
                             // set.
-                            new_heap_blkno = Some(decoded.blocks[1].blkno);
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
                         }
                     }
                 } else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID {
@@ -582,14 +582,14 @@ impl<'a> WalIngest<'a> {
                         // we can't validate the remaining number of bytes without parsing
                         // the tuple data.
                         if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
-                            old_heap_blkno = Some(decoded.blocks[0].blkno);
+                            old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
                         }
                         if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
                             // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
                             // non-HOT update where the new tuple goes to different page than
                             // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
                             // set.
-                            new_heap_blkno = Some(decoded.blocks[1].blkno);
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
                         }
                     }
                 } else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID {
@@ -745,14 +745,14 @@ impl<'a> WalIngest<'a> {
                         // we can't validate the remaining number of bytes without parsing
                         // the tuple data.
                         if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
-                            old_heap_blkno = Some(decoded.blocks[0].blkno);
+                            old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
                         }
                         if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
                             // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
                             // non-HOT update where the new tuple goes to different page than
                             // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
                             // set.
-                            new_heap_blkno = Some(decoded.blocks[1].blkno);
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
                         }
                     }
                     pg_constants::XLOG_NEON_HEAP_MULTI_INSERT => {
diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py
index d8034b31b0..8e20a44407 100644
--- a/test_runner/regress/test_vm_bits.py
+++ b/test_runner/regress/test_vm_bits.py
@@ -19,18 +19,40 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv):
     # Install extension containing function needed for test
     cur.execute("CREATE EXTENSION neon_test_utils")
 
-    # Create a test table and freeze it to set the VM bit.
+    # Create a test table for a few different scenarios and freeze it to set the VM bits.
     cur.execute("CREATE TABLE vmtest_delete (id integer PRIMARY KEY)")
     cur.execute("INSERT INTO vmtest_delete VALUES (1)")
     cur.execute("VACUUM FREEZE vmtest_delete")
 
-    cur.execute("CREATE TABLE vmtest_update (id integer PRIMARY KEY)")
-    cur.execute("INSERT INTO vmtest_update SELECT g FROM generate_series(1, 1000) g")
-    cur.execute("VACUUM FREEZE vmtest_update")
+    cur.execute("CREATE TABLE vmtest_hot_update (id integer PRIMARY KEY, filler text)")
+    cur.execute("INSERT INTO vmtest_hot_update VALUES (1, 'x')")
+    cur.execute("VACUUM FREEZE vmtest_hot_update")
+
+    cur.execute("CREATE TABLE vmtest_cold_update (id integer PRIMARY KEY)")
+    cur.execute("INSERT INTO vmtest_cold_update SELECT g FROM generate_series(1, 1000) g")
+    cur.execute("VACUUM FREEZE vmtest_cold_update")
+
+    cur.execute(
+        "CREATE TABLE vmtest_cold_update2 (id integer PRIMARY KEY, filler text) WITH (fillfactor=100)"
+    )
+    cur.execute("INSERT INTO vmtest_cold_update2 SELECT g, '' FROM generate_series(1, 1000) g")
+    cur.execute("VACUUM FREEZE vmtest_cold_update2")
 
     # DELETE and UPDATE the rows.
     cur.execute("DELETE FROM vmtest_delete WHERE id = 1")
-    cur.execute("UPDATE vmtest_update SET id = 5000 WHERE id = 1")
+    cur.execute("UPDATE vmtest_hot_update SET filler='x' WHERE id = 1")
+    cur.execute("UPDATE vmtest_cold_update SET id = 5000 WHERE id = 1")
+
+    # Clear the VM bit on the last page with an INSERT. Then clear the VM bit on
+    # the page where row 1 is (block 0), by doing an UPDATE. The UPDATE is a
+    # cold update, and the new tuple goes to the last page, which already had
+    # its VM bit cleared. The point is that the UPDATE *only* clears the VM bit
+    # on the page containing the old tuple. We had a bug where we got the old
+    # and new pages mixed up, and that only shows up when one of the bits is
+    # cleared, but not the other one.
+    cur.execute("INSERT INTO vmtest_cold_update2 VALUES (9999, 'x')")
+    # Clears the VM bit on the old page
+    cur.execute("UPDATE vmtest_cold_update2 SET id = 5000, filler=repeat('x', 200) WHERE id = 1")
 
     # Branch at this point, to test that later
     fork_at_current_lsn(env, endpoint, "test_vm_bit_clear_new", "test_vm_bit_clear")
@@ -50,9 +72,13 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv):
     """
     )
 
-    cur.execute("SELECT * FROM vmtest_delete WHERE id = 1")
+    cur.execute("SELECT id FROM vmtest_delete WHERE id = 1")
     assert cur.fetchall() == []
-    cur.execute("SELECT * FROM vmtest_update WHERE id = 1")
+    cur.execute("SELECT id FROM vmtest_hot_update WHERE id = 1")
+    assert cur.fetchall() == [(1,)]
+    cur.execute("SELECT id FROM vmtest_cold_update WHERE id = 1")
+    assert cur.fetchall() == []
+    cur.execute("SELECT id FROM vmtest_cold_update2 WHERE id = 1")
     assert cur.fetchall() == []
 
     cur.close()
@@ -77,7 +103,11 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv):
     """
     )
 
-    cur_new.execute("SELECT * FROM vmtest_delete WHERE id = 1")
+    cur_new.execute("SELECT id FROM vmtest_delete WHERE id = 1")
     assert cur_new.fetchall() == []
-    cur_new.execute("SELECT * FROM vmtest_update WHERE id = 1")
+    cur_new.execute("SELECT id FROM vmtest_hot_update WHERE id = 1")
+    assert cur_new.fetchall() == [(1,)]
+    cur_new.execute("SELECT id FROM vmtest_cold_update WHERE id = 1")
+    assert cur_new.fetchall() == []
+    cur_new.execute("SELECT id FROM vmtest_cold_update2 WHERE id = 1")
     assert cur_new.fetchall() == []

From 46857e82828f570750cd2ac52eedf7260500a192 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 15 Sep 2023 15:27:00 +0100
Subject: [PATCH 075/412] Postgres 14/15: Use previous extensions versions

---
 Dockerfile.compute-node | 170 +++++++++++++++++++++++++++++++++++-----
 1 file changed, 150 insertions(+), 20 deletions(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index d724185d5d..f7a15312ce 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -74,8 +74,21 @@ RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar
 
 ENV PATH "/usr/local/pgsql/bin:$PATH"
 
-RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \
-    echo "74eb356e3f85f14233791013360881b6748f78081cc688ff9d6f0f673a762d13 postgis.tar.gz" | sha256sum --check && \
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        export POSTGIS_VERSION=3.3.2 \
+        export POSTGIS_CHECKSUM=9a2a219da005a1730a39d1959a1c7cec619b1efb009b65be80ffc25bad299068 \
+        ;; \
+      "v16") \
+        export POSTGIS_VERSION=3.3.3 \
+        export POSTGIS_CHECKSUM=74eb356e3f85f14233791013360881b6748f78081cc688ff9d6f0f673a762d13 \
+        ;; \
+      *) \
+        echo "Export the valid PG_VERSION variable" && exit 1 \
+        ;; \
+    esac && \
+    wget https://download.osgeo.org/postgis/source/postgis-${POSTGIS_VERSION}.tar.gz -O postgis.tar.gz && \
+    echo "${POSTGIS_CHECKSUM} postgis.tar.gz" | sha256sum --check && \
     mkdir postgis-src && cd postgis-src && tar xvzf ../postgis.tar.gz --strip-components=1 -C . && \
     find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
     ./autogen.sh && \
@@ -124,8 +137,21 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt update && \
     apt install -y ninja-build python3-dev libncurses5 binutils clang
 
-RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.8.tar.gz -O plv8.tar.gz && \
-    echo "92b10c7db39afdae97ff748c9ec54713826af222c459084ad002571b79eb3f49 plv8.tar.gz" | sha256sum --check && \
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        export PLV8_VERSION=3.1.5 \
+        export PLV8_CHECKSUM=1e108d5df639e4c189e1c5bdfa2432a521c126ca89e7e5a969d46899ca7bf106 \
+        ;; \
+      "v16") \
+        export PLV8_VERSION=3.1.8 \
+        export PLV8_CHECKSUM=92b10c7db39afdae97ff748c9ec54713826af222c459084ad002571b79eb3f49 \
+        ;; \
+      *) \
+        echo "Export the valid PG_VERSION variable" && exit 1 \
+        ;; \
+    esac && \
+    wget https://github.com/plv8/plv8/archive/refs/tags/v${PLV8_VERSION}.tar.gz -O plv8.tar.gz && \
+    echo "${PLV8_CHECKSUM} plv8.tar.gz" | sha256sum --check && \
     mkdir plv8-src && cd plv8-src && tar xvzf ../plv8.tar.gz --strip-components=1 -C . && \
     export PATH="/usr/local/pgsql/bin:$PATH" && \
     make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -172,8 +198,21 @@ RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz
     cp -R /h3/usr / && \
     rm -rf build
 
-RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \
-    echo "5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 h3-pg.tar.gz" | sha256sum --check && \
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        export H3PG_VERSION=4.1.2 \
+        export H3PG_CHECKSUM=c135aa45999b2ad1326d2537c1cadef96d52660838e4ca371706c08fdea1a956 \
+        ;; \
+      "v16") \
+        export H3PG_VERSION=4.1.3 \
+        export H3PG_CHECKSUM=5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 \
+        ;; \
+      *) \
+        echo "Export the valid PG_VERSION variable" && exit 1 \
+        ;; \
+    esac && \
+    wget https://github.com/zachasme/h3-pg/archive/refs/tags/v${H3PG_VERSION}.tar.gz -O h3-pg.tar.gz && \
+    echo "${H3PG_CHECKSUM} h3-pg.tar.gz" | sha256sum --check && \
     mkdir h3-pg-src && cd h3-pg-src && tar xvzf ../h3-pg.tar.gz --strip-components=1 -C . && \
     export PATH="/usr/local/pgsql/bin:$PATH" && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -243,8 +282,21 @@ RUN wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b214
 FROM build-deps AS hypopg-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.0.tar.gz -O hypopg.tar.gz && \
-    echo "0821011743083226fc9b813c1f2ef5897a91901b57b6bea85a78e466187c6819 hypopg.tar.gz" | sha256sum --check && \
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        export HYPOPG_VERSION=1.3.1 \
+        export HYPOPG_CHECKSUM=e7f01ee0259dc1713f318a108f987663d60f3041948c2ada57a94b469565ca8e \
+        ;; \
+      "v16") \
+        export HYPOPG_VERSION=1.4.0 \
+        export HYPOPG_CHECKSUM=0821011743083226fc9b813c1f2ef5897a91901b57b6bea85a78e466187c6819 \
+        ;; \
+      *) \
+        echo "Export the valid PG_VERSION variable" && exit 1 \
+        ;; \
+    esac && \
+    wget https://github.com/HypoPG/hypopg/archive/refs/tags/${HYPOPG_VERSION}.tar.gz -O hypopg.tar.gz && \
+    echo "${HYPOPG_CHECKSUM} hypopg.tar.gz" | sha256sum --check && \
     mkdir hypopg-src && cd hypopg-src && tar xvzf ../hypopg.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -307,8 +359,21 @@ RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgta
 FROM build-deps AS ip4r-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \
-    echo "0f7b1f159974f49a47842a8ab6751aecca1ed1142b6d5e38d81b064b2ead1b4b ip4r.tar.gz" | sha256sum --check && \
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        export IP4R_VERSION=2.4.1 \
+        export IP4R_CHECKSUM=78b9f0c1ae45c22182768fe892a32d533c82281035e10914111400bf6301c726 \
+        ;; \
+      "v16") \
+        export IP4R_VERSION=2.4.2 \
+        export IP4R_CHECKSUM=0f7b1f159974f49a47842a8ab6751aecca1ed1142b6d5e38d81b064b2ead1b4b \
+        ;; \
+      *) \
+        echo "Export the valid PG_VERSION variable" && exit 1 \
+        ;; \
+    esac && \
+    wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/${IP4R_VERSION}.tar.gz -O ip4r.tar.gz && \
+    echo "${IP4R_CHECKSUM} ip4r.tar.gz" | sha256sum --check && \
     mkdir ip4r-src && cd ip4r-src && tar xvzf ../ip4r.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -323,8 +388,21 @@ RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O i
 FROM build-deps AS prefix-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \
-    echo "4342f251432a5f6fb05b8597139d3ccde8dcf87e8ca1498e7ee931ca057a8575 prefix.tar.gz" | sha256sum --check && \
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        export PREFIX_VERSION=1.2.9 \
+        export PREFIX_CHECKSUM=38d30a08d0241a8bbb8e1eb8f0152b385051665a8e621c8899e7c5068f8b511e \
+        ;; \
+      "v16") \
+        export PREFIX_VERSION=1.2.10 \
+        export PREFIX_CHECKSUM=4342f251432a5f6fb05b8597139d3ccde8dcf87e8ca1498e7ee931ca057a8575 \
+        ;; \
+      *) \
+        echo "Export the valid PG_VERSION variable" && exit 1 \
+        ;; \
+    esac && \
+    wget https://github.com/dimitri/prefix/archive/refs/tags/v$P{PREFIX_VERSION}.tar.gz -O prefix.tar.gz && \
+    echo "${PREFIX_CHECKSUM} prefix.tar.gz" | sha256sum --check && \
     mkdir prefix-src && cd prefix-src && tar xvzf ../prefix.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -339,8 +417,21 @@ RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O p
 FROM build-deps AS hll-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \
-    echo "e2f55a6f4c4ab95ee4f1b4a2b73280258c5136b161fe9d059559556079694f0e hll.tar.gz" | sha256sum --check && \
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        export HLL_VERSION=2.17 \
+        export HLL_CHECKSUM=9a18288e884f197196b0d29b9f178ba595b0dfc21fbf7a8699380e77fa04c1e9 \
+        ;; \
+      "v16") \
+        export HLL_VERSION=2.18 \
+        export HLL_CHECKSUM=e2f55a6f4c4ab95ee4f1b4a2b73280258c5136b161fe9d059559556079694f0e \
+        ;; \
+      *) \
+        echo "Export the valid PG_VERSION variable" && exit 1 \
+        ;; \
+    esac && \
+    wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v${HLL_VERSION}.tar.gz -O hll.tar.gz && \
+    echo "${HLL_CHECKSUM} hll.tar.gz" | sha256sum --check && \
     mkdir hll-src && cd hll-src && tar xvzf ../hll.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -355,8 +446,21 @@ RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar
 FROM build-deps AS plpgsql-check-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.4.0.tar.gz -O plpgsql_check.tar.gz && \
-    echo "9ba58387a279b35a3bfa39ee611e5684e6cddb2ba046ddb2c5190b3bd2ca254a plpgsql_check.tar.gz" | sha256sum --check && \
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        export PLPGSQL_CHECK_VERSION=2.3.2 \
+        export PLPGSQL_CHECK_CHECKSUM=9d81167c4bbeb74eebf7d60147b21961506161addc2aee537f95ad8efeae427b \
+        ;; \
+      "v16") \
+        export PLPGSQL_CHECK_VERSION=2.4.0 \
+        export PLPGSQL_CHECK_CHECKSUM=9ba58387a279b35a3bfa39ee611e5684e6cddb2ba046ddb2c5190b3bd2ca254a \
+        ;; \
+      *) \
+        echo "Export the valid PG_VERSION variable" && exit 1 \
+        ;; \
+    esac && \
+    wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v${PLPGSQL_CHECK_VERSION}.tar.gz -O plpgsql_check.tar.gz && \
+    echo "${PLPGSQL_CHECK_CHECKSUM} plpgsql_check.tar.gz" | sha256sum --check && \
     mkdir plpgsql_check-src && cd plpgsql_check-src && tar xvzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
@@ -465,8 +569,21 @@ FROM build-deps AS pg-cron-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
-RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \
-    echo "383a627867d730222c272bfd25cd5e151c578d73f696d32910c7db8c665cc7db pg_cron.tar.gz" | sha256sum --check && \
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        export PG_CRON_VERSION=1.5.2 \
+        export PG_CRON_CHECKSUM=6f7f0980c03f1e2a6a747060e67bf4a303ca2a50e941e2c19daeed2b44dec744 \
+        ;; \
+      "v16") \
+        export PG_CRON_VERSION=1.6.0 \
+        export PG_CRON_CHECKSUM=383a627867d730222c272bfd25cd5e151c578d73f696d32910c7db8c665cc7db \
+        ;; \
+      *) \
+        echo "Export the valid PG_VERSION variable" && exit 1 \
+        ;; \
+    esac && \
+    wget https://github.com/citusdata/pg_cron/archive/refs/tags/v${PG_CRON_VERSION}.tar.gz -O pg_cron.tar.gz && \
+    echo "${PG_CRON_CHECKSUM} pg_cron.tar.gz" | sha256sum --check && \
     mkdir pg_cron-src && cd pg_cron-src && tar xvzf ../pg_cron.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -492,8 +609,21 @@ RUN apt-get update && \
         libfreetype6-dev
 
 ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
-RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \
-    echo "bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d rdkit.tar.gz" | sha256sum --check && \
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        export RDKIT_VERSION=2023_03_1 \
+        export RDKIT_CHECKSUM=db346afbd0ba52c843926a2a62f8a38c7b774ffab37eaf382d789a824f21996c \
+        ;; \
+      "v16") \
+        export RDKIT_VERSION=2023_03_3 \
+        export RDKIT_CHECKSUM=bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d \
+        ;; \
+      *) \
+        echo "Export the valid PG_VERSION variable" && exit 1 \
+        ;; \
+    esac && \
+    wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_${RDKIT_VERSION}.tar.gz -O rdkit.tar.gz && \
+    echo "${RDKIT_CHECKSUM} rdkit.tar.gz" | sha256sum --check && \
     mkdir rdkit-src && cd rdkit-src && tar xvzf ../rdkit.tar.gz --strip-components=1 -C . && \
     cmake \
         -D RDK_BUILD_CAIRO_SUPPORT=OFF \

From 23ee4f3050539125e0b7a731f3f6de4637a1a1a9 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 15 Sep 2023 15:45:23 +0100
Subject: [PATCH 076/412] Revert plv8 only

---
 Dockerfile.compute-node | 153 +++++-----------------------------------
 1 file changed, 18 insertions(+), 135 deletions(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index f7a15312ce..892c5e93f3 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -74,21 +74,8 @@ RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar
 
 ENV PATH "/usr/local/pgsql/bin:$PATH"
 
-RUN case "${PG_VERSION}" in \
-      "v14" | "v15") \
-        export POSTGIS_VERSION=3.3.2 \
-        export POSTGIS_CHECKSUM=9a2a219da005a1730a39d1959a1c7cec619b1efb009b65be80ffc25bad299068 \
-        ;; \
-      "v16") \
-        export POSTGIS_VERSION=3.3.3 \
-        export POSTGIS_CHECKSUM=74eb356e3f85f14233791013360881b6748f78081cc688ff9d6f0f673a762d13 \
-        ;; \
-      *) \
-        echo "Export the valid PG_VERSION variable" && exit 1 \
-        ;; \
-    esac && \
-    wget https://download.osgeo.org/postgis/source/postgis-${POSTGIS_VERSION}.tar.gz -O postgis.tar.gz && \
-    echo "${POSTGIS_CHECKSUM} postgis.tar.gz" | sha256sum --check && \
+RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \
+    echo "74eb356e3f85f14233791013360881b6748f78081cc688ff9d6f0f673a762d13 postgis.tar.gz" | sha256sum --check && \
     mkdir postgis-src && cd postgis-src && tar xvzf ../postgis.tar.gz --strip-components=1 -C . && \
     find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
     ./autogen.sh && \
@@ -198,21 +185,8 @@ RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz
     cp -R /h3/usr / && \
     rm -rf build
 
-RUN case "${PG_VERSION}" in \
-      "v14" | "v15") \
-        export H3PG_VERSION=4.1.2 \
-        export H3PG_CHECKSUM=c135aa45999b2ad1326d2537c1cadef96d52660838e4ca371706c08fdea1a956 \
-        ;; \
-      "v16") \
-        export H3PG_VERSION=4.1.3 \
-        export H3PG_CHECKSUM=5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 \
-        ;; \
-      *) \
-        echo "Export the valid PG_VERSION variable" && exit 1 \
-        ;; \
-    esac && \
-    wget https://github.com/zachasme/h3-pg/archive/refs/tags/v${H3PG_VERSION}.tar.gz -O h3-pg.tar.gz && \
-    echo "${H3PG_CHECKSUM} h3-pg.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \
+    echo "5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 h3-pg.tar.gz" | sha256sum --check && \
     mkdir h3-pg-src && cd h3-pg-src && tar xvzf ../h3-pg.tar.gz --strip-components=1 -C . && \
     export PATH="/usr/local/pgsql/bin:$PATH" && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -282,21 +256,8 @@ RUN wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b214
 FROM build-deps AS hypopg-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN case "${PG_VERSION}" in \
-      "v14" | "v15") \
-        export HYPOPG_VERSION=1.3.1 \
-        export HYPOPG_CHECKSUM=e7f01ee0259dc1713f318a108f987663d60f3041948c2ada57a94b469565ca8e \
-        ;; \
-      "v16") \
-        export HYPOPG_VERSION=1.4.0 \
-        export HYPOPG_CHECKSUM=0821011743083226fc9b813c1f2ef5897a91901b57b6bea85a78e466187c6819 \
-        ;; \
-      *) \
-        echo "Export the valid PG_VERSION variable" && exit 1 \
-        ;; \
-    esac && \
-    wget https://github.com/HypoPG/hypopg/archive/refs/tags/${HYPOPG_VERSION}.tar.gz -O hypopg.tar.gz && \
-    echo "${HYPOPG_CHECKSUM} hypopg.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.0.tar.gz -O hypopg.tar.gz && \
+    echo "0821011743083226fc9b813c1f2ef5897a91901b57b6bea85a78e466187c6819 hypopg.tar.gz" | sha256sum --check && \
     mkdir hypopg-src && cd hypopg-src && tar xvzf ../hypopg.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -359,21 +320,8 @@ RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgta
 FROM build-deps AS ip4r-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN case "${PG_VERSION}" in \
-      "v14" | "v15") \
-        export IP4R_VERSION=2.4.1 \
-        export IP4R_CHECKSUM=78b9f0c1ae45c22182768fe892a32d533c82281035e10914111400bf6301c726 \
-        ;; \
-      "v16") \
-        export IP4R_VERSION=2.4.2 \
-        export IP4R_CHECKSUM=0f7b1f159974f49a47842a8ab6751aecca1ed1142b6d5e38d81b064b2ead1b4b \
-        ;; \
-      *) \
-        echo "Export the valid PG_VERSION variable" && exit 1 \
-        ;; \
-    esac && \
-    wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/${IP4R_VERSION}.tar.gz -O ip4r.tar.gz && \
-    echo "${IP4R_CHECKSUM} ip4r.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \
+    echo "0f7b1f159974f49a47842a8ab6751aecca1ed1142b6d5e38d81b064b2ead1b4b ip4r.tar.gz" | sha256sum --check && \
     mkdir ip4r-src && cd ip4r-src && tar xvzf ../ip4r.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -388,21 +336,8 @@ RUN case "${PG_VERSION}" in \
 FROM build-deps AS prefix-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN case "${PG_VERSION}" in \
-      "v14" | "v15") \
-        export PREFIX_VERSION=1.2.9 \
-        export PREFIX_CHECKSUM=38d30a08d0241a8bbb8e1eb8f0152b385051665a8e621c8899e7c5068f8b511e \
-        ;; \
-      "v16") \
-        export PREFIX_VERSION=1.2.10 \
-        export PREFIX_CHECKSUM=4342f251432a5f6fb05b8597139d3ccde8dcf87e8ca1498e7ee931ca057a8575 \
-        ;; \
-      *) \
-        echo "Export the valid PG_VERSION variable" && exit 1 \
-        ;; \
-    esac && \
-    wget https://github.com/dimitri/prefix/archive/refs/tags/v$P{PREFIX_VERSION}.tar.gz -O prefix.tar.gz && \
-    echo "${PREFIX_CHECKSUM} prefix.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \
+    echo "4342f251432a5f6fb05b8597139d3ccde8dcf87e8ca1498e7ee931ca057a8575 prefix.tar.gz" | sha256sum --check && \
     mkdir prefix-src && cd prefix-src && tar xvzf ../prefix.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -417,21 +352,8 @@ RUN case "${PG_VERSION}" in \
 FROM build-deps AS hll-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN case "${PG_VERSION}" in \
-      "v14" | "v15") \
-        export HLL_VERSION=2.17 \
-        export HLL_CHECKSUM=9a18288e884f197196b0d29b9f178ba595b0dfc21fbf7a8699380e77fa04c1e9 \
-        ;; \
-      "v16") \
-        export HLL_VERSION=2.18 \
-        export HLL_CHECKSUM=e2f55a6f4c4ab95ee4f1b4a2b73280258c5136b161fe9d059559556079694f0e \
-        ;; \
-      *) \
-        echo "Export the valid PG_VERSION variable" && exit 1 \
-        ;; \
-    esac && \
-    wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v${HLL_VERSION}.tar.gz -O hll.tar.gz && \
-    echo "${HLL_CHECKSUM} hll.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \
+    echo "e2f55a6f4c4ab95ee4f1b4a2b73280258c5136b161fe9d059559556079694f0e hll.tar.gz" | sha256sum --check && \
     mkdir hll-src && cd hll-src && tar xvzf ../hll.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -446,21 +368,8 @@ RUN case "${PG_VERSION}" in \
 FROM build-deps AS plpgsql-check-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN case "${PG_VERSION}" in \
-      "v14" | "v15") \
-        export PLPGSQL_CHECK_VERSION=2.3.2 \
-        export PLPGSQL_CHECK_CHECKSUM=9d81167c4bbeb74eebf7d60147b21961506161addc2aee537f95ad8efeae427b \
-        ;; \
-      "v16") \
-        export PLPGSQL_CHECK_VERSION=2.4.0 \
-        export PLPGSQL_CHECK_CHECKSUM=9ba58387a279b35a3bfa39ee611e5684e6cddb2ba046ddb2c5190b3bd2ca254a \
-        ;; \
-      *) \
-        echo "Export the valid PG_VERSION variable" && exit 1 \
-        ;; \
-    esac && \
-    wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v${PLPGSQL_CHECK_VERSION}.tar.gz -O plpgsql_check.tar.gz && \
-    echo "${PLPGSQL_CHECK_CHECKSUM} plpgsql_check.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.4.0.tar.gz -O plpgsql_check.tar.gz && \
+    echo "9ba58387a279b35a3bfa39ee611e5684e6cddb2ba046ddb2c5190b3bd2ca254a plpgsql_check.tar.gz" | sha256sum --check && \
     mkdir plpgsql_check-src && cd plpgsql_check-src && tar xvzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
@@ -569,21 +478,8 @@ FROM build-deps AS pg-cron-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
-RUN case "${PG_VERSION}" in \
-      "v14" | "v15") \
-        export PG_CRON_VERSION=1.5.2 \
-        export PG_CRON_CHECKSUM=6f7f0980c03f1e2a6a747060e67bf4a303ca2a50e941e2c19daeed2b44dec744 \
-        ;; \
-      "v16") \
-        export PG_CRON_VERSION=1.6.0 \
-        export PG_CRON_CHECKSUM=383a627867d730222c272bfd25cd5e151c578d73f696d32910c7db8c665cc7db \
-        ;; \
-      *) \
-        echo "Export the valid PG_VERSION variable" && exit 1 \
-        ;; \
-    esac && \
-    wget https://github.com/citusdata/pg_cron/archive/refs/tags/v${PG_CRON_VERSION}.tar.gz -O pg_cron.tar.gz && \
-    echo "${PG_CRON_CHECKSUM} pg_cron.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \
+    echo "383a627867d730222c272bfd25cd5e151c578d73f696d32910c7db8c665cc7db pg_cron.tar.gz" | sha256sum --check && \
     mkdir pg_cron-src && cd pg_cron-src && tar xvzf ../pg_cron.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -609,21 +505,8 @@ RUN apt-get update && \
         libfreetype6-dev
 
 ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
-RUN case "${PG_VERSION}" in \
-      "v14" | "v15") \
-        export RDKIT_VERSION=2023_03_1 \
-        export RDKIT_CHECKSUM=db346afbd0ba52c843926a2a62f8a38c7b774ffab37eaf382d789a824f21996c \
-        ;; \
-      "v16") \
-        export RDKIT_VERSION=2023_03_3 \
-        export RDKIT_CHECKSUM=bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d \
-        ;; \
-      *) \
-        echo "Export the valid PG_VERSION variable" && exit 1 \
-        ;; \
-    esac && \
-    wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_${RDKIT_VERSION}.tar.gz -O rdkit.tar.gz && \
-    echo "${RDKIT_CHECKSUM} rdkit.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \
+    echo "bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d rdkit.tar.gz" | sha256sum --check && \
     mkdir rdkit-src && cd rdkit-src && tar xvzf ../rdkit.tar.gz --strip-components=1 -C . && \
     cmake \
         -D RDK_BUILD_CAIRO_SUPPORT=OFF \

From ae0634b7beea1777d734e66089b8eb45cd656861 Mon Sep 17 00:00:00 2001
From: Em Sharnoff <sharnoff@neon.tech>
Date: Thu, 28 Sep 2023 00:52:39 -0700
Subject: [PATCH 077/412] Bump vm-builder v0.17.11 -> v0.17.12 (#5407)

Only relevant change is neondatabase/autoscaling#534 - refer there for
more details.
---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 7271a8d29f..65a2101dc6 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -834,7 +834,7 @@ jobs:
       run:
         shell: sh -eu {0}
     env:
-      VM_BUILDER_VERSION: v0.17.11
+      VM_BUILDER_VERSION: v0.17.12
 
     steps:
       - name: Checkout

From 72aa6b9fdd5b0ccbfe6aec8add2d7a951d4cc5a3 Mon Sep 17 00:00:00 2001
From: MMeent <matthias@neon.tech>
Date: Wed, 27 Sep 2023 13:48:30 +0200
Subject: [PATCH 078/412] Fix neon_zeroextend's WAL logging (#5387)

When you log more than a few blocks, you need to reserve the space in
advance. We didn't do that, so we got errors. Now we do that, and
shouldn't get errors.
---
 pgxn/neon/pagestore_smgr.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 919bca03e9..2e4364cbfa 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -1790,6 +1790,14 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
 	if (!XLogInsertAllowed())
 		return;
 
+	/* ensure we have enough xlog buffers to log max-sized records */
+	XLogEnsureRecordSpace(Min(remblocks, (XLR_MAX_BLOCK_ID - 1)), 0);
+
+	/*
+	 * Iterate over all the pages. They are collected into batches of
+	 * XLR_MAX_BLOCK_ID pages, and a single WAL-record is written for each
+	 * batch.
+	 */
 	while (remblocks > 0)
 	{
 		int			count = Min(remblocks, XLR_MAX_BLOCK_ID);

From 7222777784eb84efc13093783d9aabbe1260b5ef Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 3 Oct 2023 18:42:39 +0100
Subject: [PATCH 079/412] Update checksums for pg_jsonschema & pg_graphql
 (#5455)

## Problem

Folks have re-taged releases for `pg_jsonschema` and `pg_graphql` (to
increase timeouts on their CI), for us, these are a noop changes,
but unfortunately, this will cause our builds to fail due to checksums
mismatch (this might not strike right away because of the build cache).
- https://github.com/supabase/pg_jsonschema/commit/8ba7c7be9d3f12c0cf30c7105db303ce2aaf12c2
- https://github.com/supabase/pg_graphql/commit/aa7509370a4d34a26d48126ac24bc937a009c115

## Summary of changes
- `pg_jsonschema` update checksum
- `pg_graphql` update checksum
---
 Dockerfile.compute-node | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index e53ec47688..7e34b66d68 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -651,7 +651,7 @@ FROM rust-extensions-build AS pg-jsonschema-pg-build
 ARG PG_VERSION
 
 RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.2.0.tar.gz -O pg_jsonschema.tar.gz && \
-    echo "b1bd95009c8809bd6cda9a37777f8b7df425ff1a34976c1e7a4b31cf838ace66 pg_jsonschema.tar.gz" | sha256sum --check && \
+    echo "9118fc508a6e231e7a39acaa6f066fcd79af17a5db757b47d2eefbe14f7794f0 pg_jsonschema.tar.gz" | sha256sum --check && \
     mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xvzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
     sed -i 's/pgrx = "0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     cargo pgrx install --release && \
@@ -668,7 +668,7 @@ FROM rust-extensions-build AS pg-graphql-pg-build
 ARG PG_VERSION
 
 RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.4.0.tar.gz -O pg_graphql.tar.gz && \
-    echo "ea85d45f8af1d2382e2af847f88102f930782c00e6c612308e6f08f27309d5f7 pg_graphql.tar.gz" | sha256sum --check && \
+    echo "bd8dc7230282b3efa9ae5baf053a54151ed0e66881c7c53750e2d0c765776edc pg_graphql.tar.gz" | sha256sum --check && \
     mkdir pg_graphql-src && cd pg_graphql-src && tar xvzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
     sed -i 's/pgrx = "=0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     cargo pgrx install --release && \

From a6b2f4e54ef97333c35834788bf555fc10e3f4bb Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 17 Oct 2023 11:29:48 +0200
Subject: [PATCH 080/412] limit imitate accesses concurrency, using same
 semaphore as compactions (#5578)

Before this PR, when we restarted pageserver, we'd see a rush of
`$number_of_tenants` concurrent eviction tasks starting to do imitate
accesses building up in the period of `[init_order allows activations,
$random_access_delay + EvictionPolicyLayerAccessThreshold::period]`.

We simply cannot handle that degree of concurrent IO.

We already solved the problem for compactions by adding a semaphore.
So, this PR shares that semaphore for use by evictions.

Part of https://github.com/neondatabase/neon/issues/5479

Which is again part of https://github.com/neondatabase/neon/issues/4743

Risks / Changes In System Behavior
==================================

* we don't do evictions as timely as we currently do
* we log a bunch of warnings about eviction taking too long
* imitate accesses and compactions compete for the same concurrency
limit, so, they'll slow each other down through this shares semaphore

Changes
=======

- Move the `CONCURRENT_COMPACTIONS` semaphore into `tasks.rs`
- Rename it to `CONCURRENT_BACKGROUND_TASKS`
- Use it also for the eviction imitate accesses:
    - Imitate acceses are both per-TIMELINE and per-TENANT
    - The per-TENANT is done through coalescing all the per-TIMELINE
      tasks via a tokio mutex `eviction_task_tenant_state`.
    - We acquire the CONCURRENT_BACKGROUND_TASKS permit early, at the
      beginning of the eviction iteration, much before the imitate
      acesses start (and they may not even start at all in the given
      iteration, as they happen only every $threshold).
    - Acquiring early is **sub-optimal** because when the per-timline
      tasks coalesce on the `eviction_task_tenant_state` mutex,
      they are already holding a CONCURRENT_BACKGROUND_TASKS permit.
    - It's also unfair because tenants with many timelines win
      the CONCURRENT_BACKGROUND_TASKS more often.
    - I don't think there's another way though, without refactoring
      more of the imitate accesses logic, e.g, making it all per-tenant.
- Add metrics for queue depth behind the semaphore.
I found these very useful to understand what work is queued in the
system.

    - The metrics are tagged by the new `BackgroundLoopKind`.
    - On a green slate, I would have used `TaskKind`, but we already had
      pre-existing labels whose names didn't map exactly to task kind.
      Also the task kind is kind of a lower-level detail, so, I think
it's fine to have a separate enum to identify background work kinds.

Future Work
===========

I guess I could move the eviction tasks from a ticker to "sleep for
$period".
The benefit would be that the semaphore automatically "smears" the
eviction task scheduling over time, so, we only have the rush on restart
but a smeared-out rush afterward.

The downside is that this perverts the meaning of "$period", as we'd
actually not run the eviction at a fixed period. It also means the the
"took to long" warning & metric becomes meaningless.

Then again, that is already the case for the compaction and gc tasks,
which do sleep for `$period` instead of using a ticker.

(cherry picked from commit 9256788273d5661ced0b2661a8751e2aa86fbb59)
---
 pageserver/src/consumption_metrics.rs         | 10 ++-
 pageserver/src/metrics.rs                     | 20 +++++
 pageserver/src/tenant/tasks.rs                | 81 +++++++++++++++++--
 pageserver/src/tenant/timeline.rs             | 39 +++------
 .../src/tenant/timeline/eviction_task.rs      | 18 ++++-
 5 files changed, 131 insertions(+), 37 deletions(-)

diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index 72a2099d92..13f7977946 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -2,6 +2,7 @@
 //! and push them to a HTTP endpoint.
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
+use crate::tenant::tasks::BackgroundLoopKind;
 use crate::tenant::{mgr, LogicalSizeCalculationCause};
 use camino::Utf8PathBuf;
 use consumption_metrics::EventType;
@@ -143,7 +144,7 @@ pub async fn collect_metrics(
         crate::tenant::tasks::warn_when_period_overrun(
             tick_at.elapsed(),
             metric_collection_interval,
-            "consumption_metrics_collect_metrics",
+            BackgroundLoopKind::ConsumptionMetricsCollectMetrics,
         );
     }
 }
@@ -268,6 +269,11 @@ async fn calculate_synthetic_size_worker(
             }
 
             if let Ok(tenant) = mgr::get_tenant(tenant_id, true).await {
+                // TODO should we use concurrent_background_tasks_rate_limit() here, like the other background tasks?
+                // We can put in some prioritization for consumption metrics.
+                // Same for the loop that fetches computed metrics.
+                // By using the same limiter, we centralize metrics collection for "start" and "finished" counters,
+                // which turns out is really handy to understand the system.
                 if let Err(e) = tenant.calculate_synthetic_size(cause, ctx).await {
                     error!("failed to calculate synthetic size for tenant {tenant_id}: {e:#}");
                 }
@@ -277,7 +283,7 @@ async fn calculate_synthetic_size_worker(
         crate::tenant::tasks::warn_when_period_overrun(
             tick_at.elapsed(),
             synthetic_size_calculation_interval,
-            "consumption_metrics_synthetic_size_worker",
+            BackgroundLoopKind::ConsumptionMetricsSyntheticSizeWorker,
         );
     }
 }
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index c154b4a4ca..eea3de0583 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1067,6 +1067,26 @@ pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
     .expect("Failed to register tenant_task_events metric")
 });
 
+pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_START_COUNT: Lazy<IntCounterVec> =
+    Lazy::new(|| {
+        register_int_counter_vec!(
+            "pageserver_background_loop_semaphore_wait_start_count",
+            "Counter for background loop concurrency-limiting semaphore acquire calls started",
+            &["task"],
+        )
+        .unwrap()
+    });
+
+pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_FINISH_COUNT: Lazy<IntCounterVec> =
+    Lazy::new(|| {
+        register_int_counter_vec!(
+            "pageserver_background_loop_semaphore_wait_finish_count",
+            "Counter for background loop concurrency-limiting semaphore acquire calls finished",
+            &["task"],
+        )
+        .unwrap()
+    });
+
 pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
     register_int_counter_vec!(
         "pageserver_background_loop_period_overrun_count",
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index df3ffd08d3..00c8ced68a 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -14,6 +14,73 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::completion;
 
+static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy<tokio::sync::Semaphore> =
+    once_cell::sync::Lazy::new(|| {
+        let total_threads = *task_mgr::BACKGROUND_RUNTIME_WORKER_THREADS;
+        let permits = usize::max(
+            1,
+            // while a lot of the work is done on spawn_blocking, we still do
+            // repartitioning in the async context. this should give leave us some workers
+            // unblocked to be blocked on other work, hopefully easing any outside visible
+            // effects of restarts.
+            //
+            // 6/8 is a guess; previously we ran with unlimited 8 and more from
+            // spawn_blocking.
+            (total_threads * 3).checked_div(4).unwrap_or(0),
+        );
+        assert_ne!(permits, 0, "we will not be adding in permits later");
+        assert!(
+            permits < total_threads,
+            "need threads avail for shorter work"
+        );
+        tokio::sync::Semaphore::new(permits)
+    });
+
+#[derive(Debug, PartialEq, Eq, Clone, Copy, strum_macros::IntoStaticStr)]
+#[strum(serialize_all = "snake_case")]
+pub(crate) enum BackgroundLoopKind {
+    Compaction,
+    Gc,
+    Eviction,
+    ConsumptionMetricsCollectMetrics,
+    ConsumptionMetricsSyntheticSizeWorker,
+}
+
+impl BackgroundLoopKind {
+    fn as_static_str(&self) -> &'static str {
+        let s: &'static str = self.into();
+        s
+    }
+}
+
+pub(crate) enum RateLimitError {
+    Cancelled,
+}
+
+pub(crate) async fn concurrent_background_tasks_rate_limit(
+    loop_kind: BackgroundLoopKind,
+    _ctx: &RequestContext,
+    cancel: &CancellationToken,
+) -> Result<impl Drop, RateLimitError> {
+    crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_START_COUNT
+        .with_label_values(&[loop_kind.as_static_str()])
+        .inc();
+    scopeguard::defer!(
+        crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_FINISH_COUNT.with_label_values(&[loop_kind.as_static_str()]).inc();
+    );
+    tokio::select! {
+        permit = CONCURRENT_BACKGROUND_TASKS.acquire() => {
+            match permit {
+                Ok(permit) => Ok(permit),
+                Err(_closed) => unreachable!("we never close the semaphore"),
+            }
+        },
+        _ = cancel.cancelled() => {
+            Err(RateLimitError::Cancelled)
+        }
+    }
+}
+
 /// Start per tenant background loops: compaction and gc.
 pub fn start_background_loops(
     tenant: &Arc<Tenant>,
@@ -116,7 +183,7 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                 }
             };
 
-            warn_when_period_overrun(started_at.elapsed(), period, "compaction");
+            warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Compaction);
 
             // Sleep
             if tokio::time::timeout(sleep_duration, cancel.cancelled())
@@ -184,7 +251,7 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                 }
             };
 
-            warn_when_period_overrun(started_at.elapsed(), period, "gc");
+            warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Gc);
 
             // Sleep
             if tokio::time::timeout(sleep_duration, cancel.cancelled())
@@ -258,7 +325,11 @@ pub(crate) async fn random_init_delay(
 }
 
 /// Attention: the `task` and `period` beocme labels of a pageserver-wide prometheus metric.
-pub(crate) fn warn_when_period_overrun(elapsed: Duration, period: Duration, task: &str) {
+pub(crate) fn warn_when_period_overrun(
+    elapsed: Duration,
+    period: Duration,
+    task: BackgroundLoopKind,
+) {
     // Duration::ZERO will happen because it's the "disable [bgtask]" value.
     if elapsed >= period && period != Duration::ZERO {
         // humantime does no significant digits clamping whereas Duration's debug is a bit more
@@ -267,11 +338,11 @@ pub(crate) fn warn_when_period_overrun(elapsed: Duration, period: Duration, task
         warn!(
             ?elapsed,
             period = %humantime::format_duration(period),
-            task,
+            ?task,
             "task iteration took longer than the configured period"
         );
         crate::metrics::BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT
-            .with_label_values(&[task, &format!("{}", period.as_secs())])
+            .with_label_values(&[task.as_static_str(), &format!("{}", period.as_secs())])
             .inc();
     }
 }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 537f776176..a6f92646a9 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -44,6 +44,7 @@ use crate::tenant::storage_layer::delta_layer::DeltaEntry;
 use crate::tenant::storage_layer::{
     DeltaLayerWriter, ImageLayerWriter, InMemoryLayer, LayerAccessStats, LayerFileName, RemoteLayer,
 };
+use crate::tenant::tasks::{BackgroundLoopKind, RateLimitError};
 use crate::tenant::timeline::logical_size::CurrentLogicalSize;
 use crate::tenant::{
     layer_map::{LayerMap, SearchResult},
@@ -684,37 +685,17 @@ impl Timeline {
     ) -> anyhow::Result<()> {
         const ROUNDS: usize = 2;
 
-        static CONCURRENT_COMPACTIONS: once_cell::sync::Lazy<tokio::sync::Semaphore> =
-            once_cell::sync::Lazy::new(|| {
-                let total_threads = *task_mgr::BACKGROUND_RUNTIME_WORKER_THREADS;
-                let permits = usize::max(
-                    1,
-                    // while a lot of the work is done on spawn_blocking, we still do
-                    // repartitioning in the async context. this should give leave us some workers
-                    // unblocked to be blocked on other work, hopefully easing any outside visible
-                    // effects of restarts.
-                    //
-                    // 6/8 is a guess; previously we ran with unlimited 8 and more from
-                    // spawn_blocking.
-                    (total_threads * 3).checked_div(4).unwrap_or(0),
-                );
-                assert_ne!(permits, 0, "we will not be adding in permits later");
-                assert!(
-                    permits < total_threads,
-                    "need threads avail for shorter work"
-                );
-                tokio::sync::Semaphore::new(permits)
-            });
-
         // this wait probably never needs any "long time spent" logging, because we already nag if
         // compaction task goes over it's period (20s) which is quite often in production.
-        let _permit = tokio::select! {
-            permit = CONCURRENT_COMPACTIONS.acquire() => {
-                permit
-            },
-            _ = cancel.cancelled() => {
-                return Ok(());
-            }
+        let _permit = match super::tasks::concurrent_background_tasks_rate_limit(
+            BackgroundLoopKind::Compaction,
+            ctx,
+            cancel,
+        )
+        .await
+        {
+            Ok(permit) => permit,
+            Err(RateLimitError::Cancelled) => return Ok(()),
         };
 
         let last_record_lsn = self.get_last_record_lsn();
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index 9bf31d85d4..38da993deb 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -30,6 +30,7 @@ use crate::{
     tenant::{
         config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
         storage_layer::PersistentLayer,
+        tasks::{BackgroundLoopKind, RateLimitError},
         timeline::EvictionError,
         LogicalSizeCalculationCause, Tenant,
     },
@@ -129,7 +130,11 @@ impl Timeline {
                     ControlFlow::Continue(()) => (),
                 }
                 let elapsed = start.elapsed();
-                crate::tenant::tasks::warn_when_period_overrun(elapsed, p.period, "eviction");
+                crate::tenant::tasks::warn_when_period_overrun(
+                    elapsed,
+                    p.period,
+                    BackgroundLoopKind::Eviction,
+                );
                 crate::metrics::EVICTION_ITERATION_DURATION
                     .get_metric_with_label_values(&[
                         &format!("{}", p.period.as_secs()),
@@ -150,6 +155,17 @@ impl Timeline {
     ) -> ControlFlow<()> {
         let now = SystemTime::now();
 
+        let _permit = match crate::tenant::tasks::concurrent_background_tasks_rate_limit(
+            BackgroundLoopKind::Eviction,
+            ctx,
+            cancel,
+        )
+        .await
+        {
+            Ok(permit) => permit,
+            Err(RateLimitError::Cancelled) => return ControlFlow::Break(()),
+        };
+
         // If we evict layers but keep cached values derived from those layers, then
         // we face a storm of on-demand downloads after pageserver restart.
         // The reason is that the restart empties the caches, and so, the values

From 4d13bae449284088c496137b1af7501966c8a84f Mon Sep 17 00:00:00 2001
From: Em Sharnoff <sharnoff@neon.tech>
Date: Tue, 17 Oct 2023 15:30:40 -0700
Subject: [PATCH 081/412] vm-monitor: Switch from memory.high to polling
 memory.stat (#5524)

tl;dr it's really hard to avoid throttling from memory.high, and it
counts tmpfs & page cache usage, so it's also hard to make sense of.

In the interest of fixing things quickly with something that should be
*good enough*, this PR switches to instead periodically fetch memory
statistics from the cgroup's memory.stat and use that data to determine
if and when we should upscale.

This PR fixes #5444, which has a lot more detail on the difficulties
we've hit with memory.high. This PR also supersedes #5488.
---
 libs/vm_monitor/README.md         |   4 +-
 libs/vm_monitor/src/cgroup.rs     | 786 ++++++++----------------------
 libs/vm_monitor/src/dispatcher.rs |  29 +-
 libs/vm_monitor/src/runner.rs     | 285 ++++++-----
 4 files changed, 366 insertions(+), 738 deletions(-)

diff --git a/libs/vm_monitor/README.md b/libs/vm_monitor/README.md
index 53cdecd9f3..fdd943077d 100644
--- a/libs/vm_monitor/README.md
+++ b/libs/vm_monitor/README.md
@@ -27,8 +27,8 @@ and old one if it exists.
 * the filecache: a struct that allows communication with the Postgres file cache.
 On startup, we connect to the filecache and hold on to the connection for the
 entire monitor lifetime.
-* the cgroup watcher: the `CgroupWatcher` manages the `neon-postgres` cgroup by
-listening for `memory.high` events and setting its `memory.{high,max}` values.
+* the cgroup watcher: the `CgroupWatcher` polls the `neon-postgres` cgroup's memory
+usage and sends rolling aggregates to the runner.
 * the runner: the runner marries the filecache and cgroup watcher together,
 communicating with the agent throught the `Dispatcher`, and then calling filecache
 and cgroup watcher functions as needed to upscale and downscale
diff --git a/libs/vm_monitor/src/cgroup.rs b/libs/vm_monitor/src/cgroup.rs
index 15e972505e..7160a42df2 100644
--- a/libs/vm_monitor/src/cgroup.rs
+++ b/libs/vm_monitor/src/cgroup.rs
@@ -1,161 +1,38 @@
-use std::{
-    fmt::{Debug, Display},
-    fs,
-    pin::pin,
-    sync::atomic::{AtomicU64, Ordering},
-};
+use std::fmt::{self, Debug, Formatter};
+use std::time::{Duration, Instant};
 
-use anyhow::{anyhow, bail, Context};
+use anyhow::{anyhow, Context};
 use cgroups_rs::{
-    freezer::FreezerController,
-    hierarchies::{self, is_cgroup2_unified_mode, UNIFIED_MOUNTPOINT},
+    hierarchies::{self, is_cgroup2_unified_mode},
     memory::MemController,
-    MaxValue,
-    Subsystem::{Freezer, Mem},
+    Subsystem,
 };
-use inotify::{EventStream, Inotify, WatchMask};
-use tokio::sync::mpsc::{self, error::TryRecvError};
-use tokio::time::{Duration, Instant};
-use tokio_stream::{Stream, StreamExt};
+use tokio::sync::watch;
 use tracing::{info, warn};
 
-use crate::protocol::Resources;
-use crate::MiB;
-
-/// Monotonically increasing counter of the number of memory.high events
-/// the cgroup has experienced.
-///
-/// We use this to determine if a modification to the `memory.events` file actually
-/// changed the `high` field. If not, we don't care about the change. When we
-/// read the file, we check the `high` field in the file against `MEMORY_EVENT_COUNT`
-/// to see if it changed since last time.
-pub static MEMORY_EVENT_COUNT: AtomicU64 = AtomicU64::new(0);
-
-/// Monotonically increasing counter that gives each cgroup event a unique id.
-///
-/// This allows us to answer questions like "did this upscale arrive before this
-/// memory.high?". This static is also used by the `Sequenced` type to "tag" values
-/// with a sequence number. As such, prefer to used the `Sequenced` type rather
-/// than this static directly.
-static EVENT_SEQUENCE_NUMBER: AtomicU64 = AtomicU64::new(0);
-
-/// A memory event type reported in memory.events.
-#[derive(Debug, Eq, PartialEq, Copy, Clone)]
-pub enum MemoryEvent {
-    Low,
-    High,
-    Max,
-    Oom,
-    OomKill,
-    OomGroupKill,
-}
-
-impl MemoryEvent {
-    fn as_str(&self) -> &str {
-        match self {
-            MemoryEvent::Low => "low",
-            MemoryEvent::High => "high",
-            MemoryEvent::Max => "max",
-            MemoryEvent::Oom => "oom",
-            MemoryEvent::OomKill => "oom_kill",
-            MemoryEvent::OomGroupKill => "oom_group_kill",
-        }
-    }
-}
-
-impl Display for MemoryEvent {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.write_str(self.as_str())
-    }
-}
-
 /// Configuration for a `CgroupWatcher`
 #[derive(Debug, Clone)]
 pub struct Config {
-    // The target difference between the total memory reserved for the cgroup
-    // and the value of the cgroup's memory.high.
-    //
-    // In other words, memory.high + oom_buffer_bytes will equal the total memory that the cgroup may
-    // use (equal to system memory, minus whatever's taken out for the file cache).
-    oom_buffer_bytes: u64,
+    /// Interval at which we should be fetching memory statistics
+    memory_poll_interval: Duration,
 
-    // The amount of memory, in bytes, below a proposed new value for
-    // memory.high that the cgroup's memory usage must be for us to downscale
-    //
-    // In other words, we can downscale only when:
-    //
-    //   memory.current + memory_high_buffer_bytes < (proposed) memory.high
-    //
-    // TODO: there's some minor issues with this approach -- in particular, that we might have
-    // memory in use by the kernel's page cache that we're actually ok with getting rid of.
-    pub(crate) memory_high_buffer_bytes: u64,
-
-    // The maximum duration, in milliseconds, that we're allowed to pause
-    // the cgroup for while waiting for the autoscaler-agent to upscale us
-    max_upscale_wait: Duration,
-
-    // The required minimum time, in milliseconds, that we must wait before re-freezing
-    // the cgroup while waiting for the autoscaler-agent to upscale us.
-    do_not_freeze_more_often_than: Duration,
-
-    // The amount of memory, in bytes, that we should periodically increase memory.high
-    // by while waiting for the autoscaler-agent to upscale us.
-    //
-    // This exists to avoid the excessive throttling that happens when a cgroup is above its
-    // memory.high for too long. See more here:
-    // https://github.com/neondatabase/autoscaling/issues/44#issuecomment-1522487217
-    memory_high_increase_by_bytes: u64,
-
-    // The period, in milliseconds, at which we should repeatedly increase the value
-    // of the cgroup's memory.high while we're waiting on upscaling and memory.high
-    // is still being hit.
-    //
-    // Technically speaking, this actually serves as a rate limit to moderate responding to
-    // memory.high events, but these are roughly equivalent if the process is still allocating
-    // memory.
-    memory_high_increase_every: Duration,
-}
-
-impl Config {
-    /// Calculate the new value for the cgroups memory.high based on system memory
-    pub fn calculate_memory_high_value(&self, total_system_mem: u64) -> u64 {
-        total_system_mem.saturating_sub(self.oom_buffer_bytes)
-    }
+    /// The number of samples used in constructing aggregated memory statistics
+    memory_history_len: usize,
+    /// The number of most recent samples that will be periodically logged.
+    ///
+    /// Each sample is logged exactly once. Increasing this value means that recent samples will be
+    /// logged less frequently, and vice versa.
+    ///
+    /// For simplicity, this value must be greater than or equal to `memory_history_len`.
+    memory_history_log_interval: usize,
 }
 
 impl Default for Config {
     fn default() -> Self {
         Self {
-            oom_buffer_bytes: 100 * MiB,
-            memory_high_buffer_bytes: 100 * MiB,
-            // while waiting for upscale, don't freeze for more than 20ms every 1s
-            max_upscale_wait: Duration::from_millis(20),
-            do_not_freeze_more_often_than: Duration::from_millis(1000),
-            // while waiting for upscale, increase memory.high by 10MiB every 25ms
-            memory_high_increase_by_bytes: 10 * MiB,
-            memory_high_increase_every: Duration::from_millis(25),
-        }
-    }
-}
-
-/// Used to represent data that is associated with a certain point in time, such
-/// as an upscale request or memory.high event.
-///
-/// Internally, creating a `Sequenced` uses a static atomic counter to obtain
-/// a unique sequence number. Sequence numbers are monotonically increasing,
-/// allowing us to answer questions like "did this upscale happen after this
-/// memory.high event?" by comparing the sequence numbers of the two events.
-#[derive(Debug, Clone)]
-pub struct Sequenced<T> {
-    seqnum: u64,
-    data: T,
-}
-
-impl<T> Sequenced<T> {
-    pub fn new(data: T) -> Self {
-        Self {
-            seqnum: EVENT_SEQUENCE_NUMBER.fetch_add(1, Ordering::AcqRel),
-            data,
+            memory_poll_interval: Duration::from_millis(100),
+            memory_history_len: 5, // use 500ms of history for decision-making
+            memory_history_log_interval: 20, // but only log every ~2s (otherwise it's spammy)
         }
     }
 }
@@ -170,74 +47,14 @@ impl<T> Sequenced<T> {
 pub struct CgroupWatcher {
     pub config: Config,
 
-    /// The sequence number of the last upscale.
-    ///
-    /// If we receive a memory.high event that has a _lower_ sequence number than
-    /// `last_upscale_seqnum`, then we know it occured before the upscale, and we
-    /// can safely ignore it.
-    ///
-    /// Note: Like the `events` field, this doesn't _need_ interior mutability but we
-    /// use it anyways so that methods take `&self`, not `&mut self`.
-    last_upscale_seqnum: AtomicU64,
-
-    /// A channel on which we send messages to request upscale from the dispatcher.
-    upscale_requester: mpsc::Sender<()>,
-
     /// The actual cgroup we are watching and managing.
     cgroup: cgroups_rs::Cgroup,
 }
 
-/// Read memory.events for the desired event type.
-///
-/// `path` specifies the path to the desired `memory.events` file.
-/// For more info, see the `memory.events` section of the [kernel docs]
-/// <https://docs.kernel.org/admin-guide/cgroup-v2.html#memory-interface-files>
-fn get_event_count(path: &str, event: MemoryEvent) -> anyhow::Result<u64> {
-    let contents = fs::read_to_string(path)
-        .with_context(|| format!("failed to read memory.events from {path}"))?;
-
-    // Then contents of the file look like:
-    // low 42
-    // high 101
-    // ...
-    contents
-        .lines()
-        .filter_map(|s| s.split_once(' '))
-        .find(|(e, _)| *e == event.as_str())
-        .ok_or_else(|| anyhow!("failed to find entry for memory.{event} events in {path}"))
-        .and_then(|(_, count)| {
-            count
-                .parse::<u64>()
-                .with_context(|| format!("failed to parse memory.{event} as u64"))
-        })
-}
-
-/// Create an event stream that produces events whenever the file at the provided
-/// path is modified.
-fn create_file_watcher(path: &str) -> anyhow::Result<EventStream<[u8; 1024]>> {
-    info!("creating file watcher for {path}");
-    let inotify = Inotify::init().context("failed to initialize file watcher")?;
-    inotify
-        .watches()
-        .add(path, WatchMask::MODIFY)
-        .with_context(|| format!("failed to start watching {path}"))?;
-    inotify
-        // The inotify docs use [0u8; 1024] so we'll just copy them. We only need
-        // to store one event at a time - if the event gets written over, that's
-        // ok. We still see that there is an event. For more information, see:
-        // https://man7.org/linux/man-pages/man7/inotify.7.html
-        .into_event_stream([0u8; 1024])
-        .context("failed to start inotify event stream")
-}
-
 impl CgroupWatcher {
     /// Create a new `CgroupWatcher`.
     #[tracing::instrument(skip_all, fields(%name))]
-    pub fn new(
-        name: String,
-        // A channel on which to send upscale requests
-        upscale_requester: mpsc::Sender<()>,
-    ) -> anyhow::Result<(Self, impl Stream<Item = Sequenced<u64>>)> {
+    pub fn new(name: String) -> anyhow::Result<Self> {
         // TODO: clarify exactly why we need v2
         // Make sure cgroups v2 (aka unified) are supported
         if !is_cgroup2_unified_mode() {
@@ -245,410 +62,203 @@ impl CgroupWatcher {
         }
         let cgroup = cgroups_rs::Cgroup::load(hierarchies::auto(), &name);
 
-        // Start monitoring the cgroup for memory events. In general, for
-        // cgroups v2 (aka unified), metrics are reported in files like
-        // > `/sys/fs/cgroup/{name}/{metric}`
-        // We are looking for `memory.high` events, which are stored in the
-        // file `memory.events`. For more info, see the `memory.events` section
-        // of https://docs.kernel.org/admin-guide/cgroup-v2.html#memory-interface-files
-        let path = format!("{}/{}/memory.events", UNIFIED_MOUNTPOINT, &name);
-        let memory_events = create_file_watcher(&path)
-            .with_context(|| format!("failed to create event watcher for {path}"))?
-            // This would be nice with with .inspect_err followed by .ok
-            .filter_map(move |_| match get_event_count(&path, MemoryEvent::High) {
-                Ok(high) => Some(high),
-                Err(error) => {
-                    // TODO: Might want to just panic here
-                    warn!(?error, "failed to read high events count from {}", &path);
-                    None
-                }
-            })
-            // Only report the event if the memory.high count increased
-            .filter_map(|high| {
-                if MEMORY_EVENT_COUNT.fetch_max(high, Ordering::AcqRel) < high {
-                    Some(high)
-                } else {
-                    None
-                }
-            })
-            .map(Sequenced::new);
-
-        let initial_count = get_event_count(
-            &format!("{}/{}/memory.events", UNIFIED_MOUNTPOINT, &name),
-            MemoryEvent::High,
-        )?;
-
-        info!(initial_count, "initial memory.high event count");
-
-        // Hard update `MEMORY_EVENT_COUNT` since there could have been processes
-        // running in the cgroup before that caused it to be non-zero.
-        MEMORY_EVENT_COUNT.fetch_max(initial_count, Ordering::AcqRel);
-
-        Ok((
-            Self {
-                cgroup,
-                upscale_requester,
-                last_upscale_seqnum: AtomicU64::new(0),
-                config: Default::default(),
-            },
-            memory_events,
-        ))
+        Ok(Self {
+            cgroup,
+            config: Default::default(),
+        })
     }
 
     /// The entrypoint for the `CgroupWatcher`.
     #[tracing::instrument(skip_all)]
-    pub async fn watch<E>(
+    pub async fn watch(
         &self,
-        // These are ~dependency injected~ (fancy, I know) because this function
-        // should never return.
-        // -> therefore: when we tokio::spawn it, we don't await the JoinHandle.
-        // -> therefore: if we want to stick it in an Arc so many threads can access
-        //    it, methods can never take mutable access.
-        //     - note: we use the Arc strategy so that a) we can call this function
-        //             right here and b) the runner can call the set/get_memory methods
-        // -> since calling recv() on a tokio::sync::mpsc::Receiver takes &mut self,
-        //    we just pass them in here instead of holding them in fields, as that
-        //    would require this method to take &mut self.
-        mut upscales: mpsc::Receiver<Sequenced<Resources>>,
-        events: E,
-    ) -> anyhow::Result<()>
-    where
-        E: Stream<Item = Sequenced<u64>>,
-    {
-        let mut wait_to_freeze = pin!(tokio::time::sleep(Duration::ZERO));
-        let mut last_memory_high_increase_at: Option<Instant> = None;
-        let mut events = pin!(events);
-
-        // Are we waiting to be upscaled? Could be true if we request upscale due
-        // to a memory.high event and it does not arrive in time.
-        let mut waiting_on_upscale = false;
-
-        loop {
-            tokio::select! {
-                upscale = upscales.recv() => {
-                    let Sequenced { seqnum, data } = upscale
-                        .context("failed to listen on upscale notification channel")?;
-                    waiting_on_upscale = false;
-                    last_memory_high_increase_at = None;
-                    self.last_upscale_seqnum.store(seqnum, Ordering::Release);
-                    info!(cpu = data.cpu, mem_bytes = data.mem, "received upscale");
-                }
-                event = events.next() => {
-                    let Some(Sequenced { seqnum, .. }) = event else {
-                        bail!("failed to listen for memory.high events")
-                    };
-                    // The memory.high came before our last upscale, so we consider
-                    // it resolved
-                    if self.last_upscale_seqnum.fetch_max(seqnum, Ordering::AcqRel) > seqnum {
-                        info!(
-                            "received memory.high event, but it came before our last upscale -> ignoring it"
-                        );
-                        continue;
-                    }
-
-                    // The memory.high came after our latest upscale. We don't
-                    // want to do anything yet, so peek the next event in hopes
-                    // that it's an upscale.
-                    if let Some(upscale_num) = self
-                        .upscaled(&mut upscales)
-                        .context("failed to check if we were upscaled")?
-                    {
-                        if upscale_num > seqnum {
-                            info!(
-                                "received memory.high event, but it came before our last upscale -> ignoring it"
-                            );
-                            continue;
-                        }
-                    }
-
-                    // If it's been long enough since we last froze, freeze the
-                    // cgroup and request upscale
-                    if wait_to_freeze.is_elapsed() {
-                        info!("received memory.high event -> requesting upscale");
-                        waiting_on_upscale = self
-                            .handle_memory_high_event(&mut upscales)
-                            .await
-                            .context("failed to handle upscale")?;
-                        wait_to_freeze
-                            .as_mut()
-                            .reset(Instant::now() + self.config.do_not_freeze_more_often_than);
-                        continue;
-                    }
-
-                    // Ok, we can't freeze, just request upscale
-                    if !waiting_on_upscale {
-                        info!("received memory.high event, but too soon to refreeze -> requesting upscale");
-
-                        // Make check to make sure we haven't been upscaled in the
-                        // meantine (can happen if the agent independently decides
-                        // to upscale us again)
-                        if self
-                            .upscaled(&mut upscales)
-                            .context("failed to check if we were upscaled")?
-                            .is_some()
-                        {
-                            info!("no need to request upscaling because we got upscaled");
-                            continue;
-                        }
-                        self.upscale_requester
-                            .send(())
-                            .await
-                            .context("failed to request upscale")?;
-                        waiting_on_upscale = true;
-                        continue;
-                    }
-
-                    // Shoot, we can't freeze or and we're still waiting on upscale,
-                    // increase memory.high to reduce throttling
-                    let can_increase_memory_high = match last_memory_high_increase_at {
-                        None => true,
-                        Some(t) => t.elapsed() > self.config.memory_high_increase_every,
-                    };
-                    if can_increase_memory_high {
-                        info!(
-                            "received memory.high event, \
-                            but too soon to refreeze and already requested upscale \
-                            -> increasing memory.high"
-                        );
-
-                        // Make check to make sure we haven't been upscaled in the
-                        // meantine (can happen if the agent independently decides
-                        // to upscale us again)
-                        if self
-                            .upscaled(&mut upscales)
-                            .context("failed to check if we were upscaled")?
-                            .is_some()
-                        {
-                            info!("no need to increase memory.high because got upscaled");
-                            continue;
-                        }
-
-                        // Request upscale anyways (the agent will handle deduplicating
-                        // requests)
-                        self.upscale_requester
-                            .send(())
-                            .await
-                            .context("failed to request upscale")?;
-
-                        let memory_high =
-                            self.get_memory_high_bytes().context("failed to get memory.high")?;
-                        let new_high = memory_high + self.config.memory_high_increase_by_bytes;
-                        info!(
-                            current_high_bytes = memory_high,
-                            new_high_bytes = new_high,
-                            "updating memory.high"
-                        );
-                        self.set_memory_high_bytes(new_high)
-                            .context("failed to set memory.high")?;
-                        last_memory_high_increase_at = Some(Instant::now());
-                        continue;
-                    }
-
-                    info!("received memory.high event, but can't do anything");
-                }
-            };
-        }
-    }
-
-    /// Handle a `memory.high`, returning whether we are still waiting on upscale
-    /// by the time the function returns.
-    ///
-    /// The general plan for handling a `memory.high` event is as follows:
-    /// 1. Freeze the cgroup
-    /// 2. Start a timer for `self.config.max_upscale_wait`
-    /// 3. Request upscale
-    /// 4. After the timer elapses or we receive upscale, thaw the cgroup.
-    /// 5. Return whether or not we are still waiting for upscale. If we are,
-    ///    we'll increase the cgroups memory.high to avoid getting oom killed
-    #[tracing::instrument(skip_all)]
-    async fn handle_memory_high_event(
-        &self,
-        upscales: &mut mpsc::Receiver<Sequenced<Resources>>,
-    ) -> anyhow::Result<bool> {
-        // Immediately freeze the cgroup before doing anything else.
-        info!("received memory.high event -> freezing cgroup");
-        self.freeze().context("failed to freeze cgroup")?;
-
-        // We'll use this for logging durations
-        let start_time = Instant::now();
-
-        // Await the upscale until we have to unfreeze
-        let timed =
-            tokio::time::timeout(self.config.max_upscale_wait, self.await_upscale(upscales));
-
-        // Request the upscale
-        info!(
-            wait = ?self.config.max_upscale_wait,
-            "sending request for immediate upscaling",
-        );
-        self.upscale_requester
-            .send(())
-            .await
-            .context("failed to request upscale")?;
-
-        let waiting_on_upscale = match timed.await {
-            Ok(Ok(())) => {
-                info!(elapsed = ?start_time.elapsed(), "received upscale in time");
-                false
-            }
-            // **important**: unfreeze the cgroup before ?-reporting the error
-            Ok(Err(e)) => {
-                info!("error waiting for upscale -> thawing cgroup");
-                self.thaw()
-                    .context("failed to thaw cgroup after errored waiting for upscale")?;
-                Err(e.context("failed to await upscale"))?
-            }
-            Err(_) => {
-                info!(elapsed = ?self.config.max_upscale_wait, "timed out waiting for upscale");
-                true
-            }
-        };
-
-        info!("thawing cgroup");
-        self.thaw().context("failed to thaw cgroup")?;
-
-        Ok(waiting_on_upscale)
-    }
-
-    /// Checks whether we were just upscaled, returning the upscale's sequence
-    /// number if so.
-    #[tracing::instrument(skip_all)]
-    fn upscaled(
-        &self,
-        upscales: &mut mpsc::Receiver<Sequenced<Resources>>,
-    ) -> anyhow::Result<Option<u64>> {
-        let Sequenced { seqnum, data } = match upscales.try_recv() {
-            Ok(upscale) => upscale,
-            Err(TryRecvError::Empty) => return Ok(None),
-            Err(TryRecvError::Disconnected) => {
-                bail!("upscale notification channel was disconnected")
-            }
-        };
-
-        // Make sure to update the last upscale sequence number
-        self.last_upscale_seqnum.store(seqnum, Ordering::Release);
-        info!(cpu = data.cpu, mem_bytes = data.mem, "received upscale");
-        Ok(Some(seqnum))
-    }
-
-    /// Await an upscale event, discarding any `memory.high` events received in
-    /// the process.
-    ///
-    /// This is used in `handle_memory_high_event`, where we need to listen
-    /// for upscales in particular so we know if we can thaw the cgroup early.
-    #[tracing::instrument(skip_all)]
-    async fn await_upscale(
-        &self,
-        upscales: &mut mpsc::Receiver<Sequenced<Resources>>,
+        updates: watch::Sender<(Instant, MemoryHistory)>,
     ) -> anyhow::Result<()> {
-        let Sequenced { seqnum, .. } = upscales
-            .recv()
-            .await
-            .context("error listening for upscales")?;
+        // this requirement makes the code a bit easier to work with; see the config for more.
+        assert!(self.config.memory_history_len <= self.config.memory_history_log_interval);
 
-        self.last_upscale_seqnum.store(seqnum, Ordering::Release);
-        Ok(())
-    }
+        let mut ticker = tokio::time::interval(self.config.memory_poll_interval);
+        ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
+        // ticker.reset_immediately(); // FIXME: enable this once updating to tokio >= 1.30.0
 
-    /// Get the cgroup's name.
-    pub fn path(&self) -> &str {
-        self.cgroup.path()
-    }
-}
+        let mem_controller = self.memory()?;
 
-// Methods for manipulating the actual cgroup
-impl CgroupWatcher {
-    /// Get a handle on the freezer subsystem.
-    fn freezer(&self) -> anyhow::Result<&FreezerController> {
-        if let Some(Freezer(freezer)) = self
-            .cgroup
-            .subsystems()
-            .iter()
-            .find(|sub| matches!(sub, Freezer(_)))
-        {
-            Ok(freezer)
-        } else {
-            anyhow::bail!("could not find freezer subsystem")
+        // buffer for samples that will be logged. once full, it remains so.
+        let history_log_len = self.config.memory_history_log_interval;
+        let mut history_log_buf = vec![MemoryStatus::zeroed(); history_log_len];
+
+        for t in 0_u64.. {
+            ticker.tick().await;
+
+            let now = Instant::now();
+            let mem = Self::memory_usage(mem_controller);
+
+            let i = t as usize % history_log_len;
+            history_log_buf[i] = mem;
+
+            // We're taking *at most* memory_history_len values; we may be bounded by the total
+            // number of samples that have come in so far.
+            let samples_count = (t + 1).min(self.config.memory_history_len as u64) as usize;
+            // NB: in `ring_buf_recent_values_iter`, `i` is *inclusive*, which matches the fact
+            // that we just inserted a value there, so the end of the iterator will *include* the
+            // value at i, rather than stopping just short of it.
+            let samples = ring_buf_recent_values_iter(&history_log_buf, i, samples_count);
+
+            let summary = MemoryHistory {
+                avg_non_reclaimable: samples.map(|h| h.non_reclaimable).sum::<u64>()
+                    / samples_count as u64,
+                samples_count,
+                samples_span: self.config.memory_poll_interval * (samples_count - 1) as u32,
+            };
+
+            // Log the current history if it's time to do so. Because `history_log_buf` has length
+            // equal to the logging interval, we can just log the entire buffer every time we set
+            // the last entry, which also means that for this log line, we can ignore that it's a
+            // ring buffer (because all the entries are in order of increasing time).
+            if i == history_log_len - 1 {
+                info!(
+                    history = ?MemoryStatus::debug_slice(&history_log_buf),
+                    summary = ?summary,
+                    "Recent cgroup memory statistics history"
+                );
+            }
+
+            updates
+                .send((now, summary))
+                .context("failed to send MemoryHistory")?;
         }
-    }
 
-    /// Attempt to freeze the cgroup.
-    pub fn freeze(&self) -> anyhow::Result<()> {
-        self.freezer()
-            .context("failed to get freezer subsystem")?
-            .freeze()
-            .context("failed to freeze")
-    }
-
-    /// Attempt to thaw the cgroup.
-    pub fn thaw(&self) -> anyhow::Result<()> {
-        self.freezer()
-            .context("failed to get freezer subsystem")?
-            .thaw()
-            .context("failed to thaw")
+        unreachable!()
     }
 
     /// Get a handle on the memory subsystem.
-    ///
-    /// Note: this method does not require `self.memory_update_lock` because
-    /// getting a handle to the subsystem does not access any of the files we
-    /// care about, such as memory.high and memory.events
     fn memory(&self) -> anyhow::Result<&MemController> {
-        if let Some(Mem(memory)) = self
-            .cgroup
+        self.cgroup
             .subsystems()
             .iter()
-            .find(|sub| matches!(sub, Mem(_)))
-        {
-            Ok(memory)
-        } else {
-            anyhow::bail!("could not find memory subsystem")
-        }
-    }
-
-    /// Get cgroup current memory usage.
-    pub fn current_memory_usage(&self) -> anyhow::Result<u64> {
-        Ok(self
-            .memory()
-            .context("failed to get memory subsystem")?
-            .memory_stat()
-            .usage_in_bytes)
-    }
-
-    /// Set cgroup memory.high threshold.
-    pub fn set_memory_high_bytes(&self, bytes: u64) -> anyhow::Result<()> {
-        self.set_memory_high_internal(MaxValue::Value(u64::min(bytes, i64::MAX as u64) as i64))
-    }
-
-    /// Set the cgroup's memory.high to 'max', disabling it.
-    pub fn unset_memory_high(&self) -> anyhow::Result<()> {
-        self.set_memory_high_internal(MaxValue::Max)
-    }
-
-    fn set_memory_high_internal(&self, value: MaxValue) -> anyhow::Result<()> {
-        self.memory()
-            .context("failed to get memory subsystem")?
-            .set_mem(cgroups_rs::memory::SetMemory {
-                low: None,
-                high: Some(value),
-                min: None,
-                max: None,
+            .find_map(|sub| match sub {
+                Subsystem::Mem(c) => Some(c),
+                _ => None,
             })
-            .map_err(anyhow::Error::from)
+            .ok_or_else(|| anyhow!("could not find memory subsystem"))
     }
 
-    /// Get memory.high threshold.
-    pub fn get_memory_high_bytes(&self) -> anyhow::Result<u64> {
-        let high = self
-            .memory()
-            .context("failed to get memory subsystem while getting memory statistics")?
-            .get_mem()
-            .map(|mem| mem.high)
-            .context("failed to get memory statistics from subsystem")?;
-        match high {
-            Some(MaxValue::Max) => Ok(i64::MAX as u64),
-            Some(MaxValue::Value(high)) => Ok(high as u64),
-            None => anyhow::bail!("failed to read memory.high from memory subsystem"),
+    /// Given a handle on the memory subsystem, returns the current memory information
+    fn memory_usage(mem_controller: &MemController) -> MemoryStatus {
+        let stat = mem_controller.memory_stat().stat;
+        MemoryStatus {
+            non_reclaimable: stat.active_anon + stat.inactive_anon,
         }
     }
 }
+
+// Helper function for `CgroupWatcher::watch`
+fn ring_buf_recent_values_iter<T>(
+    buf: &[T],
+    last_value_idx: usize,
+    count: usize,
+) -> impl '_ + Iterator<Item = &T> {
+    // Assertion carried over from `CgroupWatcher::watch`, to make the logic in this function
+    // easier (we only have to add `buf.len()` once, rather than a dynamic number of times).
+    assert!(count <= buf.len());
+
+    buf.iter()
+        // 'cycle' because the values could wrap around
+        .cycle()
+        // with 'cycle', this skip is more like 'offset', and functionally this is
+        // offsettting by 'last_value_idx - count (mod buf.len())', but we have to be
+        // careful to avoid underflow, so we pre-add buf.len().
+        // The '+ 1' is because `last_value_idx` is inclusive, rather than exclusive.
+        .skip((buf.len() + last_value_idx + 1 - count) % buf.len())
+        .take(count)
+}
+
+/// Summary of recent memory usage
+#[derive(Debug, Copy, Clone)]
+pub struct MemoryHistory {
+    /// Rolling average of non-reclaimable memory usage samples over the last `history_period`
+    pub avg_non_reclaimable: u64,
+
+    /// The number of samples used to construct this summary
+    pub samples_count: usize,
+    /// Total timespan between the first and last sample used for this summary
+    pub samples_span: Duration,
+}
+
+#[derive(Debug, Copy, Clone)]
+pub struct MemoryStatus {
+    non_reclaimable: u64,
+}
+
+impl MemoryStatus {
+    fn zeroed() -> Self {
+        MemoryStatus { non_reclaimable: 0 }
+    }
+
+    fn debug_slice(slice: &[Self]) -> impl '_ + Debug {
+        struct DS<'a>(&'a [MemoryStatus]);
+
+        impl<'a> Debug for DS<'a> {
+            fn fmt(&self, f: &mut Formatter) -> fmt::Result {
+                f.debug_struct("[MemoryStatus]")
+                    .field(
+                        "non_reclaimable[..]",
+                        &Fields(self.0, |stat: &MemoryStatus| {
+                            BytesToGB(stat.non_reclaimable)
+                        }),
+                    )
+                    .finish()
+            }
+        }
+
+        struct Fields<'a, F>(&'a [MemoryStatus], F);
+
+        impl<'a, F: Fn(&MemoryStatus) -> T, T: Debug> Debug for Fields<'a, F> {
+            fn fmt(&self, f: &mut Formatter) -> fmt::Result {
+                f.debug_list().entries(self.0.iter().map(&self.1)).finish()
+            }
+        }
+
+        struct BytesToGB(u64);
+
+        impl Debug for BytesToGB {
+            fn fmt(&self, f: &mut Formatter) -> fmt::Result {
+                f.write_fmt(format_args!(
+                    "{:.3}Gi",
+                    self.0 as f64 / (1_u64 << 30) as f64
+                ))
+            }
+        }
+
+        DS(slice)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    #[test]
+    fn ring_buf_iter() {
+        let buf = vec![0_i32, 1, 2, 3, 4, 5, 6, 7, 8, 9];
+
+        let values = |offset, count| {
+            super::ring_buf_recent_values_iter(&buf, offset, count)
+                .copied()
+                .collect::<Vec<i32>>()
+        };
+
+        // Boundary conditions: start, end, and entire thing:
+        assert_eq!(values(0, 1), [0]);
+        assert_eq!(values(3, 4), [0, 1, 2, 3]);
+        assert_eq!(values(9, 4), [6, 7, 8, 9]);
+        assert_eq!(values(9, 10), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]);
+
+        // "normal" operation: no wraparound
+        assert_eq!(values(7, 4), [4, 5, 6, 7]);
+
+        // wraparound:
+        assert_eq!(values(0, 4), [7, 8, 9, 0]);
+        assert_eq!(values(1, 4), [8, 9, 0, 1]);
+        assert_eq!(values(2, 4), [9, 0, 1, 2]);
+        assert_eq!(values(2, 10), [3, 4, 5, 6, 7, 8, 9, 0, 1, 2]);
+    }
+}
diff --git a/libs/vm_monitor/src/dispatcher.rs b/libs/vm_monitor/src/dispatcher.rs
index 109a68fff1..c76baf04e7 100644
--- a/libs/vm_monitor/src/dispatcher.rs
+++ b/libs/vm_monitor/src/dispatcher.rs
@@ -12,12 +12,10 @@ use futures::{
     stream::{SplitSink, SplitStream},
     SinkExt, StreamExt,
 };
-use tokio::sync::mpsc;
 use tracing::info;
 
-use crate::cgroup::Sequenced;
 use crate::protocol::{
-    OutboundMsg, ProtocolRange, ProtocolResponse, ProtocolVersion, Resources, PROTOCOL_MAX_VERSION,
+    OutboundMsg, ProtocolRange, ProtocolResponse, ProtocolVersion, PROTOCOL_MAX_VERSION,
     PROTOCOL_MIN_VERSION,
 };
 
@@ -36,13 +34,6 @@ pub struct Dispatcher {
     /// We send messages to the agent through `sink`
     sink: SplitSink<WebSocket, Message>,
 
-    /// Used to notify the cgroup when we are upscaled.
-    pub(crate) notify_upscale_events: mpsc::Sender<Sequenced<Resources>>,
-
-    /// When the cgroup requests upscale it will send on this channel. In response
-    /// we send an `UpscaleRequst` to the agent.
-    pub(crate) request_upscale_events: mpsc::Receiver<()>,
-
     /// The protocol version we have agreed to use with the agent. This is negotiated
     /// during the creation of the dispatcher, and should be the highest shared protocol
     /// version.
@@ -61,11 +52,7 @@ impl Dispatcher {
     /// 1. Wait for the agent to sent the range of protocols it supports.
     /// 2. Send a protocol version that works for us as well, or an error if there
     ///    is no compatible version.
-    pub async fn new(
-        stream: WebSocket,
-        notify_upscale_events: mpsc::Sender<Sequenced<Resources>>,
-        request_upscale_events: mpsc::Receiver<()>,
-    ) -> anyhow::Result<Self> {
+    pub async fn new(stream: WebSocket) -> anyhow::Result<Self> {
         let (mut sink, mut source) = stream.split();
 
         // Figure out the highest protocol version we both support
@@ -119,22 +106,10 @@ impl Dispatcher {
         Ok(Self {
             sink,
             source,
-            notify_upscale_events,
-            request_upscale_events,
             proto_version: highest_shared_version,
         })
     }
 
-    /// Notify the cgroup manager that we have received upscale and wait for
-    /// the acknowledgement.
-    #[tracing::instrument(skip_all, fields(?resources))]
-    pub async fn notify_upscale(&self, resources: Sequenced<Resources>) -> anyhow::Result<()> {
-        self.notify_upscale_events
-            .send(resources)
-            .await
-            .context("failed to send resources and oneshot sender across channel")
-    }
-
     /// Send a message to the agent.
     ///
     /// Although this function is small, it has one major benefit: it is the only
diff --git a/libs/vm_monitor/src/runner.rs b/libs/vm_monitor/src/runner.rs
index b0ee5f0310..a7a0995797 100644
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -5,18 +5,16 @@
 //! all functionality.
 
 use std::fmt::Debug;
-use std::sync::Arc;
 use std::time::{Duration, Instant};
 
 use anyhow::{bail, Context};
 use axum::extract::ws::{Message, WebSocket};
 use futures::StreamExt;
-use tokio::sync::broadcast;
-use tokio::sync::mpsc;
+use tokio::sync::{broadcast, watch};
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, warn};
 
-use crate::cgroup::{CgroupWatcher, Sequenced};
+use crate::cgroup::{self, CgroupWatcher};
 use crate::dispatcher::Dispatcher;
 use crate::filecache::{FileCacheConfig, FileCacheState};
 use crate::protocol::{InboundMsg, InboundMsgKind, OutboundMsg, OutboundMsgKind, Resources};
@@ -28,7 +26,7 @@ use crate::{bytes_to_mebibytes, get_total_system_memory, spawn_with_cancel, Args
 pub struct Runner {
     config: Config,
     filecache: Option<FileCacheState>,
-    cgroup: Option<Arc<CgroupWatcher>>,
+    cgroup: Option<CgroupState>,
     dispatcher: Dispatcher,
 
     /// We "mint" new message ids by incrementing this counter and taking the value.
@@ -45,6 +43,14 @@ pub struct Runner {
     kill: broadcast::Receiver<()>,
 }
 
+#[derive(Debug)]
+struct CgroupState {
+    watcher: watch::Receiver<(Instant, cgroup::MemoryHistory)>,
+    /// If [`cgroup::MemoryHistory::avg_non_reclaimable`] exceeds `threshold`, we send upscale
+    /// requests.
+    threshold: u64,
+}
+
 /// Configuration for a `Runner`
 #[derive(Debug)]
 pub struct Config {
@@ -62,16 +68,56 @@ pub struct Config {
     /// upscale resource amounts (because we might not *actually* have been upscaled yet). This field
     /// should be removed once we have a better solution there.
     sys_buffer_bytes: u64,
+
+    /// Minimum fraction of total system memory reserved *before* the the cgroup threshold; in
+    /// other words, providing a ceiling for the highest value of the threshold by enforcing that
+    /// there's at least `cgroup_min_overhead_fraction` of the total memory remaining beyond the
+    /// threshold.
+    ///
+    /// For example, a value of `0.1` means that 10% of total memory must remain after exceeding
+    /// the threshold, so the value of the cgroup threshold would always be capped at 90% of total
+    /// memory.
+    ///
+    /// The default value of `0.15` means that we *guarantee* sending upscale requests if the
+    /// cgroup is using more than 85% of total memory (even if we're *not* separately reserving
+    /// memory for the file cache).
+    cgroup_min_overhead_fraction: f64,
+
+    cgroup_downscale_threshold_buffer_bytes: u64,
 }
 
 impl Default for Config {
     fn default() -> Self {
         Self {
             sys_buffer_bytes: 100 * MiB,
+            cgroup_min_overhead_fraction: 0.15,
+            cgroup_downscale_threshold_buffer_bytes: 100 * MiB,
         }
     }
 }
 
+impl Config {
+    fn cgroup_threshold(&self, total_mem: u64, file_cache_disk_size: u64) -> u64 {
+        // If the file cache is in tmpfs, then it will count towards shmem usage of the cgroup,
+        // and thus be non-reclaimable, so we should allow for additional memory usage.
+        //
+        // If the file cache sits on disk, our desired stable system state is for it to be fully
+        // page cached (its contents should only be paged to/from disk in situations where we can't
+        // upscale fast enough). Page-cached memory is reclaimable, so we need to lower the
+        // threshold for non-reclaimable memory so we scale up *before* the kernel starts paging
+        // out the file cache.
+        let memory_remaining_for_cgroup = total_mem.saturating_sub(file_cache_disk_size);
+
+        // Even if we're not separately making room for the file cache (if it's in tmpfs), we still
+        // want our threshold to be met gracefully instead of letting postgres get OOM-killed.
+        // So we guarantee that there's at least `cgroup_min_overhead_fraction` of total memory
+        // remaining above the threshold.
+        let max_threshold = (total_mem as f64 * (1.0 - self.cgroup_min_overhead_fraction)) as u64;
+
+        memory_remaining_for_cgroup.min(max_threshold)
+    }
+}
+
 impl Runner {
     /// Create a new monitor.
     #[tracing::instrument(skip_all, fields(?config, ?args))]
@@ -87,12 +133,7 @@ impl Runner {
             "invalid monitor Config: sys_buffer_bytes cannot be 0"
         );
 
-        // *NOTE*: the dispatcher and cgroup manager talk through these channels
-        // so make sure they each get the correct half, nothing is droppped, etc.
-        let (notified_send, notified_recv) = mpsc::channel(1);
-        let (requesting_send, requesting_recv) = mpsc::channel(1);
-
-        let dispatcher = Dispatcher::new(ws, notified_send, requesting_recv)
+        let dispatcher = Dispatcher::new(ws)
             .await
             .context("error creating new dispatcher")?;
 
@@ -106,46 +147,10 @@ impl Runner {
             kill,
         };
 
-        // If we have both the cgroup and file cache integrations enabled, it's possible for
-        // temporary failures to result in cgroup throttling (from memory.high), that in turn makes
-        // it near-impossible to connect to the file cache (because it times out). Unfortunately,
-        // we *do* still want to determine the file cache size before setting the cgroup's
-        // memory.high, so it's not as simple as just swapping the order.
-        //
-        // Instead, the resolution here is that on vm-monitor startup (note: happens on each
-        // connection from autoscaler-agent, possibly multiple times per compute_ctl lifecycle), we
-        // temporarily unset memory.high, to allow any existing throttling to dissipate. It's a bit
-        // of a hacky solution, but helps with reliability.
-        if let Some(name) = &args.cgroup {
-            // Best not to set up cgroup stuff more than once, so we'll initialize cgroup state
-            // now, and then set limits later.
-            info!("initializing cgroup");
-
-            let (cgroup, cgroup_event_stream) = CgroupWatcher::new(name.clone(), requesting_send)
-                .context("failed to create cgroup manager")?;
-
-            info!("temporarily unsetting memory.high");
-
-            // Temporarily un-set cgroup memory.high; see above.
-            cgroup
-                .unset_memory_high()
-                .context("failed to unset memory.high")?;
-
-            let cgroup = Arc::new(cgroup);
-
-            let cgroup_clone = Arc::clone(&cgroup);
-            spawn_with_cancel(
-                token.clone(),
-                |_| error!("cgroup watcher terminated"),
-                async move { cgroup_clone.watch(notified_recv, cgroup_event_stream).await },
-            );
-
-            state.cgroup = Some(cgroup);
-        }
-
-        let mut file_cache_reserved_bytes = 0;
         let mem = get_total_system_memory();
 
+        let mut file_cache_disk_size = 0;
+
         // We need to process file cache initialization before cgroup initialization, so that the memory
         // allocated to the file cache is appropriately taken into account when we decide the cgroup's
         // memory limits.
@@ -156,7 +161,7 @@ impl Runner {
                 false => FileCacheConfig::default_in_memory(),
             };
 
-            let mut file_cache = FileCacheState::new(connstr, config, token)
+            let mut file_cache = FileCacheState::new(connstr, config, token.clone())
                 .await
                 .context("failed to create file cache")?;
 
@@ -181,23 +186,40 @@ impl Runner {
             if actual_size != new_size {
                 info!("file cache size actually got set to {actual_size}")
             }
-            // Mark the resources given to the file cache as reserved, but only if it's in memory.
-            if !args.file_cache_on_disk {
-                file_cache_reserved_bytes = actual_size;
+
+            if args.file_cache_on_disk {
+                file_cache_disk_size = actual_size;
             }
 
             state.filecache = Some(file_cache);
         }
 
-        if let Some(cgroup) = &state.cgroup {
-            let available = mem - file_cache_reserved_bytes;
-            let value = cgroup.config.calculate_memory_high_value(available);
+        if let Some(name) = &args.cgroup {
+            // Best not to set up cgroup stuff more than once, so we'll initialize cgroup state
+            // now, and then set limits later.
+            info!("initializing cgroup");
 
-            info!(value, "setting memory.high");
+            let cgroup =
+                CgroupWatcher::new(name.clone()).context("failed to create cgroup manager")?;
 
-            cgroup
-                .set_memory_high_bytes(value)
-                .context("failed to set cgroup memory.high")?;
+            let init_value = cgroup::MemoryHistory {
+                avg_non_reclaimable: 0,
+                samples_count: 0,
+                samples_span: Duration::ZERO,
+            };
+            let (hist_tx, hist_rx) = watch::channel((Instant::now(), init_value));
+
+            spawn_with_cancel(token, |_| error!("cgroup watcher terminated"), async move {
+                cgroup.watch(hist_tx).await
+            });
+
+            let threshold = state.config.cgroup_threshold(mem, file_cache_disk_size);
+            info!(threshold, "set initial cgroup threshold",);
+
+            state.cgroup = Some(CgroupState {
+                watcher: hist_rx,
+                threshold,
+            });
         }
 
         Ok(state)
@@ -217,28 +239,40 @@ impl Runner {
 
         let requested_mem = target.mem;
         let usable_system_memory = requested_mem.saturating_sub(self.config.sys_buffer_bytes);
-        let expected_file_cache_mem_usage = self
+        let (expected_file_cache_size, expected_file_cache_disk_size) = self
             .filecache
             .as_ref()
-            .map(|file_cache| file_cache.config.calculate_cache_size(usable_system_memory))
-            .unwrap_or(0);
-        let mut new_cgroup_mem_high = 0;
+            .map(|file_cache| {
+                let size = file_cache.config.calculate_cache_size(usable_system_memory);
+                match file_cache.config.in_memory {
+                    true => (size, 0),
+                    false => (size, size),
+                }
+            })
+            .unwrap_or((0, 0));
         if let Some(cgroup) = &self.cgroup {
-            new_cgroup_mem_high = cgroup
+            let (last_time, last_history) = *cgroup.watcher.borrow();
+
+            // TODO: make the duration here configurable.
+            if last_time.elapsed() > Duration::from_secs(5) {
+                bail!("haven't gotten cgroup memory stats recently enough to determine downscaling information");
+            } else if last_history.samples_count <= 1 {
+                bail!("haven't received enough cgroup memory stats yet");
+            }
+
+            let new_threshold = self
                 .config
-                .calculate_memory_high_value(usable_system_memory - expected_file_cache_mem_usage);
+                .cgroup_threshold(usable_system_memory, expected_file_cache_disk_size);
 
-            let current = cgroup
-                .current_memory_usage()
-                .context("failed to fetch cgroup memory")?;
+            let current = last_history.avg_non_reclaimable;
 
-            if new_cgroup_mem_high < current + cgroup.config.memory_high_buffer_bytes {
+            if new_threshold < current + self.config.cgroup_downscale_threshold_buffer_bytes {
                 let status = format!(
-                    "{}: {} MiB (new high) < {} (current usage) + {} (buffer)",
-                    "calculated memory.high too low",
-                    bytes_to_mebibytes(new_cgroup_mem_high),
+                    "{}: {} MiB (new threshold) < {} (current usage) + {} (downscale buffer)",
+                    "calculated memory threshold too low",
+                    bytes_to_mebibytes(new_threshold),
                     bytes_to_mebibytes(current),
-                    bytes_to_mebibytes(cgroup.config.memory_high_buffer_bytes)
+                    bytes_to_mebibytes(self.config.cgroup_downscale_threshold_buffer_bytes)
                 );
 
                 info!(status, "discontinuing downscale");
@@ -249,14 +283,14 @@ impl Runner {
 
         // The downscaling has been approved. Downscale the file cache, then the cgroup.
         let mut status = vec![];
-        let mut file_cache_mem_usage = 0;
+        let mut file_cache_disk_size = 0;
         if let Some(file_cache) = &mut self.filecache {
             let actual_usage = file_cache
-                .set_file_cache_size(expected_file_cache_mem_usage)
+                .set_file_cache_size(expected_file_cache_size)
                 .await
                 .context("failed to set file cache size")?;
-            if file_cache.config.in_memory {
-                file_cache_mem_usage = actual_usage;
+            if !file_cache.config.in_memory {
+                file_cache_disk_size = actual_usage;
             }
             let message = format!(
                 "set file cache size to {} MiB (in memory = {})",
@@ -267,24 +301,18 @@ impl Runner {
             status.push(message);
         }
 
-        if let Some(cgroup) = &self.cgroup {
-            let available_memory = usable_system_memory - file_cache_mem_usage;
-
-            if file_cache_mem_usage != expected_file_cache_mem_usage {
-                new_cgroup_mem_high = cgroup.config.calculate_memory_high_value(available_memory);
-            }
-
-            // new_cgroup_mem_high is initialized to 0 but it is guaranteed to not be here
-            // since it is properly initialized in the previous cgroup if let block
-            cgroup
-                .set_memory_high_bytes(new_cgroup_mem_high)
-                .context("failed to set cgroup memory.high")?;
+        if let Some(cgroup) = &mut self.cgroup {
+            let new_threshold = self
+                .config
+                .cgroup_threshold(usable_system_memory, file_cache_disk_size);
 
             let message = format!(
-                "set cgroup memory.high to {} MiB, of new max {} MiB",
-                bytes_to_mebibytes(new_cgroup_mem_high),
-                bytes_to_mebibytes(available_memory)
+                "set cgroup memory threshold from {} MiB to {} MiB, of new total {} MiB",
+                bytes_to_mebibytes(cgroup.threshold),
+                bytes_to_mebibytes(new_threshold),
+                bytes_to_mebibytes(usable_system_memory)
             );
+            cgroup.threshold = new_threshold;
             info!("downscale: {message}");
             status.push(message);
         }
@@ -305,8 +333,7 @@ impl Runner {
         let new_mem = resources.mem;
         let usable_system_memory = new_mem.saturating_sub(self.config.sys_buffer_bytes);
 
-        // Get the file cache's expected contribution to the memory usage
-        let mut file_cache_mem_usage = 0;
+        let mut file_cache_disk_size = 0;
         if let Some(file_cache) = &mut self.filecache {
             let expected_usage = file_cache.config.calculate_cache_size(usable_system_memory);
             info!(
@@ -319,8 +346,8 @@ impl Runner {
                 .set_file_cache_size(expected_usage)
                 .await
                 .context("failed to set file cache size")?;
-            if file_cache.config.in_memory {
-                file_cache_mem_usage = actual_usage;
+            if !file_cache.config.in_memory {
+                file_cache_disk_size = actual_usage;
             }
 
             if actual_usage != expected_usage {
@@ -332,18 +359,18 @@ impl Runner {
             }
         }
 
-        if let Some(cgroup) = &self.cgroup {
-            let available_memory = usable_system_memory - file_cache_mem_usage;
-            let new_cgroup_mem_high = cgroup.config.calculate_memory_high_value(available_memory);
+        if let Some(cgroup) = &mut self.cgroup {
+            let new_threshold = self
+                .config
+                .cgroup_threshold(usable_system_memory, file_cache_disk_size);
+
             info!(
-                target = bytes_to_mebibytes(new_cgroup_mem_high),
-                total = bytes_to_mebibytes(new_mem),
-                name = cgroup.path(),
-                "updating cgroup memory.high",
+                "set cgroup memory threshold from {} MiB to {} MiB of new total {} MiB",
+                bytes_to_mebibytes(cgroup.threshold),
+                bytes_to_mebibytes(new_threshold),
+                bytes_to_mebibytes(usable_system_memory)
             );
-            cgroup
-                .set_memory_high_bytes(new_cgroup_mem_high)
-                .context("failed to set cgroup memory.high")?;
+            cgroup.threshold = new_threshold;
         }
 
         Ok(())
@@ -361,10 +388,6 @@ impl Runner {
                 self.handle_upscale(granted)
                     .await
                     .context("failed to handle upscale")?;
-                self.dispatcher
-                    .notify_upscale(Sequenced::new(granted))
-                    .await
-                    .context("failed to notify notify cgroup of upscale")?;
                 Ok(Some(OutboundMsg::new(
                     OutboundMsgKind::UpscaleConfirmation {},
                     id,
@@ -408,33 +431,53 @@ impl Runner {
                         Err(e) => bail!("failed to receive kill signal: {e}")
                     }
                 }
-                // we need to propagate an upscale request
-                request = self.dispatcher.request_upscale_events.recv(), if self.cgroup.is_some() => {
-                    if request.is_none() {
-                        bail!("failed to listen for upscale event from cgroup")
+
+                // New memory stats from the cgroup, *may* need to request upscaling, if we've
+                // exceeded the threshold
+                result = self.cgroup.as_mut().unwrap().watcher.changed(), if self.cgroup.is_some() => {
+                    result.context("failed to receive from cgroup memory stats watcher")?;
+
+                    let cgroup = self.cgroup.as_ref().unwrap();
+
+                    let (_time, cgroup_mem_stat) = *cgroup.watcher.borrow();
+
+                    // If we haven't exceeded the threshold, then we're all ok
+                    if cgroup_mem_stat.avg_non_reclaimable < cgroup.threshold {
+                        continue;
                     }
 
-                    // If it's been less than 1 second since the last time we requested upscaling,
-                    // ignore the event, to avoid spamming the agent (otherwise, this can happen
-                    // ~1k times per second).
+                    // Otherwise, we generally want upscaling. But, if it's been less than 1 second
+                    // since the last time we requested upscaling, ignore the event, to avoid
+                    // spamming the agent.
                     if let Some(t) = self.last_upscale_request_at {
                         let elapsed = t.elapsed();
                         if elapsed < Duration::from_secs(1) {
-                            info!(elapsed_millis = elapsed.as_millis(), "cgroup asked for upscale but too soon to forward the request, ignoring");
+                            info!(
+                                elapsed_millis = elapsed.as_millis(),
+                                avg_non_reclaimable = bytes_to_mebibytes(cgroup_mem_stat.avg_non_reclaimable),
+                                threshold = bytes_to_mebibytes(cgroup.threshold),
+                                "cgroup memory stats are high enough to upscale but too soon to forward the request, ignoring",
+                            );
                             continue;
                         }
                     }
 
                     self.last_upscale_request_at = Some(Instant::now());
 
-                    info!("cgroup asking for upscale; forwarding request");
+                    info!(
+                        avg_non_reclaimable = bytes_to_mebibytes(cgroup_mem_stat.avg_non_reclaimable),
+                        threshold = bytes_to_mebibytes(cgroup.threshold),
+                        "cgroup memory stats are high enough to upscale, requesting upscale",
+                    );
+
                     self.counter += 2; // Increment, preserving parity (i.e. keep the
                                        // counter odd). See the field comment for more.
                     self.dispatcher
                         .send(OutboundMsg::new(OutboundMsgKind::UpscaleRequest {}, self.counter))
                         .await
                         .context("failed to send message")?;
-                }
+                },
+
                 // there is a message from the agent
                 msg = self.dispatcher.source.next() => {
                     if let Some(msg) = msg {

From 8a316b127711cdf5777f20547b15583be0b6499a Mon Sep 17 00:00:00 2001
From: Em Sharnoff <sharnoff@neon.tech>
Date: Thu, 19 Oct 2023 09:10:33 -0700
Subject: [PATCH 082/412] vm-monitor: Log full error on message handling
 failure (#5604)

There's currently an issue with the vm-monitor on staging that's not
really feasible to debug because the current display impl gives no
context to the errors (just says "failed to downscale").

Logging the full error should help.

For communications with the autoscaler-agent, it's ok to only provide
the outermost cause, because we can cross-reference with the VM logs.
At some point in the future, we may want to change that.
---
 libs/vm_monitor/src/runner.rs | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/libs/vm_monitor/src/runner.rs b/libs/vm_monitor/src/runner.rs
index a7a0995797..e818292196 100644
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -505,11 +505,14 @@ impl Runner {
                                     Ok(Some(out)) => out,
                                     Ok(None) => continue,
                                     Err(e) => {
-                                        let error = e.to_string();
-                                        warn!(?error, "error handling message");
+                                        // use {:#} for our logging because the display impl only
+                                        // gives the outermost cause, and the debug impl
+                                        // pretty-prints the error, whereas {:#} contains all the
+                                        // causes, but is compact (no newlines).
+                                        warn!(error = format!("{e:#}"), "error handling message");
                                         OutboundMsg::new(
                                             OutboundMsgKind::InternalError {
-                                                error
+                                                error: e.to_string(),
                                             },
                                             message.id
                                         )

From 850db4cc139d625a2097afa7d464d5fc66c0dfd2 Mon Sep 17 00:00:00 2001
From: Em Sharnoff <sharnoff@neon.tech>
Date: Thu, 19 Oct 2023 11:09:37 -0700
Subject: [PATCH 083/412] vm-monitor: Deny not fail downscale if no memory
 stats yet (#5606)

Fixes an issue we observed on staging that happens when the
autoscaler-agent attempts to immediately downscale the VM after binding,
which is typical for pooled computes.

The issue was occurring because the autoscaler-agent was requesting
downscaling before the vm-monitor had gathered sufficient cgroup memory
stats to be confident in approving it. When the vm-monitor returned an
internal error instead of denying downscaling, the autoscaler-agent
retried the connection and immediately hit the same issue (in part
because cgroup stats are collected per-connection, rather than
globally).
---
 libs/vm_monitor/src/runner.rs | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/libs/vm_monitor/src/runner.rs b/libs/vm_monitor/src/runner.rs
index e818292196..99bea6d7ca 100644
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -253,11 +253,22 @@ impl Runner {
         if let Some(cgroup) = &self.cgroup {
             let (last_time, last_history) = *cgroup.watcher.borrow();
 
+            // NB: The ordering of these conditions is intentional. During startup, we should deny
+            // downscaling until we have enough information to determine that it's safe to do so
+            // (i.e. enough samples have come in). But if it's been a while and we *still* haven't
+            // received any information, we should *fail* instead of just denying downscaling.
+            //
+            // `last_time` is set to `Instant::now()` on startup, so checking `last_time.elapsed()`
+            // serves double-duty: it trips if we haven't received *any* metrics for long enough,
+            // OR if we haven't received metrics *recently enough*.
+            //
             // TODO: make the duration here configurable.
             if last_time.elapsed() > Duration::from_secs(5) {
                 bail!("haven't gotten cgroup memory stats recently enough to determine downscaling information");
             } else if last_history.samples_count <= 1 {
-                bail!("haven't received enough cgroup memory stats yet");
+                let status = "haven't received enough cgroup memory stats yet";
+                info!(status, "discontinuing downscale");
+                return Ok((false, status.to_owned()));
             }
 
             let new_threshold = self

From 0d80d6ce1873b067d608acc8942ba9ef909f8c08 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Wed, 25 Oct 2023 20:35:23 +0300
Subject: [PATCH 084/412] Ignore missed AUX_FILES_KEY when generating image
 layer (#5660)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

Logical replication requires new AUX_FILES_KEY which is definitely
absent in existed database.
We do not have function to check if key exists in our KV storage.
So I have to handle the error in `list_aux_files` method.
But this key is also included in key space range and accessed y
`create_image_layer` method.

## Summary of changes

Check if AUX_FILES_KEY  exists before including it in keyspace.

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
Co-authored-by: Shany Pozin <shany@neon.tech>
Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
---
 pageserver/src/pgdatadir_mapping.rs | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index d27c8a3d5d..52c44746dc 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -675,8 +675,9 @@ impl Timeline {
 
         result.add_key(CONTROLFILE_KEY);
         result.add_key(CHECKPOINT_KEY);
-        result.add_key(AUX_FILES_KEY);
-
+        if self.get(AUX_FILES_KEY, lsn, ctx).await.is_ok() {
+            result.add_key(AUX_FILES_KEY);
+        }
         Ok(result.to_keyspace())
     }
 

From 4b3b37b91250fe5732ec98cf9a64bd59db678d8b Mon Sep 17 00:00:00 2001
From: Em Sharnoff <sharnoff@neon.tech>
Date: Tue, 24 Oct 2023 16:04:28 -0700
Subject: [PATCH 085/412] Bump vm-builder v0.18.1 -> v0.18.2 (#5646)

Only applicable change was neondatabase/autoscaling#571, removing the
postgres_exporter flags `--auto-discover-databases` and
`--exclude-databases=...`
---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 3f1c728c70..8fda08efda 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -837,7 +837,7 @@ jobs:
       run:
         shell: sh -eu {0}
     env:
-      VM_BUILDER_VERSION: v0.18.1
+      VM_BUILDER_VERSION: v0.18.2
 
     steps:
       - name: Checkout

From 7d384d6953197c8b62e5eade00979a824f044d6e Mon Sep 17 00:00:00 2001
From: Em Sharnoff <sharnoff@neon.tech>
Date: Thu, 26 Oct 2023 12:04:57 -0700
Subject: [PATCH 086/412] Bump vm-builder v0.18.2 -> v0.18.4 (#5666)

Only applicable change was neondatabase/autoscaling#584, setting
pgbouncer auth_dbname=postgres in order to fix superuser connections
from preventing dropping databases.
---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 8fda08efda..6b487c0789 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -837,7 +837,7 @@ jobs:
       run:
         shell: sh -eu {0}
     env:
-      VM_BUILDER_VERSION: v0.18.2
+      VM_BUILDER_VERSION: v0.18.5
 
     steps:
       - name: Checkout

From 48b845fa769073e232ab39b09d386ba251fcb8a3 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Mon, 27 Nov 2023 21:45:41 +0000
Subject: [PATCH 087/412] Make neon extension relocatable to allow SET SCHEMA
 (#5942)

---
 pgxn/neon/neon.control | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pgxn/neon/neon.control b/pgxn/neon/neon.control
index c110437c3e..4e4cb9f372 100644
--- a/pgxn/neon/neon.control
+++ b/pgxn/neon/neon.control
@@ -2,3 +2,4 @@
 comment = 'cloud storage for PostgreSQL'
 default_version = '1.1'
 module_pathname = '$libdir/neon'
+relocatable = true

From e81fc598f416a2620ae8e942811e545495de490d Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Mon, 27 Nov 2023 23:29:24 +0000
Subject: [PATCH 088/412] Update neon extension relocatable for existing
 installations (#5943)

---
 compute_tools/src/spec.rs | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 97aa144c79..8c44c6d519 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -687,6 +687,9 @@ pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
     info!("create neon extension with query: {}", query);
     client.simple_query(query)?;
 
+    query = "UPDATE pg_extension SET extrelocatable = true WHERE extname = 'neon'";
+    client.simple_query(query)?;
+
     query = "ALTER EXTENSION neon SET SCHEMA neon";
     info!("alter neon extension schema with query: {}", query);
     client.simple_query(query)?;

From 5d71601ca9e92c9411ca751a6373e03dd64f03dc Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Wed, 29 Nov 2023 11:36:23 +0300
Subject: [PATCH 089/412] Notify safekeeper readiness with systemd.

To avoid downtime during deploy, as in busy regions initial load can currently
take ~30s.
---
 Cargo.lock                       | 7 +++++++
 Cargo.toml                       | 1 +
 safekeeper/Cargo.toml            | 1 +
 safekeeper/src/bin/safekeeper.rs | 7 +++++++
 4 files changed, 16 insertions(+)

diff --git a/Cargo.lock b/Cargo.lock
index a75209b8db..48c7bf1795 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4137,6 +4137,7 @@ dependencies = [
  "reqwest",
  "safekeeper_api",
  "scopeguard",
+ "sd-notify",
  "serde",
  "serde_json",
  "serde_with",
@@ -4199,6 +4200,12 @@ dependencies = [
  "untrusted",
 ]
 
+[[package]]
+name = "sd-notify"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "621e3680f3e07db4c9c2c3fb07c6223ab2fab2e54bd3c04c3ae037990f428c32"
+
 [[package]]
 name = "security-framework"
 version = "2.9.1"
diff --git a/Cargo.toml b/Cargo.toml
index c50fb7be42..6df48ffc55 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -122,6 +122,7 @@ rustls-pemfile = "1"
 rustls-split = "0.3"
 scopeguard = "1.1"
 sysinfo = "0.29.2"
+sd-notify = "0.4.1"
 sentry = { version = "0.31", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml
index 30516c0763..53fcd5ff07 100644
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -46,6 +46,7 @@ postgres_ffi.workspace = true
 pq_proto.workspace = true
 remote_storage.workspace = true
 safekeeper_api.workspace = true
+sd-notify.workspace = true
 storage_broker.workspace = true
 tokio-stream.workspace = true
 utils.workspace = true
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 0b5bb22c8b..e59deb9fda 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -8,6 +8,7 @@ use futures::future::BoxFuture;
 use futures::stream::FuturesUnordered;
 use futures::{FutureExt, StreamExt};
 use remote_storage::RemoteStorageConfig;
+use sd_notify::NotifyState;
 use tokio::runtime::Handle;
 use tokio::signal::unix::{signal, SignalKind};
 use tokio::task::JoinError;
@@ -434,6 +435,12 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
     let mut sigint_stream = signal(SignalKind::interrupt())?;
     let mut sigterm_stream = signal(SignalKind::terminate())?;
 
+    // Notify systemd that we are ready. This is important as currently loading
+    // timelines takes significant time (~30s in busy regions).
+    if let Err(e) = sd_notify::notify(true, &[NotifyState::Ready]) {
+        warn!("systemd notify failed: {:?}", e);
+    }
+
     tokio::select! {
         Some((task_name, res)) = tasks_handles.next()=> {
             error!("{} task failed: {:?}, exiting", task_name, res);

From 40087b8164183d18da670657cc9aac042b23f769 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 1 Dec 2023 12:43:09 +0000
Subject: [PATCH 090/412] fix: use create_new instead of create for mutex file

---
 pageserver/src/tenant.rs | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 249a9a80c5..7384459ab5 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3147,7 +3147,10 @@ impl Tenant {
         let uninit_mark_path = self
             .conf
             .timeline_uninit_mark_file_path(tenant_id, timeline_id);
-        fs::File::create(&uninit_mark_path)
+        fs::OpenOptions::new()
+            .write(true)
+            .create_new(true)
+            .open(&uninit_mark_path)
             .context("Failed to create uninit mark file")
             .and_then(|_| {
                 crashsafe::fsync_file_and_parent(&uninit_mark_path)

From 558394f7103f33da8cfb36c7486ae5872bd4aa34 Mon Sep 17 00:00:00 2001
From: Shany Pozin <shany@neon.tech>
Date: Mon, 4 Dec 2023 11:41:27 +0200
Subject: [PATCH 091/412] fix merge

---
 Cargo.lock | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 2014c0498f..5639665758 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4363,12 +4363,6 @@ dependencies = [
  "zeroize",
 ]
 
-[[package]]
-name = "sd-notify"
-version = "0.4.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "621e3680f3e07db4c9c2c3fb07c6223ab2fab2e54bd3c04c3ae037990f428c32"
-
 [[package]]
 name = "security-framework"
 version = "2.9.1"

From 45c51227542b99cb5c4e6068e0ea34e61cfe3f19 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <krassovskysasha@gmail.com>
Date: Mon, 4 Dec 2023 12:36:19 -0800
Subject: [PATCH 092/412] Remove trusted from wal2json

---
 Dockerfile.compute-node | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 425f163e8b..a3772265c0 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -721,8 +721,7 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.
     echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \
     mkdir wal2json-src && cd wal2json-src && tar xvzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/wal2json.control
+    make -j $(getconf _NPROCESSORS_ONLN) install
 
 #########################################################################################
 #

From 661fc41e716c50703876eba0f212e96ef8ae273f Mon Sep 17 00:00:00 2001
From: Vadim Kharitonov <vadim2404@users.noreply.github.com>
Date: Wed, 6 Dec 2023 16:12:36 +0100
Subject: [PATCH 093/412] Revert timescaledb for pg14 and pg15 (#6056)

```
could not start the compute node: compute is in state "failed": db error: ERROR: could not access file "$libdir/timescaledb-2.10.1": No such file or directory Caused by: ERROR: could not access file "$libdir/timescaledb-2.10.1": No such file or directory
```
---
 Dockerfile.compute-node | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index a3772265c0..03280586f8 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -387,10 +387,20 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ARG PG_VERSION
 ENV PATH "/usr/local/pgsql/bin:$PATH"
 
-RUN apt-get update && \
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        export TIMESCALEDB_VERSION=2.10.1 \
+        export TIMESCALEDB_CHECKSUM=6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 \
+        ;; \
+      *) \
+        export TIMESCALEDB_VERSION=2.13.0 \
+        export TIMESCALEDB_CHECKSUM=584a351c7775f0e067eaa0e7277ea88cab9077cc4c455cbbf09a5d9723dce95d \
+        ;; \
+    esac && \
+    apt-get update && \
     apt-get install -y cmake && \
-    wget https://github.com/timescale/timescaledb/archive/refs/tags/2.13.0.tar.gz -O timescaledb.tar.gz && \
-    echo "584a351c7775f0e067eaa0e7277ea88cab9077cc4c455cbbf09a5d9723dce95d timescaledb.tar.gz" | sha256sum --check && \
+    wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \
+    echo "${TIMESCALEDB_CHECKSUM} timescaledb.tar.gz" | sha256sum --check && \
     mkdir timescaledb-src && cd timescaledb-src && tar xvzf ../timescaledb.tar.gz --strip-components=1 -C . && \
     ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON -DCMAKE_BUILD_TYPE=Release && \
     cd build && \

From 04b82c92a72de06299ca1d73e452138f53844918 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 11 Dec 2023 23:27:53 +0200
Subject: [PATCH 094/412] fix: accidential return Ok (#6106)

Error indicating request cancellation OR timeline shutdown was deemed as
a reason to exit the background worker that calculated synthetic size.
Fix it to only be considered for avoiding logging such of such errors.

This conflicted on tenant_shard_id having already replaced tenant_id on
`main`.
---
 pageserver/src/consumption_metrics.rs | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index 7ad6a0f890..bfd7897b49 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -281,12 +281,18 @@ async fn calculate_synthetic_size_worker(
                 // By using the same limiter, we centralize metrics collection for "start" and "finished" counters,
                 // which turns out is really handy to understand the system.
                 if let Err(e) = tenant.calculate_synthetic_size(cause, cancel, ctx).await {
-                    if let Some(PageReconstructError::Cancelled) =
-                        e.downcast_ref::<PageReconstructError>()
-                    {
-                        return Ok(());
+                    // this error can be returned if timeline is shutting down, but it does not
+                    // mean the synthetic size worker should terminate. we do not need any checks
+                    // in this function because `mgr::get_tenant` will error out after shutdown has
+                    // progressed to shutting down tenants.
+                    let is_cancelled = matches!(
+                        e.downcast_ref::<PageReconstructError>(),
+                        Some(PageReconstructError::Cancelled)
+                    );
+
+                    if !is_cancelled {
+                        error!("failed to calculate synthetic size for tenant {tenant_id}: {e:#}");
                     }
-                    error!("failed to calculate synthetic size for tenant {tenant_id}: {e:#}");
                 }
             }
         }
@@ -299,7 +305,7 @@ async fn calculate_synthetic_size_worker(
 
         let res = tokio::time::timeout_at(
             started_at + synthetic_size_calculation_interval,
-            task_mgr::shutdown_token().cancelled(),
+            cancel.cancelled(),
         )
         .await;
         if res.is_ok() {

From 8ff3253f20dcbcee24b31b65d71c23a6b30d5c43 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 11 Dec 2023 10:25:43 -0600
Subject: [PATCH 095/412] Fix git ownership issue in check-codestyle-rust-arm

We have this workaround for other jobs. Looks like this one was
forgotten about.
---
 .github/workflows/neon_extra_builds.yml | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index 0d7db8dfbc..09a106fb52 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -238,6 +238,20 @@ jobs:
       options: --init
 
     steps:
+      - name: Fix git ownership
+        run: |
+          # Workaround for `fatal: detected dubious ownership in repository at ...`
+          #
+          # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
+          #   Ref https://github.com/actions/checkout/issues/785
+          #
+          git config --global --add safe.directory ${{ github.workspace }}
+          git config --global --add safe.directory ${GITHUB_WORKSPACE}
+          for r in 14 15 16; do
+            git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
+            git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
+          done
+
       - name: Checkout
         uses: actions/checkout@v4
         with:

From a92702b01e1021b579c04b10bbf8ac7002939bfa Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 11 Dec 2023 10:46:41 -0600
Subject: [PATCH 096/412] Add submodule paths as safe directories as a
 precaution

The check-codestyle-rust-arm job requires this for some reason, so let's
just add them everywhere we do this workaround.
---
 .github/workflows/build_and_test.yml    | 8 ++++++++
 .github/workflows/neon_extra_builds.yml | 4 ++++
 2 files changed, 12 insertions(+)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 820848b4fb..693ed1a66f 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -199,6 +199,10 @@ jobs:
           #
           git config --global --add safe.directory ${{ github.workspace }}
           git config --global --add safe.directory ${GITHUB_WORKSPACE}
+          for r in 14 15 16; do
+            git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
+            git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
+          done
 
       - name: Checkout
         uses: actions/checkout@v3
@@ -1097,6 +1101,10 @@ jobs:
           #
           git config --global --add safe.directory ${{ github.workspace }}
           git config --global --add safe.directory ${GITHUB_WORKSPACE}
+          for r in 14 15 16; do
+            git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
+            git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
+          done
 
       - name: Checkout
         uses: actions/checkout@v3
diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index 09a106fb52..b1ea5e4f74 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -142,6 +142,10 @@ jobs:
           #
           git config --global --add safe.directory ${{ github.workspace }}
           git config --global --add safe.directory ${GITHUB_WORKSPACE}
+          for r in 14 15 16; do
+            git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
+            git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
+          done
 
       - name: Checkout
         uses: actions/checkout@v4

From 322ea1cf7ca6426643c0c3a9a82d9b9fba538e1b Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 18 Dec 2023 10:29:19 +0000
Subject: [PATCH 097/412] pageserver: on-demand activation cleanups (#6157)

## Problem

#6112 added some logs and metrics: clean these up a bit:
- Avoid counting startup completions for tenants launched after startup
- exclude no-op cases from timing histograms
- remove a rogue log messages
---
 pageserver/src/tenant.rs | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 1478a1a445..eceef6bf78 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -629,9 +629,12 @@ impl Tenant {
             "attach tenant",
             false,
             async move {
+                // Is this tenant being spawned as part of process startup?
+                let starting_up = init_order.is_some();
                 scopeguard::defer! {
-                    tracing::info!("Increment complete count");
-                    TENANT.startup_complete.inc();
+                    if starting_up {
+                        TENANT.startup_complete.inc();
+                    }
                 }
 
                 // Ideally we should use Tenant::set_broken_no_wait, but it is not supposed to be used when tenant is in loading state.
@@ -711,7 +714,11 @@ impl Tenant {
 
                 let preload_timer = TENANT.preload.start_timer();
                 let preload = match mode {
-                    SpawnMode::Create => {None},
+                    SpawnMode::Create => {
+                        // Don't count the skipped preload into the histogram of preload durations
+                        preload_timer.stop_and_discard();
+                        None
+                    },
                     SpawnMode::Normal => {
                         match &remote_storage {
                             Some(remote_storage) => Some(
@@ -721,7 +728,11 @@ impl Tenant {
                                         tracing::info_span!(parent: None, "attach_preload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()),
                                     )
                                     .await {
-                                        Ok(p) => p,
+                                        Ok(p) => {
+                                            preload_timer.observe_duration();
+                                            p
+                                        }
+                                            ,
                                         Err(e) => {
                                             make_broken(&tenant_clone, anyhow::anyhow!(e));
                                                 return Ok(());
@@ -732,7 +743,6 @@ impl Tenant {
                         }
                     }
                 };
-                preload_timer.observe_duration();
 
                 // Remote preload is complete.
                 drop(remote_load_completion);
@@ -784,15 +794,19 @@ impl Tenant {
                     }
                 }
 
-                let attach_timer = TENANT.attach.start_timer();
+                // We will time the duration of the attach phase unless this is a creation (attach will do no work)
+                let attach_timer = match mode {
+                    SpawnMode::Create => None,
+                    SpawnMode::Normal => {Some(TENANT.attach.start_timer())}
+                };
                 match tenant_clone.attach(preload, &ctx).await {
                     Ok(()) => {
                         info!("attach finished, activating");
-                        attach_timer.observe_duration();
+                        if let Some(t)=  attach_timer {t.observe_duration();}
                         tenant_clone.activate(broker_client, None, &ctx);
                     }
                     Err(e) => {
-                        attach_timer.observe_duration();
+                        if let Some(t)=  attach_timer {t.observe_duration();}
                         make_broken(&tenant_clone, anyhow::anyhow!(e));
                     }
                 }

From e67b8f69c0e30389cbd037521d5f6024180c6752 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 18 Dec 2023 13:39:48 +0100
Subject: [PATCH 098/412] [PRE-MERGE] pageserver: Reduce tracing overhead in
 timeline::get #6115

Pre-merge `git merge --squash` of
https://github.com/neondatabase/neon/pull/6115

Lowering the tracing level in get_value_reconstruct_data and
get_or_maybe_download from info to debug reduces the overhead
of span creation in non-debug environments.
---
 pageserver/src/tenant/storage_layer/layer.rs | 160 ++++++++++---------
 test_runner/regress/test_broken_timeline.py  |   4 +-
 2 files changed, 85 insertions(+), 79 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index a4b102c314..9a8ddc1a6b 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -259,8 +259,9 @@ impl Layer {
 
         layer
             .get_value_reconstruct_data(key, lsn_range, reconstruct_data, &self.0, ctx)
-            .instrument(tracing::info_span!("get_value_reconstruct_data", layer=%self))
+            .instrument(tracing::debug_span!("get_value_reconstruct_data", layer=%self))
             .await
+            .with_context(|| format!("get_value_reconstruct_data for layer {self}"))
     }
 
     /// Download the layer if evicted.
@@ -654,7 +655,6 @@ impl LayerInner {
     }
 
     /// Cancellation safe.
-    #[tracing::instrument(skip_all, fields(layer=%self))]
     async fn get_or_maybe_download(
         self: &Arc<Self>,
         allow_download: bool,
@@ -663,95 +663,101 @@ impl LayerInner {
         let mut init_permit = None;
 
         loop {
-            let download = move |permit| async move {
-                // disable any scheduled but not yet running eviction deletions for this
-                let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed);
+            let download = move |permit| {
+                async move {
+                    // disable any scheduled but not yet running eviction deletions for this
+                    let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed);
 
-                // count cancellations, which currently remain largely unexpected
-                let init_cancelled =
-                    scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
+                    // count cancellations, which currently remain largely unexpected
+                    let init_cancelled =
+                        scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
 
-                // no need to make the evict_and_wait wait for the actual download to complete
-                drop(self.status.send(Status::Downloaded));
+                    // no need to make the evict_and_wait wait for the actual download to complete
+                    drop(self.status.send(Status::Downloaded));
 
-                let timeline = self
-                    .timeline
-                    .upgrade()
-                    .ok_or_else(|| DownloadError::TimelineShutdown)?;
+                    let timeline = self
+                        .timeline
+                        .upgrade()
+                        .ok_or_else(|| DownloadError::TimelineShutdown)?;
 
-                // FIXME: grab a gate
+                    // FIXME: grab a gate
 
-                let can_ever_evict = timeline.remote_client.as_ref().is_some();
+                    let can_ever_evict = timeline.remote_client.as_ref().is_some();
 
-                // check if we really need to be downloaded; could have been already downloaded by a
-                // cancelled previous attempt.
-                let needs_download = self
-                    .needs_download()
-                    .await
-                    .map_err(DownloadError::PreStatFailed)?;
+                    // check if we really need to be downloaded; could have been already downloaded by a
+                    // cancelled previous attempt.
+                    let needs_download = self
+                        .needs_download()
+                        .await
+                        .map_err(DownloadError::PreStatFailed)?;
 
-                let permit = if let Some(reason) = needs_download {
-                    if let NeedsDownload::NotFile(ft) = reason {
-                        return Err(DownloadError::NotFile(ft));
+                    let permit = if let Some(reason) = needs_download {
+                        if let NeedsDownload::NotFile(ft) = reason {
+                            return Err(DownloadError::NotFile(ft));
+                        }
+
+                        // only reset this after we've decided we really need to download. otherwise it'd
+                        // be impossible to mark cancelled downloads for eviction, like one could imagine
+                        // we would like to do for prefetching which was not needed.
+                        self.wanted_evicted.store(false, Ordering::Release);
+
+                        if !can_ever_evict {
+                            return Err(DownloadError::NoRemoteStorage);
+                        }
+
+                        if let Some(ctx) = ctx {
+                            self.check_expected_download(ctx)?;
+                        }
+
+                        if !allow_download {
+                            // this does look weird, but for LayerInner the "downloading" means also changing
+                            // internal once related state ...
+                            return Err(DownloadError::DownloadRequired);
+                        }
+
+                        tracing::info!(%reason, "downloading on-demand");
+
+                        self.spawn_download_and_wait(timeline, permit).await?
+                    } else {
+                        // the file is present locally, probably by a previous but cancelled call to
+                        // get_or_maybe_download. alternatively we might be running without remote storage.
+                        LAYER_IMPL_METRICS.inc_init_needed_no_download();
+
+                        permit
+                    };
+
+                    let since_last_eviction =
+                        self.last_evicted_at.lock().unwrap().map(|ts| ts.elapsed());
+                    if let Some(since_last_eviction) = since_last_eviction {
+                        // FIXME: this will not always be recorded correctly until #6028 (the no
+                        // download needed branch above)
+                        LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction);
                     }
 
-                    // only reset this after we've decided we really need to download. otherwise it'd
-                    // be impossible to mark cancelled downloads for eviction, like one could imagine
-                    // we would like to do for prefetching which was not needed.
-                    self.wanted_evicted.store(false, Ordering::Release);
+                    let res = Arc::new(DownloadedLayer {
+                        owner: Arc::downgrade(self),
+                        kind: tokio::sync::OnceCell::default(),
+                        version: next_version,
+                    });
 
-                    if !can_ever_evict {
-                        return Err(DownloadError::NoRemoteStorage);
+                    self.access_stats.record_residence_event(
+                        LayerResidenceStatus::Resident,
+                        LayerResidenceEventReason::ResidenceChange,
+                    );
+
+                    let waiters = self.inner.initializer_count();
+                    if waiters > 0 {
+                        tracing::info!(
+                            waiters,
+                            "completing the on-demand download for other tasks"
+                        );
                     }
 
-                    if let Some(ctx) = ctx {
-                        self.check_expected_download(ctx)?;
-                    }
+                    scopeguard::ScopeGuard::into_inner(init_cancelled);
 
-                    if !allow_download {
-                        // this does look weird, but for LayerInner the "downloading" means also changing
-                        // internal once related state ...
-                        return Err(DownloadError::DownloadRequired);
-                    }
-
-                    tracing::info!(%reason, "downloading on-demand");
-
-                    self.spawn_download_and_wait(timeline, permit).await?
-                } else {
-                    // the file is present locally, probably by a previous but cancelled call to
-                    // get_or_maybe_download. alternatively we might be running without remote storage.
-                    LAYER_IMPL_METRICS.inc_init_needed_no_download();
-
-                    permit
-                };
-
-                let since_last_eviction =
-                    self.last_evicted_at.lock().unwrap().map(|ts| ts.elapsed());
-                if let Some(since_last_eviction) = since_last_eviction {
-                    // FIXME: this will not always be recorded correctly until #6028 (the no
-                    // download needed branch above)
-                    LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction);
+                    Ok((ResidentOrWantedEvicted::Resident(res), permit))
                 }
-
-                let res = Arc::new(DownloadedLayer {
-                    owner: Arc::downgrade(self),
-                    kind: tokio::sync::OnceCell::default(),
-                    version: next_version,
-                });
-
-                self.access_stats.record_residence_event(
-                    LayerResidenceStatus::Resident,
-                    LayerResidenceEventReason::ResidenceChange,
-                );
-
-                let waiters = self.inner.initializer_count();
-                if waiters > 0 {
-                    tracing::info!(waiters, "completing the on-demand download for other tasks");
-                }
-
-                scopeguard::ScopeGuard::into_inner(init_cancelled);
-
-                Ok((ResidentOrWantedEvicted::Resident(res), permit))
+                .instrument(tracing::info_span!("get_or_maybe_download", layer=%self))
             };
 
             if let Some(init_permit) = init_permit.take() {
diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py
index 53eeb8bbe9..4da0ba7b20 100644
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -20,7 +20,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder):
 
     env.pageserver.allowed_errors.extend(
         [
-            ".*layer loading failed:.*",
+            ".*get_value_reconstruct_data for layer .*",
             ".*could not find data for key.*",
             ".*is not active. Current state: Broken.*",
             ".*will not become active. Current state: Broken.*",
@@ -83,7 +83,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder):
     # (We don't check layer file contents on startup, when loading the timeline)
     #
     # This will change when we implement checksums for layers
-    with pytest.raises(Exception, match="layer loading failed:") as err:
+    with pytest.raises(Exception, match="get_value_reconstruct_data for layer ") as err:
         pg2.start()
     log.info(
         f"As expected, compute startup failed for timeline {tenant2}/{timeline2} with corrupt layers: {err}"

From 80be423a58a5fa0518c70a33727678731ecf9561 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <krassovskysasha@gmail.com>
Date: Mon, 18 Dec 2023 10:22:36 -0800
Subject: [PATCH 099/412] Grant BYPASSRLS and REPLICATION explicitly to
 neon_superuser roles

---
 compute_tools/src/spec.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index ba1ee6d1b2..20299c8fde 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -298,7 +298,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
                 // safe to add more permissions here. BYPASSRLS and REPLICATION are inherited
                 // from neon_superuser.
                 let mut query: String = format!(
-                    "CREATE ROLE {} INHERIT CREATEROLE CREATEDB IN ROLE neon_superuser",
+                    "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
                     name.pg_quote()
                 );
                 info!("role create query: '{}'", &query);

From 6eda0a3158ce40bd4520ba6e275bd3958f1a7be7 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 19 Dec 2023 13:31:04 +0000
Subject: [PATCH 100/412] [PRE-MERGE] fix metric
 `pageserver_initial_logical_size_start_calculation`

(This is a pre-merge cherry-pick of https://github.com/neondatabase/neon/pull/6191)

It wasn't being incremented.

Fixup of

    commit 1c88824ed0e6bfbce02fa92e13ca91d5ab0e37b3
    Author: Christian Schwarz <christian@neon.tech>
    Date:   Fri Dec 1 12:52:59 2023 +0100

        initial logical size calculation: add a bunch of metrics (#5995)
---
 pageserver/src/metrics.rs | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 45c01b71d1..e34d7c0287 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -522,14 +522,18 @@ pub(crate) mod initial_logical_size {
     impl StartCalculation {
         pub(crate) fn first(&self, circumstances: StartCircumstances) -> OngoingCalculationGuard {
             let circumstances_label: &'static str = circumstances.into();
-            self.0.with_label_values(&["first", circumstances_label]);
+            self.0
+                .with_label_values(&["first", circumstances_label])
+                .inc();
             OngoingCalculationGuard {
                 inc_drop_calculation: Some(DROP_CALCULATION.first.clone()),
             }
         }
         pub(crate) fn retry(&self, circumstances: StartCircumstances) -> OngoingCalculationGuard {
             let circumstances_label: &'static str = circumstances.into();
-            self.0.with_label_values(&["retry", circumstances_label]);
+            self.0
+                .with_label_values(&["retry", circumstances_label])
+                .inc();
             OngoingCalculationGuard {
                 inc_drop_calculation: Some(DROP_CALCULATION.retry.clone()),
             }

From 39be366fc5a4d3e3c9713c247dc42305f4fc56e4 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 19 Dec 2023 14:46:17 +0100
Subject: [PATCH 101/412] higher resolution histograms for getpage@lsn (#6177)

part of https://github.com/neondatabase/cloud/issues/7811
---
 pageserver/src/metrics.rs | 52 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 51 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index e34d7c0287..4725903783 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1023,12 +1023,62 @@ static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+static SMGR_QUERY_TIME_GLOBAL_BUCKETS: Lazy<Vec<f64>> = Lazy::new(|| {
+    [
+        1,
+        10,
+        20,
+        40,
+        60,
+        80,
+        100,
+        200,
+        300,
+        400,
+        500,
+        600,
+        700,
+        800,
+        900,
+        1_000, // 1ms
+        2_000,
+        4_000,
+        6_000,
+        8_000,
+        10_000, // 10ms
+        20_000,
+        40_000,
+        60_000,
+        80_000,
+        100_000,
+        200_000,
+        400_000,
+        600_000,
+        800_000,
+        1_000_000, // 1s
+        2_000_000,
+        4_000_000,
+        6_000_000,
+        8_000_000,
+        10_000_000, // 10s
+        20_000_000,
+        50_000_000,
+        100_000_000,
+        200_000_000,
+        1_000_000_000, // 1000s
+    ]
+    .into_iter()
+    .map(Duration::from_micros)
+    .map(|d| d.as_secs_f64())
+    .collect()
+});
+
 static SMGR_QUERY_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
     register_histogram_vec!(
         "pageserver_smgr_query_seconds_global",
         "Time spent on smgr query handling, aggregated by query type.",
         &["smgr_query_type"],
-        CRITICAL_OP_BUCKETS.into(),
+        SMGR_QUERY_TIME_GLOBAL_BUCKETS.clone(),
     )
     .expect("failed to define a metric")
 });

From bd4dae8f4a5d4ea3fe2279dbd30a30a0943ffbab Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 1 Jan 2024 14:38:08 +0300
Subject: [PATCH 102/412] compute_ctl: kill postgres and sync-safekeeprs on
 exit.

Otherwise they are left orphaned when compute_ctl is terminated with a
signal. It was invisible most of the time because normally neon_local or k8s
kills postgres directly and then compute_ctl finishes gracefully. However, in
some tests compute_ctl gets stuck waiting for sync-safekeepers which
intentionally never ends because safekeepers are offline, and we want to stop
compute_ctl without leaving orphanes behind.

This is a quite rough approach which doesn't wait for children termination. A
better way would be to convert compute_ctl to async which would make waiting
easy.
---
 Cargo.lock                           |  2 ++
 compute_tools/Cargo.toml             |  2 ++
 compute_tools/src/bin/compute_ctl.rs | 32 +++++++++++++++++++++++++++-
 compute_tools/src/compute.rs         |  8 +++++++
 control_plane/src/endpoint.rs        | 18 ++++++++++++----
 5 files changed, 57 insertions(+), 5 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index abd87dc0da..8e0ad7c8ee 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1161,6 +1161,7 @@ dependencies = [
  "flate2",
  "futures",
  "hyper",
+ "nix 0.26.2",
  "notify",
  "num_cpus",
  "opentelemetry",
@@ -1171,6 +1172,7 @@ dependencies = [
  "rust-ini",
  "serde",
  "serde_json",
+ "signal-hook",
  "tar",
  "tokio",
  "tokio-postgres",
diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml
index 142fa08495..759a117ee9 100644
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -13,6 +13,7 @@ clap.workspace = true
 flate2.workspace = true
 futures.workspace = true
 hyper = { workspace = true, features = ["full"] }
+nix.workspace = true
 notify.workspace = true
 num_cpus.workspace = true
 opentelemetry.workspace = true
@@ -20,6 +21,7 @@ postgres.workspace = true
 regex.workspace = true
 serde.workspace = true
 serde_json.workspace = true
+signal-hook.workspace = true
 tar.workspace = true
 reqwest = { workspace = true, features = ["json"] }
 tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index 436db59088..eb1d746f04 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -40,18 +40,22 @@ use std::collections::HashMap;
 use std::fs::File;
 use std::path::Path;
 use std::process::exit;
+use std::sync::atomic::Ordering;
 use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock};
 use std::{thread, time::Duration};
 
 use anyhow::{Context, Result};
 use chrono::Utc;
 use clap::Arg;
+use nix::sys::signal::{kill, Signal};
+use signal_hook::consts::{SIGQUIT, SIGTERM};
+use signal_hook::{consts::SIGINT, iterator::Signals};
 use tracing::{error, info};
 use url::Url;
 
 use compute_api::responses::ComputeStatus;
 
-use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec};
+use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec, PG_PID, SYNC_SAFEKEEPERS_PID};
 use compute_tools::configurator::launch_configurator;
 use compute_tools::extension_server::get_pg_version;
 use compute_tools::http::api::launch_http_server;
@@ -67,6 +71,13 @@ const BUILD_TAG_DEFAULT: &str = "latest";
 fn main() -> Result<()> {
     init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
 
+    let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?;
+    thread::spawn(move || {
+        for sig in signals.forever() {
+            handle_exit_signal(sig);
+        }
+    });
+
     let build_tag = option_env!("BUILD_TAG")
         .unwrap_or(BUILD_TAG_DEFAULT)
         .to_string();
@@ -346,6 +357,7 @@ fn main() -> Result<()> {
         let ecode = pg
             .wait()
             .expect("failed to start waiting on Postgres process");
+        PG_PID.store(0, Ordering::SeqCst);
         info!("Postgres exited with code {}, shutting down", ecode);
         exit_code = ecode.code()
     }
@@ -519,6 +531,24 @@ fn cli() -> clap::Command {
         )
 }
 
+/// When compute_ctl is killed, send also termination signal to sync-safekeepers
+/// to prevent leakage. TODO: it is better to convert compute_ctl to async and
+/// wait for termination which would be easy then.
+fn handle_exit_signal(sig: i32) {
+    info!("received {sig} termination signal");
+    let ss_pid = SYNC_SAFEKEEPERS_PID.load(Ordering::SeqCst);
+    if ss_pid != 0 {
+        let ss_pid = nix::unistd::Pid::from_raw(ss_pid as i32);
+        kill(ss_pid, Signal::SIGTERM).ok();
+    }
+    let pg_pid = PG_PID.load(Ordering::SeqCst);
+    if pg_pid != 0 {
+        let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
+        kill(pg_pid, Signal::SIGTERM).ok();
+    }
+    exit(1);
+}
+
 #[test]
 fn verify_cli() {
     cli().debug_assert()
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index cd7be0520e..13701b7378 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -6,6 +6,8 @@ use std::os::unix::fs::PermissionsExt;
 use std::path::Path;
 use std::process::{Command, Stdio};
 use std::str::FromStr;
+use std::sync::atomic::AtomicU32;
+use std::sync::atomic::Ordering;
 use std::sync::{Condvar, Mutex, RwLock};
 use std::thread;
 use std::time::Instant;
@@ -34,6 +36,9 @@ use crate::spec::*;
 use crate::sync_sk::{check_if_synced, ping_safekeeper};
 use crate::{config, extension_server};
 
+pub static SYNC_SAFEKEEPERS_PID: AtomicU32 = AtomicU32::new(0);
+pub static PG_PID: AtomicU32 = AtomicU32::new(0);
+
 /// Compute node info shared across several `compute_ctl` threads.
 pub struct ComputeNode {
     // Url type maintains proper escaping
@@ -501,6 +506,7 @@ impl ComputeNode {
             .stdout(Stdio::piped())
             .spawn()
             .expect("postgres --sync-safekeepers failed to start");
+        SYNC_SAFEKEEPERS_PID.store(sync_handle.id(), Ordering::SeqCst);
 
         // `postgres --sync-safekeepers` will print all log output to stderr and
         // final LSN to stdout. So we pipe only stdout, while stderr will be automatically
@@ -508,6 +514,7 @@ impl ComputeNode {
         let sync_output = sync_handle
             .wait_with_output()
             .expect("postgres --sync-safekeepers failed");
+        SYNC_SAFEKEEPERS_PID.store(0, Ordering::SeqCst);
 
         if !sync_output.status.success() {
             anyhow::bail!(
@@ -662,6 +669,7 @@ impl ComputeNode {
             })
             .spawn()
             .expect("cannot start postgres process");
+        PG_PID.store(pg.id(), Ordering::SeqCst);
 
         wait_for_postgres(&mut pg, pgdata_path)?;
 
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 55b66742ca..3d5dfd6311 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -46,6 +46,8 @@ use std::time::Duration;
 
 use anyhow::{anyhow, bail, Context, Result};
 use compute_api::spec::RemoteExtSpec;
+use nix::sys::signal::kill;
+use nix::sys::signal::Signal;
 use serde::{Deserialize, Serialize};
 use utils::id::{NodeId, TenantId, TimelineId};
 
@@ -439,11 +441,14 @@ impl Endpoint {
         Ok(())
     }
 
-    fn wait_for_compute_ctl_to_exit(&self) -> Result<()> {
+    fn wait_for_compute_ctl_to_exit(&self, send_sigterm: bool) -> Result<()> {
         // TODO use background_process::stop_process instead
         let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
         let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?;
         let pid = nix::unistd::Pid::from_raw(pid as i32);
+        if send_sigterm {
+            kill(pid, Signal::SIGTERM).ok();
+        }
         crate::background_process::wait_until_stopped("compute_ctl", pid)?;
         Ok(())
     }
@@ -733,10 +738,15 @@ impl Endpoint {
             &None,
         )?;
 
-        // Also wait for the compute_ctl process to die. It might have some cleanup
-        // work to do after postgres stops, like syncing safekeepers, etc.
+        // Also wait for the compute_ctl process to die. It might have some
+        // cleanup work to do after postgres stops, like syncing safekeepers,
+        // etc.
         //
-        self.wait_for_compute_ctl_to_exit()?;
+        // If destroying, send it SIGTERM before waiting. Sometimes we do *not*
+        // want this cleanup: tests intentionally do stop when majority of
+        // safekeepers is down, so sync-safekeepers would hang otherwise. This
+        // could be a separate flag though.
+        self.wait_for_compute_ctl_to_exit(destroy)?;
         if destroy {
             println!(
                 "Destroying postgres data directory '{}'",

From 392843ad2ac550d63a8847f74f83e955addfb701 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 1 Jan 2024 14:43:44 +0300
Subject: [PATCH 103/412] Fix safekeeper START_REPLICATION (term=n).

It was giving WAL only up to commit_lsn instead of flush_lsn, so recovery of
uncommitted WAL since cdb08f03 hanged. Add test for this.
---
 safekeeper/src/send_wal.rs                    | 11 +----
 .../regress/test_wal_acceptor_async.py        | 40 +++++++++++++++++++
 2 files changed, 42 insertions(+), 9 deletions(-)

diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs
index 44f14f8c7e..70590a0f95 100644
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -391,15 +391,8 @@ impl SafekeeperPostgresHandler {
         // application_name: give only committed WAL (used by pageserver) or all
         // existing WAL (up to flush_lsn, used by walproposer or peer recovery).
         // The second case is always driven by a consensus leader which term
-        // must generally be also supplied. However we're sloppy to do this in
-        // walproposer recovery which will be removed soon. So TODO is to make
-        // it not Option'al then.
-        //
-        // Fetching WAL without term in recovery creates a small risk of this
-        // WAL getting concurrently garbaged if another compute rises which
-        // collects majority and starts fixing log on this safekeeper itself.
-        // That's ok as (old) proposer will never be able to commit such WAL.
-        let end_watch = if self.is_walproposer_recovery() {
+        // must be supplied.
+        let end_watch = if term.is_some() {
             EndWatch::Flush(tli.get_term_flush_lsn_watch_rx())
         } else {
             EndWatch::Commit(tli.get_commit_lsn_watch_rx())
diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py
index feab7e605b..77d67cd63a 100644
--- a/test_runner/regress/test_wal_acceptor_async.py
+++ b/test_runner/regress/test_wal_acceptor_async.py
@@ -475,6 +475,46 @@ def test_unavailability(neon_env_builder: NeonEnvBuilder):
     asyncio.run(run_unavailability(env, endpoint))
 
 
+async def run_recovery_uncommitted(env: NeonEnv):
+    (sk1, sk2, _) = env.safekeepers
+
+    env.neon_cli.create_branch("test_recovery_uncommitted")
+    ep = env.endpoints.create_start("test_recovery_uncommitted")
+    ep.safe_psql("create table t(key int, value text)")
+    ep.safe_psql("insert into t select generate_series(1, 100), 'payload'")
+
+    # insert with only one safekeeper up to create tail of flushed but not committed WAL
+    sk1.stop()
+    sk2.stop()
+    conn = await ep.connect_async()
+    # query should hang, so execute in separate task
+    bg_query = asyncio.create_task(
+        conn.execute("insert into t select generate_series(1, 2000), 'payload'")
+    )
+    sleep_sec = 2
+    await asyncio.sleep(sleep_sec)
+    # it must still be not finished
+    assert not bg_query.done()
+    # note: destoy will kill compute_ctl, preventing it waiting for hanging sync-safekeepers.
+    ep.stop_and_destroy()
+
+    # Start one of sks to make quorum online plus compute and ensure they can
+    # sync.
+    sk2.start()
+    ep = env.endpoints.create_start(
+        "test_recovery_uncommitted",
+    )
+    ep.safe_psql("insert into t select generate_series(1, 2000), 'payload'")
+
+
+# Test pulling uncommitted WAL (up to flush_lsn) during recovery.
+def test_recovery_uncommitted(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.num_safekeepers = 3
+    env = neon_env_builder.init_start()
+
+    asyncio.run(run_recovery_uncommitted(env))
+
+
 @dataclass
 class RaceConditionTest:
     iteration: int

From d6cfcb0d93529bfae5a20a751e3f280fbe8424e9 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 1 Jan 2024 22:33:27 +0300
Subject: [PATCH 104/412] Move failpoint support code to utils.

To enable them in safekeeper as well.
---
 Cargo.lock                                    |  1 +
 libs/pageserver_api/src/models.rs             | 13 -----
 libs/utils/Cargo.toml                         |  7 +++
 .../utils}/src/failpoint_support.rs           | 57 ++++++++++++++++++-
 libs/utils/src/lib.rs                         |  2 +
 pageserver/src/bin/pageserver.rs              |  3 +-
 pageserver/src/http/routes.rs                 | 32 +----------
 pageserver/src/lib.rs                         |  2 -
 pageserver/src/tenant.rs                      |  9 ++-
 pageserver/src/walingest.rs                   |  5 +-
 10 files changed, 74 insertions(+), 57 deletions(-)
 rename {pageserver => libs/utils}/src/failpoint_support.rs (61%)

diff --git a/Cargo.lock b/Cargo.lock
index 8e0ad7c8ee..73cb83d3a7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5880,6 +5880,7 @@ dependencies = [
  "chrono",
  "const_format",
  "criterion",
+ "fail",
  "futures",
  "heapless",
  "hex",
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index be41b610b8..dea925b468 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -557,19 +557,6 @@ pub enum DownloadRemoteLayersTaskState {
     ShutDown,
 }
 
-pub type ConfigureFailpointsRequest = Vec<FailpointConfig>;
-
-/// Information for configuring a single fail point
-#[derive(Debug, Serialize, Deserialize)]
-pub struct FailpointConfig {
-    /// Name of the fail point
-    pub name: String,
-    /// List of actions to take, using the format described in `fail::cfg`
-    ///
-    /// We also support `actions = "exit"` to cause the fail point to immediately exit.
-    pub actions: String,
-}
-
 #[derive(Debug, Serialize, Deserialize)]
 pub struct TimelineGcRequest {
     pub gc_horizon: Option<u64>,
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index af0414daa2..706b7a3187 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -4,6 +4,12 @@ version = "0.1.0"
 edition.workspace = true
 license.workspace = true
 
+[features]
+default = []
+# Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
+# which adds some runtime cost to run tests on outage conditions
+testing = ["fail/failpoints"]
+
 [dependencies]
 arc-swap.workspace = true
 sentry.workspace = true
@@ -16,6 +22,7 @@ chrono.workspace = true
 heapless.workspace = true
 hex = { workspace = true, features = ["serde"] }
 hyper = { workspace = true, features = ["full"] }
+fail.workspace = true
 futures = { workspace = true}
 jsonwebtoken.workspace = true
 nix.workspace = true
diff --git a/pageserver/src/failpoint_support.rs b/libs/utils/src/failpoint_support.rs
similarity index 61%
rename from pageserver/src/failpoint_support.rs
rename to libs/utils/src/failpoint_support.rs
index 2190eba18a..5ec532e2a6 100644
--- a/pageserver/src/failpoint_support.rs
+++ b/libs/utils/src/failpoint_support.rs
@@ -1,3 +1,14 @@
+//! Failpoint support code shared between pageserver and safekeepers.
+
+use crate::http::{
+    error::ApiError,
+    json::{json_request, json_response},
+};
+use hyper::{Body, Request, Response, StatusCode};
+use serde::{Deserialize, Serialize};
+use tokio_util::sync::CancellationToken;
+use tracing::*;
+
 /// use with fail::cfg("$name", "return(2000)")
 ///
 /// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the
@@ -25,7 +36,7 @@ pub use __failpoint_sleep_millis_async as sleep_millis_async;
 // Helper function used by the macro. (A function has nicer scoping so we
 // don't need to decorate everything with "::")
 #[doc(hidden)]
-pub(crate) async fn failpoint_sleep_helper(name: &'static str, duration_str: String) {
+pub async fn failpoint_sleep_helper(name: &'static str, duration_str: String) {
     let millis = duration_str.parse::<u64>().unwrap();
     let d = std::time::Duration::from_millis(millis);
 
@@ -71,7 +82,7 @@ pub fn init() -> fail::FailScenario<'static> {
     scenario
 }
 
-pub(crate) fn apply_failpoint(name: &str, actions: &str) -> Result<(), String> {
+pub fn apply_failpoint(name: &str, actions: &str) -> Result<(), String> {
     if actions == "exit" {
         fail::cfg_callback(name, exit_failpoint)
     } else {
@@ -84,3 +95,45 @@ fn exit_failpoint() {
     tracing::info!("Exit requested by failpoint");
     std::process::exit(1);
 }
+
+pub type ConfigureFailpointsRequest = Vec<FailpointConfig>;
+
+/// Information for configuring a single fail point
+#[derive(Debug, Serialize, Deserialize)]
+pub struct FailpointConfig {
+    /// Name of the fail point
+    pub name: String,
+    /// List of actions to take, using the format described in `fail::cfg`
+    ///
+    /// We also support `actions = "exit"` to cause the fail point to immediately exit.
+    pub actions: String,
+}
+
+/// Configure failpoints through http.
+pub async fn failpoints_handler(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    if !fail::has_failpoints() {
+        return Err(ApiError::BadRequest(anyhow::anyhow!(
+            "Cannot manage failpoints because storage was compiled without failpoints support"
+        )));
+    }
+
+    let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?;
+    for fp in failpoints {
+        info!("cfg failpoint: {} {}", fp.name, fp.actions);
+
+        // We recognize one extra "action" that's not natively recognized
+        // by the failpoints crate: exit, to immediately kill the process
+        let cfg_result = apply_failpoint(&fp.name, &fp.actions);
+
+        if let Err(err_msg) = cfg_result {
+            return Err(ApiError::BadRequest(anyhow::anyhow!(
+                "Failed to configure failpoints: {err_msg}"
+            )));
+        }
+    }
+
+    json_response(StatusCode::OK, ())
+}
diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs
index bb6c848bf4..9e9b0adfe5 100644
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -83,6 +83,8 @@ pub mod timeout;
 
 pub mod sync;
 
+pub mod failpoint_support;
+
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index f65c4f4580..621ad050f4 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -31,6 +31,7 @@ use pageserver::{
     virtual_file,
 };
 use postgres_backend::AuthType;
+use utils::failpoint_support;
 use utils::logging::TracingErrorLayerEnablement;
 use utils::signals::ShutdownSignals;
 use utils::{
@@ -126,7 +127,7 @@ fn main() -> anyhow::Result<()> {
     }
 
     // Initialize up failpoints support
-    let scenario = pageserver::failpoint_support::init();
+    let scenario = failpoint_support::init();
 
     // Basic initialization of things that don't change after startup
     virtual_file::init(conf.max_file_descriptors);
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 11a3a2c872..157e6b4e3e 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -25,6 +25,7 @@ use tenant_size_model::{SizeResult, StorageModel};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::auth::JwtAuth;
+use utils::failpoint_support::failpoints_handler;
 use utils::http::endpoint::request_span;
 use utils::http::json::json_request_or_empty_body;
 use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
@@ -66,9 +67,6 @@ use utils::{
     lsn::Lsn,
 };
 
-// Imports only used for testing APIs
-use pageserver_api::models::ConfigureFailpointsRequest;
-
 // For APIs that require an Active tenant, how long should we block waiting for that state?
 // This is not functionally necessary (clients will retry), but avoids generating a lot of
 // failed API calls while tenants are activating.
@@ -1293,34 +1291,6 @@ async fn handle_tenant_break(
     json_response(StatusCode::OK, ())
 }
 
-async fn failpoints_handler(
-    mut request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    if !fail::has_failpoints() {
-        return Err(ApiError::BadRequest(anyhow!(
-            "Cannot manage failpoints because pageserver was compiled without failpoints support"
-        )));
-    }
-
-    let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?;
-    for fp in failpoints {
-        info!("cfg failpoint: {} {}", fp.name, fp.actions);
-
-        // We recognize one extra "action" that's not natively recognized
-        // by the failpoints crate: exit, to immediately kill the process
-        let cfg_result = crate::failpoint_support::apply_failpoint(&fp.name, &fp.actions);
-
-        if let Err(err_msg) = cfg_result {
-            return Err(ApiError::BadRequest(anyhow!(
-                "Failed to configure failpoints: {err_msg}"
-            )));
-        }
-    }
-
-    json_response(StatusCode::OK, ())
-}
-
 // Run GC immediately on given timeline.
 async fn timeline_gc_handler(
     mut request: Request<Body>,
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 58adf6e8c4..c1ce0af47b 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -25,8 +25,6 @@ pub mod walingest;
 pub mod walrecord;
 pub mod walredo;
 
-pub mod failpoint_support;
-
 use crate::task_mgr::TaskKind;
 use camino::Utf8Path;
 use deletion_queue::DeletionQueue;
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 2f2169d194..e50987c84b 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -33,6 +33,7 @@ use tracing::*;
 use utils::backoff;
 use utils::completion;
 use utils::crashsafe::path_with_suffix_extension;
+use utils::failpoint_support;
 use utils::fs_ext;
 use utils::sync::gate::Gate;
 use utils::sync::gate::GateGuard;
@@ -890,7 +891,7 @@ impl Tenant {
     ) -> anyhow::Result<()> {
         span::debug_assert_current_span_has_tenant_id();
 
-        crate::failpoint_support::sleep_millis_async!("before-attaching-tenant");
+        failpoint_support::sleep_millis_async!("before-attaching-tenant");
 
         let preload = match preload {
             Some(p) => p,
@@ -1002,7 +1003,7 @@ impl Tenant {
         // IndexPart is the source of truth.
         self.clean_up_timelines(&existent_timelines)?;
 
-        crate::failpoint_support::sleep_millis_async!("attach-before-activate");
+        failpoint_support::sleep_millis_async!("attach-before-activate");
 
         info!("Done");
 
@@ -2839,9 +2840,7 @@ impl Tenant {
             }
         };
 
-        crate::failpoint_support::sleep_millis_async!(
-            "gc_iteration_internal_after_getting_gc_timelines"
-        );
+        failpoint_support::sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines");
 
         // If there is nothing to GC, we don't want any messages in the INFO log.
         if !gc_timelines.is_empty() {
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 1d14214030..a6a8972970 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -29,6 +29,7 @@ use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn};
 use anyhow::{bail, Context, Result};
 use bytes::{Buf, Bytes, BytesMut};
 use tracing::*;
+use utils::failpoint_support;
 
 use crate::context::RequestContext;
 use crate::metrics::WAL_INGEST;
@@ -344,9 +345,7 @@ impl<'a> WalIngest<'a> {
                         // particular point in the WAL. For more fine-grained control,
                         // we could peek into the message and only pause if it contains
                         // a particular string, for example, but this is enough for now.
-                        crate::failpoint_support::sleep_millis_async!(
-                            "wal-ingest-logical-message-sleep"
-                        );
+                        failpoint_support::sleep_millis_async!("wal-ingest-logical-message-sleep");
                     } else if let Some(path) = prefix.strip_prefix("neon-file:") {
                         modification.put_file(path, message, ctx).await?;
                     }

From 2f83f852913887a630517b1b124424946761e7f4 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 1 Jan 2024 23:32:24 +0300
Subject: [PATCH 105/412] Add failpoint support to safekeeper.

Just a copy paste from pageserver.
---
 Cargo.lock                            |  1 +
 safekeeper/Cargo.toml                 |  7 ++++++
 safekeeper/src/bin/safekeeper.rs      | 17 ++++++++++++-
 safekeeper/src/http/routes.rs         |  8 ++++++
 test_runner/fixtures/neon_fixtures.py | 36 +++++++++++++++++++++++----
 5 files changed, 63 insertions(+), 6 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 73cb83d3a7..55e868a6d5 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4449,6 +4449,7 @@ dependencies = [
  "clap",
  "const_format",
  "crc32c",
+ "fail",
  "fs2",
  "futures",
  "git-version",
diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml
index cccb4ebd79..4015c27933 100644
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -4,6 +4,12 @@ version = "0.1.0"
 edition.workspace = true
 license.workspace = true
 
+[features]
+default = []
+# Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
+# which adds some runtime cost to run tests on outage conditions
+testing = ["fail/failpoints"]
+
 [dependencies]
 async-stream.workspace = true
 anyhow.workspace = true
@@ -16,6 +22,7 @@ chrono.workspace = true
 clap = { workspace = true, features = ["derive"] }
 const_format.workspace = true
 crc32c.workspace = true
+fail.workspace = true
 fs2.workspace = true
 git-version.workspace = true
 hex.workspace = true
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index e59deb9fda..33047051df 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -54,6 +54,19 @@ const ID_FILE_NAME: &str = "safekeeper.id";
 project_git_version!(GIT_VERSION);
 project_build_tag!(BUILD_TAG);
 
+const FEATURES: &[&str] = &[
+    #[cfg(feature = "testing")]
+    "testing",
+];
+
+fn version() -> String {
+    format!(
+        "{GIT_VERSION} failpoints: {}, features: {:?}",
+        fail::has_failpoints(),
+        FEATURES,
+    )
+}
+
 const ABOUT: &str = r#"
 A fleet of safekeepers is responsible for reliably storing WAL received from
 compute, passing it through consensus (mitigating potential computes brain
@@ -167,7 +180,9 @@ async fn main() -> anyhow::Result<()> {
     // getting 'argument cannot be used multiple times' error. This seems to be
     // impossible with pure Derive API, so convert struct to Command, modify it,
     // parse arguments, and then fill the struct back.
-    let cmd = <Args as clap::CommandFactory>::command().args_override_self(true);
+    let cmd = <Args as clap::CommandFactory>::command()
+        .args_override_self(true)
+        .version(version());
     let mut matches = cmd.get_matches();
     let mut args = <Args as clap::FromArgMatches>::from_arg_matches_mut(&mut matches)?;
 
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index c48b5330b3..25a3334e63 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -12,6 +12,8 @@ use storage_broker::proto::SafekeeperTimelineInfo;
 use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
 use tokio::fs::File;
 use tokio::io::AsyncReadExt;
+use tokio_util::sync::CancellationToken;
+use utils::failpoint_support::failpoints_handler;
 
 use std::io::Write as _;
 use tokio::sync::mpsc;
@@ -444,6 +446,12 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
         .data(Arc::new(conf))
         .data(auth)
         .get("/v1/status", |r| request_span(r, status_handler))
+        .put("/v1/failpoints", |r| {
+            request_span(r, move |r| async {
+                let cancel = CancellationToken::new();
+                failpoints_handler(r, cancel).await
+            })
+        })
         // Will be used in the future instead of implicit timeline creation
         .post("/v1/tenant/timeline", |r| {
             request_span(r, timeline_create_handler)
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 597e311e02..9aa82d8854 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -890,8 +890,8 @@ class NeonEnv:
         """Get list of safekeeper endpoints suitable for safekeepers GUC"""
         return ",".join(f"localhost:{wa.port.pg}" for wa in self.safekeepers)
 
-    def get_pageserver_version(self) -> str:
-        bin_pageserver = str(self.neon_binpath / "pageserver")
+    def get_binary_version(self, binary_name: str) -> str:
+        bin_pageserver = str(self.neon_binpath / binary_name)
         res = subprocess.run(
             [bin_pageserver, "--version"],
             check=True,
@@ -1656,7 +1656,7 @@ class NeonPageserver(PgProtocol):
         self.running = False
         self.service_port = port
         self.config_override = config_override
-        self.version = env.get_pageserver_version()
+        self.version = env.get_binary_version("pageserver")
 
         # After a test finishes, we will scrape the log to see if there are any
         # unexpected error messages. If your test expects an error, add it to
@@ -2924,7 +2924,8 @@ class Safekeeper:
                 return res
 
     def http_client(self, auth_token: Optional[str] = None) -> SafekeeperHttpClient:
-        return SafekeeperHttpClient(port=self.port.http, auth_token=auth_token)
+        is_testing_enabled = '"testing"' in self.env.get_binary_version("safekeeper")
+        return SafekeeperHttpClient(port=self.port.http, auth_token=auth_token, is_testing_enabled=is_testing_enabled)
 
     def data_dir(self) -> str:
         return os.path.join(self.env.repo_dir, "safekeepers", f"sk{self.id}")
@@ -2975,10 +2976,11 @@ class SafekeeperMetrics:
 class SafekeeperHttpClient(requests.Session):
     HTTPError = requests.HTTPError
 
-    def __init__(self, port: int, auth_token: Optional[str] = None):
+    def __init__(self, port: int, auth_token: Optional[str] = None, is_testing_enabled = False):
         super().__init__()
         self.port = port
         self.auth_token = auth_token
+        self.is_testing_enabled = is_testing_enabled
 
         if auth_token is not None:
             self.headers["Authorization"] = f"Bearer {auth_token}"
@@ -2986,6 +2988,30 @@ class SafekeeperHttpClient(requests.Session):
     def check_status(self):
         self.get(f"http://localhost:{self.port}/v1/status").raise_for_status()
 
+    def is_testing_enabled_or_skip(self):
+        if not self.is_testing_enabled:
+            pytest.skip("safekeeper was built without 'testing' feature")
+
+    def configure_failpoints(self, config_strings: Tuple[str, str] | List[Tuple[str, str]]):
+        self.is_testing_enabled_or_skip()
+
+        if isinstance(config_strings, tuple):
+            pairs = [config_strings]
+        else:
+            pairs = config_strings
+
+        log.info(f"Requesting config failpoints: {repr(pairs)}")
+
+        res = self.put(
+            f"http://localhost:{self.port}/v1/failpoints",
+            json=[{"name": name, "actions": actions} for name, actions in pairs],
+        )
+        log.info(f"Got failpoints request response code {res.status_code}")
+        res.raise_for_status()
+        res_json = res.json()
+        assert res_json is None
+        return res_json
+
     def debug_dump(self, params: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
         params = params or {}
         res = self.get(f"http://localhost:{self.port}/v1/debug_dump", params=params)

From 4a227484bff969ddade5b42ee846c6056870ad1a Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 29 Dec 2023 23:09:36 +0300
Subject: [PATCH 106/412] Add large insertion and slow WAL sending to
 test_hot_standby.

To exercise MAX_SEND_SIZE sending from safekeeper; we've had a bug with WAL
records torn across several XLogData messages. Add failpoint to safekeeper to
slow down sending. Also check for corrupted WAL complains in standby log.

Make the test a bit simpler in passing, e.g. we don't need explicit commits as
autocommit is enabled by default.

https://neondb.slack.com/archives/C05L7D1JAUS/p1703774799114719
https://github.com/neondatabase/cloud/issues/9057
---
 safekeeper/src/send_wal.rs              |  6 ++
 test_runner/fixtures/neon_fixtures.py   | 17 +++--
 test_runner/regress/test_hot_standby.py | 91 +++++++++++++++----------
 3 files changed, 73 insertions(+), 41 deletions(-)

diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs
index 70590a0f95..bd1d306968 100644
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -17,6 +17,7 @@ use postgres_ffi::{TimestampTz, MAX_SEND_SIZE};
 use pq_proto::{BeMessage, WalSndKeepAlive, XLogDataBody};
 use serde::{Deserialize, Serialize};
 use tokio::io::{AsyncRead, AsyncWrite};
+use utils::failpoint_support;
 use utils::id::TenantTimelineId;
 use utils::lsn::AtomicLsn;
 use utils::pageserver_feedback::PageserverFeedback;
@@ -559,6 +560,11 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
                 }))
                 .await?;
 
+            if let Some(appname) = &self.appname {
+                if appname == "replica" {
+                    failpoint_support::sleep_millis_async!("sk-send-wal-replica-sleep");
+                }
+            }
             trace!(
                 "sent {} bytes of WAL {}-{}",
                 send_size,
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 9aa82d8854..5b1a8ba27d 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -347,7 +347,9 @@ class PgProtocol:
         """
         return self.safe_psql_many([query], **kwargs)[0]
 
-    def safe_psql_many(self, queries: List[str], **kwargs: Any) -> List[List[Tuple[Any, ...]]]:
+    def safe_psql_many(
+        self, queries: List[str], log_query=True, **kwargs: Any
+    ) -> List[List[Tuple[Any, ...]]]:
         """
         Execute queries against the node and return all rows.
         This method passes all extra params to connstr.
@@ -356,7 +358,8 @@ class PgProtocol:
         with closing(self.connect(**kwargs)) as conn:
             with conn.cursor() as cur:
                 for query in queries:
-                    log.info(f"Executing query: {query}")
+                    if log_query:
+                        log.info(f"Executing query: {query}")
                     cur.execute(query)
 
                     if cur.description is None:
@@ -365,11 +368,11 @@ class PgProtocol:
                         result.append(cur.fetchall())
         return result
 
-    def safe_psql_scalar(self, query) -> Any:
+    def safe_psql_scalar(self, query, log_query=True) -> Any:
         """
         Execute query returning single row with single column.
         """
-        return self.safe_psql(query)[0][0]
+        return self.safe_psql(query, log_query=log_query)[0][0]
 
 
 @dataclass
@@ -2925,7 +2928,9 @@ class Safekeeper:
 
     def http_client(self, auth_token: Optional[str] = None) -> SafekeeperHttpClient:
         is_testing_enabled = '"testing"' in self.env.get_binary_version("safekeeper")
-        return SafekeeperHttpClient(port=self.port.http, auth_token=auth_token, is_testing_enabled=is_testing_enabled)
+        return SafekeeperHttpClient(
+            port=self.port.http, auth_token=auth_token, is_testing_enabled=is_testing_enabled
+        )
 
     def data_dir(self) -> str:
         return os.path.join(self.env.repo_dir, "safekeepers", f"sk{self.id}")
@@ -2976,7 +2981,7 @@ class SafekeeperMetrics:
 class SafekeeperHttpClient(requests.Session):
     HTTPError = requests.HTTPError
 
-    def __init__(self, port: int, auth_token: Optional[str] = None, is_testing_enabled = False):
+    def __init__(self, port: int, auth_token: Optional[str] = None, is_testing_enabled=False):
         super().__init__()
         self.port = port
         self.auth_token = auth_token
diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py
index 031fd2857d..7822e29ed9 100644
--- a/test_runner/regress/test_hot_standby.py
+++ b/test_runner/regress/test_hot_standby.py
@@ -1,19 +1,59 @@
+import os
+import re
 import time
 
-from fixtures.neon_fixtures import NeonEnv
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import Endpoint, NeonEnv
+
+
+def wait_caughtup(primary: Endpoint, secondary: Endpoint):
+    primary_lsn = primary.safe_psql_scalar(
+        "SELECT pg_current_wal_insert_lsn()::text", log_query=False
+    )
+    while True:
+        secondary_lsn = secondary.safe_psql_scalar(
+            "SELECT pg_last_wal_replay_lsn()", log_query=False
+        )
+        caught_up = secondary_lsn >= primary_lsn
+        log.info(f"caughtup={caught_up}, primary_lsn={primary_lsn}, secondary_lsn={secondary_lsn}")
+        if caught_up:
+            return
+        time.sleep(1)
+
+
+# Check for corrupted WAL messages which might otherwise go unnoticed if
+# reconnection fixes this.
+def scan_standby_log_for_errors(secondary):
+    log_path = secondary.endpoint_path() / "compute.log"
+    with log_path.open("r") as f:
+        markers = re.compile(
+            r"incorrect resource manager data|record with incorrect|invalid magic number|unexpected pageaddr"
+        )
+        for line in f:
+            if markers.search(line):
+                log.info(f"bad error in standby log: {line}")
+                raise AssertionError()
 
 
 def test_hot_standby(neon_simple_env: NeonEnv):
     env = neon_simple_env
 
+    # We've had a bug caused by WAL records split across multiple XLogData
+    # messages resulting in corrupted WAL complains on standby. It reproduced
+    # only when sending from safekeeper is slow enough to grab full
+    # MAX_SEND_SIZE messages. So insert sleep through failpoints, but only in
+    # one conf to decrease test time.
+    slow_down_send = "[debug-pg16]" in os.environ.get("PYTEST_CURRENT_TEST", "")
+    if slow_down_send:
+        sk_http = env.safekeepers[0].http_client()
+        sk_http.configure_failpoints([("sk-send-wal-replica-sleep", "return(100)")])
+
     with env.endpoints.create_start(
         branch_name="main",
         endpoint_id="primary",
     ) as primary:
         time.sleep(1)
         with env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary") as secondary:
-            primary_lsn = None
-            caught_up = False
             queries = [
                 "SHOW neon.timeline_id",
                 "SHOW neon.tenant_id",
@@ -26,23 +66,6 @@ def test_hot_standby(neon_simple_env: NeonEnv):
                 with p_con.cursor() as p_cur:
                     p_cur.execute("CREATE TABLE test AS SELECT generate_series(1, 100) AS i")
 
-                # Explicit commit to make sure other connections (and replicas) can
-                # see the changes of this commit.
-                p_con.commit()
-
-                with p_con.cursor() as p_cur:
-                    p_cur.execute("SELECT pg_current_wal_insert_lsn()::text")
-                    res = p_cur.fetchone()
-                    assert res is not None
-                    (lsn,) = res
-                    primary_lsn = lsn
-
-                # Explicit commit to make sure other connections (and replicas) can
-                # see the changes of this commit.
-                # Note that this may generate more WAL if the transaction has changed
-                # things, but we don't care about that.
-                p_con.commit()
-
                 for query in queries:
                     with p_con.cursor() as p_cur:
                         p_cur.execute(query)
@@ -51,30 +74,28 @@ def test_hot_standby(neon_simple_env: NeonEnv):
                         response = res
                         responses[query] = response
 
+                # insert more data to make safekeeper send MAX_SEND_SIZE messages
+                if slow_down_send:
+                    primary.safe_psql("create table t(key int, value text)")
+                    primary.safe_psql("insert into t select generate_series(1, 100000), 'payload'")
+
+            wait_caughtup(primary, secondary)
+
             with secondary.connect() as s_con:
                 with s_con.cursor() as s_cur:
                     s_cur.execute("SELECT 1 WHERE pg_is_in_recovery()")
                     res = s_cur.fetchone()
                     assert res is not None
 
-                while not caught_up:
-                    with s_con.cursor() as secondary_cursor:
-                        secondary_cursor.execute("SELECT pg_last_wal_replay_lsn()")
-                        res = secondary_cursor.fetchone()
-                        assert res is not None
-                        (secondary_lsn,) = res
-                        # There may be more changes on the primary after we got our LSN
-                        # due to e.g. autovacuum, but that shouldn't impact the content
-                        # of the tables, so we check whether we've replayed up to at
-                        # least after the commit of the `test` table.
-                        caught_up = secondary_lsn >= primary_lsn
-
-                # Explicit commit to flush any transient transaction-level state.
-                s_con.commit()
-
                 for query in queries:
                     with s_con.cursor() as secondary_cursor:
                         secondary_cursor.execute(query)
                         response = secondary_cursor.fetchone()
                         assert response is not None
                         assert response == responses[query]
+
+            scan_standby_log_for_errors(secondary)
+
+    # clean up
+    if slow_down_send:
+        sk_http.configure_failpoints(("sk-send-wal-replica-sleep", "off"))

From 54aa31980597742f28b022bd0a729b97d910f0b7 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Sat, 30 Dec 2023 00:31:19 +0300
Subject: [PATCH 107/412] Don't split WAL record across two XLogData's when
 sending from safekeepers.

As protocol demands. Not following this makes standby complain about corrupted
WAL in various ways.

https://neondb.slack.com/archives/C05L7D1JAUS/p1703774799114719
closes https://github.com/neondatabase/cloud/issues/9057
---
 safekeeper/src/send_wal.rs    | 22 +++++++++++++++-------
 safekeeper/src/wal_storage.rs |  3 +++
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs
index bd1d306968..9a5657a40d 100644
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -529,12 +529,19 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
             );
 
             // try to send as much as available, capped by MAX_SEND_SIZE
-            let mut send_size = self
-                .end_pos
-                .checked_sub(self.start_pos)
-                .context("reading wal without waiting for it first")?
-                .0 as usize;
-            send_size = min(send_size, self.send_buf.len());
+            let mut chunk_end_pos = self.start_pos + MAX_SEND_SIZE as u64;
+            // if we went behind available WAL, back off
+            if chunk_end_pos >= self.end_pos {
+                chunk_end_pos = self.end_pos;
+            } else {
+                // If sending not up to end pos, round down to page boundary to
+                // avoid breaking WAL record not at page boundary, as protocol
+                // demands. See walsender.c (XLogSendPhysical).
+                chunk_end_pos = chunk_end_pos
+                    .checked_sub(chunk_end_pos.block_offset())
+                    .unwrap();
+            }
+            let send_size = (chunk_end_pos.0 - self.start_pos.0) as usize;
             let send_buf = &mut self.send_buf[..send_size];
             let send_size: usize;
             {
@@ -545,7 +552,8 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
                 } else {
                     None
                 };
-                // read wal into buffer
+                // Read WAL into buffer. send_size can be additionally capped to
+                // segment boundary here.
                 send_size = self.wal_reader.read(send_buf).await?
             };
             let send_buf = &send_buf[..send_size];
diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index fa44b24258..e7538f805c 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -565,6 +565,9 @@ impl WalReader {
         })
     }
 
+    /// Read WAL at current position into provided buf, returns number of bytes
+    /// read. It can be smaller than buf size only if segment boundary is
+    /// reached.
     pub async fn read(&mut self, buf: &mut [u8]) -> Result<usize> {
         // If this timeline is new, we may not have a full segment yet, so
         // we pad the first bytes of the timeline's first WAL segment with 0s

From aa72a22661ca841868ab11b21137c47e6c07c3e3 Mon Sep 17 00:00:00 2001
From: vipvap <91739071+vipvap@users.noreply.github.com>
Date: Mon, 8 Jan 2024 09:26:27 +0000
Subject: [PATCH 108/412] Release 2024-01-08 (#6286)

Release 2024-01-08
---
 .github/workflows/build_and_test.yml          |   8 +-
 Cargo.lock                                    |   3 +
 compute_tools/src/monitor.rs                  |  25 +-
 control_plane/src/pageserver.rs               |   7 +
 control_plane/src/tenant_migration.rs         |  21 +-
 docs/sourcetree.md                            |   8 +-
 libs/pageserver_api/src/key.rs                |   2 +-
 libs/pageserver_api/src/keyspace.rs           |   3 +
 libs/pageserver_api/src/shard.rs              |  22 +-
 libs/postgres_backend/src/lib.rs              |  23 +-
 libs/remote_storage/src/azure_blob.rs         |   6 +
 libs/remote_storage/src/lib.rs                |  13 +
 libs/remote_storage/src/local_fs.rs           |  14 +
 libs/remote_storage/src/s3_bucket.rs          |  32 +
 libs/remote_storage/src/s3_bucket/metrics.rs  |   8 +-
 libs/remote_storage/src/simulate_failures.rs  |   7 +
 libs/safekeeper_api/src/models.rs             |   6 +
 libs/utils/src/http/error.rs                  |   7 +
 libs/utils/src/lib.rs                         |   2 +
 libs/utils/src/sync/gate.rs                   |   6 +
 libs/utils/src/yielding_loop.rs               |  35 +
 libs/walproposer/src/walproposer.rs           |   2 +-
 pageserver/benches/bench_walredo.rs           |   5 +-
 pageserver/client/src/mgmt_api.rs             |  14 +-
 pageserver/client/src/page_service.rs         |   9 +-
 .../pagebench/src/cmd/getpage_latest_lsn.rs   |  38 +-
 pageserver/src/basebackup.rs                  |  15 +-
 pageserver/src/config.rs                      |  58 +-
 pageserver/src/http/routes.rs                 |  36 +
 pageserver/src/import_datadir.rs              |   9 +-
 pageserver/src/lib.rs                         |   4 +
 pageserver/src/metrics.rs                     |  96 ++-
 pageserver/src/page_service.rs                | 240 ++++--
 pageserver/src/pgdatadir_mapping.rs           | 275 ++++--
 pageserver/src/task_mgr.rs                    |   5 +-
 pageserver/src/tenant.rs                      |  17 +-
 pageserver/src/tenant/config.rs               |   2 +
 pageserver/src/tenant/delete.rs               |   2 +-
 pageserver/src/tenant/mgr.rs                  | 289 ++++---
 .../src/tenant/remote_timeline_client.rs      |  28 +-
 pageserver/src/tenant/secondary.rs            | 142 +++-
 pageserver/src/tenant/secondary/downloader.rs | 801 ++++++++++++++++++
 .../src/tenant/secondary/heatmap_uploader.rs  | 502 ++++-------
 pageserver/src/tenant/secondary/scheduler.rs  | 361 ++++++++
 .../tenant/storage_layer/inmemory_layer.rs    |  43 +-
 pageserver/src/tenant/storage_layer/layer.rs  |   1 +
 pageserver/src/tenant/tasks.rs                |   2 +
 pageserver/src/tenant/timeline.rs             | 173 ++--
 pageserver/src/tenant/timeline/walreceiver.rs |   1 +
 .../walreceiver/connection_manager.rs         |   3 +
 .../walreceiver/walreceiver_connection.rs     |  36 +-
 pageserver/src/walingest.rs                   | 272 +++---
 pageserver/src/walredo.rs                     |  64 +-
 pgxn/neon/libpagestore.c                      |  25 +-
 pgxn/neon/walproposer.c                       | 268 +++---
 pgxn/neon/walproposer.h                       |  16 +-
 pgxn/neon/walproposer_pg.c                    |  77 +-
 poetry.lock                                   | 174 +---
 pre-commit.py                                 |  24 +-
 pyproject.toml                                |  20 +-
 s3_scrubber/Cargo.toml                        |   3 +
 s3_scrubber/src/lib.rs                        |   8 +-
 s3_scrubber/src/main.rs                       |  57 +-
 s3_scrubber/src/scan_metadata.rs              |  11 +-
 safekeeper/Cargo.toml                         |   1 +
 safekeeper/src/control_file.rs                |   7 +-
 safekeeper/src/copy_timeline.rs               | 250 ++++++
 safekeeper/src/debug_dump.rs                  |  57 ++
 safekeeper/src/http/routes.rs                 |  65 +-
 safekeeper/src/lib.rs                         |   1 +
 safekeeper/src/pull_timeline.rs               | 131 ++-
 safekeeper/src/timeline.rs                    |   3 +-
 safekeeper/src/timelines_global_map.rs        |  15 +-
 safekeeper/src/wal_backup.rs                  |  61 +-
 safekeeper/src/wal_storage.rs                 |   2 +-
 scripts/export_import_between_pageservers.py  |   2 +-
 scripts/reformat                              |   4 +-
 test_runner/fixtures/neon_fixtures.py         |  26 +-
 test_runner/fixtures/pageserver/http.py       |  10 +-
 test_runner/performance/test_perf_olap.py     |   7 +-
 .../performance/test_wal_backpressure.py      |   3 +-
 .../regress/test_attach_tenant_config.py      |  13 +-
 test_runner/regress/test_compatibility.py     |   8 +-
 test_runner/regress/test_crafted_wal_end.py   |   1 -
 test_runner/regress/test_layer_eviction.py    |   4 +-
 .../regress/test_layers_from_future.py        |   3 +
 test_runner/regress/test_pageserver_api.py    |   3 +-
 .../regress/test_pageserver_secondary.py      | 149 +++-
 test_runner/regress/test_tenant_detach.py     |  14 +-
 test_runner/regress/test_tenant_relocation.py |   6 +-
 .../test_tenants_with_remote_storage.py       |   8 +-
 test_runner/regress/test_wal_acceptor.py      |  82 +-
 vm-image-spec.yaml                            |  21 +-
 93 files changed, 4014 insertions(+), 1429 deletions(-)
 create mode 100644 libs/utils/src/yielding_loop.rs
 create mode 100644 pageserver/src/tenant/secondary/downloader.rs
 create mode 100644 pageserver/src/tenant/secondary/scheduler.rs
 create mode 100644 safekeeper/src/copy_timeline.rs

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 78deff6e85..880d6044f2 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -105,11 +105,11 @@ jobs:
       - name: Install Python deps
         run: ./scripts/pysync
 
-      - name: Run ruff to ensure code format
-        run: poetry run ruff .
+      - name: Run `ruff check` to ensure code format
+        run: poetry run ruff check .
 
-      - name: Run black to ensure code format
-        run: poetry run black --diff --check .
+      - name: Run `ruff format` to ensure code format
+        run: poetry run ruff format --check .
 
       - name: Run mypy to check types
         run: poetry run mypy .
diff --git a/Cargo.lock b/Cargo.lock
index 55e868a6d5..4dd195a895 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4405,12 +4405,14 @@ dependencies = [
  "async-stream",
  "aws-config",
  "aws-sdk-s3",
+ "aws-smithy-async",
  "bincode",
  "bytes",
  "chrono",
  "clap",
  "crc32c",
  "either",
+ "futures",
  "futures-util",
  "hex",
  "histogram",
@@ -4473,6 +4475,7 @@ dependencies = [
  "serde",
  "serde_json",
  "serde_with",
+ "sha2",
  "signal-hook",
  "storage_broker",
  "thiserror",
diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs
index f974d6023d..fd19b7e53f 100644
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -3,7 +3,7 @@ use std::{thread, time::Duration};
 
 use chrono::{DateTime, Utc};
 use postgres::{Client, NoTls};
-use tracing::{debug, info};
+use tracing::{debug, info, warn};
 
 use crate::compute::ComputeNode;
 
@@ -84,6 +84,29 @@ fn watch_compute_activity(compute: &ComputeNode) {
                     }
                 }
 
+                // If there are existing (logical) walsenders, do not suspend.
+                //
+                // walproposer doesn't currently show up in pg_stat_replication,
+                // but protect if it will be
+                let ws_count_query = "select count(*) from pg_stat_replication where application_name != 'walproposer';";
+                match cli.query_one(ws_count_query, &[]) {
+                    Ok(r) => match r.try_get::<&str, i64>("count") {
+                        Ok(num_ws) => {
+                            if num_ws > 0 {
+                                last_active = Some(Utc::now());
+                            }
+                        }
+                        Err(e) => {
+                            warn!("failed to parse ws count: {:?}", e);
+                            continue;
+                        }
+                    },
+                    Err(e) => {
+                        warn!("failed to get list of walsenders: {:?}", e);
+                        continue;
+                    }
+                }
+
                 // Update the last activity in the shared state if we got a more recent one.
                 let mut state = compute.state.lock().unwrap();
                 // NB: `Some(<DateTime>)` is always greater than `None`.
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 7d490016bf..fb0d251722 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -485,6 +485,13 @@ impl PageServerNode {
         Ok(self.http_client.list_timelines(*tenant_id).await?)
     }
 
+    pub async fn tenant_secondary_download(&self, tenant_id: &TenantShardId) -> anyhow::Result<()> {
+        Ok(self
+            .http_client
+            .tenant_secondary_download(*tenant_id)
+            .await?)
+    }
+
     pub async fn timeline_create(
         &self,
         tenant_id: TenantId,
diff --git a/control_plane/src/tenant_migration.rs b/control_plane/src/tenant_migration.rs
index 79df108896..23ea8f4060 100644
--- a/control_plane/src/tenant_migration.rs
+++ b/control_plane/src/tenant_migration.rs
@@ -11,6 +11,7 @@ use crate::{
 use pageserver_api::models::{
     LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
 };
+use pageserver_api::shard::TenantShardId;
 use std::collections::HashMap;
 use std::time::Duration;
 use utils::{
@@ -40,9 +41,9 @@ async fn await_lsn(
     loop {
         let latest = match get_lsns(tenant_id, pageserver).await {
             Ok(l) => l,
-            Err(e) => {
+            Err(_e) => {
                 println!(
-                    "🕑 Can't get LSNs on pageserver {} yet, waiting ({e})",
+                    "🕑 Waiting for pageserver {} to activate...",
                     pageserver.conf.id
                 );
                 std::thread::sleep(Duration::from_millis(500));
@@ -89,7 +90,7 @@ pub async fn migrate_tenant(
     tenant_id: TenantId,
     dest_ps: PageServerNode,
 ) -> anyhow::Result<()> {
-    // Get a new generation
+    println!("🤔 Checking existing status...");
     let attachment_service = AttachmentService::from_env(env);
 
     fn build_location_config(
@@ -135,6 +136,20 @@ pub async fn migrate_tenant(
         baseline_lsns = Some(get_lsns(tenant_id, &origin_ps).await?);
     }
 
+    println!(
+        "🔁 Downloading latest layers to destination pageserver {}",
+        dest_ps.conf.id
+    );
+    match dest_ps
+        .tenant_secondary_download(&TenantShardId::unsharded(tenant_id))
+        .await
+    {
+        Ok(()) => {}
+        Err(_) => {
+            println!("  (skipping, destination wasn't in secondary mode)")
+        }
+    }
+
     let gen = attachment_service
         .attach_hook(tenant_id, dest_ps.conf.id)
         .await?;
diff --git a/docs/sourcetree.md b/docs/sourcetree.md
index 95bed83ae5..12fa80349e 100644
--- a/docs/sourcetree.md
+++ b/docs/sourcetree.md
@@ -129,13 +129,13 @@ Run `poetry shell` to activate the virtual environment.
 Alternatively, use `poetry run` to run a single command in the venv, e.g. `poetry run pytest`.
 
 ### Obligatory checks
-We force code formatting via `black`, `ruff`, and type hints via `mypy`.
+We force code formatting via `ruff`, and type hints via `mypy`.
 Run the following commands in the repository's root (next to `pyproject.toml`):
 
 ```bash
-poetry run black .  # All code is reformatted
-poetry run ruff .  # Python linter
-poetry run mypy .  # Ensure there are no typing errors
+poetry run ruff format . # All code is reformatted
+poetry run ruff check .  # Python linter
+poetry run mypy .        # Ensure there are no typing errors
 ```
 
 **WARNING**: do not run `mypy` from a directory other than the root of the repository.
diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index d680a5600e..3e1bba2a06 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -142,7 +142,7 @@ impl Key {
 }
 
 pub fn is_rel_block_key(key: &Key) -> bool {
-    key.field1 == 0x00 && key.field4 != 0
+    key.field1 == 0x00 && key.field4 != 0 && key.field6 != 0xffffffff
 }
 
 impl std::str::FromStr for Key {
diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs
index 16651c322e..80183506d8 100644
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -124,6 +124,9 @@ impl KeySpaceAccum {
                 if range.start == accum.end {
                     accum.end = range.end;
                 } else {
+                    // TODO: to efficiently support small sharding stripe sizes, we should avoid starting
+                    // a new range here if the skipped region was all keys that don't belong on this shard.
+                    // (https://github.com/neondatabase/neon/issues/6247)
                     assert!(range.start > accum.end);
                     self.ranges.push(accum.clone());
                     *accum = range;
diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index 3e4936eec4..18ef2be523 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -422,6 +422,21 @@ impl ShardIdentity {
         }
     }
 
+    /// Return true if the key should be discarded if found in this shard's
+    /// data store, e.g. during compaction after a split
+    pub fn is_key_disposable(&self, key: &Key) -> bool {
+        if key_is_shard0(key) {
+            // Q: Why can't we dispose of shard0 content if we're not shard 0?
+            // A: because the WAL ingestion logic currently ingests some shard 0
+            //    content on all shards, even though it's only read on shard 0.  If we
+            //    dropped it, then subsequent WAL ingest to these keys would encounter
+            //    an error.
+            false
+        } else {
+            !self.is_key_local(key)
+        }
+    }
+
     pub fn shard_slug(&self) -> String {
         if self.count > ShardCount(0) {
             format!("-{:02x}{:02x}", self.number.0, self.count.0)
@@ -515,12 +530,7 @@ fn key_is_shard0(key: &Key) -> bool {
     // relation pages are distributed to shards other than shard zero. Everything else gets
     // stored on shard 0.  This guarantees that shard 0 can independently serve basebackup
     // requests, and any request other than those for particular blocks in relations.
-    //
-    // In this condition:
-    // - is_rel_block_key includes only relations, i.e. excludes SLRU data and
-    // all metadata.
-    // - field6 is set to -1 for relation size pages.
-    !(is_rel_block_key(key) && key.field6 != 0xffffffff)
+    !is_rel_block_key(key)
 }
 
 /// Provide the same result as the function in postgres `hashfn.h` with the same name
diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs
index 1dae008a4f..73d25619c3 100644
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -35,6 +35,12 @@ pub enum QueryError {
     /// We were instructed to shutdown while processing the query
     #[error("Shutting down")]
     Shutdown,
+    /// Query handler indicated that client should reconnect
+    #[error("Server requested reconnect")]
+    Reconnect,
+    /// Query named an entity that was not found
+    #[error("Not found: {0}")]
+    NotFound(std::borrow::Cow<'static, str>),
     /// Authentication failure
     #[error("Unauthorized: {0}")]
     Unauthorized(std::borrow::Cow<'static, str>),
@@ -54,9 +60,9 @@ impl From<io::Error> for QueryError {
 impl QueryError {
     pub fn pg_error_code(&self) -> &'static [u8; 5] {
         match self {
-            Self::Disconnected(_) | Self::SimulatedConnectionError => b"08006", // connection failure
+            Self::Disconnected(_) | Self::SimulatedConnectionError | Self::Reconnect => b"08006", // connection failure
             Self::Shutdown => SQLSTATE_ADMIN_SHUTDOWN,
-            Self::Unauthorized(_) => SQLSTATE_INTERNAL_ERROR,
+            Self::Unauthorized(_) | Self::NotFound(_) => SQLSTATE_INTERNAL_ERROR,
             Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error
         }
     }
@@ -425,6 +431,11 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
                 info!("Stopped due to shutdown");
                 Ok(())
             }
+            Err(QueryError::Reconnect) => {
+                // Dropping out of this loop implicitly disconnects
+                info!("Stopped due to handler reconnect request");
+                Ok(())
+            }
             Err(QueryError::Disconnected(e)) => {
                 info!("Disconnected ({e:#})");
                 // Disconnection is not an error: we just use it that way internally to drop
@@ -974,7 +985,9 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> AsyncWrite for CopyDataWriter<'a, I
 pub fn short_error(e: &QueryError) -> String {
     match e {
         QueryError::Disconnected(connection_error) => connection_error.to_string(),
+        QueryError::Reconnect => "reconnect".to_string(),
         QueryError::Shutdown => "shutdown".to_string(),
+        QueryError::NotFound(_) => "not found".to_string(),
         QueryError::Unauthorized(_e) => "JWT authentication error".to_string(),
         QueryError::SimulatedConnectionError => "simulated connection error".to_string(),
         QueryError::Other(e) => format!("{e:#}"),
@@ -996,9 +1009,15 @@ fn log_query_error(query: &str, e: &QueryError) {
         QueryError::SimulatedConnectionError => {
             error!("query handler for query '{query}' failed due to a simulated connection error")
         }
+        QueryError::Reconnect => {
+            info!("query handler for '{query}' requested client to reconnect")
+        }
         QueryError::Shutdown => {
             info!("query handler for '{query}' cancelled during tenant shutdown")
         }
+        QueryError::NotFound(reason) => {
+            info!("query handler for '{query}' entity not found: {reason}")
+        }
         QueryError::Unauthorized(e) => {
             warn!("query handler for '{query}' failed with authentication error: {e}");
         }
diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index 7ea1103eb2..18cf5d97ba 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -322,6 +322,12 @@ impl RemoteStorage for AzureBlobStorage {
         }
         Ok(())
     }
+
+    async fn copy(&self, _from: &RemotePath, _to: &RemotePath) -> anyhow::Result<()> {
+        Err(anyhow::anyhow!(
+            "copy for azure blob storage is not implemented"
+        ))
+    }
 }
 
 pin_project_lite::pin_project! {
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 3e408e3119..942d0016b0 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -207,6 +207,9 @@ pub trait RemoteStorage: Send + Sync + 'static {
     async fn delete(&self, path: &RemotePath) -> anyhow::Result<()>;
 
     async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()>;
+
+    /// Copy a remote object inside a bucket from one path to another.
+    async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()>;
 }
 
 pub type DownloadStream = Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>;
@@ -374,6 +377,15 @@ impl GenericRemoteStorage {
             Self::Unreliable(s) => s.delete_objects(paths).await,
         }
     }
+
+    pub async fn copy_object(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
+        match self {
+            Self::LocalFs(s) => s.copy(from, to).await,
+            Self::AwsS3(s) => s.copy(from, to).await,
+            Self::AzureBlob(s) => s.copy(from, to).await,
+            Self::Unreliable(s) => s.copy(from, to).await,
+        }
+    }
 }
 
 impl GenericRemoteStorage {
@@ -660,6 +672,7 @@ impl ConcurrencyLimiter {
             RequestKind::Put => &self.write,
             RequestKind::List => &self.read,
             RequestKind::Delete => &self.write,
+            RequestKind::Copy => &self.write,
         }
     }
 
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index d1e7d325b9..bf8b6b5dde 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -409,6 +409,20 @@ impl RemoteStorage for LocalFs {
         }
         Ok(())
     }
+
+    async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
+        let from_path = from.with_base(&self.storage_root);
+        let to_path = to.with_base(&self.storage_root);
+        create_target_directory(&to_path).await?;
+        fs::copy(&from_path, &to_path).await.with_context(|| {
+            format!(
+                "Failed to copy file from '{from_path}' to '{to_path}'",
+                from_path = from_path,
+                to_path = to_path
+            )
+        })?;
+        Ok(())
+    }
 }
 
 fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 0f95458ad1..d7b41edaaf 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -493,6 +493,38 @@ impl RemoteStorage for S3Bucket {
         Ok(())
     }
 
+    async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
+        let kind = RequestKind::Copy;
+        let _guard = self.permit(kind).await;
+
+        let started_at = start_measuring_requests(kind);
+
+        // we need to specify bucket_name as a prefix
+        let copy_source = format!(
+            "{}/{}",
+            self.bucket_name,
+            self.relative_path_to_s3_object(from)
+        );
+
+        let res = self
+            .client
+            .copy_object()
+            .bucket(self.bucket_name.clone())
+            .key(self.relative_path_to_s3_object(to))
+            .copy_source(copy_source)
+            .send()
+            .await;
+
+        let started_at = ScopeGuard::into_inner(started_at);
+        metrics::BUCKET_METRICS
+            .req_seconds
+            .observe_elapsed(kind, &res, started_at);
+
+        res?;
+
+        Ok(())
+    }
+
     async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
         // if prefix is not none then download file `prefix/from`
         // if prefix is none then download file `from`
diff --git a/libs/remote_storage/src/s3_bucket/metrics.rs b/libs/remote_storage/src/s3_bucket/metrics.rs
index ea11edafa5..21dde14906 100644
--- a/libs/remote_storage/src/s3_bucket/metrics.rs
+++ b/libs/remote_storage/src/s3_bucket/metrics.rs
@@ -11,6 +11,7 @@ pub(crate) enum RequestKind {
     Put = 1,
     Delete = 2,
     List = 3,
+    Copy = 4,
 }
 
 use RequestKind::*;
@@ -22,6 +23,7 @@ impl RequestKind {
             Put => "put_object",
             Delete => "delete_object",
             List => "list_objects",
+            Copy => "copy_object",
         }
     }
     const fn as_index(&self) -> usize {
@@ -29,7 +31,7 @@ impl RequestKind {
     }
 }
 
-pub(super) struct RequestTyped<C>([C; 4]);
+pub(super) struct RequestTyped<C>([C; 5]);
 
 impl<C> RequestTyped<C> {
     pub(super) fn get(&self, kind: RequestKind) -> &C {
@@ -38,8 +40,8 @@ impl<C> RequestTyped<C> {
 
     fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self {
         use RequestKind::*;
-        let mut it = [Get, Put, Delete, List].into_iter();
-        let arr = std::array::from_fn::<C, 4, _>(|index| {
+        let mut it = [Get, Put, Delete, List, Copy].into_iter();
+        let arr = std::array::from_fn::<C, 5, _>(|index| {
             let next = it.next().unwrap();
             assert_eq!(index, next.as_index());
             f(next)
diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs
index 802b0db7f5..7f5adcea30 100644
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -162,4 +162,11 @@ impl RemoteStorage for UnreliableWrapper {
         }
         Ok(())
     }
+
+    async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
+        // copy is equivalent to download + upload
+        self.attempt(RemoteOp::Download(from.clone()))?;
+        self.attempt(RemoteOp::Upload(to.clone()))?;
+        self.inner.copy_object(from, to).await
+    }
 }
diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs
index 786712deb1..ce5a1e411e 100644
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -51,3 +51,9 @@ pub struct SkTimelineInfo {
     #[serde(default)]
     pub http_connstr: Option<String>,
 }
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct TimelineCopyRequest {
+    pub target_timeline_id: TimelineId,
+    pub until_lsn: Lsn,
+}
diff --git a/libs/utils/src/http/error.rs b/libs/utils/src/http/error.rs
index ac68b04888..3e9281ac81 100644
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -31,6 +31,9 @@ pub enum ApiError {
     #[error("Shutting down")]
     ShuttingDown,
 
+    #[error("Timeout")]
+    Timeout(Cow<'static, str>),
+
     #[error(transparent)]
     InternalServerError(anyhow::Error),
 }
@@ -67,6 +70,10 @@ impl ApiError {
                 err.to_string(),
                 StatusCode::SERVICE_UNAVAILABLE,
             ),
+            ApiError::Timeout(err) => HttpErrorBody::response_from_msg_and_status(
+                err.to_string(),
+                StatusCode::REQUEST_TIMEOUT,
+            ),
             ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
                 err.to_string(),
                 StatusCode::INTERNAL_SERVER_ERROR,
diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs
index 9e9b0adfe5..890061dc59 100644
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -85,6 +85,8 @@ pub mod sync;
 
 pub mod failpoint_support;
 
+pub mod yielding_loop;
+
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
diff --git a/libs/utils/src/sync/gate.rs b/libs/utils/src/sync/gate.rs
index 31c76d2f74..abc3842da8 100644
--- a/libs/utils/src/sync/gate.rs
+++ b/libs/utils/src/sync/gate.rs
@@ -15,6 +15,12 @@ pub struct Gate {
     name: String,
 }
 
+impl std::fmt::Debug for Gate {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "Gate<{}>", self.name)
+    }
+}
+
 /// RAII guard for a [`Gate`]: as long as this exists, calls to [`Gate::close`] will
 /// not complete.
 #[derive(Debug)]
diff --git a/libs/utils/src/yielding_loop.rs b/libs/utils/src/yielding_loop.rs
new file mode 100644
index 0000000000..963279eb4c
--- /dev/null
+++ b/libs/utils/src/yielding_loop.rs
@@ -0,0 +1,35 @@
+use tokio_util::sync::CancellationToken;
+
+#[derive(thiserror::Error, Debug)]
+pub enum YieldingLoopError {
+    #[error("Cancelled")]
+    Cancelled,
+}
+
+/// Helper for long synchronous loops, e.g. over all tenants in the system.  Periodically
+/// yields to avoid blocking the executor, and after resuming checks the provided
+/// cancellation token to drop out promptly on shutdown.
+#[inline(always)]
+pub async fn yielding_loop<I, T, F>(
+    interval: usize,
+    cancel: &CancellationToken,
+    iter: I,
+    mut visitor: F,
+) -> Result<(), YieldingLoopError>
+where
+    I: Iterator<Item = T>,
+    F: FnMut(T),
+{
+    for (i, item) in iter.enumerate() {
+        visitor(item);
+
+        if i + 1 % interval == 0 {
+            tokio::task::yield_now().await;
+            if cancel.is_cancelled() {
+                return Err(YieldingLoopError::Cancelled);
+            }
+        }
+    }
+
+    Ok(())
+}
diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs
index 35c8f6904d..7251545792 100644
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -425,7 +425,7 @@ mod tests {
         }
 
         fn log_internal(&self, _wp: &mut crate::bindings::WalProposer, level: Level, msg: &str) {
-            println!("walprop_log[{}] {}", level, msg);
+            println!("wp_log[{}] {}", level, msg);
         }
 
         fn after_election(&self, _wp: &mut crate::bindings::WalProposer) {
diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs
index ba41866935..4837626086 100644
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -13,6 +13,7 @@ use bytes::{Buf, Bytes};
 use pageserver::{
     config::PageServerConf, repository::Key, walrecord::NeonWalRecord, walredo::PostgresRedoManager,
 };
+use pageserver_api::shard::TenantShardId;
 use utils::{id::TenantId, lsn::Lsn};
 
 use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
@@ -26,9 +27,9 @@ fn redo_scenarios(c: &mut Criterion) {
 
     let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
     let conf = Box::leak(Box::new(conf));
-    let tenant_id = TenantId::generate();
+    let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
 
-    let manager = PostgresRedoManager::new(conf, tenant_id);
+    let manager = PostgresRedoManager::new(conf, tenant_shard_id);
 
     let manager = Arc::new(manager);
 
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 87e4ed8efd..4c285293f7 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -1,4 +1,4 @@
-use pageserver_api::models::*;
+use pageserver_api::{models::*, shard::TenantShardId};
 use reqwest::{IntoUrl, Method};
 use utils::{
     http::error::HttpErrorBody,
@@ -164,6 +164,18 @@ impl Client {
         Ok(())
     }
 
+    pub async fn tenant_secondary_download(&self, tenant_id: TenantShardId) -> Result<()> {
+        let uri = format!(
+            "{}/v1/tenant/{}/secondary/download",
+            self.mgmt_api_endpoint, tenant_id
+        );
+        self.request(Method::POST, &uri, ())
+            .await?
+            .error_for_status()
+            .map(|_| ())
+            .map_err(|e| Error::ApiError(format!("{}", e)))
+    }
+
     pub async fn location_config(
         &self,
         tenant_id: TenantId,
diff --git a/pageserver/client/src/page_service.rs b/pageserver/client/src/page_service.rs
index fc0d2311f7..231461267a 100644
--- a/pageserver/client/src/page_service.rs
+++ b/pageserver/client/src/page_service.rs
@@ -115,15 +115,8 @@ impl PagestreamClient {
 
     pub async fn getpage(
         &mut self,
-        key: RelTagBlockNo,
-        lsn: Lsn,
+        req: PagestreamGetPageRequest,
     ) -> anyhow::Result<PagestreamGetPageResponse> {
-        let req = PagestreamGetPageRequest {
-            latest: false,
-            rel: key.rel_tag,
-            blkno: key.block_no,
-            lsn,
-        };
         let req = PagestreamFeMessage::GetPage(req);
         let req: bytes::Bytes = req.serialize();
         // let mut req = tokio_util::io::ReaderStream::new(&req);
diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
index 16d198ab0e..cb36a403f1 100644
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -3,7 +3,7 @@ use futures::future::join_all;
 use pageserver::pgdatadir_mapping::key_to_rel_block;
 use pageserver::repository;
 use pageserver_api::key::is_rel_block_key;
-use pageserver_client::page_service::RelTagBlockNo;
+use pageserver_api::models::PagestreamGetPageRequest;
 
 use utils::id::TenantTimelineId;
 use utils::lsn::Lsn;
@@ -39,6 +39,9 @@ pub(crate) struct Args {
     runtime: Option<humantime::Duration>,
     #[clap(long)]
     per_target_rate_limit: Option<usize>,
+    /// Probability for sending `latest=true` in the request (uniform distribution).
+    #[clap(long, default_value = "1")]
+    req_latest_probability: f64,
     #[clap(long)]
     limit_to_first_n_targets: Option<usize>,
     targets: Option<Vec<TenantTimelineId>>,
@@ -200,18 +203,26 @@ async fn main_impl(
             start_work_barrier.wait().await;
 
             loop {
-                let (range, key) = {
+                let (timeline, req) = {
                     let mut rng = rand::thread_rng();
                     let r = &all_ranges[weights.sample(&mut rng)];
                     let key: i128 = rng.gen_range(r.start..r.end);
                     let key = repository::Key::from_i128(key);
                     let (rel_tag, block_no) =
                         key_to_rel_block(key).expect("we filter non-rel-block keys out above");
-                    (r, RelTagBlockNo { rel_tag, block_no })
+                    (
+                        r.timeline,
+                        PagestreamGetPageRequest {
+                            latest: rng.gen_bool(args.req_latest_probability),
+                            lsn: r.timeline_lsn,
+                            rel: rel_tag,
+                            blkno: block_no,
+                        },
+                    )
                 };
-                let sender = work_senders.get(&range.timeline).unwrap();
+                let sender = work_senders.get(&timeline).unwrap();
                 // TODO: what if this blocks?
-                sender.send((key, range.timeline_lsn)).await.ok().unwrap();
+                sender.send(req).await.ok().unwrap();
             }
         }),
         Some(rps_limit) => Box::pin(async move {
@@ -240,16 +251,21 @@ async fn main_impl(
                     );
                     loop {
                         ticker.tick().await;
-                        let (range, key) = {
+                        let req = {
                             let mut rng = rand::thread_rng();
                             let r = &ranges[weights.sample(&mut rng)];
                             let key: i128 = rng.gen_range(r.start..r.end);
                             let key = repository::Key::from_i128(key);
                             let (rel_tag, block_no) = key_to_rel_block(key)
                                 .expect("we filter non-rel-block keys out above");
-                            (r, RelTagBlockNo { rel_tag, block_no })
+                            PagestreamGetPageRequest {
+                                latest: rng.gen_bool(args.req_latest_probability),
+                                lsn: r.timeline_lsn,
+                                rel: rel_tag,
+                                blkno: block_no,
+                            }
                         };
-                        sender.send((key, range.timeline_lsn)).await.ok().unwrap();
+                        sender.send(req).await.ok().unwrap();
                     }
                 })
             };
@@ -303,7 +319,7 @@ async fn client(
     args: &'static Args,
     timeline: TenantTimelineId,
     start_work_barrier: Arc<Barrier>,
-    mut work: tokio::sync::mpsc::Receiver<(RelTagBlockNo, Lsn)>,
+    mut work: tokio::sync::mpsc::Receiver<PagestreamGetPageRequest>,
     all_work_done_barrier: Arc<Barrier>,
     live_stats: Arc<LiveStats>,
 ) {
@@ -317,10 +333,10 @@ async fn client(
         .await
         .unwrap();
 
-    while let Some((key, lsn)) = work.recv().await {
+    while let Some(req) = work.recv().await {
         let start = Instant::now();
         client
-            .getpage(key, lsn)
+            .getpage(req)
             .await
             .with_context(|| format!("getpage for {timeline}"))
             .unwrap();
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index ed452eae7d..7e5ae892ad 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -23,6 +23,7 @@ use tracing::*;
 use tokio_tar::{Builder, EntryType, Header};
 
 use crate::context::RequestContext;
+use crate::pgdatadir_mapping::Version;
 use crate::tenant::Timeline;
 use pageserver_api::reltag::{RelTag, SlruKind};
 
@@ -174,7 +175,7 @@ where
         ] {
             for segno in self
                 .timeline
-                .list_slru_segments(kind, self.lsn, self.ctx)
+                .list_slru_segments(kind, Version::Lsn(self.lsn), self.ctx)
                 .await?
             {
                 self.add_slru_segment(kind, segno).await?;
@@ -192,7 +193,7 @@ where
             // Otherwise only include init forks of unlogged relations.
             let rels = self
                 .timeline
-                .list_rels(spcnode, dbnode, self.lsn, self.ctx)
+                .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
                 .await?;
             for &rel in rels.iter() {
                 // Send init fork as main fork to provide well formed empty
@@ -267,7 +268,7 @@ where
     async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> anyhow::Result<()> {
         let nblocks = self
             .timeline
-            .get_rel_size(src, self.lsn, false, self.ctx)
+            .get_rel_size(src, Version::Lsn(self.lsn), false, self.ctx)
             .await?;
 
         // If the relation is empty, create an empty file
@@ -288,7 +289,7 @@ where
             for blknum in startblk..endblk {
                 let img = self
                     .timeline
-                    .get_rel_page_at_lsn(src, blknum, self.lsn, false, self.ctx)
+                    .get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), false, self.ctx)
                     .await?;
                 segment_data.extend_from_slice(&img[..]);
             }
@@ -310,7 +311,7 @@ where
     async fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> {
         let nblocks = self
             .timeline
-            .get_slru_segment_size(slru, segno, self.lsn, self.ctx)
+            .get_slru_segment_size(slru, segno, Version::Lsn(self.lsn), self.ctx)
             .await?;
 
         let mut slru_buf: Vec<u8> = Vec::with_capacity(nblocks as usize * BLCKSZ as usize);
@@ -352,7 +353,7 @@ where
         let relmap_img = if has_relmap_file {
             let img = self
                 .timeline
-                .get_relmap_file(spcnode, dbnode, self.lsn, self.ctx)
+                .get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
                 .await?;
 
             ensure!(
@@ -399,7 +400,7 @@ where
             if !has_relmap_file
                 && self
                     .timeline
-                    .list_rels(spcnode, dbnode, self.lsn, self.ctx)
+                    .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
                     .await?
                     .is_empty()
             {
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 8516f397ca..7c03dc1bdd 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -37,8 +37,8 @@ use crate::tenant::{
     TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
 };
 use crate::{
-    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_LOCATION_CONFIG_NAME,
-    TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
+    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
+    TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
 };
 
 use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP;
@@ -75,6 +75,9 @@ pub mod defaults {
     pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";
 
     pub const DEFAULT_HEATMAP_UPLOAD_CONCURRENCY: usize = 8;
+    pub const DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY: usize = 1;
+
+    pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
 
     ///
     /// Default built-in configuration file.
@@ -88,6 +91,7 @@ pub mod defaults {
 #wait_lsn_timeout = '{DEFAULT_WAIT_LSN_TIMEOUT}'
 #wal_redo_timeout = '{DEFAULT_WAL_REDO_TIMEOUT}'
 
+#page_cache_size = {DEFAULT_PAGE_CACHE_SIZE}
 #max_file_descriptors = {DEFAULT_MAX_FILE_DESCRIPTORS}
 
 # initial superuser role name to use when creating a new tenant
@@ -108,6 +112,8 @@ pub mod defaults {
 
 #background_task_maximum_delay = '{DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY}'
 
+#ingest_batch_size = {DEFAULT_INGEST_BATCH_SIZE}
+
 [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -125,6 +131,7 @@ pub mod defaults {
 #gc_feedback = false
 
 #heatmap_upload_concurrency = {DEFAULT_HEATMAP_UPLOAD_CONCURRENCY}
+#secondary_download_concurrency = {DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY}
 
 [remote_storage]
 
@@ -233,6 +240,13 @@ pub struct PageServerConf {
     /// How many heatmap uploads may be done concurrency: lower values implicitly deprioritize
     /// heatmap uploads vs. other remote storage operations.
     pub heatmap_upload_concurrency: usize,
+
+    /// How many remote storage downloads may be done for secondary tenants concurrently.  Implicitly
+    /// deprioritises secondary downloads vs. remote storage operations for attached tenants.
+    pub secondary_download_concurrency: usize,
+
+    /// Maximum number of WAL records to be ingested and committed at the same time
+    pub ingest_batch_size: u64,
 }
 
 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -314,6 +328,9 @@ struct PageServerConfigBuilder {
     control_plane_emergency_mode: BuilderValue<bool>,
 
     heatmap_upload_concurrency: BuilderValue<usize>,
+    secondary_download_concurrency: BuilderValue<usize>,
+
+    ingest_batch_size: BuilderValue<u64>,
 }
 
 impl Default for PageServerConfigBuilder {
@@ -386,6 +403,9 @@ impl Default for PageServerConfigBuilder {
             control_plane_emergency_mode: Set(false),
 
             heatmap_upload_concurrency: Set(DEFAULT_HEATMAP_UPLOAD_CONCURRENCY),
+            secondary_download_concurrency: Set(DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY),
+
+            ingest_batch_size: Set(DEFAULT_INGEST_BATCH_SIZE),
         }
     }
 }
@@ -534,6 +554,14 @@ impl PageServerConfigBuilder {
         self.heatmap_upload_concurrency = BuilderValue::Set(value)
     }
 
+    pub fn secondary_download_concurrency(&mut self, value: usize) {
+        self.secondary_download_concurrency = BuilderValue::Set(value)
+    }
+
+    pub fn ingest_batch_size(&mut self, ingest_batch_size: u64) {
+        self.ingest_batch_size = BuilderValue::Set(ingest_batch_size)
+    }
+
     pub fn build(self) -> anyhow::Result<PageServerConf> {
         let concurrent_tenant_warmup = self
             .concurrent_tenant_warmup
@@ -632,10 +660,15 @@ impl PageServerConfigBuilder {
             control_plane_emergency_mode: self
                 .control_plane_emergency_mode
                 .ok_or(anyhow!("missing control_plane_emergency_mode"))?,
-
             heatmap_upload_concurrency: self
                 .heatmap_upload_concurrency
                 .ok_or(anyhow!("missing heatmap_upload_concurrency"))?,
+            secondary_download_concurrency: self
+                .secondary_download_concurrency
+                .ok_or(anyhow!("missing secondary_download_concurrency"))?,
+            ingest_batch_size: self
+                .ingest_batch_size
+                .ok_or(anyhow!("missing ingest_batch_size"))?,
         })
     }
 }
@@ -693,6 +726,11 @@ impl PageServerConf {
             .join(TENANT_LOCATION_CONFIG_NAME)
     }
 
+    pub(crate) fn tenant_heatmap_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
+        self.tenant_path(tenant_shard_id)
+            .join(TENANT_HEATMAP_BASENAME)
+    }
+
     pub fn timelines_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
         self.tenant_path(tenant_shard_id)
             .join(TIMELINES_SEGMENT_NAME)
@@ -878,6 +916,10 @@ impl PageServerConf {
                 "heatmap_upload_concurrency" => {
                     builder.heatmap_upload_concurrency(parse_toml_u64(key, item)? as usize)
                 },
+                "secondary_download_concurrency" => {
+                    builder.secondary_download_concurrency(parse_toml_u64(key, item)? as usize)
+                },
+                "ingest_batch_size" => builder.ingest_batch_size(parse_toml_u64(key, item)?),
                 _ => bail!("unrecognized pageserver option '{key}'"),
             }
         }
@@ -949,6 +991,8 @@ impl PageServerConf {
             control_plane_api_token: None,
             control_plane_emergency_mode: false,
             heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
+            secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
+            ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
         }
     }
 }
@@ -1177,7 +1221,9 @@ background_task_maximum_delay = '334 s'
                 control_plane_api: None,
                 control_plane_api_token: None,
                 control_plane_emergency_mode: false,
-                heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY
+                heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
+                secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
+                ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
             },
             "Correct defaults should be used when no config values are provided"
         );
@@ -1238,7 +1284,9 @@ background_task_maximum_delay = '334 s'
                 control_plane_api: None,
                 control_plane_api_token: None,
                 control_plane_emergency_mode: false,
-                heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY
+                heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
+                secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
+                ingest_batch_size: 100,
             },
             "Should be able to parse all basic config values correctly"
         );
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 157e6b4e3e..5c7747d353 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -152,6 +152,7 @@ impl From<PageReconstructError> for ApiError {
             PageReconstructError::AncestorStopping(_) => {
                 ApiError::ResourceUnavailable(format!("{pre}").into())
             }
+            PageReconstructError::AncestorLsnTimeout(e) => ApiError::Timeout(format!("{e}").into()),
             PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre),
         }
     }
@@ -1273,6 +1274,23 @@ async fn put_tenant_location_config_handler(
         // which is not a 400 but a 409.
         .map_err(ApiError::BadRequest)?;
 
+    if let Some(_flush_ms) = flush {
+        match state
+            .secondary_controller
+            .upload_tenant(tenant_shard_id)
+            .await
+        {
+            Ok(()) => {
+                tracing::info!("Uploaded heatmap during flush");
+            }
+            Err(e) => {
+                tracing::warn!("Failed to flush heatmap: {e}");
+            }
+        }
+    } else {
+        tracing::info!("No flush requested when configuring");
+    }
+
     json_response(StatusCode::OK, ())
 }
 
@@ -1610,6 +1628,21 @@ async fn secondary_upload_handler(
     json_response(StatusCode::OK, ())
 }
 
+async fn secondary_download_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let state = get_state(&request);
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    state
+        .secondary_controller
+        .download_tenant(tenant_shard_id)
+        .await
+        .map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::OK, ())
+}
+
 async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
     json_response(
         StatusCode::NOT_FOUND,
@@ -1878,6 +1911,9 @@ pub fn make_router(
         .put("/v1/deletion_queue/flush", |r| {
             api_handler(r, deletion_queue_flush)
         })
+        .post("/v1/tenant/:tenant_shard_id/secondary/download", |r| {
+            api_handler(r, secondary_download_handler)
+        })
         .put("/v1/tenant/:tenant_shard_id/break", |r| {
             testing_api_handler("set tenant state to broken", r, handle_tenant_break)
         })
diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs
index d95d75449d..d66df36b3a 100644
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -21,6 +21,7 @@ use tracing::*;
 use walkdir::WalkDir;
 
 use crate::context::RequestContext;
+use crate::metrics::WAL_INGEST;
 use crate::pgdatadir_mapping::*;
 use crate::tenant::remote_timeline_client::INITDB_PATH;
 use crate::tenant::Timeline;
@@ -312,13 +313,16 @@ async fn import_wal(
         waldecoder.feed_bytes(&buf);
 
         let mut nrecords = 0;
-        let mut modification = tline.begin_modification(endpoint);
+        let mut modification = tline.begin_modification(last_lsn);
         let mut decoded = DecodedWALRecord::default();
         while last_lsn <= endpoint {
             if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
                 walingest
                     .ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
                     .await?;
+                WAL_INGEST.records_committed.inc();
+
+                modification.commit(ctx).await?;
                 last_lsn = lsn;
 
                 nrecords += 1;
@@ -448,13 +452,14 @@ pub async fn import_wal_from_tar(
 
         waldecoder.feed_bytes(&bytes[offset..]);
 
-        let mut modification = tline.begin_modification(end_lsn);
+        let mut modification = tline.begin_modification(last_lsn);
         let mut decoded = DecodedWALRecord::default();
         while last_lsn <= end_lsn {
             if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
                 walingest
                     .ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
                     .await?;
+                modification.commit(ctx).await?;
                 last_lsn = lsn;
 
                 debug!("imported record at {} (end {})", lsn, end_lsn);
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index c1ce0af47b..26070e0cc1 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -117,6 +117,10 @@ pub const TENANT_CONFIG_NAME: &str = "config";
 /// Full path: `tenants/<tenant_id>/config`.
 pub const TENANT_LOCATION_CONFIG_NAME: &str = "config-v1";
 
+/// Per-tenant copy of their remote heatmap, downloaded into the local
+/// tenant path while in secondary mode.
+pub const TENANT_HEATMAP_BASENAME: &str = "heatmap-v1.json";
+
 /// A suffix used for various temporary files. Any temporary files found in the
 /// data directory at pageserver startup can be automatically removed.
 pub const TEMP_FILE_SUFFIX: &str = "___temp";
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 4725903783..6f4431c3cf 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -29,7 +29,7 @@ const CRITICAL_OP_BUCKETS: &[f64] = &[
 // Metrics collected on operations on the storage repository.
 #[derive(Debug, EnumVariantNames, IntoStaticStr)]
 #[strum(serialize_all = "kebab_case")]
-pub enum StorageTimeOperation {
+pub(crate) enum StorageTimeOperation {
     #[strum(serialize = "layer flush")]
     LayerFlush,
 
@@ -55,7 +55,7 @@ pub enum StorageTimeOperation {
     CreateTenant,
 }
 
-pub static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(|| {
+pub(crate) static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(|| {
     register_counter_vec!(
         "pageserver_storage_operations_seconds_sum",
         "Total time spent on storage operations with operation, tenant and timeline dimensions",
@@ -64,7 +64,7 @@ pub static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
-pub static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
+pub(crate) static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
     register_int_counter_vec!(
         "pageserver_storage_operations_seconds_count",
         "Count of storage operations with operation, tenant and timeline dimensions",
@@ -150,7 +150,7 @@ pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
-pub struct PageCacheMetricsForTaskKind {
+pub(crate) struct PageCacheMetricsForTaskKind {
     pub read_accesses_materialized_page: IntCounter,
     pub read_accesses_immutable: IntCounter,
 
@@ -159,7 +159,7 @@ pub struct PageCacheMetricsForTaskKind {
     pub read_hits_materialized_page_older_lsn: IntCounter,
 }
 
-pub struct PageCacheMetrics {
+pub(crate) struct PageCacheMetrics {
     map: EnumMap<TaskKind, EnumMap<PageContentKind, PageCacheMetricsForTaskKind>>,
 }
 
@@ -181,7 +181,7 @@ static PAGE_CACHE_READ_ACCESSES: Lazy<IntCounterVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
-pub static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMetrics {
+pub(crate) static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMetrics {
     map: EnumMap::from_array(std::array::from_fn(|task_kind| {
         let task_kind = <TaskKind as enum_map::Enum>::from_usize(task_kind);
         let task_kind: &'static str = task_kind.into();
@@ -243,10 +243,9 @@ impl PageCacheMetrics {
     }
 }
 
-pub struct PageCacheSizeMetrics {
+pub(crate) struct PageCacheSizeMetrics {
     pub max_bytes: UIntGauge,
 
-    pub current_bytes_ephemeral: UIntGauge,
     pub current_bytes_immutable: UIntGauge,
     pub current_bytes_materialized_page: UIntGauge,
 }
@@ -260,31 +259,26 @@ static PAGE_CACHE_SIZE_CURRENT_BYTES: Lazy<UIntGaugeVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
-pub static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> = Lazy::new(|| PageCacheSizeMetrics {
-    max_bytes: {
-        register_uint_gauge!(
-            "pageserver_page_cache_size_max_bytes",
-            "Maximum size of the page cache in bytes"
-        )
-        .expect("failed to define a metric")
-    },
-
-    current_bytes_ephemeral: {
-        PAGE_CACHE_SIZE_CURRENT_BYTES
-            .get_metric_with_label_values(&["ephemeral"])
-            .unwrap()
-    },
-    current_bytes_immutable: {
-        PAGE_CACHE_SIZE_CURRENT_BYTES
-            .get_metric_with_label_values(&["immutable"])
-            .unwrap()
-    },
-    current_bytes_materialized_page: {
-        PAGE_CACHE_SIZE_CURRENT_BYTES
-            .get_metric_with_label_values(&["materialized_page"])
-            .unwrap()
-    },
-});
+pub(crate) static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> =
+    Lazy::new(|| PageCacheSizeMetrics {
+        max_bytes: {
+            register_uint_gauge!(
+                "pageserver_page_cache_size_max_bytes",
+                "Maximum size of the page cache in bytes"
+            )
+            .expect("failed to define a metric")
+        },
+        current_bytes_immutable: {
+            PAGE_CACHE_SIZE_CURRENT_BYTES
+                .get_metric_with_label_values(&["immutable"])
+                .unwrap()
+        },
+        current_bytes_materialized_page: {
+            PAGE_CACHE_SIZE_CURRENT_BYTES
+                .get_metric_with_label_values(&["materialized_page"])
+                .unwrap()
+        },
+    });
 
 pub(crate) mod page_cache_eviction_metrics {
     use std::num::NonZeroUsize;
@@ -740,13 +734,13 @@ pub(crate) static TENANT: Lazy<TenantMetrics> = Lazy::new(|| {
 
 /// Each `Timeline`'s  [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
 #[derive(Debug)]
-pub struct EvictionsWithLowResidenceDuration {
+pub(crate) struct EvictionsWithLowResidenceDuration {
     data_source: &'static str,
     threshold: Duration,
     counter: Option<IntCounter>,
 }
 
-pub struct EvictionsWithLowResidenceDurationBuilder {
+pub(crate) struct EvictionsWithLowResidenceDurationBuilder {
     data_source: &'static str,
     threshold: Duration,
 }
@@ -1009,7 +1003,7 @@ pub enum SmgrQueryType {
 }
 
 #[derive(Debug)]
-pub struct SmgrQueryTimePerTimeline {
+pub(crate) struct SmgrQueryTimePerTimeline {
     metrics: [GlobalAndPerTimelineHistogram; SmgrQueryType::COUNT],
 }
 
@@ -1181,8 +1175,8 @@ static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| {
     .map(|ms| (ms as f64) / 1000.0)
 });
 
-pub struct BasebackupQueryTime(HistogramVec);
-pub static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
+pub(crate) struct BasebackupQueryTime(HistogramVec);
+pub(crate) static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
     BasebackupQueryTime({
         register_histogram_vec!(
             "pageserver_basebackup_query_seconds",
@@ -1202,7 +1196,7 @@ impl DurationResultObserver for BasebackupQueryTime {
     }
 }
 
-pub static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
+pub(crate) static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
     register_int_gauge_vec!(
         "pageserver_live_connections",
         "Number of live network connections",
@@ -1369,6 +1363,8 @@ pub(crate) struct SecondaryModeMetrics {
     pub(crate) upload_heatmap: IntCounter,
     pub(crate) upload_heatmap_errors: IntCounter,
     pub(crate) upload_heatmap_duration: Histogram,
+    pub(crate) download_heatmap: IntCounter,
+    pub(crate) download_layer: IntCounter,
 }
 pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| SecondaryModeMetrics {
     upload_heatmap: register_int_counter!(
@@ -1386,6 +1382,16 @@ pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| Seco
         "Time to build and upload a heatmap, including any waiting inside the S3 client"
     )
     .expect("failed to define a metric"),
+    download_heatmap: register_int_counter!(
+        "pageserver_secondary_download_heatmap",
+        "Number of downloads of heatmaps by secondary mode locations"
+    )
+    .expect("failed to define a metric"),
+    download_layer: register_int_counter!(
+        "pageserver_secondary_download_layer",
+        "Number of downloads of layers by secondary mode locations"
+    )
+    .expect("failed to define a metric"),
 });
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
@@ -1655,7 +1661,7 @@ pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy<WalRedoProcessCounters> =
     Lazy::new(WalRedoProcessCounters::default);
 
 /// Similar to `prometheus::HistogramTimer` but does not record on drop.
-pub struct StorageTimeMetricsTimer {
+pub(crate) struct StorageTimeMetricsTimer {
     metrics: StorageTimeMetrics,
     start: Instant,
 }
@@ -1680,7 +1686,7 @@ impl StorageTimeMetricsTimer {
 /// Timing facilities for an globally histogrammed metric, which is supported by per tenant and
 /// timeline total sum and count.
 #[derive(Clone, Debug)]
-pub struct StorageTimeMetrics {
+pub(crate) struct StorageTimeMetrics {
     /// Sum of f64 seconds, per operation, tenant_id and timeline_id
     timeline_sum: Counter,
     /// Number of oeprations, per operation, tenant_id and timeline_id
@@ -1719,7 +1725,7 @@ impl StorageTimeMetrics {
 }
 
 #[derive(Debug)]
-pub struct TimelineMetrics {
+pub(crate) struct TimelineMetrics {
     tenant_id: String,
     shard_id: String,
     timeline_id: String,
@@ -1927,7 +1933,7 @@ impl Drop for PerTimelineRemotePhysicalSizeGauge {
     }
 }
 
-pub struct RemoteTimelineClientMetrics {
+pub(crate) struct RemoteTimelineClientMetrics {
     tenant_id: String,
     timeline_id: String,
     remote_physical_size_gauge: Mutex<Option<PerTimelineRemotePhysicalSizeGauge>>,
@@ -2225,7 +2231,7 @@ impl Drop for RemoteTimelineClientMetrics {
 
 /// Wrapper future that measures the time spent by a remote storage operation,
 /// and records the time and success/failure as a prometheus metric.
-pub trait MeasureRemoteOp: Sized {
+pub(crate) trait MeasureRemoteOp: Sized {
     fn measure_remote_op(
         self,
         tenant_id: TenantId,
@@ -2250,7 +2256,7 @@ pub trait MeasureRemoteOp: Sized {
 impl<T: Sized> MeasureRemoteOp for T {}
 
 pin_project! {
-    pub struct MeasuredRemoteOp<F>
+    pub(crate) struct MeasuredRemoteOp<F>
     {
         #[pin]
         inner: F,
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index d5ca7f7382..291490d016 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -25,6 +25,7 @@ use postgres_backend::{self, is_expected_io_error, AuthType, PostgresBackend, Qu
 use pq_proto::framed::ConnectionError;
 use pq_proto::FeStartupPacket;
 use pq_proto::{BeMessage, FeMessage, RowDescriptor};
+use std::borrow::Cow;
 use std::io;
 use std::net::TcpListener;
 use std::pin::pin;
@@ -53,7 +54,7 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::import_datadir::import_wal_from_tar;
 use crate::metrics;
 use crate::metrics::LIVE_CONNECTIONS_COUNT;
-use crate::pgdatadir_mapping::rel_block_to_key;
+use crate::pgdatadir_mapping::{rel_block_to_key, Version};
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
@@ -61,6 +62,9 @@ use crate::tenant::mgr;
 use crate::tenant::mgr::get_active_tenant_with_timeout;
 use crate::tenant::mgr::GetActiveTenantError;
 use crate::tenant::mgr::ShardSelector;
+use crate::tenant::timeline::WaitLsnError;
+use crate::tenant::GetTimelineError;
+use crate::tenant::PageReconstructError;
 use crate::tenant::Timeline;
 use crate::trace::Tracer;
 
@@ -283,6 +287,64 @@ struct PageServerHandler {
     connection_ctx: RequestContext,
 }
 
+#[derive(thiserror::Error, Debug)]
+enum PageStreamError {
+    /// We encountered an error that should prompt the client to reconnect:
+    /// in practice this means we drop the connection without sending a response.
+    #[error("Reconnect required: {0}")]
+    Reconnect(Cow<'static, str>),
+
+    /// We were instructed to shutdown while processing the query
+    #[error("Shutting down")]
+    Shutdown,
+
+    /// Something went wrong reading a page: this likely indicates a pageserver bug
+    #[error("Read error: {0}")]
+    Read(PageReconstructError),
+
+    /// Ran out of time waiting for an LSN
+    #[error("LSN timeout: {0}")]
+    LsnTimeout(WaitLsnError),
+
+    /// The entity required to serve the request (tenant or timeline) is not found,
+    /// or is not found in a suitable state to serve a request.
+    #[error("Not found: {0}")]
+    NotFound(std::borrow::Cow<'static, str>),
+
+    /// Request asked for something that doesn't make sense, like an invalid LSN
+    #[error("Bad request: {0}")]
+    BadRequest(std::borrow::Cow<'static, str>),
+}
+
+impl From<PageReconstructError> for PageStreamError {
+    fn from(value: PageReconstructError) -> Self {
+        match value {
+            PageReconstructError::Cancelled => Self::Shutdown,
+            e => Self::Read(e),
+        }
+    }
+}
+
+impl From<GetActiveTimelineError> for PageStreamError {
+    fn from(value: GetActiveTimelineError) -> Self {
+        match value {
+            GetActiveTimelineError::Tenant(GetActiveTenantError::Cancelled) => Self::Shutdown,
+            GetActiveTimelineError::Tenant(e) => Self::NotFound(format!("{e}").into()),
+            GetActiveTimelineError::Timeline(e) => Self::NotFound(format!("{e}").into()),
+        }
+    }
+}
+
+impl From<WaitLsnError> for PageStreamError {
+    fn from(value: WaitLsnError) -> Self {
+        match value {
+            e @ WaitLsnError::Timeout(_) => Self::LsnTimeout(e),
+            WaitLsnError::Shutdown => Self::Shutdown,
+            WaitLsnError::BadState => Self::Reconnect("Timeline is not active".into()),
+        }
+    }
+}
+
 impl PageServerHandler {
     pub fn new(
         conf: &'static PageServerConf,
@@ -428,7 +490,7 @@ impl PageServerHandler {
         // Check that the timeline exists
         let timeline = tenant
             .get_timeline(timeline_id, true)
-            .map_err(|e| anyhow::anyhow!(e))?;
+            .map_err(|e| QueryError::NotFound(format!("{e}").into()))?;
 
         // Avoid starting new requests if the timeline has already started shutting down,
         // and block timeline shutdown until this request is complete, or drops out due
@@ -520,32 +582,44 @@ impl PageServerHandler {
                 }
             };
 
-            if let Err(e) = &response {
-                // Requests may fail as soon as we are Stopping, even if the Timeline's cancellation token wasn't fired yet,
-                // because wait_lsn etc will drop out
-                // is_stopping(): [`Timeline::flush_and_shutdown`] has entered
-                // is_canceled(): [`Timeline::shutdown`]` has entered
-                if timeline.cancel.is_cancelled() || timeline.is_stopping() {
+            match response {
+                Err(PageStreamError::Shutdown) => {
                     // If we fail to fulfil a request during shutdown, which may be _because_ of
                     // shutdown, then do not send the error to the client.  Instead just drop the
                     // connection.
-                    span.in_scope(|| info!("dropped response during shutdown: {e:#}"));
+                    span.in_scope(|| info!("dropping connection due to shutdown"));
                     return Err(QueryError::Shutdown);
                 }
+                Err(PageStreamError::Reconnect(reason)) => {
+                    span.in_scope(|| info!("handler requested reconnect: {reason}"));
+                    return Err(QueryError::Reconnect);
+                }
+                Err(e) if timeline.cancel.is_cancelled() || timeline.is_stopping() => {
+                    // This branch accomodates code within request handlers that returns an anyhow::Error instead of a clean
+                    // shutdown error, this may be buried inside a PageReconstructError::Other for example.
+                    //
+                    // Requests may fail as soon as we are Stopping, even if the Timeline's cancellation token wasn't fired yet,
+                    // because wait_lsn etc will drop out
+                    // is_stopping(): [`Timeline::flush_and_shutdown`] has entered
+                    // is_canceled(): [`Timeline::shutdown`]` has entered
+                    span.in_scope(|| info!("dropped error response during shutdown: {e:#}"));
+                    return Err(QueryError::Shutdown);
+                }
+                r => {
+                    let response_msg = r.unwrap_or_else(|e| {
+                        // print the all details to the log with {:#}, but for the client the
+                        // error message is enough.  Do not log if shutting down, as the anyhow::Error
+                        // here includes cancellation which is not an error.
+                        span.in_scope(|| error!("error reading relation or page version: {:#}", e));
+                        PagestreamBeMessage::Error(PagestreamErrorResponse {
+                            message: e.to_string(),
+                        })
+                    });
+
+                    pgb.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?;
+                    self.flush_cancellable(pgb, &timeline.cancel).await?;
+                }
             }
-
-            let response = response.unwrap_or_else(|e| {
-                // print the all details to the log with {:#}, but for the client the
-                // error message is enough.  Do not log if shutting down, as the anyhow::Error
-                // here includes cancellation which is not an error.
-                span.in_scope(|| error!("error reading relation or page version: {:#}", e));
-                PagestreamBeMessage::Error(PagestreamErrorResponse {
-                    message: e.to_string(),
-                })
-            });
-
-            pgb.write_message_noflush(&BeMessage::CopyData(&response.serialize()))?;
-            self.flush_cancellable(pgb, &timeline.cancel).await?;
         }
         Ok(())
     }
@@ -692,7 +766,7 @@ impl PageServerHandler {
         latest: bool,
         latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<Lsn> {
+    ) -> Result<Lsn, PageStreamError> {
         if latest {
             // Latest page version was requested. If LSN is given, it is a hint
             // to the page server that there have been no modifications to the
@@ -723,15 +797,19 @@ impl PageServerHandler {
             }
         } else {
             if lsn == Lsn(0) {
-                anyhow::bail!("invalid LSN(0) in request");
+                return Err(PageStreamError::BadRequest(
+                    "invalid LSN(0) in request".into(),
+                ));
             }
             timeline.wait_lsn(lsn, ctx).await?;
         }
-        anyhow::ensure!(
-            lsn >= **latest_gc_cutoff_lsn,
-            "tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
-            lsn, **latest_gc_cutoff_lsn
-        );
+
+        if lsn < **latest_gc_cutoff_lsn {
+            return Err(PageStreamError::BadRequest(format!(
+                "tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
+                lsn, **latest_gc_cutoff_lsn
+            ).into()));
+        }
         Ok(lsn)
     }
 
@@ -740,14 +818,14 @@ impl PageServerHandler {
         timeline: &Timeline,
         req: &PagestreamExistsRequest,
         ctx: &RequestContext,
-    ) -> anyhow::Result<PagestreamBeMessage> {
+    ) -> Result<PagestreamBeMessage, PageStreamError> {
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn =
             Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
                 .await?;
 
         let exists = timeline
-            .get_rel_exists(req.rel, lsn, req.latest, ctx)
+            .get_rel_exists(req.rel, Version::Lsn(lsn), req.latest, ctx)
             .await?;
 
         Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse {
@@ -760,13 +838,15 @@ impl PageServerHandler {
         timeline: &Timeline,
         req: &PagestreamNblocksRequest,
         ctx: &RequestContext,
-    ) -> anyhow::Result<PagestreamBeMessage> {
+    ) -> Result<PagestreamBeMessage, PageStreamError> {
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn =
             Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
                 .await?;
 
-        let n_blocks = timeline.get_rel_size(req.rel, lsn, req.latest, ctx).await?;
+        let n_blocks = timeline
+            .get_rel_size(req.rel, Version::Lsn(lsn), req.latest, ctx)
+            .await?;
 
         Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse {
             n_blocks,
@@ -778,14 +858,20 @@ impl PageServerHandler {
         timeline: &Timeline,
         req: &PagestreamDbSizeRequest,
         ctx: &RequestContext,
-    ) -> anyhow::Result<PagestreamBeMessage> {
+    ) -> Result<PagestreamBeMessage, PageStreamError> {
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn =
             Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
                 .await?;
 
         let total_blocks = timeline
-            .get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest, ctx)
+            .get_db_size(
+                DEFAULTTABLESPACE_OID,
+                req.dbnode,
+                Version::Lsn(lsn),
+                req.latest,
+                ctx,
+            )
             .await?;
         let db_size = total_blocks as i64 * BLCKSZ as i64;
 
@@ -794,30 +880,35 @@ impl PageServerHandler {
         }))
     }
 
+    async fn do_handle_get_page_at_lsn_request(
+        &self,
+        timeline: &Timeline,
+        req: &PagestreamGetPageRequest,
+        ctx: &RequestContext,
+    ) -> Result<PagestreamBeMessage, PageStreamError> {
+        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
+        let lsn =
+            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
+                .await?;
+        let page = timeline
+            .get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), req.latest, ctx)
+            .await?;
+
+        Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
+            page,
+        }))
+    }
+
     async fn handle_get_page_at_lsn_request(
         &self,
         timeline: &Timeline,
         req: &PagestreamGetPageRequest,
         ctx: &RequestContext,
-    ) -> anyhow::Result<PagestreamBeMessage> {
-        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
-        let lsn =
-            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
-                .await?;
-        /*
-        // Add a 1s delay to some requests. The delay helps the requests to
-        // hit the race condition from github issue #1047 more easily.
-        use rand::Rng;
-        if rand::thread_rng().gen::<u8>() < 5 {
-            std::thread::sleep(std::time::Duration::from_millis(1000));
-        }
-        */
-
+    ) -> Result<PagestreamBeMessage, PageStreamError> {
         let key = rel_block_to_key(req.rel, req.blkno);
-        let page = if timeline.get_shard_identity().is_key_local(&key) {
-            timeline
-                .get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
-                .await?
+        if timeline.get_shard_identity().is_key_local(&key) {
+            self.do_handle_get_page_at_lsn_request(timeline, req, ctx)
+                .await
         } else {
             // The Tenant shard we looked up at connection start does not hold this particular
             // key: look for other shards in this tenant.  This scenario occurs if a pageserver
@@ -836,30 +927,30 @@ impl PageServerHandler {
                 Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => {
                     // We already know this tenant exists in general, because we resolved it at
                     // start of connection.  Getting a NotFound here indicates that the shard containing
-                    // the requested page is not present on this node.
-
-                    // TODO: this should be some kind of structured error that the client will understand,
-                    // so that it can block until its config is updated: this error is expected in the case
-                    // that the Tenant's shards' placements are being updated and the client hasn't been
-                    // informed yet.
-                    //
-                    // https://github.com/neondatabase/neon/issues/6038
-                    return Err(anyhow::anyhow!("Request routed to wrong shard"));
+                    // the requested page is not present on this node: the client's knowledge of shard->pageserver
+                    // mapping is out of date.
+                    tracing::info!("Page request routed to wrong shard: my identity {:?}, should go to shard {}, key {}",
+                        timeline.get_shard_identity(), timeline.get_shard_identity().get_shard_number(&key).0, key);
+                    // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via
+                    // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration
+                    // and talk to a different pageserver.
+                    return Err(PageStreamError::Reconnect(
+                        "getpage@lsn request routed to wrong shard".into(),
+                    ));
                 }
                 Err(e) => return Err(e.into()),
             };
 
             // Take a GateGuard for the duration of this request.  If we were using our main Timeline object,
             // the GateGuard was already held over the whole connection.
-            let _timeline_guard = timeline.gate.enter().map_err(|_| QueryError::Shutdown)?;
-            timeline
-                .get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
-                .await?
-        };
+            let _timeline_guard = timeline
+                .gate
+                .enter()
+                .map_err(|_| PageStreamError::Shutdown)?;
 
-        Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
-            page,
-        }))
+            self.do_handle_get_page_at_lsn_request(&timeline, req, ctx)
+                .await
+        }
     }
 
     #[allow(clippy::too_many_arguments)]
@@ -1000,9 +1091,7 @@ impl PageServerHandler {
         )
         .await
         .map_err(GetActiveTimelineError::Tenant)?;
-        let timeline = tenant
-            .get_timeline(timeline_id, true)
-            .map_err(|e| GetActiveTimelineError::Timeline(anyhow::anyhow!(e)))?;
+        let timeline = tenant.get_timeline(timeline_id, true)?;
         Ok(timeline)
     }
 }
@@ -1424,14 +1513,15 @@ enum GetActiveTimelineError {
     #[error(transparent)]
     Tenant(GetActiveTenantError),
     #[error(transparent)]
-    Timeline(anyhow::Error),
+    Timeline(#[from] GetTimelineError),
 }
 
 impl From<GetActiveTimelineError> for QueryError {
     fn from(e: GetActiveTimelineError) -> Self {
         match e {
+            GetActiveTimelineError::Tenant(GetActiveTenantError::Cancelled) => QueryError::Shutdown,
             GetActiveTimelineError::Tenant(e) => e.into(),
-            GetActiveTimelineError::Timeline(e) => QueryError::Other(e),
+            GetActiveTimelineError::Timeline(e) => QueryError::NotFound(format!("{e}").into()),
         }
     }
 }
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index e9884a15f5..f11a72f2ab 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -11,7 +11,7 @@ use crate::context::RequestContext;
 use crate::keyspace::{KeySpace, KeySpaceAccum};
 use crate::repository::*;
 use crate::walrecord::NeonWalRecord;
-use anyhow::Context;
+use anyhow::{ensure, Context};
 use bytes::{Buf, Bytes};
 use pageserver_api::key::is_rel_block_key;
 use pageserver_api::reltag::{RelTag, SlruKind};
@@ -147,6 +147,7 @@ impl Timeline {
     {
         DatadirModification {
             tline: self,
+            pending_lsns: Vec::new(),
             pending_updates: HashMap::new(),
             pending_deletions: Vec::new(),
             pending_nblocks: 0,
@@ -159,11 +160,11 @@ impl Timeline {
     //------------------------------------------------------------------------------
 
     /// Look up given page version.
-    pub async fn get_rel_page_at_lsn(
+    pub(crate) async fn get_rel_page_at_lsn(
         &self,
         tag: RelTag,
         blknum: BlockNumber,
-        lsn: Lsn,
+        version: Version<'_>,
         latest: bool,
         ctx: &RequestContext,
     ) -> Result<Bytes, PageReconstructError> {
@@ -173,44 +174,47 @@ impl Timeline {
             ));
         }
 
-        let nblocks = self.get_rel_size(tag, lsn, latest, ctx).await?;
+        let nblocks = self.get_rel_size(tag, version, latest, ctx).await?;
         if blknum >= nblocks {
             debug!(
                 "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
-                tag, blknum, lsn, nblocks
+                tag,
+                blknum,
+                version.get_lsn(),
+                nblocks
             );
             return Ok(ZERO_PAGE.clone());
         }
 
         let key = rel_block_to_key(tag, blknum);
-        self.get(key, lsn, ctx).await
+        version.get(self, key, ctx).await
     }
 
     // Get size of a database in blocks
-    pub async fn get_db_size(
+    pub(crate) async fn get_db_size(
         &self,
         spcnode: Oid,
         dbnode: Oid,
-        lsn: Lsn,
+        version: Version<'_>,
         latest: bool,
         ctx: &RequestContext,
     ) -> Result<usize, PageReconstructError> {
         let mut total_blocks = 0;
 
-        let rels = self.list_rels(spcnode, dbnode, lsn, ctx).await?;
+        let rels = self.list_rels(spcnode, dbnode, version, ctx).await?;
 
         for rel in rels {
-            let n_blocks = self.get_rel_size(rel, lsn, latest, ctx).await?;
+            let n_blocks = self.get_rel_size(rel, version, latest, ctx).await?;
             total_blocks += n_blocks as usize;
         }
         Ok(total_blocks)
     }
 
     /// Get size of a relation file
-    pub async fn get_rel_size(
+    pub(crate) async fn get_rel_size(
         &self,
         tag: RelTag,
-        lsn: Lsn,
+        version: Version<'_>,
         latest: bool,
         ctx: &RequestContext,
     ) -> Result<BlockNumber, PageReconstructError> {
@@ -220,12 +224,12 @@ impl Timeline {
             ));
         }
 
-        if let Some(nblocks) = self.get_cached_rel_size(&tag, lsn) {
+        if let Some(nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) {
             return Ok(nblocks);
         }
 
         if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM)
-            && !self.get_rel_exists(tag, lsn, latest, ctx).await?
+            && !self.get_rel_exists(tag, version, latest, ctx).await?
         {
             // FIXME: Postgres sometimes calls smgrcreate() to create
             // FSM, and smgrnblocks() on it immediately afterwards,
@@ -235,7 +239,7 @@ impl Timeline {
         }
 
         let key = rel_size_to_key(tag);
-        let mut buf = self.get(key, lsn, ctx).await?;
+        let mut buf = version.get(self, key, ctx).await?;
         let nblocks = buf.get_u32_le();
 
         if latest {
@@ -246,16 +250,16 @@ impl Timeline {
             // latest=true, then it can not cause cache corruption, because with latest=true
             // pageserver choose max(request_lsn, last_written_lsn) and so cached value will be
             // associated with most recent value of LSN.
-            self.update_cached_rel_size(tag, lsn, nblocks);
+            self.update_cached_rel_size(tag, version.get_lsn(), nblocks);
         }
         Ok(nblocks)
     }
 
     /// Does relation exist?
-    pub async fn get_rel_exists(
+    pub(crate) async fn get_rel_exists(
         &self,
         tag: RelTag,
-        lsn: Lsn,
+        version: Version<'_>,
         _latest: bool,
         ctx: &RequestContext,
     ) -> Result<bool, PageReconstructError> {
@@ -266,12 +270,12 @@ impl Timeline {
         }
 
         // first try to lookup relation in cache
-        if let Some(_nblocks) = self.get_cached_rel_size(&tag, lsn) {
+        if let Some(_nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) {
             return Ok(true);
         }
         // fetch directory listing
         let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
-        let buf = self.get(key, lsn, ctx).await?;
+        let buf = version.get(self, key, ctx).await?;
 
         match RelDirectory::des(&buf).context("deserialization failure") {
             Ok(dir) => {
@@ -287,16 +291,16 @@ impl Timeline {
     /// # Cancel-Safety
     ///
     /// This method is cancellation-safe.
-    pub async fn list_rels(
+    pub(crate) async fn list_rels(
         &self,
         spcnode: Oid,
         dbnode: Oid,
-        lsn: Lsn,
+        version: Version<'_>,
         ctx: &RequestContext,
     ) -> Result<HashSet<RelTag>, PageReconstructError> {
         // fetch directory listing
         let key = rel_dir_to_key(spcnode, dbnode);
-        let buf = self.get(key, lsn, ctx).await?;
+        let buf = version.get(self, key, ctx).await?;
 
         match RelDirectory::des(&buf).context("deserialization failure") {
             Ok(dir) => {
@@ -315,7 +319,7 @@ impl Timeline {
     }
 
     /// Look up given SLRU page version.
-    pub async fn get_slru_page_at_lsn(
+    pub(crate) async fn get_slru_page_at_lsn(
         &self,
         kind: SlruKind,
         segno: u32,
@@ -328,29 +332,29 @@ impl Timeline {
     }
 
     /// Get size of an SLRU segment
-    pub async fn get_slru_segment_size(
+    pub(crate) async fn get_slru_segment_size(
         &self,
         kind: SlruKind,
         segno: u32,
-        lsn: Lsn,
+        version: Version<'_>,
         ctx: &RequestContext,
     ) -> Result<BlockNumber, PageReconstructError> {
         let key = slru_segment_size_to_key(kind, segno);
-        let mut buf = self.get(key, lsn, ctx).await?;
+        let mut buf = version.get(self, key, ctx).await?;
         Ok(buf.get_u32_le())
     }
 
     /// Get size of an SLRU segment
-    pub async fn get_slru_segment_exists(
+    pub(crate) async fn get_slru_segment_exists(
         &self,
         kind: SlruKind,
         segno: u32,
-        lsn: Lsn,
+        version: Version<'_>,
         ctx: &RequestContext,
     ) -> Result<bool, PageReconstructError> {
         // fetch directory listing
         let key = slru_dir_to_key(kind);
-        let buf = self.get(key, lsn, ctx).await?;
+        let buf = version.get(self, key, ctx).await?;
 
         match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
             Ok(dir) => {
@@ -368,7 +372,7 @@ impl Timeline {
     /// so it's not well defined which LSN you get if there were multiple commits
     /// "in flight" at that point in time.
     ///
-    pub async fn find_lsn_for_timestamp(
+    pub(crate) async fn find_lsn_for_timestamp(
         &self,
         search_timestamp: TimestampTz,
         cancel: &CancellationToken,
@@ -448,7 +452,7 @@ impl Timeline {
     /// Additionally, sets 'found_smaller'/'found_Larger, if encounters any commits
     /// with a smaller/larger timestamp.
     ///
-    pub async fn is_latest_commit_timestamp_ge_than(
+    pub(crate) async fn is_latest_commit_timestamp_ge_than(
         &self,
         search_timestamp: TimestampTz,
         probe_lsn: Lsn,
@@ -471,7 +475,7 @@ impl Timeline {
     /// Obtain the possible timestamp range for the given lsn.
     ///
     /// If the lsn has no timestamps, returns None. returns `(min, max, median)` if it has timestamps.
-    pub async fn get_timestamp_for_lsn(
+    pub(crate) async fn get_timestamp_for_lsn(
         &self,
         probe_lsn: Lsn,
         ctx: &RequestContext,
@@ -501,11 +505,11 @@ impl Timeline {
         mut f: impl FnMut(TimestampTz) -> ControlFlow<T>,
     ) -> Result<T, PageReconstructError> {
         for segno in self
-            .list_slru_segments(SlruKind::Clog, probe_lsn, ctx)
+            .list_slru_segments(SlruKind::Clog, Version::Lsn(probe_lsn), ctx)
             .await?
         {
             let nblocks = self
-                .get_slru_segment_size(SlruKind::Clog, segno, probe_lsn, ctx)
+                .get_slru_segment_size(SlruKind::Clog, segno, Version::Lsn(probe_lsn), ctx)
                 .await?;
             for blknum in (0..nblocks).rev() {
                 let clog_page = self
@@ -528,36 +532,36 @@ impl Timeline {
     }
 
     /// Get a list of SLRU segments
-    pub async fn list_slru_segments(
+    pub(crate) async fn list_slru_segments(
         &self,
         kind: SlruKind,
-        lsn: Lsn,
+        version: Version<'_>,
         ctx: &RequestContext,
     ) -> Result<HashSet<u32>, PageReconstructError> {
         // fetch directory entry
         let key = slru_dir_to_key(kind);
 
-        let buf = self.get(key, lsn, ctx).await?;
+        let buf = version.get(self, key, ctx).await?;
         match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
             Ok(dir) => Ok(dir.segments),
             Err(e) => Err(PageReconstructError::from(e)),
         }
     }
 
-    pub async fn get_relmap_file(
+    pub(crate) async fn get_relmap_file(
         &self,
         spcnode: Oid,
         dbnode: Oid,
-        lsn: Lsn,
+        version: Version<'_>,
         ctx: &RequestContext,
     ) -> Result<Bytes, PageReconstructError> {
         let key = relmap_file_key(spcnode, dbnode);
 
-        let buf = self.get(key, lsn, ctx).await?;
+        let buf = version.get(self, key, ctx).await?;
         Ok(buf)
     }
 
-    pub async fn list_dbdirs(
+    pub(crate) async fn list_dbdirs(
         &self,
         lsn: Lsn,
         ctx: &RequestContext,
@@ -571,7 +575,7 @@ impl Timeline {
         }
     }
 
-    pub async fn get_twophase_file(
+    pub(crate) async fn get_twophase_file(
         &self,
         xid: TransactionId,
         lsn: Lsn,
@@ -582,7 +586,7 @@ impl Timeline {
         Ok(buf)
     }
 
-    pub async fn list_twophase_files(
+    pub(crate) async fn list_twophase_files(
         &self,
         lsn: Lsn,
         ctx: &RequestContext,
@@ -596,7 +600,7 @@ impl Timeline {
         }
     }
 
-    pub async fn get_control_file(
+    pub(crate) async fn get_control_file(
         &self,
         lsn: Lsn,
         ctx: &RequestContext,
@@ -604,7 +608,7 @@ impl Timeline {
         self.get(CONTROLFILE_KEY, lsn, ctx).await
     }
 
-    pub async fn get_checkpoint(
+    pub(crate) async fn get_checkpoint(
         &self,
         lsn: Lsn,
         ctx: &RequestContext,
@@ -612,7 +616,7 @@ impl Timeline {
         self.get(CHECKPOINT_KEY, lsn, ctx).await
     }
 
-    pub async fn list_aux_files(
+    pub(crate) async fn list_aux_files(
         &self,
         lsn: Lsn,
         ctx: &RequestContext,
@@ -652,7 +656,10 @@ impl Timeline {
 
         let mut total_size: u64 = 0;
         for (spcnode, dbnode) in dbdir.dbdirs.keys() {
-            for rel in self.list_rels(*spcnode, *dbnode, lsn, ctx).await? {
+            for rel in self
+                .list_rels(*spcnode, *dbnode, Version::Lsn(lsn), ctx)
+                .await?
+            {
                 if self.cancel.is_cancelled() {
                     return Err(CalculateLogicalSizeError::Cancelled);
                 }
@@ -692,7 +699,7 @@ impl Timeline {
             result.add_key(rel_dir_to_key(spcnode, dbnode));
 
             let mut rels: Vec<RelTag> = self
-                .list_rels(spcnode, dbnode, lsn, ctx)
+                .list_rels(spcnode, dbnode, Version::Lsn(lsn), ctx)
                 .await?
                 .into_iter()
                 .collect();
@@ -799,18 +806,39 @@ pub struct DatadirModification<'a> {
     /// in the state in 'tline' yet.
     pub tline: &'a Timeline,
 
-    /// Lsn assigned by begin_modification
-    pub lsn: Lsn,
+    /// Current LSN of the modification
+    lsn: Lsn,
 
     // The modifications are not applied directly to the underlying key-value store.
     // The put-functions add the modifications here, and they are flushed to the
     // underlying key-value store by the 'finish' function.
-    pending_updates: HashMap<Key, Value>,
-    pending_deletions: Vec<Range<Key>>,
+    pending_lsns: Vec<Lsn>,
+    pending_updates: HashMap<Key, Vec<(Lsn, Value)>>,
+    pending_deletions: Vec<(Range<Key>, Lsn)>,
     pending_nblocks: i64,
 }
 
 impl<'a> DatadirModification<'a> {
+    /// Get the current lsn
+    pub(crate) fn get_lsn(&self) -> Lsn {
+        self.lsn
+    }
+
+    /// Set the current lsn
+    pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
+        ensure!(
+            lsn >= self.lsn,
+            "setting an older lsn {} than {} is not allowed",
+            lsn,
+            self.lsn
+        );
+        if lsn > self.lsn {
+            self.pending_lsns.push(self.lsn);
+            self.lsn = lsn;
+        }
+        Ok(())
+    }
+
     /// Initialize a completely new repository.
     ///
     /// This inserts the directory metadata entries that are assumed to
@@ -984,11 +1012,9 @@ impl<'a> DatadirModification<'a> {
         dbnode: Oid,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
-        let req_lsn = self.tline.get_last_record_lsn();
-
         let total_blocks = self
             .tline
-            .get_db_size(spcnode, dbnode, req_lsn, true, ctx)
+            .get_db_size(spcnode, dbnode, Version::Modified(self), true, ctx)
             .await?;
 
         // Remove entry from dbdir
@@ -1077,8 +1103,11 @@ impl<'a> DatadirModification<'a> {
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
-        let last_lsn = self.tline.get_last_record_lsn();
-        if self.tline.get_rel_exists(rel, last_lsn, true, ctx).await? {
+        if self
+            .tline
+            .get_rel_exists(rel, Version::Modified(self), true, ctx)
+            .await?
+        {
             let size_key = rel_size_to_key(rel);
             // Fetch the old size first
             let old_size = self.get(size_key, ctx).await?.get_u32_le();
@@ -1323,17 +1352,23 @@ impl<'a> DatadirModification<'a> {
         let writer = self.tline.writer().await;
 
         // Flush relation and  SLRU data blocks, keep metadata.
-        let mut retained_pending_updates = HashMap::new();
-        for (key, value) in self.pending_updates.drain() {
-            if is_rel_block_key(&key) || is_slru_block_key(key) {
-                // This bails out on first error without modifying pending_updates.
-                // That's Ok, cf this function's doc comment.
-                writer.put(key, self.lsn, &value, ctx).await?;
-            } else {
-                retained_pending_updates.insert(key, value);
+        let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
+        for (key, values) in self.pending_updates.drain() {
+            for (lsn, value) in values {
+                if is_rel_block_key(&key) || is_slru_block_key(key) {
+                    // This bails out on first error without modifying pending_updates.
+                    // That's Ok, cf this function's doc comment.
+                    writer.put(key, lsn, &value, ctx).await?;
+                } else {
+                    retained_pending_updates
+                        .entry(key)
+                        .or_default()
+                        .push((lsn, value));
+                }
             }
         }
-        self.pending_updates.extend(retained_pending_updates);
+
+        self.pending_updates = retained_pending_updates;
 
         if pending_nblocks != 0 {
             writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
@@ -1350,18 +1385,28 @@ impl<'a> DatadirModification<'a> {
     ///
     pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
         let writer = self.tline.writer().await;
-        let lsn = self.lsn;
+
         let pending_nblocks = self.pending_nblocks;
         self.pending_nblocks = 0;
 
-        for (key, value) in self.pending_updates.drain() {
-            writer.put(key, lsn, &value, ctx).await?;
-        }
-        for key_range in self.pending_deletions.drain(..) {
-            writer.delete(key_range, lsn).await?;
+        if !self.pending_updates.is_empty() {
+            writer.put_batch(&self.pending_updates, ctx).await?;
+            self.pending_updates.clear();
         }
 
-        writer.finish_write(lsn);
+        if !self.pending_deletions.is_empty() {
+            writer.delete_batch(&self.pending_deletions).await?;
+            self.pending_deletions.clear();
+        }
+
+        self.pending_lsns.push(self.lsn);
+        for pending_lsn in self.pending_lsns.drain(..) {
+            // Ideally, we should be able to call writer.finish_write() only once
+            // with the highest LSN. However, the last_record_lsn variable in the
+            // timeline keeps track of the latest LSN and the immediate previous LSN
+            // so we need to record every LSN to not leave a gap between them.
+            writer.finish_write(pending_lsn);
+        }
 
         if pending_nblocks != 0 {
             writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
@@ -1370,44 +1415,86 @@ impl<'a> DatadirModification<'a> {
         Ok(())
     }
 
-    pub(crate) fn is_empty(&self) -> bool {
-        self.pending_updates.is_empty() && self.pending_deletions.is_empty()
+    pub(crate) fn len(&self) -> usize {
+        self.pending_updates.len() + self.pending_deletions.len()
     }
 
     // Internal helper functions to batch the modifications
 
     async fn get(&self, key: Key, ctx: &RequestContext) -> Result<Bytes, PageReconstructError> {
-        // Have we already updated the same key? Read the pending updated
+        // Have we already updated the same key? Read the latest pending updated
         // version in that case.
         //
         // Note: we don't check pending_deletions. It is an error to request a
         // value that has been removed, deletion only avoids leaking storage.
-        if let Some(value) = self.pending_updates.get(&key) {
-            if let Value::Image(img) = value {
-                Ok(img.clone())
-            } else {
-                // Currently, we never need to read back a WAL record that we
-                // inserted in the same "transaction". All the metadata updates
-                // work directly with Images, and we never need to read actual
-                // data pages. We could handle this if we had to, by calling
-                // the walredo manager, but let's keep it simple for now.
-                Err(PageReconstructError::from(anyhow::anyhow!(
-                    "unexpected pending WAL record"
-                )))
+        if let Some(values) = self.pending_updates.get(&key) {
+            if let Some((_, value)) = values.last() {
+                return if let Value::Image(img) = value {
+                    Ok(img.clone())
+                } else {
+                    // Currently, we never need to read back a WAL record that we
+                    // inserted in the same "transaction". All the metadata updates
+                    // work directly with Images, and we never need to read actual
+                    // data pages. We could handle this if we had to, by calling
+                    // the walredo manager, but let's keep it simple for now.
+                    Err(PageReconstructError::from(anyhow::anyhow!(
+                        "unexpected pending WAL record"
+                    )))
+                };
             }
-        } else {
-            let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
-            self.tline.get(key, lsn, ctx).await
         }
+        let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
+        self.tline.get(key, lsn, ctx).await
     }
 
     fn put(&mut self, key: Key, val: Value) {
-        self.pending_updates.insert(key, val);
+        let values = self.pending_updates.entry(key).or_default();
+        // Replace the previous value if it exists at the same lsn
+        if let Some((last_lsn, last_value)) = values.last_mut() {
+            if *last_lsn == self.lsn {
+                *last_value = val;
+                return;
+            }
+        }
+        values.push((self.lsn, val));
     }
 
     fn delete(&mut self, key_range: Range<Key>) {
         trace!("DELETE {}-{}", key_range.start, key_range.end);
-        self.pending_deletions.push(key_range);
+        self.pending_deletions.push((key_range, self.lsn));
+    }
+}
+
+/// This struct facilitates accessing either a committed key from the timeline at a
+/// specific LSN, or the latest uncommitted key from a pending modification.
+/// During WAL ingestion, the records from multiple LSNs may be batched in the same
+/// modification before being flushed to the timeline. Hence, the routines in WalIngest
+/// need to look up the keys in the modification first before looking them up in the
+/// timeline to not miss the latest updates.
+#[derive(Clone, Copy)]
+pub enum Version<'a> {
+    Lsn(Lsn),
+    Modified(&'a DatadirModification<'a>),
+}
+
+impl<'a> Version<'a> {
+    async fn get(
+        &self,
+        timeline: &Timeline,
+        key: Key,
+        ctx: &RequestContext,
+    ) -> Result<Bytes, PageReconstructError> {
+        match self {
+            Version::Lsn(lsn) => timeline.get(key, *lsn, ctx).await,
+            Version::Modified(modification) => modification.get(key, ctx).await,
+        }
+    }
+
+    fn get_lsn(&self) -> Lsn {
+        match self {
+            Version::Lsn(lsn) => *lsn,
+            Version::Modified(modification) => modification.lsn,
+        }
     }
 }
 
diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index cb1b2b8011..5a06a97525 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -147,7 +147,7 @@ pub(crate) static BACKGROUND_RUNTIME_WORKER_THREADS: Lazy<usize> = Lazy::new(||
     // else, but that has not been needed in a long time.
     std::env::var("TOKIO_WORKER_THREADS")
         .map(|s| s.parse::<usize>().unwrap())
-        .unwrap_or_else(|_e| usize::max(1, num_cpus::get()))
+        .unwrap_or_else(|_e| usize::max(2, num_cpus::get()))
 });
 
 #[derive(Debug, Clone, Copy)]
@@ -258,6 +258,9 @@ pub enum TaskKind {
     /// See [`crate::disk_usage_eviction_task`].
     DiskUsageEviction,
 
+    /// See [`crate::tenant::secondary`].
+    SecondaryDownloads,
+
     /// See [`crate::tenant::secondary`].
     SecondaryUploads,
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index e50987c84b..7c609452e5 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -56,6 +56,7 @@ use self::timeline::uninit::TimelineUninitMark;
 use self::timeline::uninit::UninitializedTimeline;
 use self::timeline::EvictionTaskTenantState;
 use self::timeline::TimelineResources;
+use self::timeline::WaitLsnError;
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::deletion_queue::DeletionQueueClient;
@@ -595,10 +596,9 @@ impl Tenant {
         mode: SpawnMode,
         ctx: &RequestContext,
     ) -> anyhow::Result<Arc<Tenant>> {
-        // TODO(sharding): make WalRedoManager shard-aware
         let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
             conf,
-            tenant_shard_id.tenant_id,
+            tenant_shard_id,
         )));
 
         let TenantSharedResources {
@@ -1145,10 +1145,9 @@ impl Tenant {
         tenant_shard_id: TenantShardId,
         reason: String,
     ) -> Arc<Tenant> {
-        // TODO(sharding): make WalRedoManager shard-aware
         let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
             conf,
-            tenant_shard_id.tenant_id,
+            tenant_shard_id,
         )));
         Arc::new(Tenant::new(
             TenantState::Broken {
@@ -1760,7 +1759,15 @@ impl Tenant {
                     // decoding the new WAL might need to look up previous pages, relation
                     // sizes etc. and that would get confused if the previous page versions
                     // are not in the repository yet.
-                    ancestor_timeline.wait_lsn(*lsn, ctx).await?;
+                    ancestor_timeline
+                        .wait_lsn(*lsn, ctx)
+                        .await
+                        .map_err(|e| match e {
+                            e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState) => {
+                                CreateTimelineError::AncestorLsn(anyhow::anyhow!(e))
+                            }
+                            WaitLsnError::Shutdown => CreateTimelineError::ShuttingDown,
+                        })?;
                 }
 
                 self.branch_timeline(
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index 25d97f51ce..2d4cd350d7 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -46,6 +46,8 @@ pub mod defaults {
     pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
     pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024;
     pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
+
+    pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs
index b21bad51ba..2f606ed822 100644
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -588,7 +588,7 @@ impl DeleteTenantFlow {
                             }
                             break;
                         }
-                        TenantsMapRemoveResult::Occupied(TenantSlot::Secondary) => {
+                        TenantsMapRemoveResult::Occupied(TenantSlot::Secondary(_)) => {
                             // This is unexpected: this secondary tenants should not have been created, and we
                             // are not in a position to shut it down from here.
                             tracing::warn!("Tenant transitioned to secondary mode while deleting!");
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 62922e8c99..70b41b7b1f 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -44,6 +44,7 @@ use utils::generation::Generation;
 use utils::id::{TenantId, TimelineId};
 
 use super::delete::DeleteTenantError;
+use super::secondary::SecondaryTenant;
 use super::TenantSharedResources;
 
 /// For a tenant that appears in TenantsMap, it may either be
@@ -57,7 +58,7 @@ use super::TenantSharedResources;
 /// having a properly acquired generation (Secondary doesn't need a generation)
 pub(crate) enum TenantSlot {
     Attached(Arc<Tenant>),
-    Secondary,
+    Secondary(Arc<SecondaryTenant>),
     /// In this state, other administrative operations acting on the TenantId should
     /// block, or return a retry indicator equivalent to HTTP 503.
     InProgress(utils::completion::Barrier),
@@ -67,7 +68,7 @@ impl std::fmt::Debug for TenantSlot {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         match self {
             Self::Attached(tenant) => write!(f, "Attached({})", tenant.current_state()),
-            Self::Secondary => write!(f, "Secondary"),
+            Self::Secondary(_) => write!(f, "Secondary"),
             Self::InProgress(_) => write!(f, "InProgress"),
         }
     }
@@ -78,7 +79,7 @@ impl TenantSlot {
     fn get_attached(&self) -> Option<&Arc<Tenant>> {
         match self {
             Self::Attached(t) => Some(t),
-            Self::Secondary => None,
+            Self::Secondary(_) => None,
             Self::InProgress(_) => None,
         }
     }
@@ -130,7 +131,7 @@ impl TenantsMap {
 
     /// A page service client sends a TenantId, and to look up the correct Tenant we must
     /// resolve this to a fully qualified TenantShardId.
-    fn resolve_shard(
+    fn resolve_attached_shard(
         &self,
         tenant_id: &TenantId,
         selector: ShardSelector,
@@ -140,25 +141,27 @@ impl TenantsMap {
             TenantsMap::Initializing => None,
             TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
                 for slot in m.range(TenantShardId::tenant_range(*tenant_id)) {
+                    // Ignore all slots that don't contain an attached tenant
+                    let tenant = match &slot.1 {
+                        TenantSlot::Attached(t) => t,
+                        _ => continue,
+                    };
+
                     match selector {
                         ShardSelector::First => return Some(*slot.0),
                         ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
                             return Some(*slot.0)
                         }
                         ShardSelector::Page(key) => {
-                            if let Some(tenant) = slot.1.get_attached() {
-                                // First slot we see for this tenant, calculate the expected shard number
-                                // for the key: we will use this for checking if this and subsequent
-                                // slots contain the key, rather than recalculating the hash each time.
-                                if want_shard.is_none() {
-                                    want_shard = Some(tenant.shard_identity.get_shard_number(&key));
-                                }
+                            // First slot we see for this tenant, calculate the expected shard number
+                            // for the key: we will use this for checking if this and subsequent
+                            // slots contain the key, rather than recalculating the hash each time.
+                            if want_shard.is_none() {
+                                want_shard = Some(tenant.shard_identity.get_shard_number(&key));
+                            }
 
-                                if Some(tenant.shard_identity.number) == want_shard {
-                                    return Some(*slot.0);
-                                }
-                            } else {
-                                continue;
+                            if Some(tenant.shard_identity.number) == want_shard {
+                                return Some(*slot.0);
                             }
                         }
                         _ => continue,
@@ -464,12 +467,18 @@ pub async fn init_tenant_mgr(
                 *gen
             } else {
                 match &location_conf.mode {
-                    LocationMode::Secondary(_) => {
+                    LocationMode::Secondary(secondary_config) => {
                         // We do not require the control plane's permission for secondary mode
                         // tenants, because they do no remote writes and hence require no
                         // generation number
                         info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Loaded tenant in secondary mode");
-                        tenants.insert(tenant_shard_id, TenantSlot::Secondary);
+                        tenants.insert(
+                            tenant_shard_id,
+                            TenantSlot::Secondary(SecondaryTenant::new(
+                                tenant_shard_id,
+                                secondary_config,
+                            )),
+                        );
                     }
                     LocationMode::Attached(_) => {
                         // TODO: augment re-attach API to enable the control plane to
@@ -661,8 +670,14 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
 
                             total_attached += 1;
                         }
-                        TenantSlot::Secondary => {
-                            shutdown_state.insert(tenant_shard_id, TenantSlot::Secondary);
+                        TenantSlot::Secondary(state) => {
+                            // We don't need to wait for this individually per-tenant: the
+                            // downloader task will be waited on eventually, this cancel
+                            // is just to encourage it to drop out if it is doing work
+                            // for this tenant right now.
+                            state.cancel.cancel();
+
+                            shutdown_state.insert(tenant_shard_id, TenantSlot::Secondary(state));
                         }
                         TenantSlot::InProgress(notify) => {
                             // InProgress tenants are not visible in TenantsMap::ShuttingDown: we will
@@ -845,12 +860,28 @@ impl TenantManager {
             Some(TenantSlot::InProgress(_)) => {
                 Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
             }
-            None | Some(TenantSlot::Secondary) => {
+            None | Some(TenantSlot::Secondary(_)) => {
                 Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
             }
         }
     }
 
+    pub(crate) fn get_secondary_tenant_shard(
+        &self,
+        tenant_shard_id: TenantShardId,
+    ) -> Option<Arc<SecondaryTenant>> {
+        let locked = self.tenants.read().unwrap();
+
+        let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
+            .ok()
+            .flatten();
+
+        match peek_slot {
+            Some(TenantSlot::Secondary(s)) => Some(s.clone()),
+            _ => None,
+        }
+    }
+
     #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
     pub(crate) async fn upsert_location(
         &self,
@@ -862,10 +893,15 @@ impl TenantManager {
         debug_assert_current_span_has_tenant_id();
         info!("configuring tenant location to state {new_location_config:?}");
 
-        // Special case fast-path for updates to Tenant: if our upsert is only updating configuration,
+        enum FastPathModified {
+            Attached(Arc<Tenant>),
+            Secondary(Arc<SecondaryTenant>),
+        }
+
+        // Special case fast-path for updates to existing slots: if our upsert is only updating configuration,
         // then we do not need to set the slot to InProgress, we can just call into the
         // existng tenant.
-        let modify_tenant = {
+        let fast_path_taken = {
             let locked = self.tenants.read().unwrap();
             let peek_slot =
                 tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Write)?;
@@ -879,12 +915,19 @@ impl TenantManager {
                             new_location_config.clone(),
                         )?);
 
-                        Some(tenant.clone())
+                        Some(FastPathModified::Attached(tenant.clone()))
                     } else {
                         // Different generations, fall through to general case
                         None
                     }
                 }
+                (
+                    LocationMode::Secondary(secondary_conf),
+                    Some(TenantSlot::Secondary(secondary_tenant)),
+                ) => {
+                    secondary_tenant.set_config(secondary_conf);
+                    Some(FastPathModified::Secondary(secondary_tenant.clone()))
+                }
                 _ => {
                     // Not an Attached->Attached transition, fall through to general case
                     None
@@ -893,34 +936,51 @@ impl TenantManager {
         };
 
         // Fast-path continued: having dropped out of the self.tenants lock, do the async
-        // phase of waiting for flush, before returning.
-        if let Some(tenant) = modify_tenant {
-            // Transition to AttachedStale means we may well hold a valid generation
-            // still, and have been requested to go stale as part of a migration.  If
-            // the caller set `flush`, then flush to remote storage.
-            if let LocationMode::Attached(AttachedLocationConfig {
-                generation: _,
-                attach_mode: AttachmentMode::Stale,
-            }) = &new_location_config.mode
-            {
-                if let Some(flush_timeout) = flush {
-                    match tokio::time::timeout(flush_timeout, tenant.flush_remote()).await {
-                        Ok(Err(e)) => {
-                            return Err(e);
-                        }
-                        Ok(Ok(_)) => return Ok(()),
-                        Err(_) => {
-                            tracing::warn!(
+        // phase of writing config and/or waiting for flush, before returning.
+        match fast_path_taken {
+            Some(FastPathModified::Attached(tenant)) => {
+                Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
+                    .await
+                    .map_err(SetNewTenantConfigError::Persist)?;
+
+                // Transition to AttachedStale means we may well hold a valid generation
+                // still, and have been requested to go stale as part of a migration.  If
+                // the caller set `flush`, then flush to remote storage.
+                if let LocationMode::Attached(AttachedLocationConfig {
+                    generation: _,
+                    attach_mode: AttachmentMode::Stale,
+                }) = &new_location_config.mode
+                {
+                    if let Some(flush_timeout) = flush {
+                        match tokio::time::timeout(flush_timeout, tenant.flush_remote()).await {
+                            Ok(Err(e)) => {
+                                return Err(e);
+                            }
+                            Ok(Ok(_)) => return Ok(()),
+                            Err(_) => {
+                                tracing::warn!(
                                 timeout_ms = flush_timeout.as_millis(),
                                 "Timed out waiting for flush to remote storage, proceeding anyway."
                             )
+                            }
                         }
                     }
                 }
-            }
 
-            return Ok(());
-        }
+                return Ok(());
+            }
+            Some(FastPathModified::Secondary(_secondary_tenant)) => {
+                Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
+                    .await
+                    .map_err(SetNewTenantConfigError::Persist)?;
+
+                return Ok(());
+            }
+            None => {
+                // Proceed with the general case procedure, where we will shutdown & remove any existing
+                // slot contents and replace with a fresh one
+            }
+        };
 
         // General case for upserts to TenantsMap, excluding the case above: we will substitute an
         // InProgress value to the slot while we make whatever changes are required.  The state for
@@ -929,33 +989,47 @@ impl TenantManager {
         // not do significant I/O, and shutdowns should be prompt via cancellation tokens.
         let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
 
-        if let Some(TenantSlot::Attached(tenant)) = slot_guard.get_old_value() {
-            // The case where we keep a Tenant alive was covered above in the special case
-            // for Attached->Attached transitions in the same generation.  By this point,
-            // if we see an attached tenant we know it will be discarded and should be
-            // shut down.
-            let (_guard, progress) = utils::completion::channel();
+        match slot_guard.get_old_value() {
+            Some(TenantSlot::Attached(tenant)) => {
+                // The case where we keep a Tenant alive was covered above in the special case
+                // for Attached->Attached transitions in the same generation.  By this point,
+                // if we see an attached tenant we know it will be discarded and should be
+                // shut down.
+                let (_guard, progress) = utils::completion::channel();
 
-            match tenant.get_attach_mode() {
-                AttachmentMode::Single | AttachmentMode::Multi => {
-                    // Before we leave our state as the presumed holder of the latest generation,
-                    // flush any outstanding deletions to reduce the risk of leaking objects.
-                    self.resources.deletion_queue_client.flush_advisory()
-                }
-                AttachmentMode::Stale => {
-                    // If we're stale there's not point trying to flush deletions
-                }
-            };
+                match tenant.get_attach_mode() {
+                    AttachmentMode::Single | AttachmentMode::Multi => {
+                        // Before we leave our state as the presumed holder of the latest generation,
+                        // flush any outstanding deletions to reduce the risk of leaking objects.
+                        self.resources.deletion_queue_client.flush_advisory()
+                    }
+                    AttachmentMode::Stale => {
+                        // If we're stale there's not point trying to flush deletions
+                    }
+                };
 
-            info!("Shutting down attached tenant");
-            match tenant.shutdown(progress, false).await {
-                Ok(()) => {}
-                Err(barrier) => {
-                    info!("Shutdown already in progress, waiting for it to complete");
-                    barrier.wait().await;
+                info!("Shutting down attached tenant");
+                match tenant.shutdown(progress, false).await {
+                    Ok(()) => {}
+                    Err(barrier) => {
+                        info!("Shutdown already in progress, waiting for it to complete");
+                        barrier.wait().await;
+                    }
                 }
+                slot_guard.drop_old_value().expect("We just shut it down");
+            }
+            Some(TenantSlot::Secondary(state)) => {
+                info!("Shutting down secondary tenant");
+                state.shutdown().await;
+            }
+            Some(TenantSlot::InProgress(_)) => {
+                // This should never happen: acquire_slot should error out
+                // if the contents of a slot were InProgress.
+                anyhow::bail!("Acquired an InProgress slot, this is a bug.")
+            }
+            None => {
+                // Slot was vacant, nothing needs shutting down.
             }
-            slot_guard.drop_old_value().expect("We just shut it down");
         }
 
         let tenant_path = self.conf.tenant_path(&tenant_shard_id);
@@ -978,7 +1052,9 @@ impl TenantManager {
             .map_err(SetNewTenantConfigError::Persist)?;
 
         let new_slot = match &new_location_config.mode {
-            LocationMode::Secondary(_) => TenantSlot::Secondary,
+            LocationMode::Secondary(secondary_config) => {
+                TenantSlot::Secondary(SecondaryTenant::new(tenant_shard_id, secondary_config))
+            }
             LocationMode::Attached(_attach_config) => {
                 let shard_identity = new_location_config.shard;
                 let tenant = tenant_spawn(
@@ -1091,6 +1167,30 @@ impl TenantManager {
                 .collect(),
         }
     }
+    // Do some synchronous work for all tenant slots in Secondary state.  The provided
+    // callback should be small and fast, as it will be called inside the global
+    // TenantsMap lock.
+    pub(crate) fn foreach_secondary_tenants<F>(&self, mut func: F)
+    where
+        // TODO: let the callback return a hint to drop out of the loop early
+        F: FnMut(&TenantShardId, &Arc<SecondaryTenant>),
+    {
+        let locked = self.tenants.read().unwrap();
+
+        let map = match &*locked {
+            TenantsMap::Initializing | TenantsMap::ShuttingDown(_) => return,
+            TenantsMap::Open(m) => m,
+        };
+
+        for (tenant_id, slot) in map {
+            if let TenantSlot::Secondary(state) = slot {
+                // Only expose secondary tenants that are not currently shutting down
+                if !state.cancel.is_cancelled() {
+                    func(tenant_id, state)
+                }
+            }
+        }
+    }
 
     pub(crate) async fn delete_tenant(
         &self,
@@ -1205,7 +1305,7 @@ pub(crate) fn get_tenant(
         Some(TenantSlot::InProgress(_)) => {
             Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
         }
-        None | Some(TenantSlot::Secondary) => {
+        None | Some(TenantSlot::Secondary(_)) => {
             Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
         }
     }
@@ -1257,9 +1357,11 @@ pub(crate) async fn get_active_tenant_with_timeout(
         let locked = TENANTS.read().unwrap();
 
         // Resolve TenantId to TenantShardId
-        let tenant_shard_id = locked.resolve_shard(&tenant_id, shard_selector).ok_or(
-            GetActiveTenantError::NotFound(GetTenantError::NotFound(tenant_id)),
-        )?;
+        let tenant_shard_id = locked
+            .resolve_attached_shard(&tenant_id, shard_selector)
+            .ok_or(GetActiveTenantError::NotFound(GetTenantError::NotFound(
+                tenant_id,
+            )))?;
 
         let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
             .map_err(GetTenantError::MapState)?;
@@ -1276,7 +1378,7 @@ pub(crate) async fn get_active_tenant_with_timeout(
                     }
                 }
             }
-            Some(TenantSlot::Secondary) => {
+            Some(TenantSlot::Secondary(_)) => {
                 return Err(GetActiveTenantError::NotFound(GetTenantError::NotActive(
                     tenant_id,
                 )))
@@ -1540,7 +1642,7 @@ pub(crate) async fn list_tenants() -> Result<Vec<(TenantShardId, TenantState)>,
     Ok(m.iter()
         .filter_map(|(id, tenant)| match tenant {
             TenantSlot::Attached(tenant) => Some((*id, tenant.current_state())),
-            TenantSlot::Secondary => None,
+            TenantSlot::Secondary(_) => None,
             TenantSlot::InProgress(_) => None,
         })
         .collect())
@@ -1797,11 +1899,7 @@ impl SlotGuard {
     fn old_value_is_shutdown(&self) -> bool {
         match self.old_value.as_ref() {
             Some(TenantSlot::Attached(tenant)) => tenant.gate.close_complete(),
-            Some(TenantSlot::Secondary) => {
-                // TODO: when adding secondary mode tenants, this will check for shutdown
-                // in the same way that we do for `Tenant` above
-                true
-            }
+            Some(TenantSlot::Secondary(secondary_tenant)) => secondary_tenant.gate.close_complete(),
             Some(TenantSlot::InProgress(_)) => {
                 // A SlotGuard cannot be constructed for a slot that was already InProgress
                 unreachable!()
@@ -2011,26 +2109,19 @@ where
     let mut slot_guard =
         tenant_map_acquire_slot_impl(&tenant_shard_id, tenants, TenantSlotAcquireMode::MustExist)?;
 
-    // The SlotGuard allows us to manipulate the Tenant object without fear of some
-    // concurrent API request doing something else for the same tenant ID.
-    let attached_tenant = match slot_guard.get_old_value() {
-        Some(TenantSlot::Attached(t)) => Some(t),
-        _ => None,
-    };
-
     // allow pageserver shutdown to await for our completion
     let (_guard, progress) = completion::channel();
 
-    // If the tenant was attached, shut it down gracefully.  For secondary
-    // locations this part is not necessary
-    match &attached_tenant {
-        Some(attached_tenant) => {
+    // The SlotGuard allows us to manipulate the Tenant object without fear of some
+    // concurrent API request doing something else for the same tenant ID.
+    let attached_tenant = match slot_guard.get_old_value() {
+        Some(TenantSlot::Attached(tenant)) => {
             // whenever we remove a tenant from memory, we don't want to flush and wait for upload
             let freeze_and_flush = false;
 
             // shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so
             // that we can continue safely to cleanup.
-            match attached_tenant.shutdown(progress, freeze_and_flush).await {
+            match tenant.shutdown(progress, freeze_and_flush).await {
                 Ok(()) => {}
                 Err(_other) => {
                     // if pageserver shutdown or other detach/ignore is already ongoing, we don't want to
@@ -2039,11 +2130,19 @@ where
                     return Err(TenantStateError::IsStopping(tenant_shard_id.tenant_id));
                 }
             }
+            Some(tenant)
         }
-        None => {
-            // Nothing to wait on when not attached, proceed.
+        Some(TenantSlot::Secondary(secondary_state)) => {
+            tracing::info!("Shutting down in secondary mode");
+            secondary_state.shutdown().await;
+            None
         }
-    }
+        Some(TenantSlot::InProgress(_)) => {
+            // Acquiring a slot guarantees its old value was not InProgress
+            unreachable!();
+        }
+        None => None,
+    };
 
     match tenant_cleanup
         .await
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 1b0cf39fbe..2ea3ced008 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -229,6 +229,7 @@ use crate::{
     tenant::upload_queue::{
         UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, UploadTask,
     },
+    TENANT_HEATMAP_BASENAME,
 };
 
 use utils::id::{TenantId, TimelineId};
@@ -818,8 +819,25 @@ impl RemoteTimelineClient {
     fn schedule_deletion_of_unlinked0(
         self: &Arc<Self>,
         upload_queue: &mut UploadQueueInitialized,
-        with_metadata: Vec<(LayerFileName, LayerFileMetadata)>,
+        mut with_metadata: Vec<(LayerFileName, LayerFileMetadata)>,
     ) {
+        // Filter out any layers which were not created by this tenant shard.  These are
+        // layers that originate from some ancestor shard after a split, and may still
+        // be referenced by other shards. We are free to delete them locally and remove
+        // them from our index (and would have already done so when we reach this point
+        // in the code), but we may not delete them remotely.
+        with_metadata.retain(|(name, meta)| {
+            let retain = meta.shard.shard_number == self.tenant_shard_id.shard_number
+                && meta.shard.shard_count == self.tenant_shard_id.shard_count;
+            if !retain {
+                tracing::debug!(
+                    "Skipping deletion of ancestor-shard layer {name}, from shard {}",
+                    meta.shard
+                );
+            }
+            retain
+        });
+
         for (name, meta) in &with_metadata {
             info!(
                 "scheduling deletion of layer {}{} (shard {})",
@@ -1724,11 +1742,11 @@ pub fn remote_index_path(
     .expect("Failed to construct path")
 }
 
-pub const HEATMAP_BASENAME: &str = "heatmap-v1.json";
-
 pub(crate) fn remote_heatmap_path(tenant_shard_id: &TenantShardId) -> RemotePath {
-    RemotePath::from_string(&format!("tenants/{tenant_shard_id}/{HEATMAP_BASENAME}"))
-        .expect("Failed to construct path")
+    RemotePath::from_string(&format!(
+        "tenants/{tenant_shard_id}/{TENANT_HEATMAP_BASENAME}"
+    ))
+    .expect("Failed to construct path")
 }
 
 /// Given the key of an index, parse out the generation part of the name
diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs
index d25fe56b92..2331447266 100644
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -1,24 +1,48 @@
+mod downloader;
 pub mod heatmap;
 mod heatmap_uploader;
+mod scheduler;
 
 use std::sync::Arc;
 
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
 
-use self::heatmap_uploader::heatmap_uploader_task;
+use self::{
+    downloader::{downloader_task, SecondaryDetail},
+    heatmap_uploader::heatmap_uploader_task,
+};
 
-use super::mgr::TenantManager;
+use super::{config::SecondaryLocationConfig, mgr::TenantManager};
 
 use pageserver_api::shard::TenantShardId;
 use remote_storage::GenericRemoteStorage;
 
 use tokio_util::sync::CancellationToken;
-use utils::completion::Barrier;
+use utils::{completion::Barrier, sync::gate::Gate};
 
+enum DownloadCommand {
+    Download(TenantShardId),
+}
 enum UploadCommand {
     Upload(TenantShardId),
 }
 
+impl UploadCommand {
+    fn get_tenant_shard_id(&self) -> &TenantShardId {
+        match self {
+            Self::Upload(id) => id,
+        }
+    }
+}
+
+impl DownloadCommand {
+    fn get_tenant_shard_id(&self) -> &TenantShardId {
+        match self {
+            Self::Download(id) => id,
+        }
+    }
+}
+
 struct CommandRequest<T> {
     payload: T,
     response_tx: tokio::sync::oneshot::Sender<CommandResponse>,
@@ -28,12 +52,73 @@ struct CommandResponse {
     result: anyhow::Result<()>,
 }
 
+// Whereas [`Tenant`] represents an attached tenant, this type represents the work
+// we do for secondary tenant locations: where we are not serving clients or
+// ingesting WAL, but we are maintaining a warm cache of layer files.
+//
+// This type is all about the _download_ path for secondary mode.  The upload path
+// runs separately (see [`heatmap_uploader`]) while a regular attached `Tenant` exists.
+//
+// This structure coordinates TenantManager and SecondaryDownloader,
+// so that the downloader can indicate which tenants it is currently
+// operating on, and the manager can indicate when a particular
+// secondary tenant should cancel any work in flight.
+#[derive(Debug)]
+pub(crate) struct SecondaryTenant {
+    /// Carrying a tenant shard ID simplifies callers such as the downloader
+    /// which need to organize many of these objects by ID.
+    tenant_shard_id: TenantShardId,
+
+    /// Cancellation token indicates to SecondaryDownloader that it should stop doing
+    /// any work for this tenant at the next opportunity.
+    pub(crate) cancel: CancellationToken,
+
+    pub(crate) gate: Gate,
+
+    detail: std::sync::Mutex<SecondaryDetail>,
+}
+
+impl SecondaryTenant {
+    pub(crate) fn new(
+        tenant_shard_id: TenantShardId,
+        config: &SecondaryLocationConfig,
+    ) -> Arc<Self> {
+        Arc::new(Self {
+            tenant_shard_id,
+            // todo: shall we make this a descendent of the
+            // main cancellation token, or is it sufficient that
+            // on shutdown we walk the tenants and fire their
+            // individual cancellations?
+            cancel: CancellationToken::new(),
+            gate: Gate::new(format!("SecondaryTenant {tenant_shard_id}")),
+
+            detail: std::sync::Mutex::new(SecondaryDetail::new(config.clone())),
+        })
+    }
+
+    pub(crate) async fn shutdown(&self) {
+        self.cancel.cancel();
+
+        // Wait for any secondary downloader work to complete
+        self.gate.close().await;
+    }
+
+    pub(crate) fn set_config(&self, config: &SecondaryLocationConfig) {
+        self.detail.lock().unwrap().config = config.clone();
+    }
+
+    fn get_tenant_shard_id(&self) -> &TenantShardId {
+        &self.tenant_shard_id
+    }
+}
+
 /// The SecondaryController is a pseudo-rpc client for administrative control of secondary mode downloads,
 /// and heatmap uploads.  This is not a hot data path: it's primarily a hook for tests,
 /// where we want to immediately upload/download for a particular tenant.  In normal operation
 /// uploads & downloads are autonomous and not driven by this interface.
 pub struct SecondaryController {
     upload_req_tx: tokio::sync::mpsc::Sender<CommandRequest<UploadCommand>>,
+    download_req_tx: tokio::sync::mpsc::Sender<CommandRequest<DownloadCommand>>,
 }
 
 impl SecondaryController {
@@ -63,6 +148,13 @@ impl SecondaryController {
         self.dispatch(&self.upload_req_tx, UploadCommand::Upload(tenant_shard_id))
             .await
     }
+    pub async fn download_tenant(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
+        self.dispatch(
+            &self.download_req_tx,
+            DownloadCommand::Download(tenant_shard_id),
+        )
+        .await
+    }
 }
 
 pub fn spawn_tasks(
@@ -71,9 +163,37 @@ pub fn spawn_tasks(
     background_jobs_can_start: Barrier,
     cancel: CancellationToken,
 ) -> SecondaryController {
+    let mgr_clone = tenant_manager.clone();
+    let storage_clone = remote_storage.clone();
+    let cancel_clone = cancel.clone();
+    let bg_jobs_clone = background_jobs_can_start.clone();
+
+    let (download_req_tx, download_req_rx) =
+        tokio::sync::mpsc::channel::<CommandRequest<DownloadCommand>>(16);
     let (upload_req_tx, upload_req_rx) =
         tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);
 
+    task_mgr::spawn(
+        BACKGROUND_RUNTIME.handle(),
+        TaskKind::SecondaryDownloads,
+        None,
+        None,
+        "secondary tenant downloads",
+        false,
+        async move {
+            downloader_task(
+                mgr_clone,
+                storage_clone,
+                download_req_rx,
+                bg_jobs_clone,
+                cancel_clone,
+            )
+            .await;
+
+            Ok(())
+        },
+    );
+
     task_mgr::spawn(
         BACKGROUND_RUNTIME.handle(),
         TaskKind::SecondaryUploads,
@@ -89,16 +209,26 @@ pub fn spawn_tasks(
                 background_jobs_can_start,
                 cancel,
             )
-            .await
+            .await;
+
+            Ok(())
         },
     );
 
-    SecondaryController { upload_req_tx }
+    SecondaryController {
+        download_req_tx,
+        upload_req_tx,
+    }
 }
 
 /// For running with remote storage disabled: a SecondaryController that is connected to nothing.
 pub fn null_controller() -> SecondaryController {
+    let (download_req_tx, _download_req_rx) =
+        tokio::sync::mpsc::channel::<CommandRequest<DownloadCommand>>(16);
     let (upload_req_tx, _upload_req_rx) =
         tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);
-    SecondaryController { upload_req_tx }
+    SecondaryController {
+        upload_req_tx,
+        download_req_tx,
+    }
 }
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
new file mode 100644
index 0000000000..6fdee08a4e
--- /dev/null
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -0,0 +1,801 @@
+use std::{
+    collections::{HashMap, HashSet},
+    pin::Pin,
+    str::FromStr,
+    sync::Arc,
+    time::{Duration, Instant, SystemTime},
+};
+
+use crate::{
+    config::PageServerConf,
+    metrics::SECONDARY_MODE,
+    tenant::{
+        config::SecondaryLocationConfig,
+        debug_assert_current_span_has_tenant_and_timeline_id,
+        remote_timeline_client::{
+            index::LayerFileMetadata, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES,
+        },
+        span::debug_assert_current_span_has_tenant_id,
+        storage_layer::LayerFileName,
+        tasks::{warn_when_period_overrun, BackgroundLoopKind},
+    },
+    virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile},
+    METADATA_FILE_NAME, TEMP_FILE_SUFFIX,
+};
+
+use super::{
+    heatmap::HeatMapLayer,
+    scheduler::{self, Completion, JobGenerator, SchedulingResult, TenantBackgroundJobs},
+    SecondaryTenant,
+};
+
+use crate::tenant::{
+    mgr::TenantManager,
+    remote_timeline_client::{download::download_layer_file, remote_heatmap_path},
+};
+
+use chrono::format::{DelayedFormat, StrftimeItems};
+use futures::Future;
+use pageserver_api::shard::TenantShardId;
+use rand::Rng;
+use remote_storage::{DownloadError, GenericRemoteStorage};
+
+use tokio_util::sync::CancellationToken;
+use tracing::{info_span, instrument, Instrument};
+use utils::{
+    backoff, completion::Barrier, crashsafe::path_with_suffix_extension, fs_ext, id::TimelineId,
+};
+
+use super::{
+    heatmap::{HeatMapTenant, HeatMapTimeline},
+    CommandRequest, DownloadCommand,
+};
+
+/// For each tenant, how long must have passed since the last download_tenant call before
+/// calling it again.  This is approximately the time by which local data is allowed
+/// to fall behind remote data.
+///
+/// TODO: this should just be a default, and the actual period should be controlled
+/// via the heatmap itself
+/// `<ttps://github.com/neondatabase/neon/issues/6200>`
+const DOWNLOAD_FRESHEN_INTERVAL: Duration = Duration::from_millis(60000);
+
+pub(super) async fn downloader_task(
+    tenant_manager: Arc<TenantManager>,
+    remote_storage: GenericRemoteStorage,
+    command_queue: tokio::sync::mpsc::Receiver<CommandRequest<DownloadCommand>>,
+    background_jobs_can_start: Barrier,
+    cancel: CancellationToken,
+) {
+    let concurrency = tenant_manager.get_conf().secondary_download_concurrency;
+
+    let generator = SecondaryDownloader {
+        tenant_manager,
+        remote_storage,
+    };
+    let mut scheduler = Scheduler::new(generator, concurrency);
+
+    scheduler
+        .run(command_queue, background_jobs_can_start, cancel)
+        .instrument(info_span!("secondary_downloads"))
+        .await
+}
+
+struct SecondaryDownloader {
+    tenant_manager: Arc<TenantManager>,
+    remote_storage: GenericRemoteStorage,
+}
+
+#[derive(Debug, Clone)]
+pub(super) struct OnDiskState {
+    metadata: LayerFileMetadata,
+    access_time: SystemTime,
+}
+
+impl OnDiskState {
+    fn new(
+        _conf: &'static PageServerConf,
+        _tenant_shard_id: &TenantShardId,
+        _imeline_id: &TimelineId,
+        _ame: LayerFileName,
+        metadata: LayerFileMetadata,
+        access_time: SystemTime,
+    ) -> Self {
+        Self {
+            metadata,
+            access_time,
+        }
+    }
+}
+
+#[derive(Debug, Clone, Default)]
+pub(super) struct SecondaryDetailTimeline {
+    pub(super) on_disk_layers: HashMap<LayerFileName, OnDiskState>,
+
+    /// We remember when layers were evicted, to prevent re-downloading them.
+    pub(super) evicted_at: HashMap<LayerFileName, SystemTime>,
+}
+
+/// This state is written by the secondary downloader, it is opaque
+/// to TenantManager
+#[derive(Debug)]
+pub(super) struct SecondaryDetail {
+    pub(super) config: SecondaryLocationConfig,
+
+    last_download: Option<Instant>,
+    next_download: Option<Instant>,
+    pub(super) timelines: HashMap<TimelineId, SecondaryDetailTimeline>,
+}
+
+/// Helper for logging SystemTime
+fn strftime(t: &'_ SystemTime) -> DelayedFormat<StrftimeItems<'_>> {
+    let datetime: chrono::DateTime<chrono::Utc> = (*t).into();
+    datetime.format("%d/%m/%Y %T")
+}
+
+impl SecondaryDetail {
+    pub(super) fn new(config: SecondaryLocationConfig) -> Self {
+        Self {
+            config,
+            last_download: None,
+            next_download: None,
+            timelines: HashMap::new(),
+        }
+    }
+}
+
+struct PendingDownload {
+    secondary_state: Arc<SecondaryTenant>,
+    last_download: Option<Instant>,
+    target_time: Option<Instant>,
+    period: Option<Duration>,
+}
+
+impl scheduler::PendingJob for PendingDownload {
+    fn get_tenant_shard_id(&self) -> &TenantShardId {
+        self.secondary_state.get_tenant_shard_id()
+    }
+}
+
+struct RunningDownload {
+    barrier: Barrier,
+}
+
+impl scheduler::RunningJob for RunningDownload {
+    fn get_barrier(&self) -> Barrier {
+        self.barrier.clone()
+    }
+}
+
+struct CompleteDownload {
+    secondary_state: Arc<SecondaryTenant>,
+    completed_at: Instant,
+}
+
+impl scheduler::Completion for CompleteDownload {
+    fn get_tenant_shard_id(&self) -> &TenantShardId {
+        self.secondary_state.get_tenant_shard_id()
+    }
+}
+
+type Scheduler = TenantBackgroundJobs<
+    SecondaryDownloader,
+    PendingDownload,
+    RunningDownload,
+    CompleteDownload,
+    DownloadCommand,
+>;
+
+#[async_trait::async_trait]
+impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCommand>
+    for SecondaryDownloader
+{
+    #[instrument(skip_all, fields(tenant_id=%completion.get_tenant_shard_id().tenant_id, shard_id=%completion.get_tenant_shard_id().shard_slug()))]
+    fn on_completion(&mut self, completion: CompleteDownload) {
+        let CompleteDownload {
+            secondary_state,
+            completed_at: _completed_at,
+        } = completion;
+
+        tracing::debug!("Secondary tenant download completed");
+
+        // Update freshened_at even if there was an error: we don't want errored tenants to implicitly
+        // take priority to run again.
+        let mut detail = secondary_state.detail.lock().unwrap();
+        detail.next_download = Some(Instant::now() + DOWNLOAD_FRESHEN_INTERVAL);
+    }
+
+    async fn schedule(&mut self) -> SchedulingResult<PendingDownload> {
+        let mut result = SchedulingResult {
+            jobs: Vec::new(),
+            want_interval: None,
+        };
+
+        // Step 1: identify some tenants that we may work on
+        let mut tenants: Vec<Arc<SecondaryTenant>> = Vec::new();
+        self.tenant_manager
+            .foreach_secondary_tenants(|_id, secondary_state| {
+                tenants.push(secondary_state.clone());
+            });
+
+        // Step 2: filter out tenants which are not yet elegible to run
+        let now = Instant::now();
+        result.jobs = tenants
+            .into_iter()
+            .filter_map(|secondary_tenant| {
+                let (last_download, next_download) = {
+                    let mut detail = secondary_tenant.detail.lock().unwrap();
+
+                    if !detail.config.warm {
+                        // Downloads are disabled for this tenant
+                        detail.next_download = None;
+                        return None;
+                    }
+
+                    if detail.next_download.is_none() {
+                        // Initialize with a jitter: this spreads initial downloads on startup
+                        // or mass-attach across our freshen interval.
+                        let jittered_period =
+                            rand::thread_rng().gen_range(Duration::ZERO..DOWNLOAD_FRESHEN_INTERVAL);
+                        detail.next_download = Some(now.checked_add(jittered_period).expect(
+                        "Using our constant, which is known to be small compared with clock range",
+                    ));
+                    }
+                    (detail.last_download, detail.next_download.unwrap())
+                };
+
+                if now < next_download {
+                    Some(PendingDownload {
+                        secondary_state: secondary_tenant,
+                        last_download,
+                        target_time: Some(next_download),
+                        period: Some(DOWNLOAD_FRESHEN_INTERVAL),
+                    })
+                } else {
+                    None
+                }
+            })
+            .collect();
+
+        // Step 3: sort by target execution time to run most urgent first.
+        result.jobs.sort_by_key(|j| j.target_time);
+
+        result
+    }
+
+    fn on_command(&mut self, command: DownloadCommand) -> anyhow::Result<PendingDownload> {
+        let tenant_shard_id = command.get_tenant_shard_id();
+
+        let tenant = self
+            .tenant_manager
+            .get_secondary_tenant_shard(*tenant_shard_id);
+        let Some(tenant) = tenant else {
+            {
+                return Err(anyhow::anyhow!("Not found or not in Secondary mode"));
+            }
+        };
+
+        Ok(PendingDownload {
+            target_time: None,
+            period: None,
+            last_download: None,
+            secondary_state: tenant,
+        })
+    }
+
+    fn spawn(
+        &mut self,
+        job: PendingDownload,
+    ) -> (
+        RunningDownload,
+        Pin<Box<dyn Future<Output = CompleteDownload> + Send>>,
+    ) {
+        let PendingDownload {
+            secondary_state,
+            last_download,
+            target_time,
+            period,
+        } = job;
+
+        let (completion, barrier) = utils::completion::channel();
+        let remote_storage = self.remote_storage.clone();
+        let conf = self.tenant_manager.get_conf();
+        let tenant_shard_id = *secondary_state.get_tenant_shard_id();
+        (RunningDownload { barrier }, Box::pin(async move {
+            let _completion = completion;
+
+            match TenantDownloader::new(conf, &remote_storage, &secondary_state)
+                .download()
+                .await
+            {
+                Err(UpdateError::NoData) => {
+                    tracing::info!("No heatmap found for tenant.  This is fine if it is new.");
+                },
+                Err(UpdateError::NoSpace) => {
+                    tracing::warn!("Insufficient space while downloading.  Will retry later.");
+                }
+                Err(UpdateError::Cancelled) => {
+                    tracing::debug!("Shut down while downloading");
+                },
+                Err(UpdateError::Deserialize(e)) => {
+                    tracing::error!("Corrupt content while downloading tenant: {e}");
+                },
+                Err(e @ (UpdateError::DownloadError(_) | UpdateError::Other(_))) => {
+                    tracing::error!("Error while downloading tenant: {e}");
+                },
+                Ok(()) => {}
+            };
+
+            // Irrespective of the result, we will reschedule ourselves to run after our usual period.
+
+            // If the job had a target execution time, we may check our final execution
+            // time against that for observability purposes.
+            if let (Some(target_time), Some(period)) = (target_time, period) {
+                // Only track execution lag if this isn't our first download: otherwise, it is expected
+                // that execution will have taken longer than our configured interval, for example
+                // when starting up a pageserver and
+                if last_download.is_some() {
+                    // Elapsed time includes any scheduling lag as well as the execution of the job
+                    let elapsed = Instant::now().duration_since(target_time);
+
+                    warn_when_period_overrun(
+                        elapsed,
+                        period,
+                        BackgroundLoopKind::SecondaryDownload,
+                    );
+                }
+            }
+
+            CompleteDownload {
+                    secondary_state,
+                    completed_at: Instant::now(),
+                }
+        }.instrument(info_span!(parent: None, "secondary_download", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))))
+    }
+}
+
+/// This type is a convenience to group together the various functions involved in
+/// freshening a secondary tenant.
+struct TenantDownloader<'a> {
+    conf: &'static PageServerConf,
+    remote_storage: &'a GenericRemoteStorage,
+    secondary_state: &'a SecondaryTenant,
+}
+
+/// Errors that may be encountered while updating a tenant
+#[derive(thiserror::Error, Debug)]
+enum UpdateError {
+    #[error("No remote data found")]
+    NoData,
+    #[error("Insufficient local storage space")]
+    NoSpace,
+    #[error("Failed to download")]
+    DownloadError(DownloadError),
+    #[error(transparent)]
+    Deserialize(#[from] serde_json::Error),
+    #[error("Cancelled")]
+    Cancelled,
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
+impl From<DownloadError> for UpdateError {
+    fn from(value: DownloadError) -> Self {
+        match &value {
+            DownloadError::Cancelled => Self::Cancelled,
+            DownloadError::NotFound => Self::NoData,
+            _ => Self::DownloadError(value),
+        }
+    }
+}
+
+impl From<std::io::Error> for UpdateError {
+    fn from(value: std::io::Error) -> Self {
+        if let Some(nix::errno::Errno::ENOSPC) = value.raw_os_error().map(nix::errno::from_i32) {
+            UpdateError::NoSpace
+        } else {
+            // An I/O error from e.g. tokio::io::copy is most likely a remote storage issue
+            UpdateError::Other(anyhow::anyhow!(value))
+        }
+    }
+}
+
+impl<'a> TenantDownloader<'a> {
+    fn new(
+        conf: &'static PageServerConf,
+        remote_storage: &'a GenericRemoteStorage,
+        secondary_state: &'a SecondaryTenant,
+    ) -> Self {
+        Self {
+            conf,
+            remote_storage,
+            secondary_state,
+        }
+    }
+
+    async fn download(&self) -> Result<(), UpdateError> {
+        debug_assert_current_span_has_tenant_id();
+
+        // For the duration of a download, we must hold the SecondaryTenant::gate, to ensure
+        // cover our access to local storage.
+        let Ok(_guard) = self.secondary_state.gate.enter() else {
+            // Shutting down
+            return Ok(());
+        };
+
+        let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
+        // Download the tenant's heatmap
+        let heatmap_bytes = tokio::select!(
+            bytes = self.download_heatmap() => {bytes?},
+            _ = self.secondary_state.cancel.cancelled() => return Ok(())
+        );
+
+        let heatmap = serde_json::from_slice::<HeatMapTenant>(&heatmap_bytes)?;
+
+        // Save the heatmap: this will be useful on restart, allowing us to reconstruct
+        // layer metadata without having to re-download it.
+        let heatmap_path = self.conf.tenant_heatmap_path(tenant_shard_id);
+
+        let temp_path = path_with_suffix_extension(&heatmap_path, TEMP_FILE_SUFFIX);
+        let context_msg = format!("write tenant {tenant_shard_id} heatmap to {heatmap_path}");
+        let heatmap_path_bg = heatmap_path.clone();
+        tokio::task::spawn_blocking(move || {
+            tokio::runtime::Handle::current().block_on(async move {
+                VirtualFile::crashsafe_overwrite(&heatmap_path_bg, &temp_path, &heatmap_bytes).await
+            })
+        })
+        .await
+        .expect("Blocking task is never aborted")
+        .maybe_fatal_err(&context_msg)?;
+
+        tracing::debug!("Wrote local heatmap to {}", heatmap_path);
+
+        // Download the layers in the heatmap
+        for timeline in heatmap.timelines {
+            if self.secondary_state.cancel.is_cancelled() {
+                return Ok(());
+            }
+
+            let timeline_id = timeline.timeline_id;
+            self.download_timeline(timeline)
+                .instrument(tracing::info_span!(
+                    "secondary_download_timeline",
+                    tenant_id=%tenant_shard_id.tenant_id,
+                    shard_id=%tenant_shard_id.shard_slug(),
+                    %timeline_id
+                ))
+                .await?;
+        }
+
+        Ok(())
+    }
+
+    async fn download_heatmap(&self) -> Result<Vec<u8>, UpdateError> {
+        debug_assert_current_span_has_tenant_id();
+        let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
+        // TODO: make download conditional on ETag having changed since last download
+        // (https://github.com/neondatabase/neon/issues/6199)
+        tracing::debug!("Downloading heatmap for secondary tenant",);
+
+        let heatmap_path = remote_heatmap_path(tenant_shard_id);
+
+        let heatmap_bytes = backoff::retry(
+            || async {
+                let download = self
+                    .remote_storage
+                    .download(&heatmap_path)
+                    .await
+                    .map_err(UpdateError::from)?;
+                let mut heatmap_bytes = Vec::new();
+                let mut body = tokio_util::io::StreamReader::new(download.download_stream);
+                let _size = tokio::io::copy(&mut body, &mut heatmap_bytes).await?;
+                Ok(heatmap_bytes)
+            },
+            |e| matches!(e, UpdateError::NoData | UpdateError::Cancelled),
+            FAILED_DOWNLOAD_WARN_THRESHOLD,
+            FAILED_REMOTE_OP_RETRIES,
+            "download heatmap",
+            backoff::Cancel::new(self.secondary_state.cancel.clone(), || {
+                UpdateError::Cancelled
+            }),
+        )
+        .await?;
+
+        SECONDARY_MODE.download_heatmap.inc();
+
+        Ok(heatmap_bytes)
+    }
+
+    async fn download_timeline(&self, timeline: HeatMapTimeline) -> Result<(), UpdateError> {
+        debug_assert_current_span_has_tenant_and_timeline_id();
+        let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
+        let timeline_path = self
+            .conf
+            .timeline_path(tenant_shard_id, &timeline.timeline_id);
+
+        // Accumulate updates to the state
+        let mut touched = Vec::new();
+
+        // Clone a view of what layers already exist on disk
+        let timeline_state = self
+            .secondary_state
+            .detail
+            .lock()
+            .unwrap()
+            .timelines
+            .get(&timeline.timeline_id)
+            .cloned();
+
+        let timeline_state = match timeline_state {
+            Some(t) => t,
+            None => {
+                // We have no existing state: need to scan local disk for layers first.
+                let timeline_state =
+                    init_timeline_state(self.conf, tenant_shard_id, &timeline).await;
+
+                // Re-acquire detail lock now that we're done with async load from local FS
+                self.secondary_state
+                    .detail
+                    .lock()
+                    .unwrap()
+                    .timelines
+                    .insert(timeline.timeline_id, timeline_state.clone());
+                timeline_state
+            }
+        };
+
+        let layers_in_heatmap = timeline
+            .layers
+            .iter()
+            .map(|l| &l.name)
+            .collect::<HashSet<_>>();
+        let layers_on_disk = timeline_state
+            .on_disk_layers
+            .iter()
+            .map(|l| l.0)
+            .collect::<HashSet<_>>();
+
+        // Remove on-disk layers that are no longer present in heatmap
+        for layer in layers_on_disk.difference(&layers_in_heatmap) {
+            let local_path = timeline_path.join(layer.to_string());
+            tracing::info!("Removing secondary local layer {layer} because it's absent in heatmap",);
+            tokio::fs::remove_file(&local_path)
+                .await
+                .or_else(fs_ext::ignore_not_found)
+                .maybe_fatal_err("Removing secondary layer")?;
+        }
+
+        // Download heatmap layers that are not present on local disk, or update their
+        // access time if they are already present.
+        for layer in timeline.layers {
+            if self.secondary_state.cancel.is_cancelled() {
+                return Ok(());
+            }
+
+            // Existing on-disk layers: just update their access time.
+            if let Some(on_disk) = timeline_state.on_disk_layers.get(&layer.name) {
+                tracing::debug!("Layer {} is already on disk", layer.name);
+                if on_disk.metadata != LayerFileMetadata::from(&layer.metadata)
+                    || on_disk.access_time != layer.access_time
+                {
+                    // We already have this layer on disk.  Update its access time.
+                    tracing::debug!(
+                        "Access time updated for layer {}: {} -> {}",
+                        layer.name,
+                        strftime(&on_disk.access_time),
+                        strftime(&layer.access_time)
+                    );
+                    touched.push(layer);
+                }
+                continue;
+            } else {
+                tracing::debug!("Layer {} not present on disk yet", layer.name);
+            }
+
+            // Eviction: if we evicted a layer, then do not re-download it unless it was accessed more
+            // recently than it was evicted.
+            if let Some(evicted_at) = timeline_state.evicted_at.get(&layer.name) {
+                if &layer.access_time > evicted_at {
+                    tracing::info!(
+                        "Re-downloading evicted layer {}, accessed at {}, evicted at {}",
+                        layer.name,
+                        strftime(&layer.access_time),
+                        strftime(evicted_at)
+                    );
+                } else {
+                    tracing::trace!(
+                        "Not re-downloading evicted layer {}, accessed at {}, evicted at {}",
+                        layer.name,
+                        strftime(&layer.access_time),
+                        strftime(evicted_at)
+                    );
+                    continue;
+                }
+            }
+
+            // Note: no backoff::retry wrapper here because download_layer_file does its own retries internally
+            let downloaded_bytes = match download_layer_file(
+                self.conf,
+                self.remote_storage,
+                *tenant_shard_id,
+                timeline.timeline_id,
+                &layer.name,
+                &LayerFileMetadata::from(&layer.metadata),
+                &self.secondary_state.cancel,
+            )
+            .await
+            {
+                Ok(bytes) => bytes,
+                Err(e) => {
+                    if let DownloadError::NotFound = e {
+                        // A heatmap might be out of date and refer to a layer that doesn't exist any more.
+                        // This is harmless: continue to download the next layer. It is expected during compaction
+                        // GC.
+                        tracing::debug!(
+                            "Skipped downloading missing layer {}, raced with compaction/gc?",
+                            layer.name
+                        );
+                        continue;
+                    } else {
+                        return Err(e.into());
+                    }
+                }
+            };
+
+            if downloaded_bytes != layer.metadata.file_size {
+                let local_path = timeline_path.join(layer.name.to_string());
+
+                tracing::warn!(
+                    "Downloaded layer {} with unexpected size {} != {}.  Removing download.",
+                    layer.name,
+                    downloaded_bytes,
+                    layer.metadata.file_size
+                );
+
+                tokio::fs::remove_file(&local_path)
+                    .await
+                    .or_else(fs_ext::ignore_not_found)?;
+            }
+
+            SECONDARY_MODE.download_layer.inc();
+            touched.push(layer)
+        }
+
+        // Write updates to state to record layers we just downloaded or touched.
+        {
+            let mut detail = self.secondary_state.detail.lock().unwrap();
+            let timeline_detail = detail.timelines.entry(timeline.timeline_id).or_default();
+
+            tracing::info!("Wrote timeline_detail for {} touched layers", touched.len());
+
+            for t in touched {
+                use std::collections::hash_map::Entry;
+                match timeline_detail.on_disk_layers.entry(t.name.clone()) {
+                    Entry::Occupied(mut v) => {
+                        v.get_mut().access_time = t.access_time;
+                    }
+                    Entry::Vacant(e) => {
+                        e.insert(OnDiskState::new(
+                            self.conf,
+                            tenant_shard_id,
+                            &timeline.timeline_id,
+                            t.name,
+                            LayerFileMetadata::from(&t.metadata),
+                            t.access_time,
+                        ));
+                    }
+                }
+            }
+        }
+
+        Ok(())
+    }
+}
+
+/// Scan local storage and build up Layer objects based on the metadata in a HeatMapTimeline
+async fn init_timeline_state(
+    conf: &'static PageServerConf,
+    tenant_shard_id: &TenantShardId,
+    heatmap: &HeatMapTimeline,
+) -> SecondaryDetailTimeline {
+    let timeline_path = conf.timeline_path(tenant_shard_id, &heatmap.timeline_id);
+    let mut detail = SecondaryDetailTimeline::default();
+
+    let mut dir = match tokio::fs::read_dir(&timeline_path).await {
+        Ok(d) => d,
+        Err(e) => {
+            if e.kind() == std::io::ErrorKind::NotFound {
+                let context = format!("Creating timeline directory {timeline_path}");
+                tracing::info!("{}", context);
+                tokio::fs::create_dir_all(&timeline_path)
+                    .await
+                    .fatal_err(&context);
+
+                // No entries to report: drop out.
+                return detail;
+            } else {
+                on_fatal_io_error(&e, &format!("Reading timeline dir {timeline_path}"));
+            }
+        }
+    };
+
+    // As we iterate through layers found on disk, we will look up their metadata from this map.
+    // Layers not present in metadata will be discarded.
+    let heatmap_metadata: HashMap<&LayerFileName, &HeatMapLayer> =
+        heatmap.layers.iter().map(|l| (&l.name, l)).collect();
+
+    while let Some(dentry) = dir
+        .next_entry()
+        .await
+        .fatal_err(&format!("Listing {timeline_path}"))
+    {
+        let dentry_file_name = dentry.file_name();
+        let file_name = dentry_file_name.to_string_lossy();
+        let local_meta = dentry.metadata().await.fatal_err(&format!(
+            "Read metadata on {}",
+            dentry.path().to_string_lossy()
+        ));
+
+        // Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant.
+        if file_name == METADATA_FILE_NAME {
+            continue;
+        }
+
+        match LayerFileName::from_str(&file_name) {
+            Ok(name) => {
+                let remote_meta = heatmap_metadata.get(&name);
+                match remote_meta {
+                    Some(remote_meta) => {
+                        // TODO: checksums for layers (https://github.com/neondatabase/neon/issues/2784)
+                        if local_meta.len() != remote_meta.metadata.file_size {
+                            // This should not happen, because we do crashsafe write-then-rename when downloading
+                            // layers, and layers in remote storage are immutable.  Remove the local file because
+                            // we cannot trust it.
+                            tracing::warn!(
+                                "Removing local layer {name} with unexpected local size {} != {}",
+                                local_meta.len(),
+                                remote_meta.metadata.file_size
+                            );
+                        } else {
+                            // We expect the access time to be initialized immediately afterwards, when
+                            // the latest heatmap is applied to the state.
+                            detail.on_disk_layers.insert(
+                                name.clone(),
+                                OnDiskState::new(
+                                    conf,
+                                    tenant_shard_id,
+                                    &heatmap.timeline_id,
+                                    name,
+                                    LayerFileMetadata::from(&remote_meta.metadata),
+                                    remote_meta.access_time,
+                                ),
+                            );
+                        }
+                    }
+                    None => {
+                        // FIXME: consider some optimization when transitioning from attached to secondary: maybe
+                        // wait until we have seen a heatmap that is more recent than the most recent on-disk state?  Otherwise
+                        // we will end up deleting any layers which were created+uploaded more recently than the heatmap.
+                        tracing::info!(
+                            "Removing secondary local layer {} because it's absent in heatmap",
+                            name
+                        );
+                        tokio::fs::remove_file(&dentry.path())
+                            .await
+                            .or_else(fs_ext::ignore_not_found)
+                            .fatal_err(&format!(
+                                "Removing layer {}",
+                                dentry.path().to_string_lossy()
+                            ));
+                    }
+                }
+            }
+            Err(_) => {
+                // Ignore it.
+                tracing::warn!("Unexpected file in timeline directory: {file_name}");
+            }
+        }
+    }
+
+    detail
+}
diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs
index ece2b93ce1..ef01c33e8e 100644
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -1,5 +1,6 @@
 use std::{
     collections::HashMap,
+    pin::Pin,
     sync::{Arc, Weak},
     time::{Duration, Instant},
 };
@@ -7,35 +8,86 @@ use std::{
 use crate::{
     metrics::SECONDARY_MODE,
     tenant::{
-        config::AttachmentMode, mgr::TenantManager, remote_timeline_client::remote_heatmap_path,
-        secondary::CommandResponse, span::debug_assert_current_span_has_tenant_id, Tenant,
+        config::AttachmentMode,
+        mgr::TenantManager,
+        remote_timeline_client::remote_heatmap_path,
+        span::debug_assert_current_span_has_tenant_id,
+        tasks::{warn_when_period_overrun, BackgroundLoopKind},
+        Tenant,
     },
 };
 
+use futures::Future;
 use md5;
 use pageserver_api::shard::TenantShardId;
+use rand::Rng;
 use remote_storage::GenericRemoteStorage;
 
-use tokio::task::JoinSet;
+use super::{
+    scheduler::{self, JobGenerator, RunningJob, SchedulingResult, TenantBackgroundJobs},
+    CommandRequest,
+};
 use tokio_util::sync::CancellationToken;
-use tracing::instrument;
-use utils::{backoff, completion::Barrier};
+use tracing::{info_span, instrument, Instrument};
+use utils::{backoff, completion::Barrier, yielding_loop::yielding_loop};
 
-use super::{heatmap::HeatMapTenant, CommandRequest, UploadCommand};
+use super::{heatmap::HeatMapTenant, UploadCommand};
 
-/// Period between heatmap uploader walking Tenants to look for work to do.
-/// If any tenants have a heatmap upload period lower than this, it will be adjusted
-/// downward to match.
-const DEFAULT_SCHEDULING_INTERVAL: Duration = Duration::from_millis(60000);
-const MIN_SCHEDULING_INTERVAL: Duration = Duration::from_millis(1000);
+pub(super) async fn heatmap_uploader_task(
+    tenant_manager: Arc<TenantManager>,
+    remote_storage: GenericRemoteStorage,
+    command_queue: tokio::sync::mpsc::Receiver<CommandRequest<UploadCommand>>,
+    background_jobs_can_start: Barrier,
+    cancel: CancellationToken,
+) {
+    let concurrency = tenant_manager.get_conf().heatmap_upload_concurrency;
+
+    let generator = HeatmapUploader {
+        tenant_manager,
+        remote_storage,
+        cancel: cancel.clone(),
+        tenants: HashMap::new(),
+    };
+    let mut scheduler = Scheduler::new(generator, concurrency);
+
+    scheduler
+        .run(command_queue, background_jobs_can_start, cancel)
+        .instrument(info_span!("heatmap_uploader"))
+        .await
+}
+
+/// This type is owned by a single task ([`heatmap_uploader_task`]) which runs an event
+/// handling loop and mutates it as needed: there are no locks here, because that event loop
+/// can hold &mut references to this type throughout.
+struct HeatmapUploader {
+    tenant_manager: Arc<TenantManager>,
+    remote_storage: GenericRemoteStorage,
+    cancel: CancellationToken,
+
+    tenants: HashMap<TenantShardId, UploaderTenantState>,
+}
 
 struct WriteInProgress {
     barrier: Barrier,
 }
 
+impl RunningJob for WriteInProgress {
+    fn get_barrier(&self) -> Barrier {
+        self.barrier.clone()
+    }
+}
+
 struct UploadPending {
     tenant: Arc<Tenant>,
     last_digest: Option<md5::Digest>,
+    target_time: Option<Instant>,
+    period: Option<Duration>,
+}
+
+impl scheduler::PendingJob for UploadPending {
+    fn get_tenant_shard_id(&self) -> &TenantShardId {
+        self.tenant.get_tenant_shard_id()
+    }
 }
 
 struct WriteComplete {
@@ -45,6 +97,12 @@ struct WriteComplete {
     next_upload: Option<Instant>,
 }
 
+impl scheduler::Completion for WriteComplete {
+    fn get_tenant_shard_id(&self) -> &TenantShardId {
+        &self.tenant_shard_id
+    }
+}
+
 /// The heatmap uploader keeps a little bit of per-tenant state, mainly to remember
 /// when we last did a write.  We only populate this after doing at least one
 /// write for a tenant -- this avoids holding state for tenants that have
@@ -68,267 +126,111 @@ struct UploaderTenantState {
     next_upload: Option<Instant>,
 }
 
-/// This type is owned by a single task ([`heatmap_uploader_task`]) which runs an event
-/// handling loop and mutates it as needed: there are no locks here, because that event loop
-/// can hold &mut references to this type throughout.
-struct HeatmapUploader {
-    tenant_manager: Arc<TenantManager>,
-    remote_storage: GenericRemoteStorage,
-    cancel: CancellationToken,
+type Scheduler = TenantBackgroundJobs<
+    HeatmapUploader,
+    UploadPending,
+    WriteInProgress,
+    WriteComplete,
+    UploadCommand,
+>;
 
-    tenants: HashMap<TenantShardId, UploaderTenantState>,
-
-    /// Tenants with work to do, for which tasks should be spawned as soon as concurrency
-    /// limits permit it.
-    tenants_pending: std::collections::VecDeque<UploadPending>,
-
-    /// Tenants for which a task in `tasks` has been spawned.
-    tenants_uploading: HashMap<TenantShardId, WriteInProgress>,
-
-    tasks: JoinSet<()>,
-
-    /// Channel for our child tasks to send results to: we use a channel for results rather than
-    /// just getting task results via JoinSet because we need the channel's recv() "sleep until something
-    /// is available" semantic, rather than JoinSet::join_next()'s "sleep until next thing is available _or_ I'm empty"
-    /// behavior.
-    task_result_tx: tokio::sync::mpsc::UnboundedSender<WriteComplete>,
-    task_result_rx: tokio::sync::mpsc::UnboundedReceiver<WriteComplete>,
-
-    concurrent_uploads: usize,
-
-    scheduling_interval: Duration,
-}
-
-/// The uploader task runs a loop that periodically wakes up and schedules tasks for
-/// tenants that require an upload, or handles any commands that have been sent into
-/// `command_queue`.  No I/O is done in this loop: that all happens in the tasks we
-/// spawn.
-///
-/// Scheduling iterations are somewhat infrequent.  However, each one will enqueue
-/// all tenants that require an upload, and in between scheduling iterations we will
-/// continue to spawn new tasks for pending tenants, as our concurrency limit permits.
-///
-/// While we take a CancellationToken here, it is subordinate to the CancellationTokens
-/// of tenants: i.e. we expect all Tenants to have been shut down before we are shut down, otherwise
-/// we might block waiting on a Tenant.
-pub(super) async fn heatmap_uploader_task(
-    tenant_manager: Arc<TenantManager>,
-    remote_storage: GenericRemoteStorage,
-    mut command_queue: tokio::sync::mpsc::Receiver<CommandRequest<UploadCommand>>,
-    background_jobs_can_start: Barrier,
-    cancel: CancellationToken,
-) -> anyhow::Result<()> {
-    let concurrent_uploads = tenant_manager.get_conf().heatmap_upload_concurrency;
-
-    let (result_tx, result_rx) = tokio::sync::mpsc::unbounded_channel();
-
-    let mut uploader = HeatmapUploader {
-        tenant_manager,
-        remote_storage,
-        cancel: cancel.clone(),
-        tasks: JoinSet::new(),
-        tenants: HashMap::new(),
-        tenants_pending: std::collections::VecDeque::new(),
-        tenants_uploading: HashMap::new(),
-        task_result_tx: result_tx,
-        task_result_rx: result_rx,
-        concurrent_uploads,
-        scheduling_interval: DEFAULT_SCHEDULING_INTERVAL,
-    };
-
-    tracing::info!("Waiting for background_jobs_can start...");
-    background_jobs_can_start.wait().await;
-    tracing::info!("background_jobs_can is ready, proceeding.");
-
-    while !cancel.is_cancelled() {
-        // Look for new work: this is relatively expensive because we have to go acquire the lock on
-        // the tenant manager to retrieve tenants, and then iterate over them to figure out which ones
-        // require an upload.
-        uploader.schedule_iteration().await?;
-
-        // Between scheduling iterations, we will:
-        //  - Drain any complete tasks and spawn pending tasks
-        //  - Handle incoming administrative commands
-        //  - Check our cancellation token
-        let next_scheduling_iteration = Instant::now()
-            .checked_add(uploader.scheduling_interval)
-            .unwrap_or_else(|| {
-                tracing::warn!(
-                    "Scheduling interval invalid ({}s), running immediately!",
-                    uploader.scheduling_interval.as_secs_f64()
-                );
-                Instant::now()
-            });
-        loop {
-            tokio::select! {
-                _ = cancel.cancelled() => {
-                    // We do not simply drop the JoinSet, in order to have an orderly shutdown without cancellation.
-                    tracing::info!("Heatmap uploader joining tasks");
-                    while let Some(_r) = uploader.tasks.join_next().await {};
-                    tracing::info!("Heatmap uploader terminating");
-
-                    break;
-                },
-                _ = tokio::time::sleep(next_scheduling_iteration.duration_since(Instant::now())) => {
-                    tracing::debug!("heatmap_uploader_task: woke for scheduling interval");
-                    break;},
-                cmd = command_queue.recv() => {
-                    tracing::debug!("heatmap_uploader_task: woke for command queue");
-                    let cmd = match cmd {
-                        Some(c) =>c,
-                        None => {
-                            // SecondaryController was destroyed, and this has raced with
-                            // our CancellationToken
-                            tracing::info!("Heatmap uploader terminating");
-                            cancel.cancel();
-                            break;
-                        }
-                    };
-
-                    let CommandRequest{
-                        response_tx,
-                        payload
-                    } = cmd;
-                    uploader.handle_command(payload, response_tx);
-                },
-                _ = uploader.process_next_completion() => {
-                    if !cancel.is_cancelled() {
-                        uploader.spawn_pending();
-                    }
-                }
-            }
-        }
-    }
-
-    Ok(())
-}
-
-impl HeatmapUploader {
-    /// Periodic execution phase: inspect all attached tenants and schedule any work they require.
-    async fn schedule_iteration(&mut self) -> anyhow::Result<()> {
+#[async_trait::async_trait]
+impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
+    for HeatmapUploader
+{
+    async fn schedule(&mut self) -> SchedulingResult<UploadPending> {
         // Cull any entries in self.tenants whose Arc<Tenant> is gone
         self.tenants
             .retain(|_k, v| v.tenant.upgrade().is_some() && v.next_upload.is_some());
 
-        // The priority order of previously scheduled work may be invalidated by current state: drop
-        // all pending work (it will be re-scheduled if still needed)
-        self.tenants_pending.clear();
-
-        // Used a fixed 'now' through the following loop, for efficiency and fairness.
         let now = Instant::now();
 
-        // While iterating over the potentially-long list of tenants, we will periodically yield
-        // to avoid blocking executor.
-        const YIELD_ITERATIONS: usize = 1000;
+        let mut result = SchedulingResult {
+            jobs: Vec::new(),
+            want_interval: None,
+        };
 
-        // Iterate over tenants looking for work to do.
         let tenants = self.tenant_manager.get_attached_active_tenant_shards();
-        for (i, tenant) in tenants.into_iter().enumerate() {
-            // Process is shutting down, drop out
-            if self.cancel.is_cancelled() {
-                return Ok(());
-            }
 
-            // Skip tenants that already have a write in flight
-            if self
-                .tenants_uploading
-                .contains_key(tenant.get_tenant_shard_id())
-            {
-                continue;
-            }
+        yielding_loop(1000, &self.cancel, tenants.into_iter(), |tenant| {
+            let period = match tenant.get_heatmap_period() {
+                None => {
+                    // Heatmaps are disabled for this tenant
+                    return;
+                }
+                Some(period) => {
+                    // If any tenant has asked for uploads more frequent than our scheduling interval,
+                    // reduce it to match so that we can keep up.  This is mainly useful in testing, where
+                    // we may set rather short intervals.
+                    result.want_interval = match result.want_interval {
+                        None => Some(period),
+                        Some(existing) => Some(std::cmp::min(period, existing)),
+                    };
 
-            self.maybe_schedule_upload(&now, tenant);
+                    period
+                }
+            };
 
-            if i + 1 % YIELD_ITERATIONS == 0 {
-                tokio::task::yield_now().await;
-            }
-        }
-
-        // Spawn tasks for as many of our pending tenants as we can.
-        self.spawn_pending();
-
-        Ok(())
-    }
-
-    ///
-    /// Cancellation: this method is cancel-safe.
-    async fn process_next_completion(&mut self) {
-        match self.task_result_rx.recv().await {
-            Some(r) => {
-                self.on_completion(r);
-            }
-            None => {
-                unreachable!("Result sender is stored on Self");
-            }
-        }
-    }
-
-    /// The 'maybe' refers to the tenant's state: whether it is configured
-    /// for heatmap uploads at all, and whether sufficient time has passed
-    /// since the last upload.
-    fn maybe_schedule_upload(&mut self, now: &Instant, tenant: Arc<Tenant>) {
-        match tenant.get_heatmap_period() {
-            None => {
-                // Heatmaps are disabled for this tenant
+            // Stale attachments do not upload anything: if we are in this state, there is probably some
+            // other attachment in mode Single or Multi running on another pageserver, and we don't
+            // want to thrash and overwrite their heatmap uploads.
+            if tenant.get_attach_mode() == AttachmentMode::Stale {
                 return;
             }
-            Some(period) => {
-                // If any tenant has asked for uploads more frequent than our scheduling interval,
-                // reduce it to match so that we can keep up.  This is mainly useful in testing, where
-                // we may set rather short intervals.
-                if period < self.scheduling_interval {
-                    self.scheduling_interval = std::cmp::max(period, MIN_SCHEDULING_INTERVAL);
-                }
+
+            // Create an entry in self.tenants if one doesn't already exist: this will later be updated
+            // with the completion time in on_completion.
+            let state = self
+                .tenants
+                .entry(*tenant.get_tenant_shard_id())
+                .or_insert_with(|| {
+                    let jittered_period = rand::thread_rng().gen_range(Duration::ZERO..period);
+
+                    UploaderTenantState {
+                        tenant: Arc::downgrade(&tenant),
+                        last_upload: None,
+                        next_upload: Some(now.checked_add(jittered_period).unwrap_or(now)),
+                        last_digest: None,
+                    }
+                });
+
+            // Decline to do the upload if insufficient time has passed
+            if state.next_upload.map(|nu| nu > now).unwrap_or(false) {
+                return;
             }
-        }
 
-        // Stale attachments do not upload anything: if we are in this state, there is probably some
-        // other attachment in mode Single or Multi running on another pageserver, and we don't
-        // want to thrash and overwrite their heatmap uploads.
-        if tenant.get_attach_mode() == AttachmentMode::Stale {
-            return;
-        }
-
-        // Create an entry in self.tenants if one doesn't already exist: this will later be updated
-        // with the completion time in on_completion.
-        let state = self
-            .tenants
-            .entry(*tenant.get_tenant_shard_id())
-            .or_insert_with(|| UploaderTenantState {
-                tenant: Arc::downgrade(&tenant),
-                last_upload: None,
-                next_upload: Some(Instant::now()),
-                last_digest: None,
+            let last_digest = state.last_digest;
+            result.jobs.push(UploadPending {
+                tenant,
+                last_digest,
+                target_time: state.next_upload,
+                period: Some(period),
             });
+        })
+        .await
+        .ok();
 
-        // Decline to do the upload if insufficient time has passed
-        if state.next_upload.map(|nu| &nu > now).unwrap_or(false) {
-            return;
-        }
+        result
+    }
 
-        let last_digest = state.last_digest;
-        self.tenants_pending.push_back(UploadPending {
+    fn spawn(
+        &mut self,
+        job: UploadPending,
+    ) -> (
+        WriteInProgress,
+        Pin<Box<dyn Future<Output = WriteComplete> + Send>>,
+    ) {
+        let UploadPending {
             tenant,
             last_digest,
-        })
-    }
+            target_time,
+            period,
+        } = job;
 
-    fn spawn_pending(&mut self) {
-        while !self.tenants_pending.is_empty()
-            && self.tenants_uploading.len() < self.concurrent_uploads
-        {
-            // unwrap: loop condition includes !is_empty()
-            let pending = self.tenants_pending.pop_front().unwrap();
-            self.spawn_upload(pending.tenant, pending.last_digest);
-        }
-    }
-
-    fn spawn_upload(&mut self, tenant: Arc<Tenant>, last_digest: Option<md5::Digest>) {
         let remote_storage = self.remote_storage.clone();
-        let tenant_shard_id = *tenant.get_tenant_shard_id();
         let (completion, barrier) = utils::completion::channel();
-        let result_tx = self.task_result_tx.clone();
-        self.tasks.spawn(async move {
+        let tenant_shard_id = *tenant.get_tenant_shard_id();
+        (WriteInProgress { barrier }, Box::pin(async move {
             // Guard for the barrier in [`WriteInProgress`]
             let _completion = completion;
 
@@ -362,22 +264,47 @@ impl HeatmapUploader {
             };
 
             let now = Instant::now();
+
+            // If the job had a target execution time, we may check our final execution
+            // time against that for observability purposes.
+            if let (Some(target_time), Some(period)) = (target_time, period) {
+                // Elapsed time includes any scheduling lag as well as the execution of the job
+                let elapsed = now.duration_since(target_time);
+
+                warn_when_period_overrun(elapsed, period, BackgroundLoopKind::HeatmapUpload);
+            }
+
             let next_upload = tenant
                 .get_heatmap_period()
                 .and_then(|period| now.checked_add(period));
 
-            result_tx
-                .send(WriteComplete {
+            WriteComplete {
                     tenant_shard_id: *tenant.get_tenant_shard_id(),
                     completed_at: now,
                     digest,
                     next_upload,
-                })
-                .ok();
-        });
+                }
+        }.instrument(info_span!(parent: None, "heatmap_upload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))))
+    }
 
-        self.tenants_uploading
-            .insert(tenant_shard_id, WriteInProgress { barrier });
+    fn on_command(&mut self, command: UploadCommand) -> anyhow::Result<UploadPending> {
+        let tenant_shard_id = command.get_tenant_shard_id();
+
+        tracing::info!(
+            tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
+            "Starting heatmap write on command");
+        let tenant = self
+            .tenant_manager
+            .get_attached_tenant_shard(*tenant_shard_id, true)
+            .map_err(|e| anyhow::anyhow!(e))?;
+
+        Ok(UploadPending {
+            // Ignore our state for last digest: this forces an upload even if nothing has changed
+            last_digest: None,
+            tenant,
+            target_time: None,
+            period: None,
+        })
     }
 
     #[instrument(skip_all, fields(tenant_id=%completion.tenant_shard_id.tenant_id, shard_id=%completion.tenant_shard_id.shard_slug()))]
@@ -389,7 +316,6 @@ impl HeatmapUploader {
             digest,
             next_upload,
         } = completion;
-        self.tenants_uploading.remove(&tenant_shard_id);
         use std::collections::hash_map::Entry;
         match self.tenants.entry(tenant_shard_id) {
             Entry::Vacant(_) => {
@@ -402,69 +328,6 @@ impl HeatmapUploader {
             }
         }
     }
-
-    fn handle_command(
-        &mut self,
-        command: UploadCommand,
-        response_tx: tokio::sync::oneshot::Sender<CommandResponse>,
-    ) {
-        match command {
-            UploadCommand::Upload(tenant_shard_id) => {
-                // If an upload was ongoing for this tenant, let it finish first.
-                let barrier = if let Some(writing_state) =
-                    self.tenants_uploading.get(&tenant_shard_id)
-                {
-                    tracing::info!(
-                        tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
-                        "Waiting for heatmap write to complete");
-                    writing_state.barrier.clone()
-                } else {
-                    // Spawn the upload then immediately wait for it.  This will block processing of other commands and
-                    // starting of other background work.
-                    tracing::info!(
-                        tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
-                        "Starting heatmap write on command");
-                    let tenant = match self
-                        .tenant_manager
-                        .get_attached_tenant_shard(tenant_shard_id, true)
-                    {
-                        Ok(t) => t,
-                        Err(e) => {
-                            // Drop result of send: we don't care if caller dropped their receiver
-                            drop(response_tx.send(CommandResponse {
-                                result: Err(e.into()),
-                            }));
-                            return;
-                        }
-                    };
-                    self.spawn_upload(tenant, None);
-                    let writing_state = self
-                        .tenants_uploading
-                        .get(&tenant_shard_id)
-                        .expect("We just inserted this");
-                    tracing::info!(
-                        tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
-                        "Waiting for heatmap upload to complete");
-
-                    writing_state.barrier.clone()
-                };
-
-                // This task does no I/O: it only listens for a barrier's completion and then
-                // sends to the command response channel.  It is therefore safe to spawn this without
-                // any gates/task_mgr hooks.
-                tokio::task::spawn(async move {
-                    barrier.wait().await;
-
-                    tracing::info!(
-                        tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
-                        "Heatmap upload complete");
-
-                    // Drop result of send: we don't care if caller dropped their receiver
-                    drop(response_tx.send(CommandResponse { result: Ok(()) }))
-                });
-            }
-        }
-    }
 }
 
 enum UploadHeatmapOutcome {
@@ -487,7 +350,6 @@ enum UploadHeatmapError {
 
 /// The inner upload operation.  This will skip if `last_digest` is Some and matches the digest
 /// of the object we would have uploaded.
-#[instrument(skip_all, fields(tenant_id = %tenant.get_tenant_shard_id().tenant_id, shard_id = %tenant.get_tenant_shard_id().shard_slug()))]
 async fn upload_tenant_heatmap(
     remote_storage: GenericRemoteStorage,
     tenant: &Arc<Tenant>,
diff --git a/pageserver/src/tenant/secondary/scheduler.rs b/pageserver/src/tenant/secondary/scheduler.rs
new file mode 100644
index 0000000000..cf01a100d9
--- /dev/null
+++ b/pageserver/src/tenant/secondary/scheduler.rs
@@ -0,0 +1,361 @@
+use async_trait;
+use futures::Future;
+use std::{
+    collections::HashMap,
+    marker::PhantomData,
+    pin::Pin,
+    time::{Duration, Instant},
+};
+
+use pageserver_api::shard::TenantShardId;
+use tokio::task::JoinSet;
+use tokio_util::sync::CancellationToken;
+use utils::{completion::Barrier, yielding_loop::yielding_loop};
+
+use super::{CommandRequest, CommandResponse};
+
+/// Scheduling interval is the time between calls to JobGenerator::schedule.
+/// When we schedule jobs, the job generator may provide a hint of its preferred
+/// interval, which we will respect within these intervals.
+const MAX_SCHEDULING_INTERVAL: Duration = Duration::from_secs(10);
+const MIN_SCHEDULING_INTERVAL: Duration = Duration::from_secs(1);
+
+/// Scheduling helper for background work across many tenants.
+///
+/// Systems that need to run background work across many tenants may use this type
+/// to schedule jobs within a concurrency limit, along with their own [`JobGenerator`]
+/// implementation to provide the work to execute.  This is a simple scheduler that just
+/// polls the generator for outstanding work, replacing its queue of pending work with
+/// what the generator yields on each call: the job generator can change its mind about
+/// the order of jobs between calls.  The job generator is notified when jobs complete,
+/// and additionally may expose a command hook to generate jobs on-demand (e.g. to implement
+/// admin APIs).
+///
+/// For an example see [`crate::tenant::secondary::heatmap_uploader`]
+///
+/// G: A JobGenerator that this scheduler will poll to find pending jobs
+/// PJ: 'Pending Job': type for job descriptors that are ready to run
+/// RJ: 'Running Job' type' for jobs that have been spawned
+/// C : 'Completion' type that spawned jobs will send when they finish
+/// CMD: 'Command' type that the job generator will accept to create jobs on-demand
+pub(super) struct TenantBackgroundJobs<G, PJ, RJ, C, CMD>
+where
+    G: JobGenerator<PJ, RJ, C, CMD>,
+    C: Completion,
+    PJ: PendingJob,
+    RJ: RunningJob,
+{
+    generator: G,
+
+    /// Ready to run.  Will progress to `running` once concurrent limit is satisfied, or
+    /// be removed on next scheduling pass.
+    pending: std::collections::VecDeque<PJ>,
+
+    /// Tasks currently running in Self::tasks for these tenants.  Check this map
+    /// before pushing more work into pending for the same tenant.
+    running: HashMap<TenantShardId, RJ>,
+
+    tasks: JoinSet<C>,
+
+    concurrency: usize,
+
+    /// How often we would like schedule_interval to be called.
+    pub(super) scheduling_interval: Duration,
+
+    _phantom: PhantomData<(PJ, RJ, C, CMD)>,
+}
+
+#[async_trait::async_trait]
+pub(crate) trait JobGenerator<PJ, RJ, C, CMD>
+where
+    C: Completion,
+    PJ: PendingJob,
+    RJ: RunningJob,
+{
+    /// Called at each scheduling interval.  Return a list of jobs to run, most urgent first.
+    ///
+    /// This function may be expensive (e.g. walk all tenants), but should not do any I/O.
+    /// Implementations should take care to yield the executor periodically if running
+    /// very long loops.
+    ///
+    /// Yielding a job here does _not_ guarantee that it will run: if the queue of pending
+    /// jobs is not drained by the next scheduling interval, pending jobs will be cleared
+    /// and re-generated.
+    async fn schedule(&mut self) -> SchedulingResult<PJ>;
+
+    /// Called when a pending job is ready to be run.
+    ///
+    /// The job generation provides a future, and a RJ (Running Job) descriptor that tracks it.
+    fn spawn(&mut self, pending_job: PJ) -> (RJ, Pin<Box<dyn Future<Output = C> + Send>>);
+
+    /// Called when a job previously spawned with spawn() transmits its completion
+    fn on_completion(&mut self, completion: C);
+
+    /// Called when a command is received.  A job will be spawned immediately if the return
+    /// value is Some, ignoring concurrency limits and the pending queue.
+    fn on_command(&mut self, cmd: CMD) -> anyhow::Result<PJ>;
+}
+
+/// [`JobGenerator`] returns this to provide pending jobs, and hints about scheduling
+pub(super) struct SchedulingResult<PJ> {
+    pub(super) jobs: Vec<PJ>,
+    /// The job generator would like to be called again this soon
+    pub(super) want_interval: Option<Duration>,
+}
+
+/// See [`TenantBackgroundJobs`].
+pub(super) trait PendingJob {
+    fn get_tenant_shard_id(&self) -> &TenantShardId;
+}
+
+/// See [`TenantBackgroundJobs`].
+pub(super) trait Completion: Send + 'static {
+    fn get_tenant_shard_id(&self) -> &TenantShardId;
+}
+
+/// See [`TenantBackgroundJobs`].
+pub(super) trait RunningJob {
+    fn get_barrier(&self) -> Barrier;
+}
+
+impl<G, PJ, RJ, C, CMD> TenantBackgroundJobs<G, PJ, RJ, C, CMD>
+where
+    C: Completion,
+    PJ: PendingJob,
+    RJ: RunningJob,
+    G: JobGenerator<PJ, RJ, C, CMD>,
+{
+    pub(super) fn new(generator: G, concurrency: usize) -> Self {
+        Self {
+            generator,
+            pending: std::collections::VecDeque::new(),
+            running: HashMap::new(),
+            tasks: JoinSet::new(),
+            concurrency,
+            scheduling_interval: MAX_SCHEDULING_INTERVAL,
+            _phantom: PhantomData,
+        }
+    }
+
+    pub(super) async fn run(
+        &mut self,
+        mut command_queue: tokio::sync::mpsc::Receiver<CommandRequest<CMD>>,
+        background_jobs_can_start: Barrier,
+        cancel: CancellationToken,
+    ) {
+        tracing::info!("Waiting for background_jobs_can start...");
+        background_jobs_can_start.wait().await;
+        tracing::info!("background_jobs_can is ready, proceeding.");
+
+        while !cancel.is_cancelled() {
+            // Look for new work: this is relatively expensive because we have to go acquire the lock on
+            // the tenant manager to retrieve tenants, and then iterate over them to figure out which ones
+            // require an upload.
+            self.schedule_iteration(&cancel).await;
+
+            if cancel.is_cancelled() {
+                return;
+            }
+
+            // Schedule some work, if concurrency limit permits it
+            self.spawn_pending();
+
+            // Between scheduling iterations, we will:
+            //  - Drain any complete tasks and spawn pending tasks
+            //  - Handle incoming administrative commands
+            //  - Check our cancellation token
+            let next_scheduling_iteration = Instant::now()
+                .checked_add(self.scheduling_interval)
+                .unwrap_or_else(|| {
+                    tracing::warn!(
+                        "Scheduling interval invalid ({}s)",
+                        self.scheduling_interval.as_secs_f64()
+                    );
+                    // unwrap(): this constant is small, cannot fail to add to time unless
+                    // we are close to the end of the universe.
+                    Instant::now().checked_add(MIN_SCHEDULING_INTERVAL).unwrap()
+                });
+            loop {
+                tokio::select! {
+                    _ = cancel.cancelled() => {
+                        tracing::info!("joining tasks");
+                        // We do not simply drop the JoinSet, in order to have an orderly shutdown without cancellation.
+                        // It is the callers responsibility to make sure that the tasks they scheduled
+                        // respect an appropriate cancellation token, to shut down promptly.  It is only
+                        // safe to wait on joining these tasks because we can see the cancellation token
+                        // has been set.
+                        while let Some(_r) = self.tasks.join_next().await {}
+                        tracing::info!("terminating on cancellation token.");
+
+                        break;
+                    },
+                    _ = tokio::time::sleep(next_scheduling_iteration.duration_since(Instant::now())) => {
+                        tracing::debug!("woke for scheduling interval");
+                        break;},
+                    cmd = command_queue.recv() => {
+                        tracing::debug!("woke for command queue");
+                        let cmd = match cmd {
+                            Some(c) =>c,
+                            None => {
+                                // SecondaryController was destroyed, and this has raced with
+                                // our CancellationToken
+                                tracing::info!("terminating on command queue destruction");
+                                cancel.cancel();
+                                break;
+                            }
+                        };
+
+                        let CommandRequest{
+                            response_tx,
+                            payload
+                        } = cmd;
+                        self.handle_command(payload, response_tx);
+                    },
+                    _ = async {
+                        let completion = self.process_next_completion().await;
+                        match completion {
+                            Some(c) => {
+                                self.generator.on_completion(c);
+                                if !cancel.is_cancelled() {
+                                    self.spawn_pending();
+                                }
+                            },
+                            None => {
+                                // Nothing is running, so just wait: expect that this future
+                                // will be dropped when something in the outer select! fires.
+                                cancel.cancelled().await;
+                            }
+                        }
+
+                     } => {}
+                }
+            }
+        }
+    }
+
+    fn do_spawn(&mut self, job: PJ) {
+        let tenant_shard_id = *job.get_tenant_shard_id();
+        let (in_progress, fut) = self.generator.spawn(job);
+
+        self.tasks.spawn(fut);
+
+        self.running.insert(tenant_shard_id, in_progress);
+    }
+
+    /// For all pending tenants that are elegible for execution, spawn their task.
+    ///
+    /// Caller provides the spawn operation, we track the resulting execution.
+    fn spawn_pending(&mut self) {
+        while !self.pending.is_empty() && self.running.len() < self.concurrency {
+            // unwrap: loop condition includes !is_empty()
+            let pending = self.pending.pop_front().unwrap();
+            self.do_spawn(pending);
+        }
+    }
+
+    /// For administrative commands: skip the pending queue, ignore concurrency limits
+    fn spawn_now(&mut self, job: PJ) -> &RJ {
+        let tenant_shard_id = *job.get_tenant_shard_id();
+        self.do_spawn(job);
+        self.running
+            .get(&tenant_shard_id)
+            .expect("We just inserted this")
+    }
+
+    /// Wait until the next task completes, and handle its completion
+    ///
+    /// Cancellation: this method is cancel-safe.
+    async fn process_next_completion(&mut self) -> Option<C> {
+        match self.tasks.join_next().await {
+            Some(r) => {
+                // We use a channel to drive completions, but also
+                // need to drain the JoinSet to avoid completed tasks
+                // accumulating.  These calls are 1:1 because every task
+                // we spawn into this joinset submits is result to the channel.
+                let completion = r.expect("Panic in background task");
+
+                self.running.remove(completion.get_tenant_shard_id());
+                Some(completion)
+            }
+            None => {
+                // Nothing is running, so we have nothing to wait for.  We may drop out: the
+                // main even loop will call us again after the next time it has run something.
+                None
+            }
+        }
+    }
+
+    /// Convert the command into a pending job, spawn it, and when the spawned
+    /// job completes, send the result down `response_tx`.
+    fn handle_command(
+        &mut self,
+        cmd: CMD,
+        response_tx: tokio::sync::oneshot::Sender<CommandResponse>,
+    ) {
+        let job = match self.generator.on_command(cmd) {
+            Ok(j) => j,
+            Err(e) => {
+                response_tx.send(CommandResponse { result: Err(e) }).ok();
+                return;
+            }
+        };
+
+        let tenant_shard_id = job.get_tenant_shard_id();
+        let barrier = if let Some(barrier) = self.get_running(tenant_shard_id) {
+            barrier
+        } else {
+            let running = self.spawn_now(job);
+            running.get_barrier().clone()
+        };
+
+        // This task does no I/O: it only listens for a barrier's completion and then
+        // sends to the command response channel.  It is therefore safe to spawn this without
+        // any gates/task_mgr hooks.
+        tokio::task::spawn(async move {
+            barrier.wait().await;
+
+            response_tx.send(CommandResponse { result: Ok(()) }).ok();
+        });
+    }
+
+    fn get_running(&self, tenant_shard_id: &TenantShardId) -> Option<Barrier> {
+        self.running.get(tenant_shard_id).map(|r| r.get_barrier())
+    }
+
+    /// Periodic execution phase: inspect all attached tenants and schedule any work they require.
+    ///
+    /// The type in `tenants` should be a tenant-like structure, e.g. [`crate::tenant::Tenant`] or [`crate::tenant::secondary::SecondaryTenant`]
+    ///
+    /// This function resets the pending list: it is assumed that the caller may change their mind about
+    /// which tenants need work between calls to schedule_iteration.
+    async fn schedule_iteration(&mut self, cancel: &CancellationToken) {
+        let SchedulingResult {
+            jobs,
+            want_interval,
+        } = self.generator.schedule().await;
+
+        // Adjust interval based on feedback from the job generator
+        if let Some(want_interval) = want_interval {
+            // Calculation uses second granularity: this scheduler is not intended for high frequency tasks
+            self.scheduling_interval = Duration::from_secs(std::cmp::min(
+                std::cmp::max(MIN_SCHEDULING_INTERVAL.as_secs(), want_interval.as_secs()),
+                MAX_SCHEDULING_INTERVAL.as_secs(),
+            ));
+        }
+
+        // The priority order of previously scheduled work may be invalidated by current state: drop
+        // all pending work (it will be re-scheduled if still needed)
+        self.pending.clear();
+
+        // While iterating over the potentially-long list of tenants, we will periodically yield
+        // to avoid blocking executor.
+        yielding_loop(1000, cancel, jobs.into_iter(), |job| {
+            // Skip tenants that already have a write in flight
+            if !self.running.contains_key(job.get_tenant_shard_id()) {
+                self.pending.push_back(job);
+            }
+        })
+        .await
+        .ok();
+    }
+}
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 003cf0e92b..7c9103eea8 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -23,7 +23,7 @@ use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap};
 // while being able to use std::fmt::Write's methods
 use std::fmt::Write as _;
 use std::ops::Range;
-use tokio::sync::RwLock;
+use tokio::sync::{RwLock, RwLockWriteGuard};
 
 use super::{DeltaLayerWriter, ResidentLayer};
 
@@ -246,16 +246,43 @@ impl InMemoryLayer {
 
     /// Common subroutine of the public put_wal_record() and put_page_image() functions.
     /// Adds the page version to the in-memory tree
-    pub async fn put_value(
+    pub(crate) async fn put_value(
         &self,
         key: Key,
         lsn: Lsn,
         val: &Value,
         ctx: &RequestContext,
     ) -> Result<()> {
-        trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
-        let inner: &mut _ = &mut *self.inner.write().await;
+        let mut inner = self.inner.write().await;
         self.assert_writable();
+        self.put_value_locked(&mut inner, key, lsn, val, ctx).await
+    }
+
+    pub(crate) async fn put_values(
+        &self,
+        values: &HashMap<Key, Vec<(Lsn, Value)>>,
+        ctx: &RequestContext,
+    ) -> Result<()> {
+        let mut inner = self.inner.write().await;
+        self.assert_writable();
+        for (key, vals) in values {
+            for (lsn, val) in vals {
+                self.put_value_locked(&mut inner, *key, *lsn, val, ctx)
+                    .await?;
+            }
+        }
+        Ok(())
+    }
+
+    async fn put_value_locked(
+        &self,
+        locked_inner: &mut RwLockWriteGuard<'_, InMemoryLayerInner>,
+        key: Key,
+        lsn: Lsn,
+        val: &Value,
+        ctx: &RequestContext,
+    ) -> Result<()> {
+        trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
 
         let off = {
             // Avoid doing allocations for "small" values.
@@ -264,7 +291,7 @@ impl InMemoryLayer {
             let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
             buf.clear();
             val.ser_into(&mut buf)?;
-            inner
+            locked_inner
                 .file
                 .write_blob(
                     &buf,
@@ -275,7 +302,7 @@ impl InMemoryLayer {
                 .await?
         };
 
-        let vec_map = inner.index.entry(key).or_default();
+        let vec_map = locked_inner.index.entry(key).or_default();
         let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
         if old.is_some() {
             // We already had an entry for this LSN. That's odd..
@@ -285,13 +312,11 @@ impl InMemoryLayer {
         Ok(())
     }
 
-    pub async fn put_tombstone(&self, _key_range: Range<Key>, _lsn: Lsn) -> Result<()> {
+    pub(crate) async fn put_tombstones(&self, _key_ranges: &[(Range<Key>, Lsn)]) -> Result<()> {
         // TODO: Currently, we just leak the storage for any deleted keys
-
         Ok(())
     }
 
-    /// Make the layer non-writeable. Only call once.
     /// Records the end_lsn for non-dropped layers.
     /// `end_lsn` is exclusive
     pub async fn freeze(&self, end_lsn: Lsn) {
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 8ae911b31e..f5adf9d639 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1118,6 +1118,7 @@ impl LayerInner {
                         tracing::info!("evicted layer after unknown residence period");
                     }
                 }
+                timeline.metrics.evictions.inc();
                 timeline
                     .metrics
                     .resident_physical_size_sub(self.desc.file_size);
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 7ff1873eda..aa5894cc37 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -45,6 +45,8 @@ pub(crate) enum BackgroundLoopKind {
     ConsumptionMetricsCollectMetrics,
     ConsumptionMetricsSyntheticSizeWorker,
     InitialLogicalSizeCalculation,
+    HeatmapUpload,
+    SecondaryDownload,
 }
 
 impl BackgroundLoopKind {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 1e84fa1848..24a92859b7 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -373,15 +373,20 @@ pub struct GcInfo {
 }
 
 /// An error happened in a get() operation.
-#[derive(thiserror::Error)]
-pub enum PageReconstructError {
+#[derive(thiserror::Error, Debug)]
+pub(crate) enum PageReconstructError {
     #[error(transparent)]
     Other(#[from] anyhow::Error),
 
+    #[error("Ancestor LSN wait error: {0}")]
+    AncestorLsnTimeout(#[from] WaitLsnError),
+
     /// The operation was cancelled
+    #[error("Cancelled")]
     Cancelled,
 
     /// The ancestor of this is being stopped
+    #[error("ancestor timeline {0} is being stopped")]
     AncestorStopping(TimelineId),
 
     /// An error happened replaying WAL records
@@ -402,32 +407,6 @@ enum FlushLayerError {
     Other(#[from] anyhow::Error),
 }
 
-impl std::fmt::Debug for PageReconstructError {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
-        match self {
-            Self::Other(err) => err.fmt(f),
-            Self::Cancelled => write!(f, "cancelled"),
-            Self::AncestorStopping(timeline_id) => {
-                write!(f, "ancestor timeline {timeline_id} is being stopped")
-            }
-            Self::WalRedo(err) => err.fmt(f),
-        }
-    }
-}
-
-impl std::fmt::Display for PageReconstructError {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
-        match self {
-            Self::Other(err) => err.fmt(f),
-            Self::Cancelled => write!(f, "cancelled"),
-            Self::AncestorStopping(timeline_id) => {
-                write!(f, "ancestor timeline {timeline_id} is being stopped")
-            }
-            Self::WalRedo(err) => err.fmt(f),
-        }
-    }
-}
-
 #[derive(Clone, Copy)]
 pub enum LogicalSizeCalculationCause {
     Initial,
@@ -452,6 +431,21 @@ impl std::fmt::Debug for Timeline {
     }
 }
 
+#[derive(thiserror::Error, Debug)]
+pub(crate) enum WaitLsnError {
+    // Called on a timeline which is shutting down
+    #[error("Shutdown")]
+    Shutdown,
+
+    // Called on an timeline not in active state or shutting down
+    #[error("Bad state (not active)")]
+    BadState,
+
+    // Timeout expired while waiting for LSN to catch up with goal.
+    #[error("{0}")]
+    Timeout(String),
+}
+
 /// Public interface functions
 impl Timeline {
     /// Get the LSN where this branch was created
@@ -486,7 +480,7 @@ impl Timeline {
     /// # Cancel-Safety
     ///
     /// This method is cancellation-safe.
-    pub async fn get(
+    pub(crate) async fn get(
         &self,
         key: Key,
         lsn: Lsn,
@@ -496,6 +490,11 @@ impl Timeline {
             return Err(PageReconstructError::Other(anyhow::anyhow!("Invalid LSN")));
         }
 
+        // This check is debug-only because of the cost of hashing, and because it's a double-check: we
+        // already checked the key against the shard_identity when looking up the Timeline from
+        // page_service.
+        debug_assert!(!self.shard_identity.is_key_disposable(&key));
+
         // XXX: structured stats collection for layer eviction here.
         trace!(
             "get page request for {}@{} from task kind {:?}",
@@ -629,24 +628,28 @@ impl Timeline {
     /// You should call this before any of the other get_* or list_* functions. Calling
     /// those functions with an LSN that has been processed yet is an error.
     ///
-    pub async fn wait_lsn(
+    pub(crate) async fn wait_lsn(
         &self,
         lsn: Lsn,
         _ctx: &RequestContext, /* Prepare for use by cancellation */
-    ) -> anyhow::Result<()> {
-        anyhow::ensure!(self.is_active(), "Cannot wait for Lsn on inactive timeline");
+    ) -> Result<(), WaitLsnError> {
+        if self.cancel.is_cancelled() {
+            return Err(WaitLsnError::Shutdown);
+        } else if !self.is_active() {
+            return Err(WaitLsnError::BadState);
+        }
 
         // This should never be called from the WAL receiver, because that could lead
         // to a deadlock.
-        anyhow::ensure!(
+        debug_assert!(
             task_mgr::current_task_kind() != Some(TaskKind::WalReceiverManager),
             "wait_lsn cannot be called in WAL receiver"
         );
-        anyhow::ensure!(
+        debug_assert!(
             task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionHandler),
             "wait_lsn cannot be called in WAL receiver"
         );
-        anyhow::ensure!(
+        debug_assert!(
             task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionPoller),
             "wait_lsn cannot be called in WAL receiver"
         );
@@ -660,18 +663,22 @@ impl Timeline {
         {
             Ok(()) => Ok(()),
             Err(e) => {
-                // don't count the time spent waiting for lock below, and also in walreceiver.status(), towards the wait_lsn_time_histo
-                drop(_timer);
-                let walreceiver_status = self.walreceiver_status();
-                Err(anyhow::Error::new(e).context({
-                    format!(
+                use utils::seqwait::SeqWaitError::*;
+                match e {
+                    Shutdown => Err(WaitLsnError::Shutdown),
+                    Timeout => {
+                        // don't count the time spent waiting for lock below, and also in walreceiver.status(), towards the wait_lsn_time_histo
+                        drop(_timer);
+                        let walreceiver_status = self.walreceiver_status();
+                        Err(WaitLsnError::Timeout(format!(
                         "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}, WalReceiver status: {}",
                         lsn,
                         self.get_last_record_lsn(),
                         self.get_disk_consistent_lsn(),
                         walreceiver_status,
-                    )
-                }))
+                    )))
+                    }
+                }
             }
         }
     }
@@ -1459,6 +1466,7 @@ impl Timeline {
                 max_lsn_wal_lag,
                 auth_token: crate::config::SAFEKEEPER_AUTH_TOKEN.get().cloned(),
                 availability_zone: self.conf.availability_zone.clone(),
+                ingest_batch_size: self.conf.ingest_batch_size,
             },
             broker_client,
             ctx,
@@ -2223,13 +2231,13 @@ impl Timeline {
                     return Err(layer_traversal_error(
                         if cfg!(test) {
                             format!(
-                                "could not find data for key {} at LSN {}, for request at LSN {}\n{}",
-                                key, cont_lsn, request_lsn, std::backtrace::Backtrace::force_capture(),
+                                "could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}\n{}",
+                                key, self.shard_identity.get_shard_number(&key), cont_lsn, request_lsn, std::backtrace::Backtrace::force_capture(),
                             )
                         } else {
                             format!(
-                                "could not find data for key {} at LSN {}, for request at LSN {}",
-                                key, cont_lsn, request_lsn
+                                "could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}",
+                                key, self.shard_identity.get_shard_number(&key), cont_lsn, request_lsn
                             )
                         },
                         traversal_path,
@@ -2289,11 +2297,12 @@ impl Timeline {
                 ancestor
                     .wait_lsn(timeline.ancestor_lsn, ctx)
                     .await
-                    .with_context(|| {
-                        format!(
-                            "wait for lsn {} on ancestor timeline_id={}",
-                            timeline.ancestor_lsn, ancestor.timeline_id
-                        )
+                    .map_err(|e| match e {
+                        e @ WaitLsnError::Timeout(_) => PageReconstructError::AncestorLsnTimeout(e),
+                        WaitLsnError::Shutdown => PageReconstructError::Cancelled,
+                        e @ WaitLsnError::BadState => {
+                            PageReconstructError::Other(anyhow::anyhow!(e))
+                        }
                     })?;
 
                 timeline_owned = ancestor;
@@ -2471,9 +2480,27 @@ impl Timeline {
         Ok(())
     }
 
-    async fn put_tombstone(&self, key_range: Range<Key>, lsn: Lsn) -> anyhow::Result<()> {
-        let layer = self.get_layer_for_write(lsn).await?;
-        layer.put_tombstone(key_range, lsn).await?;
+    async fn put_values(
+        &self,
+        values: &HashMap<Key, Vec<(Lsn, Value)>>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        // Pick the first LSN in the batch to get the layer to write to.
+        for lsns in values.values() {
+            if let Some((lsn, _)) = lsns.first() {
+                let layer = self.get_layer_for_write(*lsn).await?;
+                layer.put_values(values, ctx).await?;
+                break;
+            }
+        }
+        Ok(())
+    }
+
+    async fn put_tombstones(&self, tombstones: &[(Range<Key>, Lsn)]) -> anyhow::Result<()> {
+        if let Some((_, lsn)) = tombstones.first() {
+            let layer = self.get_layer_for_write(*lsn).await?;
+            layer.put_tombstones(tombstones).await?;
+        }
         Ok(())
     }
 
@@ -3035,6 +3062,15 @@ impl Timeline {
                 for range in &partition.ranges {
                     let mut key = range.start;
                     while key < range.end {
+                        if self.shard_identity.is_key_disposable(&key) {
+                            debug!(
+                                "Dropping key {} during compaction (it belongs on shard {:?})",
+                                key,
+                                self.shard_identity.get_shard_number(&key)
+                            );
+                            key = key.next();
+                            continue;
+                        }
                         let img = match self.get(key, lsn, ctx).await {
                             Ok(img) => img,
                             Err(err) => {
@@ -3061,6 +3097,7 @@ impl Timeline {
                                 }
                             }
                         };
+
                         image_layer_writer.put_image(key, &img).await?;
                         key = key.next();
                     }
@@ -3631,7 +3668,15 @@ impl Timeline {
                 )))
             });
 
-            writer.as_mut().unwrap().put_value(key, lsn, value).await?;
+            if !self.shard_identity.is_key_disposable(&key) {
+                writer.as_mut().unwrap().put_value(key, lsn, value).await?;
+            } else {
+                debug!(
+                    "Dropping key {} during compaction (it belongs on shard {:?})",
+                    key,
+                    self.shard_identity.get_shard_number(&key)
+                );
+            }
 
             if !new_layers.is_empty() {
                 fail_point!("after-timeline-compacted-first-L1");
@@ -4186,7 +4231,7 @@ impl Timeline {
                     .context("Failed to reconstruct a page image:")
                 {
                     Ok(img) => img,
-                    Err(e) => return Err(PageReconstructError::from(e)),
+                    Err(e) => return Err(PageReconstructError::WalRedo(e)),
                 };
 
                 if img.len() == page_cache::PAGE_SZ {
@@ -4529,8 +4574,16 @@ impl<'a> TimelineWriter<'a> {
         self.tl.put_value(key, lsn, value, ctx).await
     }
 
-    pub async fn delete(&self, key_range: Range<Key>, lsn: Lsn) -> anyhow::Result<()> {
-        self.tl.put_tombstone(key_range, lsn).await
+    pub(crate) async fn put_batch(
+        &self,
+        batch: &HashMap<Key, Vec<(Lsn, Value)>>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        self.tl.put_values(batch, ctx).await
+    }
+
+    pub(crate) async fn delete_batch(&self, batch: &[(Range<Key>, Lsn)]) -> anyhow::Result<()> {
+        self.tl.put_tombstones(batch).await
     }
 
     /// Track the end of the latest digested WAL record.
@@ -4541,11 +4594,11 @@ impl<'a> TimelineWriter<'a> {
     /// 'lsn' must be aligned. This wakes up any wait_lsn() callers waiting for
     /// the 'lsn' or anything older. The previous last record LSN is stored alongside
     /// the latest and can be read.
-    pub fn finish_write(&self, new_lsn: Lsn) {
+    pub(crate) fn finish_write(&self, new_lsn: Lsn) {
         self.tl.finish_write(new_lsn);
     }
 
-    pub fn update_current_logical_size(&self, delta: i64) {
+    pub(crate) fn update_current_logical_size(&self, delta: i64) {
         self.tl.update_current_logical_size(delta)
     }
 }
diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs
index e32265afb5..2fab6722b8 100644
--- a/pageserver/src/tenant/timeline/walreceiver.rs
+++ b/pageserver/src/tenant/timeline/walreceiver.rs
@@ -58,6 +58,7 @@ pub struct WalReceiverConf {
     pub max_lsn_wal_lag: NonZeroU64,
     pub auth_token: Option<Arc<String>>,
     pub availability_zone: Option<String>,
+    pub ingest_batch_size: u64,
 }
 
 pub struct WalReceiver {
diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
index 5a5b3d7586..7fa5bb7689 100644
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -411,6 +411,7 @@ impl ConnectionManagerState {
 
         let node_id = new_sk.safekeeper_id;
         let connect_timeout = self.conf.wal_connect_timeout;
+        let ingest_batch_size = self.conf.ingest_batch_size;
         let timeline = Arc::clone(&self.timeline);
         let ctx = ctx.detached_child(
             TaskKind::WalReceiverConnectionHandler,
@@ -430,6 +431,7 @@ impl ConnectionManagerState {
                     connect_timeout,
                     ctx,
                     node_id,
+                    ingest_batch_size,
                 )
                 .await;
 
@@ -1345,6 +1347,7 @@ mod tests {
                 max_lsn_wal_lag: NonZeroU64::new(1024 * 1024).unwrap(),
                 auth_token: None,
                 availability_zone: None,
+                ingest_batch_size: 1,
             },
             wal_connection: None,
             wal_stream_candidates: HashMap::new(),
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 61ab236322..e398d683e5 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -26,7 +26,7 @@ use tracing::{debug, error, info, trace, warn, Instrument};
 use super::TaskStateUpdate;
 use crate::{
     context::RequestContext,
-    metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS},
+    metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST},
     task_mgr,
     task_mgr::TaskKind,
     task_mgr::WALRECEIVER_RUNTIME,
@@ -106,6 +106,7 @@ impl From<WalDecodeError> for WalReceiverError {
 
 /// Open a connection to the given safekeeper and receive WAL, sending back progress
 /// messages as we go.
+#[allow(clippy::too_many_arguments)]
 pub(super) async fn handle_walreceiver_connection(
     timeline: Arc<Timeline>,
     wal_source_connconf: PgConnectionConfig,
@@ -114,6 +115,7 @@ pub(super) async fn handle_walreceiver_connection(
     connect_timeout: Duration,
     ctx: RequestContext,
     node: NodeId,
+    ingest_batch_size: u64,
 ) -> Result<(), WalReceiverError> {
     debug_assert_current_span_has_tenant_and_timeline_id();
 
@@ -305,7 +307,9 @@ pub(super) async fn handle_walreceiver_connection(
 
                 {
                     let mut decoded = DecodedWALRecord::default();
-                    let mut modification = timeline.begin_modification(endlsn);
+                    let mut modification = timeline.begin_modification(startlsn);
+                    let mut uncommitted_records = 0;
+                    let mut filtered_records = 0;
                     while let Some((lsn, recdata)) = waldecoder.poll_decode()? {
                         // It is important to deal with the aligned records as lsn in getPage@LSN is
                         // aligned and can be several bytes bigger. Without this alignment we are
@@ -314,14 +318,40 @@ pub(super) async fn handle_walreceiver_connection(
                             return Err(WalReceiverError::Other(anyhow!("LSN not aligned")));
                         }
 
-                        walingest
+                        // Ingest the records without immediately committing them.
+                        let ingested = walingest
                             .ingest_record(recdata, lsn, &mut modification, &mut decoded, &ctx)
                             .await
                             .with_context(|| format!("could not ingest record at {lsn}"))?;
+                        if !ingested {
+                            tracing::debug!("ingest: filtered out record @ LSN {lsn}");
+                            WAL_INGEST.records_filtered.inc();
+                            filtered_records += 1;
+                        }
 
                         fail_point!("walreceiver-after-ingest");
 
                         last_rec_lsn = lsn;
+
+                        // Commit every ingest_batch_size records. Even if we filtered out
+                        // all records, we still need to call commit to advance the LSN.
+                        uncommitted_records += 1;
+                        if uncommitted_records >= ingest_batch_size {
+                            WAL_INGEST
+                                .records_committed
+                                .inc_by(uncommitted_records - filtered_records);
+                            modification.commit(&ctx).await?;
+                            uncommitted_records = 0;
+                            filtered_records = 0;
+                        }
+                    }
+
+                    // Commit the remaining records.
+                    if uncommitted_records > 0 {
+                        WAL_INGEST
+                            .records_committed
+                            .inc_by(uncommitted_records - filtered_records);
+                        modification.commit(&ctx).await?;
                     }
                 }
 
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index a6a8972970..8df0c81c7a 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -48,20 +48,18 @@ use postgres_ffi::TransactionId;
 use postgres_ffi::BLCKSZ;
 use utils::lsn::Lsn;
 
-pub struct WalIngest<'a> {
+pub struct WalIngest {
     shard: ShardIdentity,
-    timeline: &'a Timeline,
-
     checkpoint: CheckPoint,
     checkpoint_modified: bool,
 }
 
-impl<'a> WalIngest<'a> {
+impl WalIngest {
     pub async fn new(
-        timeline: &'a Timeline,
+        timeline: &Timeline,
         startpoint: Lsn,
-        ctx: &'_ RequestContext,
-    ) -> anyhow::Result<WalIngest<'a>> {
+        ctx: &RequestContext,
+    ) -> anyhow::Result<WalIngest> {
         // Fetch the latest checkpoint into memory, so that we can compare with it
         // quickly in `ingest_record` and update it when it changes.
         let checkpoint_bytes = timeline.get_checkpoint(startpoint, ctx).await?;
@@ -70,7 +68,6 @@ impl<'a> WalIngest<'a> {
 
         Ok(WalIngest {
             shard: *timeline.get_shard_identity(),
-            timeline,
             checkpoint,
             checkpoint_modified: false,
         })
@@ -84,6 +81,8 @@ impl<'a> WalIngest<'a> {
     /// Helper function to parse a WAL record and call the Timeline's PUT functions for all the
     /// relations/pages that the record affects.
     ///
+    /// This function returns `true` if the record was ingested, and `false` if it was filtered out
+    ///
     pub async fn ingest_record(
         &mut self,
         recdata: Bytes,
@@ -91,11 +90,13 @@ impl<'a> WalIngest<'a> {
         modification: &mut DatadirModification<'_>,
         decoded: &mut DecodedWALRecord,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> anyhow::Result<bool> {
         WAL_INGEST.records_received.inc();
+        let pg_version = modification.tline.pg_version;
+        let prev_len = modification.len();
 
-        modification.lsn = lsn;
-        decode_wal_record(recdata, decoded, self.timeline.pg_version)?;
+        modification.set_lsn(lsn)?;
+        decode_wal_record(recdata, decoded, pg_version)?;
 
         let mut buf = decoded.record.clone();
         buf.advance(decoded.main_data_offset);
@@ -132,9 +133,9 @@ impl<'a> WalIngest<'a> {
             }
             pg_constants::RM_DBASE_ID => {
                 let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
-                debug!(%info, pg_version=%self.timeline.pg_version, "handle RM_DBASE_ID");
+                debug!(%info, %pg_version, "handle RM_DBASE_ID");
 
-                if self.timeline.pg_version == 14 {
+                if pg_version == 14 {
                     if info == postgres_ffi::v14::bindings::XLOG_DBASE_CREATE {
                         let createdb = XlCreateDatabase::decode(&mut buf);
                         debug!("XLOG_DBASE_CREATE v14");
@@ -150,7 +151,7 @@ impl<'a> WalIngest<'a> {
                                 .await?;
                         }
                     }
-                } else if self.timeline.pg_version == 15 {
+                } else if pg_version == 15 {
                     if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_WAL_LOG {
                         debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
                     } else if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY {
@@ -170,7 +171,7 @@ impl<'a> WalIngest<'a> {
                                 .await?;
                         }
                     }
-                } else if self.timeline.pg_version == 16 {
+                } else if pg_version == 16 {
                     if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_WAL_LOG {
                         debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
                     } else if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY {
@@ -399,19 +400,11 @@ impl<'a> WalIngest<'a> {
             self.checkpoint_modified = false;
         }
 
-        if modification.is_empty() {
-            tracing::debug!("ingest: filtered out record @ LSN {lsn}");
-            WAL_INGEST.records_filtered.inc();
-            modification.tline.finish_write(lsn);
-        } else {
-            WAL_INGEST.records_committed.inc();
-            modification.commit(ctx).await?;
-        }
+        // Note that at this point this record is only cached in the modification
+        // until commit() is called to flush the data into the repository and update
+        // the latest LSN.
 
-        // Now that this record has been fully handled, including updating the
-        // checkpoint data, let the repository know that it is up-to-date to this LSN.
-
-        Ok(())
+        Ok(modification.len() > prev_len)
     }
 
     /// Do not store this block, but observe it for the purposes of updating our relation size state.
@@ -458,7 +451,7 @@ impl<'a> WalIngest<'a> {
             && (decoded.xl_info == pg_constants::XLOG_FPI
                 || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT)
             // compression of WAL is not yet supported: fall back to storing the original WAL record
-            && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, self.timeline.pg_version)?
+            && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, modification.tline.pg_version)?
             // do not materialize null pages because them most likely be soon replaced with real data
             && blk.bimg_len != 0
         {
@@ -511,7 +504,7 @@ impl<'a> WalIngest<'a> {
         let mut old_heap_blkno: Option<u32> = None;
         let mut flags = pg_constants::VISIBILITYMAP_VALID_BITS;
 
-        match self.timeline.pg_version {
+        match modification.tline.pg_version {
             14 => {
                 if decoded.xl_rmid == pg_constants::RM_HEAP_ID {
                     let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
@@ -735,7 +728,7 @@ impl<'a> WalIngest<'a> {
             // replaying it would fail to find the previous image of the page, because
             // it doesn't exist. So check if the VM page(s) exist, and skip the WAL
             // record if it doesn't.
-            let vm_size = self.get_relsize(vm_rel, modification.lsn, ctx).await?;
+            let vm_size = get_relsize(modification, vm_rel, ctx).await?;
             if let Some(blknum) = new_vm_blk {
                 if blknum >= vm_size {
                     new_vm_blk = None;
@@ -816,10 +809,11 @@ impl<'a> WalIngest<'a> {
         let mut new_heap_blkno: Option<u32> = None;
         let mut old_heap_blkno: Option<u32> = None;
         let mut flags = pg_constants::VISIBILITYMAP_VALID_BITS;
+        let pg_version = modification.tline.pg_version;
 
         assert_eq!(decoded.xl_rmid, pg_constants::RM_NEON_ID);
 
-        match self.timeline.pg_version {
+        match pg_version {
             16 => {
                 let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
 
@@ -882,7 +876,7 @@ impl<'a> WalIngest<'a> {
             }
             _ => bail!(
                 "Neon RMGR has no known compatibility with PostgreSQL version {}",
-                self.timeline.pg_version
+                pg_version
             ),
         }
 
@@ -905,7 +899,7 @@ impl<'a> WalIngest<'a> {
             // replaying it would fail to find the previous image of the page, because
             // it doesn't exist. So check if the VM page(s) exist, and skip the WAL
             // record if it doesn't.
-            let vm_size = self.get_relsize(vm_rel, modification.lsn, ctx).await?;
+            let vm_size = get_relsize(modification, vm_rel, ctx).await?;
             if let Some(blknum) = new_vm_blk {
                 if blknum >= vm_size {
                     new_vm_blk = None;
@@ -983,16 +977,14 @@ impl<'a> WalIngest<'a> {
         let src_db_id = rec.src_db_id;
         let src_tablespace_id = rec.src_tablespace_id;
 
-        // Creating a database is implemented by copying the template (aka. source) database.
-        // To copy all the relations, we need to ask for the state as of the same LSN, but we
-        // cannot pass 'lsn' to the Timeline.get_* functions, or they will block waiting for
-        // the last valid LSN to advance up to it. So we use the previous record's LSN in the
-        // get calls instead.
-        let req_lsn = modification.tline.get_last_record_lsn();
-
         let rels = modification
             .tline
-            .list_rels(src_tablespace_id, src_db_id, req_lsn, ctx)
+            .list_rels(
+                src_tablespace_id,
+                src_db_id,
+                Version::Modified(modification),
+                ctx,
+            )
             .await?;
 
         debug!("ingest_xlog_dbase_create: {} rels", rels.len());
@@ -1000,7 +992,12 @@ impl<'a> WalIngest<'a> {
         // Copy relfilemap
         let filemap = modification
             .tline
-            .get_relmap_file(src_tablespace_id, src_db_id, req_lsn, ctx)
+            .get_relmap_file(
+                src_tablespace_id,
+                src_db_id,
+                Version::Modified(modification),
+                ctx,
+            )
             .await?;
         modification
             .put_relmap_file(tablespace_id, db_id, filemap, ctx)
@@ -1014,7 +1011,7 @@ impl<'a> WalIngest<'a> {
 
             let nblocks = modification
                 .tline
-                .get_rel_size(src_rel, req_lsn, true, ctx)
+                .get_rel_size(src_rel, Version::Modified(modification), true, ctx)
                 .await?;
             let dst_rel = RelTag {
                 spcnode: tablespace_id,
@@ -1032,7 +1029,13 @@ impl<'a> WalIngest<'a> {
 
                 let content = modification
                     .tline
-                    .get_rel_page_at_lsn(src_rel, blknum, req_lsn, true, ctx)
+                    .get_rel_page_at_lsn(
+                        src_rel,
+                        blknum,
+                        Version::Modified(modification),
+                        true,
+                        ctx,
+                    )
                     .await?;
                 modification.put_rel_page_image(dst_rel, blknum, content)?;
                 num_blocks_copied += 1;
@@ -1103,7 +1106,7 @@ impl<'a> WalIngest<'a> {
                 modification.put_rel_page_image(rel, fsm_physical_page_no, ZERO_PAGE.clone())?;
                 fsm_physical_page_no += 1;
             }
-            let nblocks = self.get_relsize(rel, modification.lsn, ctx).await?;
+            let nblocks = get_relsize(modification, rel, ctx).await?;
             if nblocks > fsm_physical_page_no {
                 // check if something to do: FSM is larger than truncate position
                 self.put_rel_truncation(modification, rel, fsm_physical_page_no, ctx)
@@ -1125,7 +1128,7 @@ impl<'a> WalIngest<'a> {
                 modification.put_rel_page_image(rel, vm_page_no, ZERO_PAGE.clone())?;
                 vm_page_no += 1;
             }
-            let nblocks = self.get_relsize(rel, modification.lsn, ctx).await?;
+            let nblocks = get_relsize(modification, rel, ctx).await?;
             if nblocks > vm_page_no {
                 // check if something to do: VM is larger than truncate position
                 self.put_rel_truncation(modification, rel, vm_page_no, ctx)
@@ -1198,10 +1201,9 @@ impl<'a> WalIngest<'a> {
                     dbnode: xnode.dbnode,
                     relnode: xnode.relnode,
                 };
-                let last_lsn = self.timeline.get_last_record_lsn();
                 if modification
                     .tline
-                    .get_rel_exists(rel, last_lsn, true, ctx)
+                    .get_rel_exists(rel, Version::Modified(modification), true, ctx)
                     .await?
                 {
                     self.put_rel_drop(modification, rel, ctx).await?;
@@ -1255,10 +1257,9 @@ impl<'a> WalIngest<'a> {
         // will block waiting for the last valid LSN to advance up to
         // it. So we use the previous record's LSN in the get calls
         // instead.
-        let req_lsn = modification.tline.get_last_record_lsn();
         for segno in modification
             .tline
-            .list_slru_segments(SlruKind::Clog, req_lsn, ctx)
+            .list_slru_segments(SlruKind::Clog, Version::Modified(modification), ctx)
             .await?
         {
             let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
@@ -1470,20 +1471,6 @@ impl<'a> WalIngest<'a> {
         Ok(())
     }
 
-    async fn get_relsize(
-        &mut self,
-        rel: RelTag,
-        lsn: Lsn,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<BlockNumber> {
-        let nblocks = if !self.timeline.get_rel_exists(rel, lsn, true, ctx).await? {
-            0
-        } else {
-            self.timeline.get_rel_size(rel, lsn, true, ctx).await?
-        };
-        Ok(nblocks)
-    }
-
     async fn handle_rel_extend(
         &mut self,
         modification: &mut DatadirModification<'_>,
@@ -1495,7 +1482,6 @@ impl<'a> WalIngest<'a> {
         // Check if the relation exists. We implicitly create relations on first
         // record.
         // TODO: would be nice if to be more explicit about it
-        let last_lsn = modification.lsn;
 
         // Get current size and put rel creation if rel doesn't exist
         //
@@ -1503,11 +1489,14 @@ impl<'a> WalIngest<'a> {
         //       check the cache too. This is because eagerly checking the cache results in
         //       less work overall and 10% better performance. It's more work on cache miss
         //       but cache miss is rare.
-        let old_nblocks = if let Some(nblocks) = self.timeline.get_cached_rel_size(&rel, last_lsn) {
+        let old_nblocks = if let Some(nblocks) = modification
+            .tline
+            .get_cached_rel_size(&rel, modification.get_lsn())
+        {
             nblocks
-        } else if !self
-            .timeline
-            .get_rel_exists(rel, last_lsn, true, ctx)
+        } else if !modification
+            .tline
+            .get_rel_exists(rel, Version::Modified(modification), true, ctx)
             .await?
         {
             // create it with 0 size initially, the logic below will extend it
@@ -1517,7 +1506,10 @@ impl<'a> WalIngest<'a> {
                 .context("Relation Error")?;
             0
         } else {
-            self.timeline.get_rel_size(rel, last_lsn, true, ctx).await?
+            modification
+                .tline
+                .get_rel_size(rel, Version::Modified(modification), true, ctx)
+                .await?
         };
 
         if new_nblocks > old_nblocks {
@@ -1570,10 +1562,9 @@ impl<'a> WalIngest<'a> {
         // Check if the relation exists. We implicitly create relations on first
         // record.
         // TODO: would be nice if to be more explicit about it
-        let last_lsn = self.timeline.get_last_record_lsn();
-        let old_nblocks = if !self
-            .timeline
-            .get_slru_segment_exists(kind, segno, last_lsn, ctx)
+        let old_nblocks = if !modification
+            .tline
+            .get_slru_segment_exists(kind, segno, Version::Modified(modification), ctx)
             .await?
         {
             // create it with 0 size initially, the logic below will extend it
@@ -1582,8 +1573,9 @@ impl<'a> WalIngest<'a> {
                 .await?;
             0
         } else {
-            self.timeline
-                .get_slru_segment_size(kind, segno, last_lsn, ctx)
+            modification
+                .tline
+                .get_slru_segment_size(kind, segno, Version::Modified(modification), ctx)
                 .await?
         };
 
@@ -1606,6 +1598,26 @@ impl<'a> WalIngest<'a> {
     }
 }
 
+async fn get_relsize(
+    modification: &DatadirModification<'_>,
+    rel: RelTag,
+    ctx: &RequestContext,
+) -> anyhow::Result<BlockNumber> {
+    let nblocks = if !modification
+        .tline
+        .get_rel_exists(rel, Version::Modified(modification), true, ctx)
+        .await?
+    {
+        0
+    } else {
+        modification
+            .tline
+            .get_rel_size(rel, Version::Modified(modification), true, ctx)
+            .await?
+    };
+    Ok(nblocks)
+}
+
 #[allow(clippy::bool_assert_comparison)]
 #[cfg(test)]
 mod tests {
@@ -1632,10 +1644,7 @@ mod tests {
 
     static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]);
 
-    async fn init_walingest_test<'a>(
-        tline: &'a Timeline,
-        ctx: &RequestContext,
-    ) -> Result<WalIngest<'a>> {
+    async fn init_walingest_test(tline: &Timeline, ctx: &RequestContext) -> Result<WalIngest> {
         let mut m = tline.begin_modification(Lsn(0x10));
         m.put_checkpoint(ZERO_CHECKPOINT.clone())?;
         m.put_relmap_file(0, 111, Bytes::from(""), ctx).await?; // dummy relmapper file
@@ -1680,29 +1689,29 @@ mod tests {
         // The relation was created at LSN 2, not visible at LSN 1 yet.
         assert_eq!(
             tline
-                .get_rel_exists(TESTREL_A, Lsn(0x10), false, &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
                 .await?,
             false
         );
         assert!(tline
-            .get_rel_size(TESTREL_A, Lsn(0x10), false, &ctx)
+            .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
             .await
             .is_err());
         assert_eq!(
             tline
-                .get_rel_exists(TESTREL_A, Lsn(0x20), false, &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
                 .await?,
             true
         );
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Lsn(0x20), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
                 .await?,
             1
         );
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Lsn(0x50), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx)
                 .await?,
             3
         );
@@ -1710,46 +1719,46 @@ mod tests {
         // Check page contents at each LSN
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x20), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x20)), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 0 at 2")
         );
 
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x30), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x30)), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 0 at 3")
         );
 
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x40), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x40)), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 0 at 3")
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x40), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x40)), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 1 at 4")
         );
 
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x50), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x50)), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 0 at 3")
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x50), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x50)), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 1 at 4")
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 2 at 5")
         );
@@ -1765,19 +1774,19 @@ mod tests {
         // Check reported size and contents after truncation
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Lsn(0x60), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), false, &ctx)
                 .await?,
             2
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x60), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x60)), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 0 at 3")
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x60), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x60)), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 1 at 4")
         );
@@ -1785,13 +1794,13 @@ mod tests {
         // should still see the truncated block with older LSN
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Lsn(0x50), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx)
                 .await?,
             3
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 2 at 5")
         );
@@ -1804,7 +1813,7 @@ mod tests {
         m.commit(&ctx).await?;
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Lsn(0x68), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x68)), false, &ctx)
                 .await?,
             0
         );
@@ -1817,19 +1826,19 @@ mod tests {
         m.commit(&ctx).await?;
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Lsn(0x70), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x70)), false, &ctx)
                 .await?,
             2
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x70), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x70)), false, &ctx)
                 .await?,
             ZERO_PAGE
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x70), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x70)), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 1")
         );
@@ -1842,21 +1851,21 @@ mod tests {
         m.commit(&ctx).await?;
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Lsn(0x80), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx)
                 .await?,
             1501
         );
         for blk in 2..1500 {
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blk, Lsn(0x80), false, &ctx)
+                    .get_rel_page_at_lsn(TESTREL_A, blk, Version::Lsn(Lsn(0x80)), false, &ctx)
                     .await?,
                 ZERO_PAGE
             );
         }
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1500, Lsn(0x80), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 1500, Version::Lsn(Lsn(0x80)), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 1500")
         );
@@ -1883,13 +1892,13 @@ mod tests {
         // Check that rel exists and size is correct
         assert_eq!(
             tline
-                .get_rel_exists(TESTREL_A, Lsn(0x20), false, &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
                 .await?,
             true
         );
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Lsn(0x20), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
                 .await?,
             1
         );
@@ -1902,7 +1911,7 @@ mod tests {
         // Check that rel is not visible anymore
         assert_eq!(
             tline
-                .get_rel_exists(TESTREL_A, Lsn(0x30), false, &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x30)), false, &ctx)
                 .await?,
             false
         );
@@ -1920,13 +1929,13 @@ mod tests {
         // Check that rel exists and size is correct
         assert_eq!(
             tline
-                .get_rel_exists(TESTREL_A, Lsn(0x40), false, &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x40)), false, &ctx)
                 .await?,
             true
         );
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Lsn(0x40), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x40)), false, &ctx)
                 .await?,
             1
         );
@@ -1959,24 +1968,24 @@ mod tests {
         // The relation was created at LSN 20, not visible at LSN 1 yet.
         assert_eq!(
             tline
-                .get_rel_exists(TESTREL_A, Lsn(0x10), false, &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
                 .await?,
             false
         );
         assert!(tline
-            .get_rel_size(TESTREL_A, Lsn(0x10), false, &ctx)
+            .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
             .await
             .is_err());
 
         assert_eq!(
             tline
-                .get_rel_exists(TESTREL_A, Lsn(0x20), false, &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
                 .await?,
             true
         );
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Lsn(0x20), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
                 .await?,
             relsize
         );
@@ -1987,7 +1996,7 @@ mod tests {
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, lsn, false, &ctx)
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(lsn), false, &ctx)
                     .await?,
                 TEST_IMG(&data)
             );
@@ -2004,7 +2013,7 @@ mod tests {
         // Check reported size and contents after truncation
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Lsn(0x60), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), false, &ctx)
                 .await?,
             1
         );
@@ -2014,7 +2023,7 @@ mod tests {
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x60), false, &ctx)
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x60)), false, &ctx)
                     .await?,
                 TEST_IMG(&data)
             );
@@ -2023,7 +2032,7 @@ mod tests {
         // should still see all blocks with older LSN
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Lsn(0x50), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx)
                 .await?,
             relsize
         );
@@ -2032,7 +2041,7 @@ mod tests {
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x50), false, &ctx)
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x50)), false, &ctx)
                     .await?,
                 TEST_IMG(&data)
             );
@@ -2052,13 +2061,13 @@ mod tests {
 
         assert_eq!(
             tline
-                .get_rel_exists(TESTREL_A, Lsn(0x80), false, &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx)
                 .await?,
             true
         );
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Lsn(0x80), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx)
                 .await?,
             relsize
         );
@@ -2068,7 +2077,7 @@ mod tests {
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x80), false, &ctx)
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x80)), false, &ctx)
                     .await?,
                 TEST_IMG(&data)
             );
@@ -2101,7 +2110,9 @@ mod tests {
         assert_current_logical_size(&tline, Lsn(lsn));
 
         assert_eq!(
-            tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
+            tline
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
+                .await?,
             RELSEG_SIZE + 1
         );
 
@@ -2113,7 +2124,9 @@ mod tests {
             .await?;
         m.commit(&ctx).await?;
         assert_eq!(
-            tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
+            tline
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
+                .await?,
             RELSEG_SIZE
         );
         assert_current_logical_size(&tline, Lsn(lsn));
@@ -2126,7 +2139,9 @@ mod tests {
             .await?;
         m.commit(&ctx).await?;
         assert_eq!(
-            tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
+            tline
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
+                .await?,
             RELSEG_SIZE - 1
         );
         assert_current_logical_size(&tline, Lsn(lsn));
@@ -2142,7 +2157,9 @@ mod tests {
                 .await?;
             m.commit(&ctx).await?;
             assert_eq!(
-                tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
+                tline
+                    .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
+                    .await?,
                 size as BlockNumber
             );
 
@@ -2179,7 +2196,7 @@ mod tests {
         let wal_segment_path = format!("{path}/000000010000000000000001.zst");
         let source_initdb_path = format!("{path}/{INITDB_PATH}");
         let startpoint = Lsn::from_hex("14AEC08").unwrap();
-        let endpoint = Lsn::from_hex("1FFFF98").unwrap();
+        let _endpoint = Lsn::from_hex("1FFFF98").unwrap();
 
         let harness = TenantHarness::create("test_ingest_real_wal").unwrap();
         let (tenant, ctx) = harness.load().await;
@@ -2221,7 +2238,7 @@ mod tests {
         let mut walingest = WalIngest::new(tline.as_ref(), startpoint, &ctx)
             .await
             .unwrap();
-        let mut modification = tline.begin_modification(endpoint);
+        let mut modification = tline.begin_modification(startpoint);
         let mut decoded = DecodedWALRecord::default();
         println!("decoding {} bytes", bytes.len() - xlogoff);
 
@@ -2235,6 +2252,7 @@ mod tests {
                     .await
                     .unwrap();
             }
+            modification.commit(&ctx).await.unwrap();
         }
 
         let duration = started_at.elapsed();
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index 94e95fd3b3..6918698f29 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -22,6 +22,7 @@ use anyhow::Context;
 use byteorder::{ByteOrder, LittleEndian};
 use bytes::{BufMut, Bytes, BytesMut};
 use nix::poll::*;
+use pageserver_api::shard::TenantShardId;
 use serde::Serialize;
 use std::collections::VecDeque;
 use std::io;
@@ -35,14 +36,11 @@ use std::sync::{Arc, Mutex, MutexGuard, RwLock};
 use std::time::Duration;
 use std::time::Instant;
 use tracing::*;
-use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock};
+use utils::{bin_ser::BeSer, lsn::Lsn, nonblock::set_nonblock};
 
 #[cfg(feature = "testing")]
 use std::sync::atomic::{AtomicUsize, Ordering};
 
-#[cfg(feature = "testing")]
-use pageserver_api::shard::TenantShardId;
-
 use crate::config::PageServerConf;
 use crate::metrics::{
     WalRedoKillCause, WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_COUNTERS,
@@ -92,7 +90,7 @@ struct ProcessOutput {
 /// records.
 ///
 pub struct PostgresRedoManager {
-    tenant_id: TenantId,
+    tenant_shard_id: TenantShardId,
     conf: &'static PageServerConf,
     last_redo_at: std::sync::Mutex<Option<Instant>>,
     redo_process: RwLock<Option<Arc<WalRedoProcess>>>,
@@ -186,10 +184,13 @@ impl PostgresRedoManager {
     ///
     /// Create a new PostgresRedoManager.
     ///
-    pub fn new(conf: &'static PageServerConf, tenant_id: TenantId) -> PostgresRedoManager {
+    pub fn new(
+        conf: &'static PageServerConf,
+        tenant_shard_id: TenantShardId,
+    ) -> PostgresRedoManager {
         // The actual process is launched lazily, on first request.
         PostgresRedoManager {
-            tenant_id,
+            tenant_shard_id,
             conf,
             last_redo_at: std::sync::Mutex::default(),
             redo_process: RwLock::new(None),
@@ -244,8 +245,12 @@ impl PostgresRedoManager {
                                 let timer =
                                     WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.start_timer();
                                 let proc = Arc::new(
-                                    WalRedoProcess::launch(self.conf, self.tenant_id, pg_version)
-                                        .context("launch walredo process")?,
+                                    WalRedoProcess::launch(
+                                        self.conf,
+                                        self.tenant_shard_id,
+                                        pg_version,
+                                    )
+                                    .context("launch walredo process")?,
                                 );
                                 timer.observe_duration();
                                 *proc_guard = Some(Arc::clone(&proc));
@@ -638,7 +643,7 @@ impl<C: CommandExt> CloseFileDescriptors for C {
 struct WalRedoProcess {
     #[allow(dead_code)]
     conf: &'static PageServerConf,
-    tenant_id: TenantId,
+    tenant_shard_id: TenantShardId,
     // Some() on construction, only becomes None on Drop.
     child: Option<NoLeakChild>,
     stdout: Mutex<ProcessOutput>,
@@ -652,10 +657,10 @@ impl WalRedoProcess {
     //
     // Start postgres binary in special WAL redo mode.
     //
-    #[instrument(skip_all,fields(tenant_id=%tenant_id, pg_version=pg_version))]
+    #[instrument(skip_all,fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), pg_version=pg_version))]
     fn launch(
         conf: &'static PageServerConf,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
         pg_version: u32,
     ) -> anyhow::Result<Self> {
         let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
@@ -680,7 +685,7 @@ impl WalRedoProcess {
             // as close-on-exec by default, but that's not enough, since we use
             // libraries that directly call libc open without setting that flag.
             .close_fds()
-            .spawn_no_leak_child(tenant_id)
+            .spawn_no_leak_child(tenant_shard_id)
             .context("spawn process")?;
         WAL_REDO_PROCESS_COUNTERS.started.inc();
         let mut child = scopeguard::guard(child, |child| {
@@ -741,12 +746,12 @@ impl WalRedoProcess {
                         error!(error=?e, "failed to read from walredo stderr");
                     }
                 }
-            }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_id, %pg_version))
+            }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
         );
 
         Ok(Self {
             conf,
-            tenant_id,
+            tenant_shard_id,
             child: Some(child),
             stdin: Mutex::new(ProcessInput {
                 stdin,
@@ -772,7 +777,7 @@ impl WalRedoProcess {
     // Apply given WAL records ('records') over an old page image. Returns
     // new page image.
     //
-    #[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%self.id()))]
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
     fn apply_wal_records(
         &self,
         tag: BufferTag,
@@ -966,11 +971,7 @@ impl WalRedoProcess {
         // these files will be collected to an allure report
         let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
 
-        // TODO(sharding): update this call when WalRedoProcess gets a TenantShardId.
-        let path = self
-            .conf
-            .tenant_path(&TenantShardId::unsharded(self.tenant_id))
-            .join(&filename);
+        let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
 
         let res = std::fs::OpenOptions::new()
             .write(true)
@@ -1004,7 +1005,7 @@ impl Drop for WalRedoProcess {
 /// Wrapper type around `std::process::Child` which guarantees that the child
 /// will be killed and waited-for by this process before being dropped.
 struct NoLeakChild {
-    tenant_id: TenantId,
+    tenant_id: TenantShardId,
     child: Option<Child>,
 }
 
@@ -1023,7 +1024,7 @@ impl DerefMut for NoLeakChild {
 }
 
 impl NoLeakChild {
-    fn spawn(tenant_id: TenantId, command: &mut Command) -> io::Result<Self> {
+    fn spawn(tenant_id: TenantShardId, command: &mut Command) -> io::Result<Self> {
         let child = command.spawn()?;
         Ok(NoLeakChild {
             tenant_id,
@@ -1078,7 +1079,7 @@ impl Drop for NoLeakChild {
             Some(child) => child,
             None => return,
         };
-        let tenant_id = self.tenant_id;
+        let tenant_shard_id = self.tenant_id;
         // Offload the kill+wait of the child process into the background.
         // If someone stops the runtime, we'll leak the child process.
         // We can ignore that case because we only stop the runtime on pageserver exit.
@@ -1086,7 +1087,11 @@ impl Drop for NoLeakChild {
             tokio::task::spawn_blocking(move || {
                 // Intentionally don't inherit the tracing context from whoever is dropping us.
                 // This thread here is going to outlive of our dropper.
-                let span = tracing::info_span!("walredo", %tenant_id);
+                let span = tracing::info_span!(
+                    "walredo",
+                    tenant_id = %tenant_shard_id.tenant_id,
+                    shard_id = %tenant_shard_id.shard_slug()
+                );
                 let _entered = span.enter();
                 Self::kill_and_wait_impl(child, WalRedoKillCause::NoLeakChildDrop);
             })
@@ -1096,11 +1101,11 @@ impl Drop for NoLeakChild {
 }
 
 trait NoLeakChildCommandExt {
-    fn spawn_no_leak_child(&mut self, tenant_id: TenantId) -> io::Result<NoLeakChild>;
+    fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result<NoLeakChild>;
 }
 
 impl NoLeakChildCommandExt for Command {
-    fn spawn_no_leak_child(&mut self, tenant_id: TenantId) -> io::Result<NoLeakChild> {
+    fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result<NoLeakChild> {
         NoLeakChild::spawn(tenant_id, self)
     }
 }
@@ -1155,6 +1160,7 @@ mod tests {
     use crate::repository::Key;
     use crate::{config::PageServerConf, walrecord::NeonWalRecord};
     use bytes::Bytes;
+    use pageserver_api::shard::TenantShardId;
     use std::str::FromStr;
     use utils::{id::TenantId, lsn::Lsn};
 
@@ -1264,9 +1270,9 @@ mod tests {
             let repo_dir = camino_tempfile::tempdir()?;
             let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
             let conf = Box::leak(Box::new(conf));
-            let tenant_id = TenantId::generate();
+            let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
 
-            let manager = PostgresRedoManager::new(conf, tenant_id);
+            let manager = PostgresRedoManager::new(conf, tenant_shard_id);
 
             Ok(RedoHarness {
                 _repo_dir: repo_dir,
diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 3b038f906f..3a7c0f1bb6 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -35,7 +35,8 @@
 
 #define PageStoreTrace DEBUG5
 
-#define RECONNECT_INTERVAL_USEC 1000000
+#define MIN_RECONNECT_INTERVAL_USEC 1000
+#define MAX_RECONNECT_INTERVAL_USEC 1000000
 
 bool		connected = false;
 PGconn	   *pageserver_conn = NULL;
@@ -133,6 +134,11 @@ pageserver_connect(int elevel)
 	const char *values[3];
 	int			n;
 
+	static TimestampTz last_connect_time = 0;
+	static uint64_t delay_us = MIN_RECONNECT_INTERVAL_USEC;
+	TimestampTz now;
+        uint64_t us_since_last_connect;
+
 	Assert(!connected);
 
 	if (CheckConnstringUpdated())
@@ -140,6 +146,22 @@ pageserver_connect(int elevel)
 		ReloadConnstring();
 	}
 
+	now = GetCurrentTimestamp();
+        us_since_last_connect = now - last_connect_time;
+	if (us_since_last_connect < delay_us)
+	{
+		pg_usleep(delay_us - us_since_last_connect);
+		delay_us *= 2;
+		if (delay_us > MAX_RECONNECT_INTERVAL_USEC)
+			delay_us = MAX_RECONNECT_INTERVAL_USEC;
+		last_connect_time = GetCurrentTimestamp();
+	}
+	else
+	{
+		delay_us = MIN_RECONNECT_INTERVAL_USEC;
+		last_connect_time = now;
+	}
+
 	/*
 	 * Connect using the connection string we got from the
 	 * neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment
@@ -333,7 +355,6 @@ pageserver_send(NeonRequest *request)
 		{
 			HandleMainLoopInterrupts();
 			n_reconnect_attempts += 1;
-			pg_usleep(RECONNECT_INTERVAL_USEC);
 		}
 		n_reconnect_attempts = 0;
 	}
diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c
index 7fb0cab9a0..1f7c473e7d 100644
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -99,7 +99,7 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 		port = strchr(host, ':');
 		if (port == NULL)
 		{
-			walprop_log(FATAL, "port is not specified");
+			wp_log(FATAL, "port is not specified");
 		}
 		*port++ = '\0';
 		sep = strchr(port, ',');
@@ -107,7 +107,7 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 			*sep++ = '\0';
 		if (wp->n_safekeepers + 1 >= MAX_SAFEKEEPERS)
 		{
-			walprop_log(FATAL, "Too many safekeepers");
+			wp_log(FATAL, "too many safekeepers");
 		}
 		wp->safekeeper[wp->n_safekeepers].host = host;
 		wp->safekeeper[wp->n_safekeepers].port = port;
@@ -123,7 +123,7 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 							   "host=%s port=%s dbname=replication options='-c timeline_id=%s tenant_id=%s'",
 							   sk->host, sk->port, wp->config->neon_timeline, wp->config->neon_tenant);
 			if (written > MAXCONNINFO || written < 0)
-				walprop_log(FATAL, "could not create connection string for safekeeper %s:%s", sk->host, sk->port);
+				wp_log(FATAL, "could not create connection string for safekeeper %s:%s", sk->host, sk->port);
 		}
 
 		initStringInfo(&wp->safekeeper[wp->n_safekeepers].outbuf);
@@ -133,7 +133,7 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 	}
 	if (wp->n_safekeepers < 1)
 	{
-		walprop_log(FATAL, "Safekeepers addresses are not specified");
+		wp_log(FATAL, "safekeepers addresses are not specified");
 	}
 	wp->quorum = wp->n_safekeepers / 2 + 1;
 
@@ -144,15 +144,15 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 	wp->api.strong_random(wp, &wp->greetRequest.proposerId, sizeof(wp->greetRequest.proposerId));
 	wp->greetRequest.systemId = wp->config->systemId;
 	if (!wp->config->neon_timeline)
-		walprop_log(FATAL, "neon.timeline_id is not provided");
+		wp_log(FATAL, "neon.timeline_id is not provided");
 	if (*wp->config->neon_timeline != '\0' &&
 		!HexDecodeString(wp->greetRequest.timeline_id, wp->config->neon_timeline, 16))
-		walprop_log(FATAL, "Could not parse neon.timeline_id, %s", wp->config->neon_timeline);
+		wp_log(FATAL, "could not parse neon.timeline_id, %s", wp->config->neon_timeline);
 	if (!wp->config->neon_tenant)
-		walprop_log(FATAL, "neon.tenant_id is not provided");
+		wp_log(FATAL, "neon.tenant_id is not provided");
 	if (*wp->config->neon_tenant != '\0' &&
 		!HexDecodeString(wp->greetRequest.tenant_id, wp->config->neon_tenant, 16))
-		walprop_log(FATAL, "Could not parse neon.tenant_id, %s", wp->config->neon_tenant);
+		wp_log(FATAL, "could not parse neon.tenant_id, %s", wp->config->neon_tenant);
 
 	wp->greetRequest.timeline = wp->config->pgTimeline;
 	wp->greetRequest.walSegSize = wp->config->wal_segment_size;
@@ -274,8 +274,8 @@ WalProposerPoll(WalProposer *wp)
 				if (TimestampDifferenceExceeds(sk->latestMsgReceivedAt, now,
 											   wp->config->safekeeper_connection_timeout))
 				{
-					walprop_log(WARNING, "terminating connection to safekeeper '%s:%s' in '%s' state: no messages received during the last %dms or connection attempt took longer than that",
-								sk->host, sk->port, FormatSafekeeperState(sk), wp->config->safekeeper_connection_timeout);
+					wp_log(WARNING, "terminating connection to safekeeper '%s:%s' in '%s' state: no messages received during the last %dms or connection attempt took longer than that",
+						   sk->host, sk->port, FormatSafekeeperState(sk), wp->config->safekeeper_connection_timeout);
 					ShutdownConnection(sk);
 				}
 			}
@@ -356,8 +356,8 @@ ResetConnection(Safekeeper *sk)
 		 *
 		 * https://www.postgresql.org/docs/devel/libpq-connect.html#LIBPQ-PQCONNECTSTARTPARAMS
 		 */
-		walprop_log(WARNING, "Immediate failure to connect with node '%s:%s':\n\terror: %s",
-					sk->host, sk->port, wp->api.conn_error_message(sk));
+		wp_log(WARNING, "immediate failure to connect with node '%s:%s':\n\terror: %s",
+			   sk->host, sk->port, wp->api.conn_error_message(sk));
 
 		/*
 		 * Even though the connection failed, we still need to clean up the
@@ -380,7 +380,7 @@ ResetConnection(Safekeeper *sk)
 	 * (see libpqrcv_connect, defined in
 	 * src/backend/replication/libpqwalreceiver/libpqwalreceiver.c)
 	 */
-	walprop_log(LOG, "connecting with node %s:%s", sk->host, sk->port);
+	wp_log(LOG, "connecting with node %s:%s", sk->host, sk->port);
 
 	sk->state = SS_CONNECTING_WRITE;
 	sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp);
@@ -434,7 +434,7 @@ ReconnectSafekeepers(WalProposer *wp)
 static void
 AdvancePollState(Safekeeper *sk, uint32 events)
 {
-#ifdef WALPROPOSER_LIB			/* walprop_log needs wp in lib build */
+#ifdef WALPROPOSER_LIB			/* wp_log needs wp in lib build */
 	WalProposer *wp = sk->wp;
 #endif
 
@@ -452,8 +452,8 @@ AdvancePollState(Safekeeper *sk, uint32 events)
 			 * ResetConnection
 			 */
 		case SS_OFFLINE:
-			walprop_log(FATAL, "Unexpected safekeeper %s:%s state advancement: is offline",
-						sk->host, sk->port);
+			wp_log(FATAL, "unexpected safekeeper %s:%s state advancement: is offline",
+				   sk->host, sk->port);
 			break;				/* actually unreachable, but prevents
 								 * -Wimplicit-fallthrough */
 
@@ -488,8 +488,8 @@ AdvancePollState(Safekeeper *sk, uint32 events)
 			 * requests.
 			 */
 		case SS_VOTING:
-			walprop_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
-						sk->port, FormatSafekeeperState(sk));
+			wp_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
+				   sk->port, FormatSafekeeperState(sk));
 			ResetConnection(sk);
 			return;
 
@@ -517,8 +517,8 @@ AdvancePollState(Safekeeper *sk, uint32 events)
 			 * Idle state for waiting votes from quorum.
 			 */
 		case SS_IDLE:
-			walprop_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
-						sk->port, FormatSafekeeperState(sk));
+			wp_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
+				   sk->port, FormatSafekeeperState(sk));
 			ResetConnection(sk);
 			return;
 
@@ -543,8 +543,8 @@ HandleConnectionEvent(Safekeeper *sk)
 	switch (result)
 	{
 		case WP_CONN_POLLING_OK:
-			walprop_log(LOG, "connected with node %s:%s", sk->host,
-						sk->port);
+			wp_log(LOG, "connected with node %s:%s", sk->host,
+				   sk->port);
 			sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp);
 
 			/*
@@ -567,8 +567,8 @@ HandleConnectionEvent(Safekeeper *sk)
 			break;
 
 		case WP_CONN_POLLING_FAILED:
-			walprop_log(WARNING, "failed to connect to node '%s:%s': %s",
-						sk->host, sk->port, wp->api.conn_error_message(sk));
+			wp_log(WARNING, "failed to connect to node '%s:%s': %s",
+				   sk->host, sk->port, wp->api.conn_error_message(sk));
 
 			/*
 			 * If connecting failed, we don't want to restart the connection
@@ -604,8 +604,8 @@ SendStartWALPush(Safekeeper *sk)
 
 	if (!wp->api.conn_send_query(sk, "START_WAL_PUSH"))
 	{
-		walprop_log(WARNING, "Failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s",
-					sk->host, sk->port, wp->api.conn_error_message(sk));
+		wp_log(WARNING, "failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s",
+			   sk->host, sk->port, wp->api.conn_error_message(sk));
 		ShutdownConnection(sk);
 		return;
 	}
@@ -641,8 +641,8 @@ RecvStartWALPushResult(Safekeeper *sk)
 			break;
 
 		case WP_EXEC_FAILED:
-			walprop_log(WARNING, "Failed to send query to safekeeper %s:%s: %s",
-						sk->host, sk->port, wp->api.conn_error_message(sk));
+			wp_log(WARNING, "failed to send query to safekeeper %s:%s: %s",
+				   sk->host, sk->port, wp->api.conn_error_message(sk));
 			ShutdownConnection(sk);
 			return;
 
@@ -652,8 +652,8 @@ RecvStartWALPushResult(Safekeeper *sk)
 			 * wrong"
 			 */
 		case WP_EXEC_UNEXPECTED_SUCCESS:
-			walprop_log(WARNING, "Received bad response from safekeeper %s:%s query execution",
-						sk->host, sk->port);
+			wp_log(WARNING, "received bad response from safekeeper %s:%s query execution",
+				   sk->host, sk->port);
 			ShutdownConnection(sk);
 			return;
 	}
@@ -688,7 +688,7 @@ RecvAcceptorGreeting(Safekeeper *sk)
 	if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->greetResponse))
 		return;
 
-	walprop_log(LOG, "received AcceptorGreeting from safekeeper %s:%s", sk->host, sk->port);
+	wp_log(LOG, "received AcceptorGreeting from safekeeper %s:%s", sk->host, sk->port);
 
 	/* Protocol is all good, move to voting. */
 	sk->state = SS_VOTING;
@@ -708,7 +708,7 @@ RecvAcceptorGreeting(Safekeeper *sk)
 		if (wp->n_connected == wp->quorum)
 		{
 			wp->propTerm++;
-			walprop_log(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, wp->quorum, wp->propTerm);
+			wp_log(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, wp->quorum, wp->propTerm);
 
 			wp->voteRequest = (VoteRequest)
 			{
@@ -721,9 +721,9 @@ RecvAcceptorGreeting(Safekeeper *sk)
 	else if (sk->greetResponse.term > wp->propTerm)
 	{
 		/* Another compute with higher term is running. */
-		walprop_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
-					sk->host, sk->port,
-					sk->greetResponse.term, wp->propTerm);
+		wp_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
+			   sk->host, sk->port,
+			   sk->greetResponse.term, wp->propTerm);
 	}
 
 	/*
@@ -763,7 +763,7 @@ SendVoteRequest(Safekeeper *sk)
 	WalProposer *wp = sk->wp;
 
 	/* We have quorum for voting, send our vote request */
-	walprop_log(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, sk->host, sk->port, wp->voteRequest.term);
+	wp_log(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, sk->host, sk->port, wp->voteRequest.term);
 	/* On failure, logging & resetting is handled */
 	if (!BlockingWrite(sk, &wp->voteRequest, sizeof(wp->voteRequest), SS_WAIT_VERDICT))
 		return;
@@ -780,12 +780,12 @@ RecvVoteResponse(Safekeeper *sk)
 	if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->voteResponse))
 		return;
 
-	walprop_log(LOG,
-				"got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X",
-				sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory),
-				LSN_FORMAT_ARGS(sk->voteResponse.flushLsn),
-				LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn),
-				LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn));
+	wp_log(LOG,
+		   "got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X",
+		   sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory),
+		   LSN_FORMAT_ARGS(sk->voteResponse.flushLsn),
+		   LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn),
+		   LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn));
 
 	/*
 	 * In case of acceptor rejecting our vote, bail out, but only if either it
@@ -795,9 +795,9 @@ RecvVoteResponse(Safekeeper *sk)
 	if ((!sk->voteResponse.voteGiven) &&
 		(sk->voteResponse.term > wp->propTerm || wp->n_votes < wp->quorum))
 	{
-		walprop_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
-					sk->host, sk->port,
-					sk->voteResponse.term, wp->propTerm);
+		wp_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
+			   sk->host, sk->port,
+			   sk->voteResponse.term, wp->propTerm);
 	}
 	Assert(sk->voteResponse.term == wp->propTerm);
 
@@ -841,7 +841,7 @@ HandleElectedProposer(WalProposer *wp)
 	 */
 	if (!wp->api.recovery_download(wp, &wp->safekeeper[wp->donor]))
 	{
-		walprop_log(FATAL, "failed to download WAL for logical replicaiton");
+		wp_log(FATAL, "failed to download WAL for logical replicaiton");
 	}
 
 	if (wp->truncateLsn == wp->propEpochStartLsn && wp->config->syncSafekeepers)
@@ -948,10 +948,10 @@ DetermineEpochStartLsn(WalProposer *wp)
 				if (wp->timelineStartLsn != InvalidXLogRecPtr &&
 					wp->timelineStartLsn != wp->safekeeper[i].voteResponse.timelineStartLsn)
 				{
-					walprop_log(WARNING,
-								"inconsistent timelineStartLsn: current %X/%X, received %X/%X",
-								LSN_FORMAT_ARGS(wp->timelineStartLsn),
-								LSN_FORMAT_ARGS(wp->safekeeper[i].voteResponse.timelineStartLsn));
+					wp_log(WARNING,
+						   "inconsistent timelineStartLsn: current %X/%X, received %X/%X",
+						   LSN_FORMAT_ARGS(wp->timelineStartLsn),
+						   LSN_FORMAT_ARGS(wp->safekeeper[i].voteResponse.timelineStartLsn));
 				}
 				wp->timelineStartLsn = wp->safekeeper[i].voteResponse.timelineStartLsn;
 			}
@@ -969,7 +969,7 @@ DetermineEpochStartLsn(WalProposer *wp)
 		{
 			wp->timelineStartLsn = wp->api.get_redo_start_lsn(wp);
 		}
-		walprop_log(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(wp->propEpochStartLsn));
+		wp_log(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(wp->propEpochStartLsn));
 	}
 
 	/*
@@ -996,12 +996,12 @@ DetermineEpochStartLsn(WalProposer *wp)
 	wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].term = wp->propTerm;
 	wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].lsn = wp->propEpochStartLsn;
 
-	walprop_log(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X",
-				wp->quorum,
-				wp->propTerm,
-				LSN_FORMAT_ARGS(wp->propEpochStartLsn),
-				wp->safekeeper[wp->donor].host, wp->safekeeper[wp->donor].port,
-				LSN_FORMAT_ARGS(wp->truncateLsn));
+	wp_log(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X",
+		   wp->quorum,
+		   wp->propTerm,
+		   LSN_FORMAT_ARGS(wp->propEpochStartLsn),
+		   wp->safekeeper[wp->donor].host, wp->safekeeper[wp->donor].port,
+		   LSN_FORMAT_ARGS(wp->truncateLsn));
 
 	/*
 	 * Ensure the basebackup we are running (at RedoStartLsn) matches LSN
@@ -1034,10 +1034,10 @@ DetermineEpochStartLsn(WalProposer *wp)
 				 * scenario.
 				 */
 				disable_core_dump();
-				walprop_log(PANIC,
-							"collected propEpochStartLsn %X/%X, but basebackup LSN %X/%X",
-							LSN_FORMAT_ARGS(wp->propEpochStartLsn),
-							LSN_FORMAT_ARGS(wp->api.get_redo_start_lsn(wp)));
+				wp_log(PANIC,
+					   "collected propEpochStartLsn %X/%X, but basebackup LSN %X/%X",
+					   LSN_FORMAT_ARGS(wp->propEpochStartLsn),
+					   LSN_FORMAT_ARGS(wp->api.get_redo_start_lsn(wp)));
 			}
 		}
 		walprop_shared->mineLastElectedTerm = wp->propTerm;
@@ -1091,34 +1091,10 @@ SendProposerElected(Safekeeper *sk)
 	{
 		/* safekeeper is empty or no common point, start from the beginning */
 		sk->startStreamingAt = wp->propTermHistory.entries[0].lsn;
-
-		if (sk->startStreamingAt < wp->truncateLsn)
-		{
-			/*
-			 * There's a gap between the WAL starting point and a truncateLsn,
-			 * which can't appear in a normal working cluster. That gap means
-			 * that all safekeepers reported that they have persisted WAL up
-			 * to the truncateLsn before, but now current safekeeper tells
-			 * otherwise.
-			 *
-			 * Also we have a special condition here, which is empty
-			 * safekeeper with no history. In combination with a gap, that can
-			 * happen when we introduce a new safekeeper to the cluster. This
-			 * is a rare case, which is triggered manually for now, and should
-			 * be treated with care.
-			 */
-
-			/*
-			 * truncateLsn will not change without ack from current
-			 * safekeeper, and it's aligned to the WAL record, so we can
-			 * safely start streaming from this point.
-			 */
-			sk->startStreamingAt = wp->truncateLsn;
-
-			walprop_log(WARNING, "empty safekeeper joined cluster as %s:%s, historyStart=%X/%X, sk->startStreamingAt=%X/%X",
-						sk->host, sk->port, LSN_FORMAT_ARGS(wp->propTermHistory.entries[0].lsn),
-						LSN_FORMAT_ARGS(sk->startStreamingAt));
-		}
+		wp_log(LOG, "no common point with sk %s:%s, streaming since first term at %X/%X, timelineStartLsn=%X/%X, termHistory.n_entries=%u" ,
+		 	 sk->host, sk->port, LSN_FORMAT_ARGS(sk->startStreamingAt), LSN_FORMAT_ARGS(wp->timelineStartLsn), wp->propTermHistory.n_entries);
+		/* wp->timelineStartLsn == InvalidXLogRecPtr can be only when timeline is created manually (test_s3_wal_replay) */
+		Assert(sk->startStreamingAt == wp->timelineStartLsn || wp->timelineStartLsn == InvalidXLogRecPtr);
 	}
 	else
 	{
@@ -1141,7 +1117,7 @@ SendProposerElected(Safekeeper *sk)
 		}
 	}
 
-	Assert(sk->startStreamingAt >= wp->truncateLsn && sk->startStreamingAt <= wp->availableLsn);
+	Assert(sk->startStreamingAt <= wp->availableLsn);
 
 	msg.tag = 'e';
 	msg.term = wp->propTerm;
@@ -1150,9 +1126,9 @@ SendProposerElected(Safekeeper *sk)
 	msg.timelineStartLsn = wp->timelineStartLsn;
 
 	lastCommonTerm = i >= 0 ? wp->propTermHistory.entries[i].term : 0;
-	walprop_log(LOG,
-				"sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X",
-				sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn));
+	wp_log(LOG,
+		   "sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X",
+		   sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn));
 
 	resetStringInfo(&sk->outbuf);
 	pq_sendint64_le(&sk->outbuf, msg.tag);
@@ -1261,8 +1237,8 @@ HandleActiveState(Safekeeper *sk, uint32 events)
 	/* expected never to happen, c.f. walprop_pg_active_state_update_event_set */
 	if (events & WL_SOCKET_CLOSED)
 	{
-		walprop_log(WARNING, "connection to %s:%s in active state failed, got WL_SOCKET_CLOSED on neon_walreader socket",
-					sk->host, sk->port);
+		wp_log(WARNING, "connection to %s:%s in active state failed, got WL_SOCKET_CLOSED on neon_walreader socket",
+			   sk->host, sk->port);
 		ShutdownConnection(sk);
 		return;
 	}
@@ -1323,12 +1299,12 @@ SendAppendRequests(Safekeeper *sk)
 			req = &sk->appendRequest;
 			PrepareAppendRequest(sk->wp, &sk->appendRequest, sk->streamingAt, endLsn);
 
-			walprop_log(DEBUG5, "sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
-						req->endLsn - req->beginLsn,
-						LSN_FORMAT_ARGS(req->beginLsn),
-						LSN_FORMAT_ARGS(req->endLsn),
-						LSN_FORMAT_ARGS(req->commitLsn),
-						LSN_FORMAT_ARGS(wp->truncateLsn), sk->host, sk->port);
+			wp_log(DEBUG5, "sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
+				   req->endLsn - req->beginLsn,
+				   LSN_FORMAT_ARGS(req->beginLsn),
+				   LSN_FORMAT_ARGS(req->endLsn),
+				   LSN_FORMAT_ARGS(req->commitLsn),
+				   LSN_FORMAT_ARGS(wp->truncateLsn), sk->host, sk->port);
 
 			resetStringInfo(&sk->outbuf);
 
@@ -1355,8 +1331,8 @@ SendAppendRequests(Safekeeper *sk)
 				case NEON_WALREAD_WOULDBLOCK:
 					return true;
 				case NEON_WALREAD_ERROR:
-					walprop_log(WARNING, "WAL reading for node %s:%s failed: %s",
-								sk->host, sk->port, errmsg);
+					wp_log(WARNING, "WAL reading for node %s:%s failed: %s",
+						   sk->host, sk->port, errmsg);
 					ShutdownConnection(sk);
 					return false;
 				default:
@@ -1388,9 +1364,9 @@ SendAppendRequests(Safekeeper *sk)
 					return true;
 
 				case PG_ASYNC_WRITE_FAIL:
-					walprop_log(WARNING, "failed to send to node %s:%s in %s state: %s",
-								sk->host, sk->port, FormatSafekeeperState(sk),
-								wp->api.conn_error_message(sk));
+					wp_log(WARNING, "failed to send to node %s:%s in %s state: %s",
+						   sk->host, sk->port, FormatSafekeeperState(sk),
+						   wp->api.conn_error_message(sk));
 					ShutdownConnection(sk);
 					return false;
 				default:
@@ -1429,11 +1405,11 @@ RecvAppendResponses(Safekeeper *sk)
 		if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->appendResponse))
 			break;
 
-		walprop_log(DEBUG2, "received message term=" INT64_FORMAT " flushLsn=%X/%X commitLsn=%X/%X from %s:%s",
-					sk->appendResponse.term,
-					LSN_FORMAT_ARGS(sk->appendResponse.flushLsn),
-					LSN_FORMAT_ARGS(sk->appendResponse.commitLsn),
-					sk->host, sk->port);
+		wp_log(DEBUG2, "received message term=" INT64_FORMAT " flushLsn=%X/%X commitLsn=%X/%X from %s:%s",
+			   sk->appendResponse.term,
+			   LSN_FORMAT_ARGS(sk->appendResponse.flushLsn),
+			   LSN_FORMAT_ARGS(sk->appendResponse.commitLsn),
+			   sk->host, sk->port);
 
 		if (sk->appendResponse.term > wp->propTerm)
 		{
@@ -1443,9 +1419,9 @@ RecvAppendResponses(Safekeeper *sk)
 			 * core as this is kinda expected scenario.
 			 */
 			disable_core_dump();
-			walprop_log(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT "",
-						sk->host, sk->port,
-						sk->appendResponse.term, wp->propTerm);
+			wp_log(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT "",
+				   sk->host, sk->port,
+				   sk->appendResponse.term, wp->propTerm);
 		}
 
 		readAnything = true;
@@ -1489,32 +1465,32 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
 			pq_getmsgint(reply_message, sizeof(int32));
 			/* read value length */
 			rf->currentClusterSize = pq_getmsgint64(reply_message);
-			walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: current_timeline_size %lu",
-						rf->currentClusterSize);
+			wp_log(DEBUG2, "ParsePageserverFeedbackMessage: current_timeline_size %lu",
+				   rf->currentClusterSize);
 		}
 		else if ((strcmp(key, "ps_writelsn") == 0) || (strcmp(key, "last_received_lsn") == 0))
 		{
 			pq_getmsgint(reply_message, sizeof(int32));
 			/* read value length */
 			rf->last_received_lsn = pq_getmsgint64(reply_message);
-			walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: last_received_lsn %X/%X",
-						LSN_FORMAT_ARGS(rf->last_received_lsn));
+			wp_log(DEBUG2, "ParsePageserverFeedbackMessage: last_received_lsn %X/%X",
+				   LSN_FORMAT_ARGS(rf->last_received_lsn));
 		}
 		else if ((strcmp(key, "ps_flushlsn") == 0) || (strcmp(key, "disk_consistent_lsn") == 0))
 		{
 			pq_getmsgint(reply_message, sizeof(int32));
 			/* read value length */
 			rf->disk_consistent_lsn = pq_getmsgint64(reply_message);
-			walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: disk_consistent_lsn %X/%X",
-						LSN_FORMAT_ARGS(rf->disk_consistent_lsn));
+			wp_log(DEBUG2, "ParsePageserverFeedbackMessage: disk_consistent_lsn %X/%X",
+				   LSN_FORMAT_ARGS(rf->disk_consistent_lsn));
 		}
 		else if ((strcmp(key, "ps_applylsn") == 0) || (strcmp(key, "remote_consistent_lsn") == 0))
 		{
 			pq_getmsgint(reply_message, sizeof(int32));
 			/* read value length */
 			rf->remote_consistent_lsn = pq_getmsgint64(reply_message);
-			walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: remote_consistent_lsn %X/%X",
-						LSN_FORMAT_ARGS(rf->remote_consistent_lsn));
+			wp_log(DEBUG2, "ParsePageserverFeedbackMessage: remote_consistent_lsn %X/%X",
+				   LSN_FORMAT_ARGS(rf->remote_consistent_lsn));
 		}
 		else if ((strcmp(key, "ps_replytime") == 0) || (strcmp(key, "replytime") == 0))
 		{
@@ -1526,8 +1502,8 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
 
 				/* Copy because timestamptz_to_str returns a static buffer */
 				replyTimeStr = pstrdup(timestamptz_to_str(rf->replytime));
-				walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: replytime %lu reply_time: %s",
-							rf->replytime, replyTimeStr);
+				wp_log(DEBUG2, "ParsePageserverFeedbackMessage: replytime %lu reply_time: %s",
+					   rf->replytime, replyTimeStr);
 
 				pfree(replyTimeStr);
 			}
@@ -1541,7 +1517,7 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
 			 * Skip unknown keys to support backward compatibile protocol
 			 * changes
 			 */
-			walprop_log(LOG, "ParsePageserverFeedbackMessage: unknown key: %s len %d", key, len);
+			wp_log(LOG, "ParsePageserverFeedbackMessage: unknown key: %s len %d", key, len);
 			pq_getmsgbytes(reply_message, len);
 		};
 	}
@@ -1606,7 +1582,7 @@ GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn)
 
 	if (wp->n_votes < wp->quorum)
 	{
-		walprop_log(WARNING, "GetDonor called before elections are won");
+		wp_log(WARNING, "GetDonor called before elections are won");
 		return NULL;
 	}
 
@@ -1734,9 +1710,9 @@ AsyncRead(Safekeeper *sk, char **buf, int *buf_size)
 			return false;
 
 		case PG_ASYNC_READ_FAIL:
-			walprop_log(WARNING, "Failed to read from node %s:%s in %s state: %s", sk->host,
-						sk->port, FormatSafekeeperState(sk),
-						wp->api.conn_error_message(sk));
+			wp_log(WARNING, "failed to read from node %s:%s in %s state: %s", sk->host,
+				   sk->port, FormatSafekeeperState(sk),
+				   wp->api.conn_error_message(sk));
 			ShutdownConnection(sk);
 			return false;
 	}
@@ -1774,8 +1750,8 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
 	tag = pq_getmsgint64_le(&s);
 	if (tag != anymsg->tag)
 	{
-		walprop_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
-					sk->port, FormatSafekeeperState(sk));
+		wp_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
+			   sk->port, FormatSafekeeperState(sk));
 		ResetConnection(sk);
 		return false;
 	}
@@ -1851,9 +1827,9 @@ BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState succes
 
 	if (!wp->api.conn_blocking_write(sk, msg, msg_size))
 	{
-		walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
-					sk->host, sk->port, FormatSafekeeperState(sk),
-					wp->api.conn_error_message(sk));
+		wp_log(WARNING, "failed to send to node %s:%s in %s state: %s",
+			   sk->host, sk->port, FormatSafekeeperState(sk),
+			   wp->api.conn_error_message(sk));
 		ShutdownConnection(sk);
 		return false;
 	}
@@ -1904,9 +1880,9 @@ AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_sta
 			wp->api.update_event_set(sk, WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE);
 			return false;
 		case PG_ASYNC_WRITE_FAIL:
-			walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
-						sk->host, sk->port, FormatSafekeeperState(sk),
-						wp->api.conn_error_message(sk));
+			wp_log(WARNING, "failed to send to node %s:%s in %s state: %s",
+				   sk->host, sk->port, FormatSafekeeperState(sk),
+				   wp->api.conn_error_message(sk));
 			ShutdownConnection(sk);
 			return false;
 		default:
@@ -1943,9 +1919,9 @@ AsyncFlush(Safekeeper *sk)
 			/* Nothing to do; try again when the socket's ready */
 			return false;
 		case -1:
-			walprop_log(WARNING, "Failed to flush write to node %s:%s in %s state: %s",
-						sk->host, sk->port, FormatSafekeeperState(sk),
-						wp->api.conn_error_message(sk));
+			wp_log(WARNING, "failed to flush write to node %s:%s in %s state: %s",
+				   sk->host, sk->port, FormatSafekeeperState(sk),
+				   wp->api.conn_error_message(sk));
 			ResetConnection(sk);
 			return false;
 		default:
@@ -1974,11 +1950,11 @@ CompareLsn(const void *a, const void *b)
  *
  * The strings are intended to be used as a prefix to "state", e.g.:
  *
- *   walprop_log(LOG, "currently in %s state", FormatSafekeeperState(sk));
+ *   wp_log(LOG, "currently in %s state", FormatSafekeeperState(sk));
  *
  * If this sort of phrasing doesn't fit the message, instead use something like:
  *
- *   walprop_log(LOG, "currently in state [%s]", FormatSafekeeperState(sk));
+ *   wp_log(LOG, "currently in state [%s]", FormatSafekeeperState(sk));
  */
 static char *
 FormatSafekeeperState(Safekeeper *sk)
@@ -2059,8 +2035,8 @@ AssertEventsOkForState(uint32 events, Safekeeper *sk)
 		 * To give a descriptive message in the case of failure, we use elog
 		 * and then an assertion that's guaranteed to fail.
 		 */
-		walprop_log(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]",
-					FormatEvents(wp, events), sk->host, sk->port, FormatSafekeeperState(sk));
+		wp_log(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]",
+			   FormatEvents(wp, events), sk->host, sk->port, FormatSafekeeperState(sk));
 		Assert(events_ok_for_state);
 	}
 }
@@ -2199,8 +2175,8 @@ FormatEvents(WalProposer *wp, uint32 events)
 
 	if (events & (~all_flags))
 	{
-		walprop_log(WARNING, "Event formatting found unexpected component %d",
-					events & (~all_flags));
+		wp_log(WARNING, "event formatting found unexpected component %d",
+			   events & (~all_flags));
 		return_str[6] = '*';
 		return_str[7] = '\0';
 	}
diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h
index 6d478076fe..688d8e6e52 100644
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -707,11 +707,23 @@ extern Safekeeper *GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn);
 #define WPEVENT		1337		/* special log level for walproposer internal
 								 * events */
 
+#define WP_LOG_PREFIX "[WP] "
+
+/*
+ * wp_log is used in pure wp code (walproposer.c), allowing API callback to
+ * catch logging.
+ */
 #ifdef WALPROPOSER_LIB
 extern void WalProposerLibLog(WalProposer *wp, int elevel, char *fmt,...);
-#define walprop_log(elevel, ...) WalProposerLibLog(wp, elevel, __VA_ARGS__)
+#define wp_log(elevel, fmt, ...) WalProposerLibLog(wp, elevel, fmt, ## __VA_ARGS__)
 #else
-#define walprop_log(elevel, ...) elog(elevel, __VA_ARGS__)
+#define wp_log(elevel, fmt, ...) elog(elevel, WP_LOG_PREFIX fmt, ## __VA_ARGS__)
 #endif
 
+/*
+ * And wpg_log is used all other (postgres specific) walproposer code, just
+ * adding prefix.
+ */
+#define wpg_log(elevel, fmt, ...) elog(elevel, WP_LOG_PREFIX fmt, ## __VA_ARGS__)
+
 #endif							/* __NEON_WALPROPOSER_H__ */
diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index 7773aabfab..61a2a54809 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -424,8 +424,8 @@ walprop_pg_start_streaming(WalProposer *wp, XLogRecPtr startpos)
 {
 	StartReplicationCmd cmd;
 
-	elog(LOG, "WAL proposer starts streaming at %X/%X",
-		 LSN_FORMAT_ARGS(startpos));
+	wpg_log(LOG, "WAL proposer starts streaming at %X/%X",
+			LSN_FORMAT_ARGS(startpos));
 	cmd.slotname = WAL_PROPOSER_SLOT_NAME;
 	cmd.timeline = wp->greetRequest.timeline;
 	cmd.startpoint = startpos;
@@ -549,7 +549,7 @@ walprop_pg_load_libpqwalreceiver(void)
 {
 	load_file("libpqwalreceiver", false);
 	if (WalReceiverFunctions == NULL)
-		elog(ERROR, "libpqwalreceiver didn't initialize correctly");
+		wpg_log(ERROR, "libpqwalreceiver didn't initialize correctly");
 }
 
 /* Helper function */
@@ -630,7 +630,7 @@ libpqwp_connect_start(char *conninfo)
 	 * PGconn structure"
 	 */
 	if (!pg_conn)
-		elog(FATAL, "failed to allocate new PGconn object");
+		wpg_log(FATAL, "failed to allocate new PGconn object");
 
 	/*
 	 * And in theory this allocation can fail as well, but it's incredibly
@@ -680,7 +680,7 @@ walprop_connect_poll(Safekeeper *sk)
 			 * unused. We'll expect it's never returned.
 			 */
 		case PGRES_POLLING_ACTIVE:
-			elog(FATAL, "Unexpected PGRES_POLLING_ACTIVE returned from PQconnectPoll");
+			wpg_log(FATAL, "unexpected PGRES_POLLING_ACTIVE returned from PQconnectPoll");
 
 			/*
 			 * This return is never actually reached, but it's here to make
@@ -745,7 +745,7 @@ libpqwp_get_query_result(WalProposerConn *conn)
 	 */
 	if (!result)
 	{
-		elog(WARNING, "[libpqwalproposer] Unexpected successful end of command results");
+		wpg_log(WARNING, "[libpqwalproposer] Unexpected successful end of command results");
 		return WP_EXEC_UNEXPECTED_SUCCESS;
 	}
 
@@ -793,7 +793,7 @@ libpqwp_get_query_result(WalProposerConn *conn)
 	}
 
 	if (unexpected_success)
-		elog(WARNING, "[libpqwalproposer] Unexpected successful %s", unexpected_success);
+		wpg_log(WARNING, "[libpqwalproposer] Unexpected successful %s", unexpected_success);
 
 	return return_val;
 }
@@ -872,7 +872,7 @@ libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount)
 				ExecStatusType status = PQresultStatus(PQgetResult(conn->pg_conn));
 
 				if (status != PGRES_FATAL_ERROR)
-					elog(FATAL, "unexpected result status %d after failed PQgetCopyData", status);
+					wpg_log(FATAL, "unexpected result status %d after failed PQgetCopyData", status);
 
 				/*
 				 * If there was actually an error, it'll be properly reported
@@ -937,7 +937,7 @@ walprop_async_write(Safekeeper *sk, void const *buf, size_t size)
 		case -1:
 			return PG_ASYNC_WRITE_FAIL;
 		default:
-			elog(FATAL, "invalid return %d from PQputCopyData", result);
+			wpg_log(FATAL, "invalid return %d from PQputCopyData", result);
 	}
 
 	/*
@@ -958,7 +958,7 @@ walprop_async_write(Safekeeper *sk, void const *buf, size_t size)
 		case -1:
 			return PG_ASYNC_WRITE_FAIL;
 		default:
-			elog(FATAL, "invalid return %d from PQflush", result);
+			wpg_log(FATAL, "invalid return %d from PQflush", result);
 	}
 }
 
@@ -1237,19 +1237,6 @@ WalProposerRecovery(WalProposer *wp, Safekeeper *sk)
 		return true;			/* recovery not needed */
 	endpos = wp->propEpochStartLsn;
 
-	/*
-	 * If we need to download more than a max_slot_wal_keep_size, cap to it to
-	 * avoid risk of exploding pg_wal. Logical replication won't work until
-	 * recreated, but at least compute would start; this also follows
-	 * max_slot_wal_keep_size semantics.
-	 */
-	download_range_mb = (endpos - startpos) / 1024 / 1024;
-	if (max_slot_wal_keep_size_mb > 0 && download_range_mb >= max_slot_wal_keep_size_mb)
-	{
-		startpos = endpos - max_slot_wal_keep_size_mb * 1024 * 1024;
-		walprop_log(WARNING, "capped WAL download for logical replication to %X/%X as max_slot_wal_keep_size=%dMB",
-					LSN_FORMAT_ARGS(startpos), max_slot_wal_keep_size_mb);
-	}
 	timeline = wp->greetRequest.timeline;
 
 	if (!neon_auth_token)
@@ -1262,7 +1249,7 @@ WalProposerRecovery(WalProposer *wp, Safekeeper *sk)
 
 		written = snprintf((char *) conninfo, MAXCONNINFO, "password=%s %s", neon_auth_token, sk->conninfo);
 		if (written > MAXCONNINFO || written < 0)
-			elog(FATAL, "could not append password to the safekeeper connection string");
+			wpg_log(FATAL, "could not append password to the safekeeper connection string");
 	}
 
 #if PG_MAJORVERSION_NUM < 16
@@ -1279,11 +1266,11 @@ WalProposerRecovery(WalProposer *wp, Safekeeper *sk)
 						err)));
 		return false;
 	}
-	elog(LOG,
-		 "start recovery for logical replication from %s:%s starting from %X/%08X till %X/%08X timeline "
-		 "%d",
-		 sk->host, sk->port, (uint32) (startpos >> 32),
-		 (uint32) startpos, (uint32) (endpos >> 32), (uint32) endpos, timeline);
+	wpg_log(LOG,
+			"start recovery for logical replication from %s:%s starting from %X/%08X till %X/%08X timeline "
+			"%d",
+			sk->host, sk->port, (uint32) (startpos >> 32),
+			(uint32) startpos, (uint32) (endpos >> 32), (uint32) endpos, timeline);
 
 	options.logical = false;
 	options.startpoint = startpos;
@@ -1481,11 +1468,11 @@ walprop_pg_wal_reader_allocate(Safekeeper *sk)
 {
 	char		log_prefix[64];
 
-	snprintf(log_prefix, sizeof(log_prefix), "sk %s:%s nwr: ", sk->host, sk->port);
+	snprintf(log_prefix, sizeof(log_prefix), WP_LOG_PREFIX "sk %s:%s nwr: ", sk->host, sk->port);
 	Assert(!sk->xlogreader);
 	sk->xlogreader = NeonWALReaderAllocate(wal_segment_size, sk->wp->propEpochStartLsn, sk->wp, log_prefix);
 	if (sk->xlogreader == NULL)
-		elog(FATAL, "Failed to allocate xlog reader");
+		wpg_log(FATAL, "failed to allocate xlog reader");
 }
 
 static NeonWALReadResult
@@ -1549,7 +1536,7 @@ static void
 walprop_pg_init_event_set(WalProposer *wp)
 {
 	if (waitEvents)
-		elog(FATAL, "double-initialization of event set");
+		wpg_log(FATAL, "double-initialization of event set");
 
 	/* for each sk, we have socket plus potentially socket for neon walreader */
 	waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + 2 * wp->n_safekeepers);
@@ -1581,7 +1568,7 @@ add_nwr_event_set(Safekeeper *sk, uint32 events)
 	Assert(sk->nwrEventPos == -1);
 	sk->nwrEventPos = AddWaitEventToSet(waitEvents, events, NeonWALReaderSocket(sk->xlogreader), NULL, sk);
 	sk->nwrConnEstablished = NeonWALReaderIsRemConnEstablished(sk->xlogreader);
-	elog(DEBUG5, "sk %s:%s: added nwr socket events %d", sk->host, sk->port, events);
+	wpg_log(DEBUG5, "sk %s:%s: added nwr socket events %d", sk->host, sk->port, events);
 }
 
 static void
@@ -1680,8 +1667,8 @@ rm_safekeeper_event_set(Safekeeper *to_remove, bool is_sk)
 {
 	WalProposer *wp = to_remove->wp;
 
-	elog(DEBUG5, "sk %s:%s: removing event, is_sk %d",
-		 to_remove->host, to_remove->port, is_sk);
+	wpg_log(DEBUG5, "sk %s:%s: removing event, is_sk %d",
+			to_remove->host, to_remove->port, is_sk);
 
 	/*
 	 * Shortpath for exiting if have nothing to do. We never call this
@@ -1835,13 +1822,13 @@ GetLatestNeonFeedback(PageserverFeedback *rf, WalProposer *wp)
 	rf->remote_consistent_lsn = wp->safekeeper[latest_safekeeper].appendResponse.rf.remote_consistent_lsn;
 	rf->replytime = wp->safekeeper[latest_safekeeper].appendResponse.rf.replytime;
 
-	elog(DEBUG2, "GetLatestNeonFeedback: currentClusterSize %lu,"
-		 " last_received_lsn %X/%X, disk_consistent_lsn %X/%X, remote_consistent_lsn %X/%X, replytime %lu",
-		 rf->currentClusterSize,
-		 LSN_FORMAT_ARGS(rf->last_received_lsn),
-		 LSN_FORMAT_ARGS(rf->disk_consistent_lsn),
-		 LSN_FORMAT_ARGS(rf->remote_consistent_lsn),
-		 rf->replytime);
+	wpg_log(DEBUG2, "GetLatestNeonFeedback: currentClusterSize %lu,"
+			" last_received_lsn %X/%X, disk_consistent_lsn %X/%X, remote_consistent_lsn %X/%X, replytime %lu",
+			rf->currentClusterSize,
+			LSN_FORMAT_ARGS(rf->last_received_lsn),
+			LSN_FORMAT_ARGS(rf->disk_consistent_lsn),
+			LSN_FORMAT_ARGS(rf->remote_consistent_lsn),
+			rf->replytime);
 }
 
 /*
@@ -1987,7 +1974,7 @@ GetLogRepRestartLSN(WalProposer *wp)
 		{
 			uint64		download_range_mb;
 
-			elog(LOG, "logical replication restart LSN %X/%X", LSN_FORMAT_ARGS(lrRestartLsn));
+			wpg_log(LOG, "logical replication restart LSN %X/%X", LSN_FORMAT_ARGS(lrRestartLsn));
 
 			/*
 			 * If we need to download more than a max_slot_wal_keep_size,
@@ -1999,8 +1986,8 @@ GetLogRepRestartLSN(WalProposer *wp)
 			download_range_mb = (wp->propEpochStartLsn - lrRestartLsn) / MB;
 			if (max_slot_wal_keep_size_mb > 0 && download_range_mb >= max_slot_wal_keep_size_mb)
 			{
-				walprop_log(WARNING, "not downloading WAL for logical replication since %X/%X as max_slot_wal_keep_size=%dMB",
-							LSN_FORMAT_ARGS(lrRestartLsn), max_slot_wal_keep_size_mb);
+				wpg_log(WARNING, "not downloading WAL for logical replication since %X/%X as max_slot_wal_keep_size=%dMB",
+						LSN_FORMAT_ARGS(lrRestartLsn), max_slot_wal_keep_size_mb);
 				return InvalidXLogRecPtr;
 			}
 
diff --git a/poetry.lock b/poetry.lock
index 76dfd6d37d..428698cb5a 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
 
 [[package]]
 name = "aiohttp"
@@ -288,70 +288,21 @@ files = [
     {file = "backoff-2.2.1.tar.gz", hash = "sha256:03f829f5bb1923180821643f8753b0502c3b682293992485b0eef2807afa5cba"},
 ]
 
-[[package]]
-name = "black"
-version = "23.3.0"
-description = "The uncompromising code formatter."
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "black-23.3.0-cp310-cp310-macosx_10_16_arm64.whl", hash = "sha256:0945e13506be58bf7db93ee5853243eb368ace1c08a24c65ce108986eac65915"},
-    {file = "black-23.3.0-cp310-cp310-macosx_10_16_universal2.whl", hash = "sha256:67de8d0c209eb5b330cce2469503de11bca4085880d62f1628bd9972cc3366b9"},
-    {file = "black-23.3.0-cp310-cp310-macosx_10_16_x86_64.whl", hash = "sha256:7c3eb7cea23904399866c55826b31c1f55bbcd3890ce22ff70466b907b6775c2"},
-    {file = "black-23.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:32daa9783106c28815d05b724238e30718f34155653d4d6e125dc7daec8e260c"},
-    {file = "black-23.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:35d1381d7a22cc5b2be2f72c7dfdae4072a3336060635718cc7e1ede24221d6c"},
-    {file = "black-23.3.0-cp311-cp311-macosx_10_16_arm64.whl", hash = "sha256:a8a968125d0a6a404842fa1bf0b349a568634f856aa08ffaff40ae0dfa52e7c6"},
-    {file = "black-23.3.0-cp311-cp311-macosx_10_16_universal2.whl", hash = "sha256:c7ab5790333c448903c4b721b59c0d80b11fe5e9803d8703e84dcb8da56fec1b"},
-    {file = "black-23.3.0-cp311-cp311-macosx_10_16_x86_64.whl", hash = "sha256:a6f6886c9869d4daae2d1715ce34a19bbc4b95006d20ed785ca00fa03cba312d"},
-    {file = "black-23.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f3c333ea1dd6771b2d3777482429864f8e258899f6ff05826c3a4fcc5ce3f70"},
-    {file = "black-23.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:11c410f71b876f961d1de77b9699ad19f939094c3a677323f43d7a29855fe326"},
-    {file = "black-23.3.0-cp37-cp37m-macosx_10_16_x86_64.whl", hash = "sha256:1d06691f1eb8de91cd1b322f21e3bfc9efe0c7ca1f0e1eb1db44ea367dff656b"},
-    {file = "black-23.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:50cb33cac881766a5cd9913e10ff75b1e8eb71babf4c7104f2e9c52da1fb7de2"},
-    {file = "black-23.3.0-cp37-cp37m-win_amd64.whl", hash = "sha256:e114420bf26b90d4b9daa597351337762b63039752bdf72bf361364c1aa05925"},
-    {file = "black-23.3.0-cp38-cp38-macosx_10_16_arm64.whl", hash = "sha256:48f9d345675bb7fbc3dd85821b12487e1b9a75242028adad0333ce36ed2a6d27"},
-    {file = "black-23.3.0-cp38-cp38-macosx_10_16_universal2.whl", hash = "sha256:714290490c18fb0126baa0fca0a54ee795f7502b44177e1ce7624ba1c00f2331"},
-    {file = "black-23.3.0-cp38-cp38-macosx_10_16_x86_64.whl", hash = "sha256:064101748afa12ad2291c2b91c960be28b817c0c7eaa35bec09cc63aa56493c5"},
-    {file = "black-23.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:562bd3a70495facf56814293149e51aa1be9931567474993c7942ff7d3533961"},
-    {file = "black-23.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:e198cf27888ad6f4ff331ca1c48ffc038848ea9f031a3b40ba36aced7e22f2c8"},
-    {file = "black-23.3.0-cp39-cp39-macosx_10_16_arm64.whl", hash = "sha256:3238f2aacf827d18d26db07524e44741233ae09a584273aa059066d644ca7b30"},
-    {file = "black-23.3.0-cp39-cp39-macosx_10_16_universal2.whl", hash = "sha256:f0bd2f4a58d6666500542b26354978218a9babcdc972722f4bf90779524515f3"},
-    {file = "black-23.3.0-cp39-cp39-macosx_10_16_x86_64.whl", hash = "sha256:92c543f6854c28a3c7f39f4d9b7694f9a6eb9d3c5e2ece488c327b6e7ea9b266"},
-    {file = "black-23.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a150542a204124ed00683f0db1f5cf1c2aaaa9cc3495b7a3b5976fb136090ab"},
-    {file = "black-23.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:6b39abdfb402002b8a7d030ccc85cf5afff64ee90fa4c5aebc531e3ad0175ddb"},
-    {file = "black-23.3.0-py3-none-any.whl", hash = "sha256:ec751418022185b0c1bb7d7736e6933d40bbb14c14a0abcf9123d1b159f98dd4"},
-    {file = "black-23.3.0.tar.gz", hash = "sha256:1c7b8d606e728a41ea1ccbd7264677e494e87cf630e399262ced92d4a8dac940"},
-]
-
-[package.dependencies]
-click = ">=8.0.0"
-mypy-extensions = ">=0.4.3"
-packaging = ">=22.0"
-pathspec = ">=0.9.0"
-platformdirs = ">=2"
-tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""}
-typing-extensions = {version = ">=3.10.0.0", markers = "python_version < \"3.10\""}
-
-[package.extras]
-colorama = ["colorama (>=0.4.3)"]
-d = ["aiohttp (>=3.7.4)"]
-jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"]
-uvloop = ["uvloop (>=0.15.2)"]
-
 [[package]]
 name = "boto3"
-version = "1.26.16"
+version = "1.34.11"
 description = "The AWS SDK for Python"
 optional = false
-python-versions = ">= 3.7"
+python-versions = ">= 3.8"
 files = [
-    {file = "boto3-1.26.16-py3-none-any.whl", hash = "sha256:4f493a2aed71cee93e626de4f67ce58dd82c0473480a0fc45b131715cd8f4f30"},
-    {file = "boto3-1.26.16.tar.gz", hash = "sha256:31c0adf71e4bd19a5428580bb229d7ea3b5795eecaa0847a85385df00c026116"},
+    {file = "boto3-1.34.11-py3-none-any.whl", hash = "sha256:1af021e0c6e3040e8de66d403e963566476235bb70f9a8e3f6784813ac2d8026"},
+    {file = "boto3-1.34.11.tar.gz", hash = "sha256:31c130a40ec0631059b77d7e87f67ad03ff1685a5b37638ac0c4687026a3259d"},
 ]
 
 [package.dependencies]
-botocore = ">=1.29.16,<1.30.0"
+botocore = ">=1.34.11,<1.35.0"
 jmespath = ">=0.7.1,<2.0.0"
-s3transfer = ">=0.6.0,<0.7.0"
+s3transfer = ">=0.10.0,<0.11.0"
 
 [package.extras]
 crt = ["botocore[crt] (>=1.21.0,<2.0a0)"]
@@ -702,22 +653,25 @@ xray = ["mypy-boto3-xray (>=1.26.0,<1.27.0)"]
 
 [[package]]
 name = "botocore"
-version = "1.29.16"
+version = "1.34.11"
 description = "Low-level, data-driven core of boto 3."
 optional = false
-python-versions = ">= 3.7"
+python-versions = ">= 3.8"
 files = [
-    {file = "botocore-1.29.16-py3-none-any.whl", hash = "sha256:271b599e6cfe214405ed50d41cd967add1d5d469383dd81ff583bc818b47f59b"},
-    {file = "botocore-1.29.16.tar.gz", hash = "sha256:8cfcc10f2f1751608c3cec694f2d6b5e16ebcd50d0a104f9914d5616227c62e9"},
+    {file = "botocore-1.34.11-py3-none-any.whl", hash = "sha256:1ff1398b6ea670e1c01ac67a33af3da854f8e700d3528289c04f319c330d8250"},
+    {file = "botocore-1.34.11.tar.gz", hash = "sha256:51905c3d623c60df5dc5794387de7caf886d350180a01a3dfa762e903edb45a9"},
 ]
 
 [package.dependencies]
 jmespath = ">=0.7.1,<2.0.0"
 python-dateutil = ">=2.1,<3.0.0"
-urllib3 = ">=1.25.4,<1.27"
+urllib3 = [
+    {version = ">=1.25.4,<1.27", markers = "python_version < \"3.10\""},
+    {version = ">=1.25.4,<2.1", markers = "python_version >= \"3.10\""},
+]
 
 [package.extras]
-crt = ["awscrt (==0.14.0)"]
+crt = ["awscrt (==0.19.19)"]
 
 [[package]]
 name = "botocore-stubs"
@@ -1624,17 +1578,6 @@ files = [
     {file = "packaging-23.0.tar.gz", hash = "sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97"},
 ]
 
-[[package]]
-name = "pathspec"
-version = "0.9.0"
-description = "Utility library for gitignore style pattern matching of file paths."
-optional = false
-python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7"
-files = [
-    {file = "pathspec-0.9.0-py2.py3-none-any.whl", hash = "sha256:7d15c4ddb0b5c802d161efc417ec1a2558ea2653c2e8ad9c19098201dc1c993a"},
-    {file = "pathspec-0.9.0.tar.gz", hash = "sha256:e564499435a2673d586f6b2130bb5b95f04a3ba06f81b8f895b651a3c76aabb1"},
-]
-
 [[package]]
 name = "pbr"
 version = "5.9.0"
@@ -1646,21 +1589,6 @@ files = [
     {file = "pbr-5.9.0.tar.gz", hash = "sha256:e8dca2f4b43560edef58813969f52a56cef023146cbb8931626db80e6c1c4308"},
 ]
 
-[[package]]
-name = "platformdirs"
-version = "2.5.2"
-description = "A small Python module for determining appropriate platform-specific dirs, e.g. a \"user data dir\"."
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "platformdirs-2.5.2-py3-none-any.whl", hash = "sha256:027d8e83a2d7de06bbac4e5ef7e023c02b863d7ea5d079477e722bb41ab25788"},
-    {file = "platformdirs-2.5.2.tar.gz", hash = "sha256:58c8abb07dcb441e6ee4b11d8df0ac856038f944ab98b7be6b27b2a3c7feef19"},
-]
-
-[package.extras]
-docs = ["furo (>=2021.7.5b38)", "proselint (>=0.10.2)", "sphinx (>=4)", "sphinx-autodoc-typehints (>=1.12)"]
-test = ["appdirs (==1.4.4)", "pytest (>=6)", "pytest-cov (>=2.7)", "pytest-mock (>=3.6)"]
-
 [[package]]
 name = "pluggy"
 version = "1.0.0"
@@ -1889,13 +1817,13 @@ files = [
 
 [[package]]
 name = "pytest"
-version = "7.3.1"
+version = "7.4.4"
 description = "pytest: simple powerful testing with Python"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "pytest-7.3.1-py3-none-any.whl", hash = "sha256:3799fa815351fea3a5e96ac7e503a96fa51cc9942c3753cda7651b93c1cfa362"},
-    {file = "pytest-7.3.1.tar.gz", hash = "sha256:434afafd78b1d78ed0addf160ad2b77a30d35d4bdf8af234fe621919d9ed15e3"},
+    {file = "pytest-7.4.4-py3-none-any.whl", hash = "sha256:b090cdf5ed60bf4c45261be03239c2c1c22df034fbffe691abe93cd80cea01d8"},
+    {file = "pytest-7.4.4.tar.gz", hash = "sha256:2cf0005922c6ace4a3e2ec8b4080eb0d9753fdc93107415332f50ce9e7994280"},
 ]
 
 [package.dependencies]
@@ -1907,7 +1835,7 @@ pluggy = ">=0.12,<2.0"
 tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
 
 [package.extras]
-testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"]
+testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
 
 [[package]]
 name = "pytest-asyncio"
@@ -2204,46 +2132,46 @@ pyasn1 = ">=0.1.3"
 
 [[package]]
 name = "ruff"
-version = "0.0.269"
-description = "An extremely fast Python linter, written in Rust."
+version = "0.1.11"
+description = "An extremely fast Python linter and code formatter, written in Rust."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "ruff-0.0.269-py3-none-macosx_10_7_x86_64.whl", hash = "sha256:3569bcdee679045c09c0161fabc057599759c49219a08d9a4aad2cc3982ccba3"},
-    {file = "ruff-0.0.269-py3-none-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:56347da63757a56cbce7d4b3d6044ca4f1941cd1bbff3714f7554360c3361f83"},
-    {file = "ruff-0.0.269-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6da8ee25ef2f0cc6cc8e6e20942c1d44d25a36dce35070d7184655bc14f63f63"},
-    {file = "ruff-0.0.269-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:bd81b8e681b9eaa6cf15484f3985bd8bd97c3d114e95bff3e8ea283bf8865062"},
-    {file = "ruff-0.0.269-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1f19f59ca3c28742955241fb452f3346241ddbd34e72ac5cb3d84fadebcf6bc8"},
-    {file = "ruff-0.0.269-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:f062059b8289a4fab7f6064601b811d447c2f9d3d432a17f689efe4d68988450"},
-    {file = "ruff-0.0.269-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3f5dc7aac52c58e82510217e3c7efd80765c134c097c2815d59e40face0d1fe6"},
-    {file = "ruff-0.0.269-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e131b4dbe798c391090c6407641d6ab12c0fa1bb952379dde45e5000e208dabb"},
-    {file = "ruff-0.0.269-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a374434e588e06550df0f8dcb74777290f285678de991fda4e1063c367ab2eb2"},
-    {file = "ruff-0.0.269-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:cec2f4b84a14b87f1b121488649eb5b4eaa06467a2387373f750da74bdcb5679"},
-    {file = "ruff-0.0.269-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:374b161753a247904aec7a32d45e165302b76b6e83d22d099bf3ff7c232c888f"},
-    {file = "ruff-0.0.269-py3-none-musllinux_1_2_i686.whl", hash = "sha256:9ca0a1ddb1d835b5f742db9711c6cf59f213a1ad0088cb1e924a005fd399e7d8"},
-    {file = "ruff-0.0.269-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:5a20658f0b97d207c7841c13d528f36d666bf445b00b01139f28a8ccb80093bb"},
-    {file = "ruff-0.0.269-py3-none-win32.whl", hash = "sha256:03ff42bc91ceca58e0f0f072cb3f9286a9208f609812753474e799a997cdad1a"},
-    {file = "ruff-0.0.269-py3-none-win_amd64.whl", hash = "sha256:f3b59ccff57b21ef0967ea8021fd187ec14c528ec65507d8bcbe035912050776"},
-    {file = "ruff-0.0.269-py3-none-win_arm64.whl", hash = "sha256:bbeb857b1e508a4487bdb02ca1e6d41dd8d5ac5335a5246e25de8a3dff38c1ff"},
-    {file = "ruff-0.0.269.tar.gz", hash = "sha256:11ddcfbab32cf5c420ea9dd5531170ace5a3e59c16d9251c7bd2581f7b16f602"},
+    {file = "ruff-0.1.11-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:a7f772696b4cdc0a3b2e527fc3c7ccc41cdcb98f5c80fdd4f2b8c50eb1458196"},
+    {file = "ruff-0.1.11-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:934832f6ed9b34a7d5feea58972635c2039c7a3b434fe5ba2ce015064cb6e955"},
+    {file = "ruff-0.1.11-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ea0d3e950e394c4b332bcdd112aa566010a9f9c95814844a7468325290aabfd9"},
+    {file = "ruff-0.1.11-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9bd4025b9c5b429a48280785a2b71d479798a69f5c2919e7d274c5f4b32c3607"},
+    {file = "ruff-0.1.11-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e1ad00662305dcb1e987f5ec214d31f7d6a062cae3e74c1cbccef15afd96611d"},
+    {file = "ruff-0.1.11-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:4b077ce83f47dd6bea1991af08b140e8b8339f0ba8cb9b7a484c30ebab18a23f"},
+    {file = "ruff-0.1.11-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4a88efecec23c37b11076fe676e15c6cdb1271a38f2b415e381e87fe4517f18"},
+    {file = "ruff-0.1.11-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5b25093dad3b055667730a9b491129c42d45e11cdb7043b702e97125bcec48a1"},
+    {file = "ruff-0.1.11-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:231d8fb11b2cc7c0366a326a66dafc6ad449d7fcdbc268497ee47e1334f66f77"},
+    {file = "ruff-0.1.11-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:09c415716884950080921dd6237767e52e227e397e2008e2bed410117679975b"},
+    {file = "ruff-0.1.11-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:0f58948c6d212a6b8d41cd59e349751018797ce1727f961c2fa755ad6208ba45"},
+    {file = "ruff-0.1.11-py3-none-musllinux_1_2_i686.whl", hash = "sha256:190a566c8f766c37074d99640cd9ca3da11d8deae2deae7c9505e68a4a30f740"},
+    {file = "ruff-0.1.11-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:6464289bd67b2344d2a5d9158d5eb81025258f169e69a46b741b396ffb0cda95"},
+    {file = "ruff-0.1.11-py3-none-win32.whl", hash = "sha256:9b8f397902f92bc2e70fb6bebfa2139008dc72ae5177e66c383fa5426cb0bf2c"},
+    {file = "ruff-0.1.11-py3-none-win_amd64.whl", hash = "sha256:eb85ee287b11f901037a6683b2374bb0ec82928c5cbc984f575d0437979c521a"},
+    {file = "ruff-0.1.11-py3-none-win_arm64.whl", hash = "sha256:97ce4d752f964ba559c7023a86e5f8e97f026d511e48013987623915431c7ea9"},
+    {file = "ruff-0.1.11.tar.gz", hash = "sha256:f9d4d88cb6eeb4dfe20f9f0519bd2eaba8119bde87c3d5065c541dbae2b5a2cb"},
 ]
 
 [[package]]
 name = "s3transfer"
-version = "0.6.0"
+version = "0.10.0"
 description = "An Amazon S3 Transfer Manager"
 optional = false
-python-versions = ">= 3.7"
+python-versions = ">= 3.8"
 files = [
-    {file = "s3transfer-0.6.0-py3-none-any.whl", hash = "sha256:06176b74f3a15f61f1b4f25a1fc29a4429040b7647133a463da8fa5bd28d5ecd"},
-    {file = "s3transfer-0.6.0.tar.gz", hash = "sha256:2ed07d3866f523cc561bf4a00fc5535827981b117dd7876f036b0c1aca42c947"},
+    {file = "s3transfer-0.10.0-py3-none-any.whl", hash = "sha256:3cdb40f5cfa6966e812209d0994f2a4709b561c88e90cf00c2696d2df4e56b2e"},
+    {file = "s3transfer-0.10.0.tar.gz", hash = "sha256:d0c8bbf672d5eebbe4e57945e23b972d963f07d82f661cabf678a5c88831595b"},
 ]
 
 [package.dependencies]
-botocore = ">=1.12.36,<2.0a.0"
+botocore = ">=1.33.2,<2.0a.0"
 
 [package.extras]
-crt = ["botocore[crt] (>=1.20.29,<2.0a.0)"]
+crt = ["botocore[crt] (>=1.33.2,<2.0a.0)"]
 
 [[package]]
 name = "sarif-om"
@@ -2493,16 +2421,6 @@ files = [
     {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
     {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
     {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
-    {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
-    {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
-    {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
-    {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
@@ -2740,4 +2658,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "c4e38082d246636903e15c02fbf8364c6afc1fd35d36a81c49f596ba68fc739b"
+content-hash = "35c237fe6a9278b2dc65b06ed96bde5afb9e393d52c01b00c59acf1df3a8d482"
diff --git a/pre-commit.py b/pre-commit.py
index dc0b9ed588..c5ed63ac44 100755
--- a/pre-commit.py
+++ b/pre-commit.py
@@ -36,17 +36,17 @@ def rustfmt(fix_inplace: bool = False, no_color: bool = False) -> str:
     return cmd
 
 
-def black(fix_inplace: bool) -> str:
-    cmd = "poetry run black"
-    if not fix_inplace:
-        cmd += " --diff --check"
+def ruff_check(fix_inplace: bool) -> str:
+    cmd = "poetry run ruff check"
+    if fix_inplace:
+        cmd += " --fix"
     return cmd
 
 
-def ruff(fix_inplace: bool) -> str:
-    cmd = "poetry run ruff"
-    if fix_inplace:
-        cmd += " --fix"
+def ruff_format(fix_inplace: bool) -> str:
+    cmd = "poetry run ruff format"
+    if not fix_inplace:
+        cmd += " --diff --check"
     return cmd
 
 
@@ -109,16 +109,16 @@ if __name__ == "__main__":
         no_color=args.no_color,
     )
     check(
-        name="black",
+        name="ruff check",
         suffix=".py",
-        cmd=black(fix_inplace=args.fix_inplace),
+        cmd=ruff_check(fix_inplace=args.fix_inplace),
         changed_files=files,
         no_color=args.no_color,
     )
     check(
-        name="ruff",
+        name="ruff format",
         suffix=".py",
-        cmd=ruff(fix_inplace=args.fix_inplace),
+        cmd=ruff_format(fix_inplace=args.fix_inplace),
         changed_files=files,
         no_color=args.no_color,
     )
diff --git a/pyproject.toml b/pyproject.toml
index 401acaeba4..bb04123e05 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,7 @@ authors = []
 
 [tool.poetry.dependencies]
 python = "^3.9"
-pytest = "^7.3.1"
+pytest = "^7.4.4"
 psycopg2-binary = "^2.9.6"
 typing-extensions = "^4.6.1"
 PyJWT = {version = "^2.1.0", extras = ["crypto"]}
@@ -17,7 +17,7 @@ aiopg = "^1.4.0"
 Jinja2 = "^3.0.2"
 types-requests = "^2.31.0.0"
 types-psycopg2 = "^2.9.21.10"
-boto3 = "^1.26.16"
+boto3 = "^1.34.11"
 boto3-stubs = {extras = ["s3"], version = "^1.26.16"}
 moto = {extras = ["server"], version = "^4.1.2"}
 backoff = "^2.2.1"
@@ -40,22 +40,13 @@ pytest-split = "^0.8.1"
 zstandard = "^0.21.0"
 
 [tool.poetry.group.dev.dependencies]
-black = "^23.3.0"
 mypy = "==1.3.0"
-ruff = "^0.0.269"
+ruff = "^0.1.11"
 
 [build-system]
 requires = ["poetry-core>=1.0.0"]
 build-backend = "poetry.core.masonry.api"
 
-[tool.black]
-line-length = 100
-extend-exclude = '''
-/(
-    vendor
-)/
-'''
-
 [tool.mypy]
 exclude = "^vendor/"
 check_untyped_defs = true
@@ -82,7 +73,9 @@ ignore_missing_imports = true
 [tool.ruff]
 target-version = "py39"
 extend-exclude = ["vendor/"]
-ignore = ["E501"]
+ignore = [
+    "E501", # Line too long, we don't want to be too strict about it
+]
 select = [
     "E", # pycodestyle
     "F", # Pyflakes
@@ -90,3 +83,4 @@ select = [
     "W", # pycodestyle
     "B", # bugbear
 ]
+line-length = 100 # this setting is rather guidance, it won't fail if it can't make the shorter
diff --git a/s3_scrubber/Cargo.toml b/s3_scrubber/Cargo.toml
index fdae378d55..4d136472e0 100644
--- a/s3_scrubber/Cargo.toml
+++ b/s3_scrubber/Cargo.toml
@@ -6,6 +6,7 @@ license.workspace = true
 
 [dependencies]
 aws-sdk-s3.workspace = true
+aws-smithy-async.workspace = true
 either.workspace = true
 tokio-rustls.workspace = true
 anyhow.workspace = true
@@ -39,3 +40,5 @@ tracing-subscriber.workspace = true
 clap.workspace = true
 tracing-appender = "0.2"
 histogram = "0.7"
+
+futures.workspace = true
diff --git a/s3_scrubber/src/lib.rs b/s3_scrubber/src/lib.rs
index 8fb1346c8e..d2842877d0 100644
--- a/s3_scrubber/src/lib.rs
+++ b/s3_scrubber/src/lib.rs
@@ -16,10 +16,12 @@ use aws_config::environment::EnvironmentVariableCredentialsProvider;
 use aws_config::imds::credentials::ImdsCredentialsProvider;
 use aws_config::meta::credentials::CredentialsProviderChain;
 use aws_config::profile::ProfileFileCredentialsProvider;
+use aws_config::retry::RetryConfig;
 use aws_config::sso::SsoCredentialsProvider;
 use aws_config::BehaviorVersion;
-use aws_sdk_s3::config::Region;
+use aws_sdk_s3::config::{AsyncSleep, Region, SharedAsyncSleep};
 use aws_sdk_s3::{Client, Config};
+use aws_smithy_async::rt::sleep::TokioSleep;
 
 use clap::ValueEnum;
 use pageserver::tenant::TENANTS_SEGMENT_NAME;
@@ -283,9 +285,13 @@ pub fn init_s3_client(account_id: Option<String>, bucket_region: Region) -> Clie
         )
     };
 
+    let sleep_impl: Arc<dyn AsyncSleep> = Arc::new(TokioSleep::new());
+
     let mut builder = Config::builder()
         .behavior_version(BehaviorVersion::v2023_11_09())
         .region(bucket_region)
+        .retry_config(RetryConfig::adaptive().with_max_attempts(3))
+        .sleep_impl(SharedAsyncSleep::from(sleep_impl))
         .credentials_provider(credentials_provider);
 
     if let Ok(endpoint) = env::var("AWS_ENDPOINT_URL") {
diff --git a/s3_scrubber/src/main.rs b/s3_scrubber/src/main.rs
index ef020edc2a..957213856b 100644
--- a/s3_scrubber/src/main.rs
+++ b/s3_scrubber/src/main.rs
@@ -1,3 +1,4 @@
+use pageserver_api::shard::TenantShardId;
 use s3_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
 use s3_scrubber::scan_metadata::scan_metadata;
 use s3_scrubber::{init_logging, BucketConfig, ConsoleConfig, NodeKind, TraversingDepth};
@@ -34,6 +35,8 @@ enum Command {
     ScanMetadata {
         #[arg(short, long, default_value_t = false)]
         json: bool,
+        #[arg(long = "tenant-id", num_args = 0..)]
+        tenant_ids: Vec<TenantShardId>,
     },
 }
 
@@ -57,35 +60,37 @@ async fn main() -> anyhow::Result<()> {
     ));
 
     match cli.command {
-        Command::ScanMetadata { json } => match scan_metadata(bucket_config.clone()).await {
-            Err(e) => {
-                tracing::error!("Failed: {e}");
-                Err(e)
-            }
-            Ok(summary) => {
-                if json {
-                    println!("{}", serde_json::to_string(&summary).unwrap())
-                } else {
-                    println!("{}", summary.summary_string());
+        Command::ScanMetadata { json, tenant_ids } => {
+            match scan_metadata(bucket_config.clone(), tenant_ids).await {
+                Err(e) => {
+                    tracing::error!("Failed: {e}");
+                    Err(e)
                 }
-                if summary.is_fatal() {
-                    Err(anyhow::anyhow!("Fatal scrub errors detected"))
-                } else if summary.is_empty() {
-                    // Strictly speaking an empty bucket is a valid bucket, but if someone ran the
-                    // scrubber they were likely expecting to scan something, and if we see no timelines
-                    // at all then it's likely due to some configuration issues like a bad prefix
-                    Err(anyhow::anyhow!(
-                        "No timelines found in bucket {} prefix {}",
-                        bucket_config.bucket,
-                        bucket_config
-                            .prefix_in_bucket
-                            .unwrap_or("<none>".to_string())
-                    ))
-                } else {
-                    Ok(())
+                Ok(summary) => {
+                    if json {
+                        println!("{}", serde_json::to_string(&summary).unwrap())
+                    } else {
+                        println!("{}", summary.summary_string());
+                    }
+                    if summary.is_fatal() {
+                        Err(anyhow::anyhow!("Fatal scrub errors detected"))
+                    } else if summary.is_empty() {
+                        // Strictly speaking an empty bucket is a valid bucket, but if someone ran the
+                        // scrubber they were likely expecting to scan something, and if we see no timelines
+                        // at all then it's likely due to some configuration issues like a bad prefix
+                        Err(anyhow::anyhow!(
+                            "No timelines found in bucket {} prefix {}",
+                            bucket_config.bucket,
+                            bucket_config
+                                .prefix_in_bucket
+                                .unwrap_or("<none>".to_string())
+                        ))
+                    } else {
+                        Ok(())
+                    }
                 }
             }
-        },
+        }
         Command::FindGarbage {
             node_kind,
             depth,
diff --git a/s3_scrubber/src/scan_metadata.rs b/s3_scrubber/src/scan_metadata.rs
index bcc4d2e618..bfde8f0213 100644
--- a/s3_scrubber/src/scan_metadata.rs
+++ b/s3_scrubber/src/scan_metadata.rs
@@ -187,10 +187,17 @@ Timeline layer count: {6}
 }
 
 /// Scan the pageserver metadata in an S3 bucket, reporting errors and statistics.
-pub async fn scan_metadata(bucket_config: BucketConfig) -> anyhow::Result<MetadataSummary> {
+pub async fn scan_metadata(
+    bucket_config: BucketConfig,
+    tenant_ids: Vec<TenantShardId>,
+) -> anyhow::Result<MetadataSummary> {
     let (s3_client, target) = init_remote(bucket_config, NodeKind::Pageserver)?;
 
-    let tenants = stream_tenants(&s3_client, &target);
+    let tenants = if tenant_ids.is_empty() {
+        futures::future::Either::Left(stream_tenants(&s3_client, &target))
+    } else {
+        futures::future::Either::Right(futures::stream::iter(tenant_ids.into_iter().map(Ok)))
+    };
 
     // How many tenants to process in parallel.  We need to be mindful of pageservers
     // accessing the same per tenant prefixes, so use a lower setting than pageservers.
diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml
index 4015c27933..364cad7892 100644
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -54,6 +54,7 @@ postgres_ffi.workspace = true
 pq_proto.workspace = true
 remote_storage.workspace = true
 safekeeper_api.workspace = true
+sha2.workspace = true
 sd-notify.workspace = true
 storage_broker.workspace = true
 tokio-stream.workspace = true
diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs
index 7aadd67ac6..591bfea182 100644
--- a/safekeeper/src/control_file.rs
+++ b/safekeeper/src/control_file.rs
@@ -66,12 +66,10 @@ impl FileStorage {
 
     /// Create file storage for a new timeline, but don't persist it yet.
     pub fn create_new(
-        ttid: &TenantTimelineId,
+        timeline_dir: Utf8PathBuf,
         conf: &SafeKeeperConf,
         state: SafeKeeperState,
     ) -> Result<FileStorage> {
-        let timeline_dir = conf.timeline_dir(ttid);
-
         let store = FileStorage {
             timeline_dir,
             conf: conf.clone(),
@@ -277,7 +275,8 @@ mod test {
             .await
             .expect("failed to create timeline dir");
         let state = SafeKeeperState::empty();
-        let storage = FileStorage::create_new(ttid, conf, state.clone())?;
+        let timeline_dir = conf.timeline_dir(ttid);
+        let storage = FileStorage::create_new(timeline_dir, conf, state.clone())?;
         Ok((storage, state))
     }
 
diff --git a/safekeeper/src/copy_timeline.rs b/safekeeper/src/copy_timeline.rs
new file mode 100644
index 0000000000..ef88eb27e3
--- /dev/null
+++ b/safekeeper/src/copy_timeline.rs
@@ -0,0 +1,250 @@
+use std::sync::Arc;
+
+use anyhow::{bail, Result};
+use camino::Utf8PathBuf;
+
+use postgres_ffi::{MAX_SEND_SIZE, WAL_SEGMENT_SIZE};
+use tokio::{
+    fs::OpenOptions,
+    io::{AsyncSeekExt, AsyncWriteExt},
+};
+use tracing::{info, warn};
+use utils::{id::TenantTimelineId, lsn::Lsn};
+
+use crate::{
+    control_file::{FileStorage, Storage},
+    pull_timeline::{create_temp_timeline_dir, load_temp_timeline, validate_temp_timeline},
+    safekeeper::SafeKeeperState,
+    timeline::{Timeline, TimelineError},
+    wal_backup::copy_s3_segments,
+    wal_storage::{wal_file_paths, WalReader},
+    GlobalTimelines, SafeKeeperConf,
+};
+
+// we don't want to have more than 10 segments on disk after copy, because they take space
+const MAX_BACKUP_LAG: u64 = 10 * WAL_SEGMENT_SIZE as u64;
+
+pub struct Request {
+    pub source: Arc<Timeline>,
+    pub until_lsn: Lsn,
+    pub destination_ttid: TenantTimelineId,
+}
+
+pub async fn handle_request(request: Request) -> Result<()> {
+    // TODO: request.until_lsn MUST be a valid LSN, and we cannot check it :(
+    //   if LSN will point to the middle of a WAL record, timeline will be in "broken" state
+
+    match GlobalTimelines::get(request.destination_ttid) {
+        // timeline already exists. would be good to check that this timeline is the copy
+        // of the source timeline, but it isn't obvious how to do that
+        Ok(_) => return Ok(()),
+        // timeline not found, we are going to create it
+        Err(TimelineError::NotFound(_)) => {}
+        // error, probably timeline was deleted
+        res => {
+            res?;
+        }
+    }
+
+    let conf = &GlobalTimelines::get_global_config();
+    let ttid = request.destination_ttid;
+
+    let (_tmp_dir, tli_dir_path) = create_temp_timeline_dir(conf, ttid).await?;
+
+    let (mem_state, state) = request.source.get_state().await;
+    let start_lsn = state.timeline_start_lsn;
+    if start_lsn == Lsn::INVALID {
+        bail!("timeline is not initialized");
+    }
+    let backup_lsn = mem_state.backup_lsn;
+
+    {
+        let commit_lsn = mem_state.commit_lsn;
+        let flush_lsn = request.source.get_flush_lsn().await;
+
+        info!(
+            "collected info about source timeline: start_lsn={}, backup_lsn={}, commit_lsn={}, flush_lsn={}",
+            start_lsn, backup_lsn, commit_lsn, flush_lsn
+        );
+
+        assert!(backup_lsn >= start_lsn);
+        assert!(commit_lsn >= start_lsn);
+        assert!(flush_lsn >= start_lsn);
+
+        if request.until_lsn > flush_lsn {
+            bail!("requested LSN is beyond the end of the timeline");
+        }
+        if request.until_lsn < start_lsn {
+            bail!("requested LSN is before the start of the timeline");
+        }
+
+        if request.until_lsn > commit_lsn {
+            warn!("copy_timeline WAL is not fully committed");
+        }
+
+        if backup_lsn < request.until_lsn && request.until_lsn.0 - backup_lsn.0 > MAX_BACKUP_LAG {
+            // we have a lot of segments that are not backed up. we can try to wait here until
+            // segments will be backed up to remote storage, but it's not clear how long to wait
+            bail!("too many segments are not backed up");
+        }
+    }
+
+    let wal_seg_size = state.server.wal_seg_size as usize;
+    if wal_seg_size == 0 {
+        bail!("wal_seg_size is not set");
+    }
+
+    let first_segment = start_lsn.segment_number(wal_seg_size);
+    let last_segment = request.until_lsn.segment_number(wal_seg_size);
+
+    let new_backup_lsn = {
+        // we can't have new backup_lsn greater than existing backup_lsn or start of the last segment
+        let max_backup_lsn = backup_lsn.min(Lsn(last_segment * wal_seg_size as u64));
+
+        if max_backup_lsn <= start_lsn {
+            // probably we are starting from the first segment, which was not backed up yet.
+            // note that start_lsn can be in the middle of the segment
+            start_lsn
+        } else {
+            // we have some segments backed up, so we will assume all WAL below max_backup_lsn is backed up
+            assert!(max_backup_lsn.segment_offset(wal_seg_size) == 0);
+            max_backup_lsn
+        }
+    };
+
+    // all previous segments will be copied inside S3
+    let first_ondisk_segment = new_backup_lsn.segment_number(wal_seg_size);
+    assert!(first_ondisk_segment <= last_segment);
+    assert!(first_ondisk_segment >= first_segment);
+
+    copy_s3_segments(
+        wal_seg_size,
+        &request.source.ttid,
+        &request.destination_ttid,
+        first_segment,
+        first_ondisk_segment,
+    )
+    .await?;
+
+    copy_disk_segments(
+        conf,
+        &state,
+        wal_seg_size,
+        &request.source.ttid,
+        new_backup_lsn,
+        request.until_lsn,
+        &tli_dir_path,
+    )
+    .await?;
+
+    let mut new_state = SafeKeeperState::new(
+        &request.destination_ttid,
+        state.server.clone(),
+        vec![],
+        request.until_lsn,
+        start_lsn,
+    );
+    new_state.timeline_start_lsn = start_lsn;
+    new_state.peer_horizon_lsn = request.until_lsn;
+    new_state.backup_lsn = new_backup_lsn;
+
+    let mut file_storage = FileStorage::create_new(tli_dir_path.clone(), conf, new_state.clone())?;
+    file_storage.persist(&new_state).await?;
+
+    // now we have a ready timeline in a temp directory
+    validate_temp_timeline(conf, request.destination_ttid, &tli_dir_path).await?;
+    load_temp_timeline(conf, request.destination_ttid, &tli_dir_path).await?;
+
+    Ok(())
+}
+
+async fn copy_disk_segments(
+    conf: &SafeKeeperConf,
+    persisted_state: &SafeKeeperState,
+    wal_seg_size: usize,
+    source_ttid: &TenantTimelineId,
+    start_lsn: Lsn,
+    end_lsn: Lsn,
+    tli_dir_path: &Utf8PathBuf,
+) -> Result<()> {
+    let mut wal_reader = WalReader::new(
+        conf.workdir.clone(),
+        conf.timeline_dir(source_ttid),
+        persisted_state,
+        start_lsn,
+        true,
+    )?;
+
+    let mut buf = [0u8; MAX_SEND_SIZE];
+
+    let first_segment = start_lsn.segment_number(wal_seg_size);
+    let last_segment = end_lsn.segment_number(wal_seg_size);
+
+    for segment in first_segment..=last_segment {
+        let segment_start = segment * wal_seg_size as u64;
+        let segment_end = segment_start + wal_seg_size as u64;
+
+        let copy_start = segment_start.max(start_lsn.0);
+        let copy_end = segment_end.min(end_lsn.0);
+
+        let copy_start = copy_start - segment_start;
+        let copy_end = copy_end - segment_start;
+
+        let wal_file_path = {
+            let (normal, partial) = wal_file_paths(tli_dir_path, segment, wal_seg_size)?;
+
+            if segment == last_segment {
+                partial
+            } else {
+                normal
+            }
+        };
+
+        write_segment(
+            &mut buf,
+            &wal_file_path,
+            wal_seg_size as u64,
+            copy_start,
+            copy_end,
+            &mut wal_reader,
+        )
+        .await?;
+    }
+
+    Ok(())
+}
+
+async fn write_segment(
+    buf: &mut [u8],
+    file_path: &Utf8PathBuf,
+    wal_seg_size: u64,
+    from: u64,
+    to: u64,
+    reader: &mut WalReader,
+) -> Result<()> {
+    assert!(from <= to);
+    assert!(to <= wal_seg_size);
+
+    let mut file = OpenOptions::new()
+        .create(true)
+        .write(true)
+        .open(&file_path)
+        .await?;
+
+    // maybe fill with zeros, as in wal_storage.rs?
+    file.set_len(wal_seg_size).await?;
+    file.seek(std::io::SeekFrom::Start(from)).await?;
+
+    let mut bytes_left = to - from;
+    while bytes_left > 0 {
+        let len = bytes_left as usize;
+        let len = len.min(buf.len());
+        let len = reader.read(&mut buf[..len]).await?;
+        file.write_all(&buf[..len]).await?;
+        bytes_left -= len as u64;
+    }
+
+    file.flush().await?;
+    file.sync_all().await?;
+    Ok(())
+}
diff --git a/safekeeper/src/debug_dump.rs b/safekeeper/src/debug_dump.rs
index daf9255ecb..c9ff1afdea 100644
--- a/safekeeper/src/debug_dump.rs
+++ b/safekeeper/src/debug_dump.rs
@@ -7,13 +7,16 @@ use std::io::Read;
 use std::path::PathBuf;
 use std::sync::Arc;
 
+use anyhow::bail;
 use anyhow::Result;
 use camino::Utf8Path;
 use chrono::{DateTime, Utc};
 use postgres_ffi::XLogSegNo;
+use postgres_ffi::MAX_SEND_SIZE;
 use serde::Deserialize;
 use serde::Serialize;
 
+use sha2::{Digest, Sha256};
 use utils::id::NodeId;
 use utils::id::TenantTimelineId;
 use utils::id::{TenantId, TimelineId};
@@ -25,6 +28,7 @@ use crate::safekeeper::TermHistory;
 use crate::SafeKeeperConf;
 
 use crate::send_wal::WalSenderState;
+use crate::wal_storage::WalReader;
 use crate::GlobalTimelines;
 
 /// Various filters that influence the resulting JSON output.
@@ -300,3 +304,56 @@ fn build_config(config: SafeKeeperConf) -> Config {
         wal_backup_enabled: config.wal_backup_enabled,
     }
 }
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct TimelineDigestRequest {
+    pub from_lsn: Lsn,
+    pub until_lsn: Lsn,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+pub struct TimelineDigest {
+    pub sha256: String,
+}
+
+pub async fn calculate_digest(
+    tli: &Arc<crate::timeline::Timeline>,
+    request: TimelineDigestRequest,
+) -> Result<TimelineDigest> {
+    if request.from_lsn > request.until_lsn {
+        bail!("from_lsn is greater than until_lsn");
+    }
+
+    let conf = GlobalTimelines::get_global_config();
+    let (_, persisted_state) = tli.get_state().await;
+
+    if persisted_state.timeline_start_lsn > request.from_lsn {
+        bail!("requested LSN is before the start of the timeline");
+    }
+
+    let mut wal_reader = WalReader::new(
+        conf.workdir.clone(),
+        tli.timeline_dir.clone(),
+        &persisted_state,
+        request.from_lsn,
+        true,
+    )?;
+
+    let mut hasher = Sha256::new();
+    let mut buf = [0u8; MAX_SEND_SIZE];
+
+    let mut bytes_left = (request.until_lsn.0 - request.from_lsn.0) as usize;
+    while bytes_left > 0 {
+        let bytes_to_read = std::cmp::min(buf.len(), bytes_left);
+        let bytes_read = wal_reader.read(&mut buf[..bytes_to_read]).await?;
+        if bytes_read == 0 {
+            bail!("wal_reader.read returned 0 bytes");
+        }
+        hasher.update(&buf[..bytes_read]);
+        bytes_left -= bytes_read;
+    }
+
+    let digest = hasher.finalize();
+    let digest = hex::encode(digest);
+    Ok(TimelineDigest { sha256: digest })
+}
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index 25a3334e63..5283ea19c1 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -2,7 +2,7 @@ use hyper::{Body, Request, Response, StatusCode, Uri};
 
 use once_cell::sync::Lazy;
 use postgres_ffi::WAL_SEGMENT_SIZE;
-use safekeeper_api::models::SkTimelineInfo;
+use safekeeper_api::models::{SkTimelineInfo, TimelineCopyRequest};
 use serde::{Deserialize, Serialize};
 use std::collections::{HashMap, HashSet};
 use std::fmt;
@@ -14,19 +14,21 @@ use tokio::fs::File;
 use tokio::io::AsyncReadExt;
 use tokio_util::sync::CancellationToken;
 use utils::failpoint_support::failpoints_handler;
+use utils::http::request::parse_query_param;
 
 use std::io::Write as _;
 use tokio::sync::mpsc;
 use tokio_stream::wrappers::ReceiverStream;
-use tracing::info_span;
+use tracing::{info_span, Instrument};
 use utils::http::endpoint::{request_span, ChannelWriter};
 
+use crate::debug_dump::TimelineDigestRequest;
 use crate::receive_wal::WalReceiverState;
 use crate::safekeeper::Term;
 use crate::safekeeper::{ServerInfo, TermLsn};
 use crate::send_wal::WalSenderState;
 use crate::timeline::PeerInfo;
-use crate::{debug_dump, pull_timeline};
+use crate::{copy_timeline, debug_dump, pull_timeline};
 
 use crate::timelines_global_map::TimelineDeleteForceResult;
 use crate::GlobalTimelines;
@@ -204,6 +206,56 @@ async fn timeline_pull_handler(mut request: Request<Body>) -> Result<Response<Bo
     json_response(StatusCode::OK, resp)
 }
 
+async fn timeline_copy_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permission(&request, None)?;
+
+    let request_data: TimelineCopyRequest = json_request(&mut request).await?;
+    let ttid = TenantTimelineId::new(
+        parse_request_param(&request, "tenant_id")?,
+        parse_request_param(&request, "source_timeline_id")?,
+    );
+
+    let source = GlobalTimelines::get(ttid)?;
+
+    copy_timeline::handle_request(copy_timeline::Request{
+        source,
+        until_lsn: request_data.until_lsn,
+        destination_ttid: TenantTimelineId::new(ttid.tenant_id, request_data.target_timeline_id),
+    })
+        .instrument(info_span!("copy_timeline", from=%ttid, to=%request_data.target_timeline_id, until_lsn=%request_data.until_lsn))
+        .await
+        .map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::OK, ())
+}
+
+async fn timeline_digest_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let ttid = TenantTimelineId::new(
+        parse_request_param(&request, "tenant_id")?,
+        parse_request_param(&request, "timeline_id")?,
+    );
+    check_permission(&request, Some(ttid.tenant_id))?;
+
+    let from_lsn: Option<Lsn> = parse_query_param(&request, "from_lsn")?;
+    let until_lsn: Option<Lsn> = parse_query_param(&request, "until_lsn")?;
+
+    let request = TimelineDigestRequest {
+        from_lsn: from_lsn.ok_or(ApiError::BadRequest(anyhow::anyhow!(
+            "from_lsn is required"
+        )))?,
+        until_lsn: until_lsn.ok_or(ApiError::BadRequest(anyhow::anyhow!(
+            "until_lsn is required"
+        )))?,
+    };
+
+    let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
+
+    let response = debug_dump::calculate_digest(&tli, request)
+        .await
+        .map_err(ApiError::InternalServerError)?;
+    json_response(StatusCode::OK, response)
+}
+
 /// Download a file from the timeline directory.
 // TODO: figure out a better way to copy files between safekeepers
 async fn timeline_files_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -472,11 +524,18 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
             "/v1/tenant/:tenant_id/timeline/:timeline_id/file/:filename",
             |r| request_span(r, timeline_files_handler),
         )
+        .post(
+            "/v1/tenant/:tenant_id/timeline/:source_timeline_id/copy",
+            |r| request_span(r, timeline_copy_handler),
+        )
         // for tests
         .post("/v1/record_safekeeper_info/:tenant_id/:timeline_id", |r| {
             request_span(r, record_safekeeper_info)
         })
         .get("/v1/debug_dump", |r| request_span(r, dump_debug_handler))
+        .get("/v1/tenant/:tenant_id/timeline/:timeline_id/digest", |r| {
+            request_span(r, timeline_digest_handler)
+        })
 }
 
 #[cfg(test)]
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index 3a086f1f54..fc5f99eb00 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -16,6 +16,7 @@ mod auth;
 pub mod broker;
 pub mod control_file;
 pub mod control_file_upgrade;
+pub mod copy_timeline;
 pub mod debug_dump;
 pub mod handler;
 pub mod http;
diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs
index ad3a18a536..93b51f32c0 100644
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -1,16 +1,24 @@
+use std::sync::Arc;
+
+use camino::Utf8PathBuf;
+use camino_tempfile::Utf8TempDir;
 use chrono::{DateTime, Utc};
 use serde::{Deserialize, Serialize};
 
 use anyhow::{bail, Context, Result};
 use tokio::io::AsyncWriteExt;
 use tracing::info;
-use utils::id::{TenantId, TenantTimelineId, TimelineId};
+use utils::{
+    id::{TenantId, TenantTimelineId, TimelineId},
+    lsn::Lsn,
+};
 
 use crate::{
     control_file, debug_dump,
     http::routes::TimelineStatus,
+    timeline::{Timeline, TimelineError},
     wal_storage::{self, Storage},
-    GlobalTimelines,
+    GlobalTimelines, SafeKeeperConf,
 };
 
 /// Info about timeline on safekeeper ready for reporting.
@@ -91,7 +99,7 @@ pub async fn handle_request(request: Request) -> Result<Response> {
 async fn pull_timeline(status: TimelineStatus, host: String) -> Result<Response> {
     let ttid = TenantTimelineId::new(status.tenant_id, status.timeline_id);
     info!(
-        "Pulling timeline {} from safekeeper {}, commit_lsn={}, flush_lsn={}, term={}, epoch={}",
+        "pulling timeline {} from safekeeper {}, commit_lsn={}, flush_lsn={}, term={}, epoch={}",
         ttid,
         host,
         status.commit_lsn,
@@ -121,14 +129,14 @@ async fn pull_timeline(status: TimelineStatus, host: String) -> Result<Response>
 
     if dump.timelines.len() != 1 {
         bail!(
-            "Expected to fetch single timeline, got {} timelines",
+            "expected to fetch single timeline, got {} timelines",
             dump.timelines.len()
         );
     }
 
     let timeline = dump.timelines.into_iter().next().unwrap();
     let disk_content = timeline.disk_content.ok_or(anyhow::anyhow!(
-        "Timeline {} doesn't have disk content",
+        "timeline {} doesn't have disk content",
         ttid
     ))?;
 
@@ -155,29 +163,12 @@ async fn pull_timeline(status: TimelineStatus, host: String) -> Result<Response>
     filenames.insert(0, "safekeeper.control".to_string());
 
     info!(
-        "Downloading {} files from safekeeper {}",
+        "downloading {} files from safekeeper {}",
         filenames.len(),
         host
     );
 
-    // Creating temp directory for a new timeline. It needs to be
-    // located on the same filesystem as the rest of the timelines.
-
-    // conf.workdir is usually /storage/safekeeper/data
-    // will try to transform it into /storage/safekeeper/tmp
-    let temp_base = conf
-        .workdir
-        .parent()
-        .ok_or(anyhow::anyhow!("workdir has no parent"))?
-        .join("tmp");
-
-    tokio::fs::create_dir_all(&temp_base).await?;
-
-    let tli_dir = camino_tempfile::Builder::new()
-        .suffix("_temptli")
-        .prefix(&format!("{}_{}_", ttid.tenant_id, ttid.timeline_id))
-        .tempdir_in(temp_base)?;
-    let tli_dir_path = tli_dir.path().to_path_buf();
+    let (_tmp_dir, tli_dir_path) = create_temp_timeline_dir(conf, ttid).await?;
 
     // Note: some time happens between fetching list of files and fetching files themselves.
     //       It's possible that some files will be removed from safekeeper and we will fail to fetch them.
@@ -201,47 +192,105 @@ async fn pull_timeline(status: TimelineStatus, host: String) -> Result<Response>
     // TODO: fsync?
 
     // Let's create timeline from temp directory and verify that it's correct
+    let (commit_lsn, flush_lsn) = validate_temp_timeline(conf, ttid, &tli_dir_path).await?;
+    info!(
+        "finished downloading timeline {}, commit_lsn={}, flush_lsn={}",
+        ttid, commit_lsn, flush_lsn
+    );
+    assert!(status.commit_lsn <= status.flush_lsn);
 
-    let control_path = tli_dir_path.join("safekeeper.control");
+    // Finally, load the timeline.
+    let _tli = load_temp_timeline(conf, ttid, &tli_dir_path).await?;
+
+    Ok(Response {
+        safekeeper_host: host,
+    })
+}
+
+/// Create temp directory for a new timeline. It needs to be located on the same
+/// filesystem as the rest of the timelines. It will be automatically deleted when
+/// Utf8TempDir goes out of scope.
+pub async fn create_temp_timeline_dir(
+    conf: &SafeKeeperConf,
+    ttid: TenantTimelineId,
+) -> Result<(Utf8TempDir, Utf8PathBuf)> {
+    // conf.workdir is usually /storage/safekeeper/data
+    // will try to transform it into /storage/safekeeper/tmp
+    let temp_base = conf
+        .workdir
+        .parent()
+        .ok_or(anyhow::anyhow!("workdir has no parent"))?
+        .join("tmp");
+
+    tokio::fs::create_dir_all(&temp_base).await?;
+
+    let tli_dir = camino_tempfile::Builder::new()
+        .suffix("_temptli")
+        .prefix(&format!("{}_{}_", ttid.tenant_id, ttid.timeline_id))
+        .tempdir_in(temp_base)?;
+
+    let tli_dir_path = tli_dir.path().to_path_buf();
+
+    Ok((tli_dir, tli_dir_path))
+}
+
+/// Do basic validation of a temp timeline, before moving it to the global map.
+pub async fn validate_temp_timeline(
+    conf: &SafeKeeperConf,
+    ttid: TenantTimelineId,
+    path: &Utf8PathBuf,
+) -> Result<(Lsn, Lsn)> {
+    let control_path = path.join("safekeeper.control");
 
     let control_store = control_file::FileStorage::load_control_file(control_path)?;
     if control_store.server.wal_seg_size == 0 {
         bail!("wal_seg_size is not set");
     }
 
-    let wal_store =
-        wal_storage::PhysicalStorage::new(&ttid, tli_dir_path.clone(), conf, &control_store)?;
+    let wal_store = wal_storage::PhysicalStorage::new(&ttid, path.clone(), conf, &control_store)?;
 
-    let commit_lsn = status.commit_lsn;
+    let commit_lsn = control_store.commit_lsn;
     let flush_lsn = wal_store.flush_lsn();
 
-    info!(
-        "Finished downloading timeline {}, commit_lsn={}, flush_lsn={}",
-        ttid, commit_lsn, flush_lsn
-    );
-    assert!(status.commit_lsn <= status.flush_lsn);
+    Ok((commit_lsn, flush_lsn))
+}
+
+/// Move timeline from a temp directory to the main storage, and load it to the global map.
+/// This operation is done under a lock to prevent bugs if several concurrent requests are
+/// trying to load the same timeline. Note that it doesn't guard against creating the
+/// timeline with the same ttid, but no one should be doing this anyway.
+pub async fn load_temp_timeline(
+    conf: &SafeKeeperConf,
+    ttid: TenantTimelineId,
+    tmp_path: &Utf8PathBuf,
+) -> Result<Arc<Timeline>> {
+    // Take a lock to prevent concurrent loadings
+    let load_lock = GlobalTimelines::loading_lock().await;
+    let guard = load_lock.lock().await;
+
+    if !matches!(GlobalTimelines::get(ttid), Err(TimelineError::NotFound(_))) {
+        bail!("timeline already exists, cannot overwrite it")
+    }
 
     // Move timeline dir to the correct location
     let timeline_path = conf.timeline_dir(&ttid);
 
     info!(
-        "Moving timeline {} from {} to {}",
-        ttid, tli_dir_path, timeline_path
+        "moving timeline {} from {} to {}",
+        ttid, tmp_path, timeline_path
     );
     tokio::fs::create_dir_all(conf.tenant_dir(&ttid.tenant_id)).await?;
-    tokio::fs::rename(tli_dir_path, &timeline_path).await?;
+    tokio::fs::rename(tmp_path, &timeline_path).await?;
 
-    let tli = GlobalTimelines::load_timeline(ttid)
+    let tli = GlobalTimelines::load_timeline(&guard, ttid)
         .await
         .context("Failed to load timeline after copy")?;
 
     info!(
-        "Loaded timeline {}, flush_lsn={}",
+        "loaded timeline {}, flush_lsn={}",
         ttid,
         tli.get_flush_lsn().await
     );
 
-    Ok(Response {
-        safekeeper_host: host,
-    })
+    Ok(tli)
 }
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index bdc9088138..2f284abe8c 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -141,7 +141,8 @@ impl SharedState {
 
         // We don't want to write anything to disk, because we may have existing timeline there.
         // These functions should not change anything on disk.
-        let control_store = control_file::FileStorage::create_new(ttid, conf, state)?;
+        let timeline_dir = conf.timeline_dir(ttid);
+        let control_store = control_file::FileStorage::create_new(timeline_dir, conf, state)?;
         let wal_store =
             wal_storage::PhysicalStorage::new(ttid, conf.timeline_dir(ttid), conf, &control_store)?;
         let sk = SafeKeeper::new(control_store, wal_store, conf.my_id)?;
diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs
index cbb3342e40..92ac5ba66d 100644
--- a/safekeeper/src/timelines_global_map.rs
+++ b/safekeeper/src/timelines_global_map.rs
@@ -21,8 +21,12 @@ struct GlobalTimelinesState {
     timelines: HashMap<TenantTimelineId, Arc<Timeline>>,
     wal_backup_launcher_tx: Option<Sender<TenantTimelineId>>,
     conf: Option<SafeKeeperConf>,
+    load_lock: Arc<tokio::sync::Mutex<TimelineLoadLock>>,
 }
 
+// Used to prevent concurrent timeline loading.
+pub struct TimelineLoadLock;
+
 impl GlobalTimelinesState {
     /// Get configuration, which must be set once during init.
     fn get_conf(&self) -> &SafeKeeperConf {
@@ -63,6 +67,7 @@ static TIMELINES_STATE: Lazy<Mutex<GlobalTimelinesState>> = Lazy::new(|| {
         timelines: HashMap::new(),
         wal_backup_launcher_tx: None,
         conf: None,
+        load_lock: Arc::new(tokio::sync::Mutex::new(TimelineLoadLock)),
     })
 });
 
@@ -174,8 +179,16 @@ impl GlobalTimelines {
         Ok(())
     }
 
+    /// Take a lock for timeline loading.
+    pub async fn loading_lock() -> Arc<tokio::sync::Mutex<TimelineLoadLock>> {
+        TIMELINES_STATE.lock().unwrap().load_lock.clone()
+    }
+
     /// Load timeline from disk to the memory.
-    pub async fn load_timeline(ttid: TenantTimelineId) -> Result<Arc<Timeline>> {
+    pub async fn load_timeline<'a>(
+        _guard: &tokio::sync::MutexGuard<'a, TimelineLoadLock>,
+        ttid: TenantTimelineId,
+    ) -> Result<Arc<Timeline>> {
         let (conf, wal_backup_launcher_tx) = TIMELINES_STATE.lock().unwrap().get_dependencies();
 
         match Timeline::load_timeline(&conf, ttid, wal_backup_launcher_tx) {
diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs
index c99bbc7d61..e4499eaf50 100644
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -7,7 +7,7 @@ use tokio::task::JoinHandle;
 use utils::id::NodeId;
 
 use std::cmp::min;
-use std::collections::HashMap;
+use std::collections::{HashMap, HashSet};
 use std::pin::Pin;
 use std::sync::Arc;
 use std::time::Duration;
@@ -531,3 +531,62 @@ pub async fn read_object(
 
     Ok(Box::pin(reader))
 }
+
+/// Copy segments from one timeline to another. Used in copy_timeline.
+pub async fn copy_s3_segments(
+    wal_seg_size: usize,
+    src_ttid: &TenantTimelineId,
+    dst_ttid: &TenantTimelineId,
+    from_segment: XLogSegNo,
+    to_segment: XLogSegNo,
+) -> Result<()> {
+    const SEGMENTS_PROGRESS_REPORT_INTERVAL: u64 = 1024;
+
+    let storage = REMOTE_STORAGE
+        .get()
+        .expect("failed to get remote storage")
+        .as_ref()
+        .unwrap();
+
+    let relative_dst_path =
+        Utf8Path::new(&dst_ttid.tenant_id.to_string()).join(dst_ttid.timeline_id.to_string());
+
+    let remote_path = RemotePath::new(&relative_dst_path)?;
+
+    let files = storage.list_files(Some(&remote_path)).await?;
+    let uploaded_segments = &files
+        .iter()
+        .filter_map(|file| file.object_name().map(ToOwned::to_owned))
+        .collect::<HashSet<_>>();
+
+    debug!(
+        "these segments have already been uploaded: {:?}",
+        uploaded_segments
+    );
+
+    let relative_src_path =
+        Utf8Path::new(&src_ttid.tenant_id.to_string()).join(src_ttid.timeline_id.to_string());
+
+    for segno in from_segment..to_segment {
+        if segno % SEGMENTS_PROGRESS_REPORT_INTERVAL == 0 {
+            info!("copied all segments from {} until {}", from_segment, segno);
+        }
+
+        let segment_name = XLogFileName(PG_TLI, segno, wal_seg_size);
+        if uploaded_segments.contains(&segment_name) {
+            continue;
+        }
+        debug!("copying segment {}", segment_name);
+
+        let from = RemotePath::new(&relative_src_path.join(&segment_name))?;
+        let to = RemotePath::new(&relative_dst_path.join(&segment_name))?;
+
+        storage.copy_object(&from, &to).await?;
+    }
+
+    info!(
+        "finished copying segments from {} until {}",
+        from_segment, to_segment
+    );
+    Ok(())
+}
diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index e7538f805c..8d138c701f 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -728,7 +728,7 @@ async fn write_zeroes(file: &mut File, mut count: usize) -> Result<()> {
 }
 
 /// Helper returning full path to WAL segment file and its .partial brother.
-fn wal_file_paths(
+pub fn wal_file_paths(
     timeline_dir: &Utf8Path,
     segno: XLogSegNo,
     wal_seg_size: usize,
diff --git a/scripts/export_import_between_pageservers.py b/scripts/export_import_between_pageservers.py
index ff584bd4b0..980f343047 100755
--- a/scripts/export_import_between_pageservers.py
+++ b/scripts/export_import_between_pageservers.py
@@ -63,7 +63,7 @@ def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str:
     If those files already exist, we will overwrite them.
     Returns basepath for files with captured output.
     """
-    assert type(cmd) is list
+    assert isinstance(cmd, list)
     base = os.path.basename(cmd[0]) + "_{}".format(global_counter())
     basepath = os.path.join(capture_dir, base)
     stdout_filename = basepath + ".stdout"
diff --git a/scripts/reformat b/scripts/reformat
index 8688044f66..3533c4dcb8 100755
--- a/scripts/reformat
+++ b/scripts/reformat
@@ -6,5 +6,5 @@ set -euox pipefail
 echo 'Reformatting Rust code'
 cargo fmt
 echo 'Reformatting Python code'
-poetry run ruff --fix test_runner scripts
-poetry run black test_runner scripts
+poetry run ruff check --fix test_runner scripts
+poetry run ruff format test_runner scripts
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 5b1a8ba27d..001d4e23a9 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1101,8 +1101,8 @@ class AbstractNeonCli(abc.ABC):
         If `local_binpath` is true, then we are invoking a test utility
         """
 
-        assert type(arguments) == list
-        assert type(self.COMMAND) == str
+        assert isinstance(arguments, list)
+        assert isinstance(self.COMMAND, str)
 
         if local_binpath:
             # Test utility
@@ -3032,6 +3032,28 @@ class SafekeeperHttpClient(requests.Session):
         assert isinstance(res_json, dict)
         return res_json
 
+    def copy_timeline(self, tenant_id: TenantId, timeline_id: TimelineId, body: Dict[str, Any]):
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/copy",
+            json=body,
+        )
+        res.raise_for_status()
+
+    def timeline_digest(
+        self, tenant_id: TenantId, timeline_id: TimelineId, from_lsn: Lsn, until_lsn: Lsn
+    ) -> Dict[str, Any]:
+        res = self.get(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/digest",
+            params={
+                "from_lsn": str(from_lsn),
+                "until_lsn": str(until_lsn),
+            },
+        )
+        res.raise_for_status()
+        res_json = res.json()
+        assert isinstance(res_json, dict)
+        return res_json
+
     def timeline_create(
         self,
         tenant_id: TenantId,
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index add6c4288a..a779dcc436 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -326,6 +326,10 @@ class PageserverHttpClient(requests.Session):
         res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/heatmap_upload")
         self.verbose_error(res)
 
+    def tenant_secondary_download(self, tenant_id: TenantId):
+        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/secondary/download")
+        self.verbose_error(res)
+
     def set_tenant_config(self, tenant_id: TenantId, config: dict[str, Any]):
         assert "tenant_id" not in config.keys()
         res = self.put(
@@ -361,9 +365,9 @@ class PageserverHttpClient(requests.Session):
         assert isinstance(res, dict)
         assert TenantId(res["id"]) == tenant_id
         size = res["size"]
-        assert type(size) == int
+        assert isinstance(size, int)
         inputs = res["inputs"]
-        assert type(inputs) is dict
+        assert isinstance(inputs, dict)
         return (size, inputs)
 
     def tenant_size_debug(self, tenant_id: TenantId) -> str:
@@ -714,7 +718,7 @@ class PageserverHttpClient(requests.Session):
         )
         self.verbose_error(res)
 
-        assert res.status_code == 200
+        assert res.status_code in (200, 304)
 
     def evict_all_layers(self, tenant_id: TenantId, timeline_id: TimelineId):
         info = self.layer_map_info(tenant_id, timeline_id)
diff --git a/test_runner/performance/test_perf_olap.py b/test_runner/performance/test_perf_olap.py
index 1e6e9a0174..8a9509ea44 100644
--- a/test_runner/performance/test_perf_olap.py
+++ b/test_runner/performance/test_perf_olap.py
@@ -42,9 +42,10 @@ def test_clickbench_create_pg_stat_statements(remote_compare: RemoteCompare):
 # Please do not alter the label for the query, as it is used to identify it.
 # Labels for ClickBench queries match the labels in ClickBench reports
 # on https://benchmark.clickhouse.com/ (the DB size may differ).
+#
+# Disable auto formatting for the list of queries so that it's easier to read
+# fmt: off
 QUERIES: Tuple[LabelledQuery, ...] = (
-    # Disable `black` formatting for the list of queries so that it's easier to read
-    # fmt: off
     ### ClickBench queries:
     LabelledQuery("Q0",  r"SELECT COUNT(*) FROM hits;"),
     LabelledQuery("Q1",  r"SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0;"),
@@ -96,8 +97,8 @@ QUERIES: Tuple[LabelledQuery, ...] = (
     # LabelledQuery("NQ0", r"..."),
     # LabelledQuery("NQ1", r"..."),
     # ...
-    # fmt: on
 )
+# fmt: on
 
 EXPLAIN_STRING: str = "EXPLAIN (ANALYZE, VERBOSE, BUFFERS, COSTS, SETTINGS, FORMAT JSON)"
 
diff --git a/test_runner/performance/test_wal_backpressure.py b/test_runner/performance/test_wal_backpressure.py
index 3cb4b667ff..7eb244d378 100644
--- a/test_runner/performance/test_wal_backpressure.py
+++ b/test_runner/performance/test_wal_backpressure.py
@@ -32,8 +32,7 @@ def pg_compare(request) -> PgCompare:
     else:
         assert (
             len(x) == 2
-        ), f"request param ({request.param}) should have a format of \
-        `neon_{{safekeepers_enable_fsync}}`"
+        ), f"request param ({request.param}) should have a format of `neon_{{safekeepers_enable_fsync}}`"
 
         # `NeonCompare` interface
         neon_env_builder = request.getfixturevalue("neon_env_builder")
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 32397bbcc1..ed389b1aa2 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -194,12 +194,13 @@ def test_fully_custom_config(positive_env: NeonEnv):
     assert set(our_tenant_config.effective_config.keys()) == set(
         fully_custom_config.keys()
     ), "ensure we cover all config options"
-    assert {
-        k: initial_tenant_config.effective_config[k] != our_tenant_config.effective_config[k]
-        for k in fully_custom_config.keys()
-    } == {
-        k: True for k in fully_custom_config.keys()
-    }, "ensure our custom config has different values than the default config for all config options, so we know we overrode everything"
+    assert (
+        {
+            k: initial_tenant_config.effective_config[k] != our_tenant_config.effective_config[k]
+            for k in fully_custom_config.keys()
+        }
+        == {k: True for k in fully_custom_config.keys()}
+    ), "ensure our custom config has different values than the default config for all config options, so we know we overrode everything"
 
     ps_http.tenant_detach(tenant_id)
     env.pageserver.tenant_attach(tenant_id, config=fully_custom_config)
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index 5a9c2782e6..f9d6d0a934 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -186,9 +186,7 @@ def test_backward_compatibility(
         else:
             raise
 
-    assert (
-        not breaking_changes_allowed
-    ), "Breaking changes are allowed by ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage"
+    assert not breaking_changes_allowed, "Breaking changes are allowed by ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage"
 
 
 @check_ondisk_data_compatibility_if_enabled
@@ -247,9 +245,7 @@ def test_forward_compatibility(
         else:
             raise
 
-    assert (
-        not breaking_changes_allowed
-    ), "Breaking changes are allowed by ALLOW_FORWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage"
+    assert not breaking_changes_allowed, "Breaking changes are allowed by ALLOW_FORWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage"
 
 
 def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, repo_dir: Path):
diff --git a/test_runner/regress/test_crafted_wal_end.py b/test_runner/regress/test_crafted_wal_end.py
index 7ec901af34..01ecc2b95f 100644
--- a/test_runner/regress/test_crafted_wal_end.py
+++ b/test_runner/regress/test_crafted_wal_end.py
@@ -2,7 +2,6 @@ import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder, WalCraft
 
-
 # Restart nodes with WAL end having specially crafted shape, like last record
 # crossing segment boundary, to test decoding issues.
 
diff --git a/test_runner/regress/test_layer_eviction.py b/test_runner/regress/test_layer_eviction.py
index 2cd2406065..efba2033fb 100644
--- a/test_runner/regress/test_layer_eviction.py
+++ b/test_runner/regress/test_layer_eviction.py
@@ -102,9 +102,7 @@ def test_basic_eviction(
         ), f"Did not expect to find {local_layer} layer after evicting"
 
     empty_layers = list(filter(lambda path: path.name != "metadata", timeline_path.glob("*")))
-    assert (
-        not empty_layers
-    ), f"After evicting all layers, timeline {tenant_id}/{timeline_id} should have no layers locally, but got: {empty_layers}"
+    assert not empty_layers, f"After evicting all layers, timeline {tenant_id}/{timeline_id} should have no layers locally, but got: {empty_layers}"
 
     evicted_layer_map_info = client.layer_map_info(tenant_id=tenant_id, timeline_id=timeline_id)
     assert (
diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py
index 340188c1ae..999e077e45 100644
--- a/test_runner/regress/test_layers_from_future.py
+++ b/test_runner/regress/test_layers_from_future.py
@@ -38,6 +38,9 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
     neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
 
     env = neon_env_builder.init_start()
+    env.pageserver.allowed_errors.extend(
+        [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"]
+    )
 
     ps_http = env.pageserver.http_client()
 
diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py
index 573d2139ce..e29db1e252 100644
--- a/test_runner/regress/test_pageserver_api.py
+++ b/test_runner/regress/test_pageserver_api.py
@@ -145,8 +145,7 @@ def expect_updated_msg_lsn(
     last_msg_lsn = Lsn(timeline_details["last_received_msg_lsn"])
     assert (
         prev_msg_lsn is None or prev_msg_lsn < last_msg_lsn
-    ), f"the last received message's LSN {last_msg_lsn} hasn't been updated \
-        compared to the previous message's LSN {prev_msg_lsn}"
+    ), f"the last received message's LSN {last_msg_lsn} hasn't been updated compared to the previous message's LSN {prev_msg_lsn}"
 
     return last_msg_lsn
 
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 8ae4297983..a9eff99a0c 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -1,9 +1,11 @@
 import random
+from pathlib import Path
 from typing import Any, Dict, Optional
 
 import pytest
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver
+from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver, S3Scrubber
+from fixtures.pageserver.utils import assert_prefix_empty, tenant_delete_wait_completed
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
 from fixtures.types import TenantId, TimelineId
 from fixtures.utils import wait_until
@@ -251,6 +253,9 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder):
         flush_ms=5000,
     )
 
+    # Encourage the new location to download while still in secondary mode
+    pageserver_b.http_client().tenant_secondary_download(tenant_id)
+
     migrated_generation = env.attachment_service.attach_hook_issue(tenant_id, pageserver_b.id)
     log.info(f"Acquired generation {migrated_generation} for destination pageserver")
     assert migrated_generation == initial_generation + 1
@@ -258,8 +263,6 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder):
     # Writes and reads still work in AttachedStale.
     workload.validate(pageserver_a.id)
 
-    # TODO: call into secondary mode API hooks to do an upload/download sync
-
     # Generate some more dirty writes: we expect the origin to ingest WAL in
     # in AttachedStale
     workload.churn_rows(64, pageserver_a.id, upload=False)
@@ -369,3 +372,143 @@ def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder):
     log.info(f"Read back heatmap: {heatmap_second}")
     assert heatmap_second != heatmap_first
     validate_heatmap(heatmap_second)
+
+
+def list_layers(pageserver, tenant_id: TenantId, timeline_id: TimelineId) -> list[Path]:
+    """
+    Inspect local storage on a pageserver to discover which layer files are present.
+
+    :return: list of relative paths to layers, from the timeline root.
+    """
+    timeline_path = pageserver.timeline_dir(tenant_id, timeline_id)
+
+    def relative(p: Path) -> Path:
+        return p.relative_to(timeline_path)
+
+    return sorted(
+        list(
+            map(
+                relative,
+                filter(
+                    lambda path: path.name != "metadata"
+                    and "ephemeral" not in path.name
+                    and "temp" not in path.name,
+                    timeline_path.glob("*"),
+                ),
+            )
+        )
+    )
+
+
+def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
+    """
+    Test the overall data flow in secondary mode:
+     - Heatmap uploads from the attached location
+     - Heatmap & layer downloads from the secondary location
+     - Eviction of layers on the attached location results in deletion
+       on the secondary location as well.
+    """
+    neon_env_builder.num_pageservers = 2
+    neon_env_builder.enable_pageserver_remote_storage(
+        remote_storage_kind=RemoteStorageKind.MOCK_S3,
+    )
+    env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
+    assert env.attachment_service is not None
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    ps_attached = env.pageservers[0]
+    ps_secondary = env.pageservers[1]
+
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init(env.pageservers[0].id)
+    workload.write_rows(256, ps_attached.id)
+
+    # Configure a secondary location
+    log.info("Setting up secondary location...")
+    ps_secondary.tenant_location_configure(
+        tenant_id,
+        {
+            "mode": "Secondary",
+            "secondary_conf": {"warm": True},
+            "tenant_conf": {},
+        },
+    )
+    readback_conf = ps_secondary.read_tenant_location_conf(tenant_id)
+    log.info(f"Read back conf: {readback_conf}")
+
+    # Explicit upload/download cycle
+    # ==============================
+    log.info("Synchronizing after initial write...")
+    ps_attached.http_client().tenant_heatmap_upload(tenant_id)
+
+    ps_secondary.http_client().tenant_secondary_download(tenant_id)
+
+    assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
+        ps_secondary, tenant_id, timeline_id
+    )
+
+    # Make changes on attached pageserver, check secondary downloads them
+    # ===================================================================
+    log.info("Synchronizing after subsequent write...")
+    workload.churn_rows(128, ps_attached.id)
+
+    ps_attached.http_client().tenant_heatmap_upload(tenant_id)
+    ps_secondary.http_client().tenant_secondary_download(tenant_id)
+
+    assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
+        ps_secondary, tenant_id, timeline_id
+    )
+
+    # FIXME: this sleep is needed to avoid on-demand promotion of the layers we evict, while
+    # walreceiver is still doing something.
+    import time
+
+    time.sleep(5)
+
+    # Do evictions on attached pageserver, check secondary follows along
+    # ==================================================================
+    log.info("Evicting a layer...")
+    layer_to_evict = list_layers(ps_attached, tenant_id, timeline_id)[0]
+    ps_attached.http_client().evict_layer(tenant_id, timeline_id, layer_name=layer_to_evict.name)
+
+    log.info("Synchronizing after eviction...")
+    ps_attached.http_client().tenant_heatmap_upload(tenant_id)
+    ps_secondary.http_client().tenant_secondary_download(tenant_id)
+
+    assert layer_to_evict not in list_layers(ps_attached, tenant_id, timeline_id)
+    assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
+        ps_secondary, tenant_id, timeline_id
+    )
+
+    # Scrub the remote storage
+    # ========================
+    # This confirms that the scrubber isn't upset by the presence of the heatmap
+    S3Scrubber(neon_env_builder.test_output_dir, neon_env_builder).scan_metadata()
+
+    # Detach secondary and delete tenant
+    # ===================================
+    # This confirms that the heatmap gets cleaned up as well as other normal content.
+    log.info("Detaching secondary location...")
+    ps_secondary.tenant_location_configure(
+        tenant_id,
+        {
+            "mode": "Detached",
+            "secondary_conf": None,
+            "tenant_conf": {},
+        },
+    )
+
+    log.info("Deleting tenant...")
+    tenant_delete_wait_completed(ps_attached.http_client(), tenant_id, 10)
+
+    assert_prefix_empty(
+        neon_env_builder,
+        prefix="/".join(
+            (
+                "tenants",
+                str(tenant_id),
+            )
+        ),
+    )
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index 0dcbb23ad4..c6dbc77885 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -391,8 +391,7 @@ def test_tenant_detach_ignored_tenant(neon_simple_env: NeonEnv):
     tenants_after_detach = [tenant["id"] for tenant in client.tenant_list()]
     assert (
         tenant_id not in tenants_after_detach
-    ), f"Ignored and then detached tenant {tenant_id} \
-        should not be present in pageserver's memory"
+    ), f"Ignored and then detached tenant {tenant_id} should not be present in pageserver's memory"
 
 
 # Creates a tenant, and detaches it with extra paremeter that forces ignored tenant detach.
@@ -430,8 +429,7 @@ def test_tenant_detach_regular_tenant(neon_simple_env: NeonEnv):
     tenants_after_detach = [tenant["id"] for tenant in client.tenant_list()]
     assert (
         tenant_id not in tenants_after_detach
-    ), f"Ignored and then detached tenant {tenant_id} \
-        should not be present in pageserver's memory"
+    ), f"Ignored and then detached tenant {tenant_id} should not be present in pageserver's memory"
 
 
 def test_detach_while_attaching(
@@ -817,9 +815,7 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading(
         if found_broken:
             break
         time.sleep(0.5)
-    assert (
-        found_broken
-    ), f"broken should still be in set, but it is not in the tenant state count: broken={broken}, broken_set={broken_set}"
+    assert found_broken, f"broken should still be in set, but it is not in the tenant state count: broken={broken}, broken_set={broken_set}"
 
     env.pageserver.tenant_load(env.initial_tenant)
 
@@ -837,6 +833,4 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading(
             break
         time.sleep(0.5)
 
-    assert (
-        found_active
-    ), f"reloaded tenant should be active, and broken tenant set item removed: active={active}, broken_set={broken_set}"
+    assert found_active, f"reloaded tenant should be active, and broken tenant set item removed: active={active}, broken_set={broken_set}"
diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py
index dcd7232b1b..1887bca23b 100644
--- a/test_runner/regress/test_tenant_relocation.py
+++ b/test_runner/regress/test_tenant_relocation.py
@@ -161,12 +161,10 @@ def switch_pg_to_new_pageserver(
     files_before_detach = os.listdir(timeline_to_detach_local_path)
     assert (
         "metadata" in files_before_detach
-    ), f"Regular timeline {timeline_to_detach_local_path} should have the metadata file,\
-            but got: {files_before_detach}"
+    ), f"Regular timeline {timeline_to_detach_local_path} should have the metadata file, but got: {files_before_detach}"
     assert (
         len(files_before_detach) >= 2
-    ), f"Regular timeline {timeline_to_detach_local_path} should have at least one layer file,\
-            but got {files_before_detach}"
+    ), f"Regular timeline {timeline_to_detach_local_path} should have at least one layer file, but got {files_before_detach}"
 
     return timeline_to_detach_local_path
 
diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py
index 07fb6dc5ca..6f05d7f7cb 100644
--- a/test_runner/regress/test_tenants_with_remote_storage.py
+++ b/test_runner/regress/test_tenants_with_remote_storage.py
@@ -201,8 +201,8 @@ def test_tenants_attached_after_download(neon_env_builder: NeonEnvBuilder):
         len(restored_timelines) == 1
     ), f"Tenant {tenant_id} should have its timeline reattached after its layer is downloaded from the remote storage"
     restored_timeline = restored_timelines[0]
-    assert restored_timeline["timeline_id"] == str(
-        timeline_id
+    assert (
+        restored_timeline["timeline_id"] == str(timeline_id)
     ), f"Tenant {tenant_id} should have its old timeline {timeline_id} restored from the remote storage"
 
     # Check that we had to retry the downloads
@@ -280,8 +280,8 @@ def test_tenant_redownloads_truncated_file_on_startup(
         len(restored_timelines) == 1
     ), f"Tenant {tenant_id} should have its timeline reattached after its layer is downloaded from the remote storage"
     retored_timeline = restored_timelines[0]
-    assert retored_timeline["timeline_id"] == str(
-        timeline_id
+    assert (
+        retored_timeline["timeline_id"] == str(timeline_id)
     ), f"Tenant {tenant_id} should have its old timeline {timeline_id} restored from the remote storage"
 
     # Request non-incremental logical size. Calculating it needs the layer file that
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index cf8df389c8..b4ce633531 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -566,7 +566,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder):
         f"Pageserver last_record_lsn={pageserver_lsn}; flush_lsn={last_lsn}; lag before replay is {lag / 1024}kb"
     )
 
-    endpoint.stop_and_destroy()
+    endpoint.stop()
     timeline_delete_wait_completed(ps_http, tenant_id, timeline_id)
 
     # Also delete and manually create timeline on safekeepers -- this tests
@@ -1838,3 +1838,83 @@ def test_idle_reconnections(neon_env_builder: NeonEnvBuilder):
     assert final_stats.get("START_REPLICATION", 0) >= 1
     # walproposer should connect to each safekeeper at least once
     assert final_stats.get("START_WAL_PUSH", 0) >= 3
+
+
+@pytest.mark.parametrize("insert_rows", [0, 100, 100000, 500000])
+def test_timeline_copy(neon_env_builder: NeonEnvBuilder, insert_rows: int):
+    target_percents = [10, 50, 90, 100]
+
+    neon_env_builder.num_safekeepers = 3
+    # we need remote storage that supports copy_object S3 API
+    neon_env_builder.enable_safekeeper_remote_storage(RemoteStorageKind.MOCK_S3)
+    env = neon_env_builder.init_start()
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    endpoint = env.endpoints.create_start("main")
+
+    lsns = []
+
+    def remember_lsn():
+        lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
+        lsns.append(lsn)
+        return lsn
+
+    # remember LSN right after timeline creation
+    lsn = remember_lsn()
+    log.info(f"LSN after timeline creation: {lsn}")
+
+    endpoint.safe_psql("create table t(key int, value text)")
+
+    timeline_status = env.safekeepers[0].http_client().timeline_status(tenant_id, timeline_id)
+    timeline_start_lsn = timeline_status.timeline_start_lsn
+    log.info(f"Timeline start LSN: {timeline_start_lsn}")
+
+    current_percent = 0.0
+    for new_percent in target_percents:
+        new_rows = insert_rows * (new_percent - current_percent) / 100
+        current_percent = new_percent
+
+        if new_rows == 0:
+            continue
+
+        endpoint.safe_psql(
+            f"insert into t select generate_series(1, {new_rows}), repeat('payload!', 10)"
+        )
+
+        # remember LSN right after reaching new_percent
+        lsn = remember_lsn()
+        log.info(f"LSN after inserting {new_rows} rows: {lsn}")
+
+    # TODO: would be also good to test cases where not all segments are uploaded to S3
+
+    for lsn in lsns:
+        new_timeline_id = TimelineId.generate()
+        log.info(f"Copying branch for LSN {lsn}, to timeline {new_timeline_id}")
+
+        orig_digest = (
+            env.safekeepers[0]
+            .http_client()
+            .timeline_digest(tenant_id, timeline_id, timeline_start_lsn, lsn)
+        )
+        log.info(f"Original digest: {orig_digest}")
+
+        for sk in env.safekeepers:
+            sk.http_client().copy_timeline(
+                tenant_id,
+                timeline_id,
+                {
+                    "target_timeline_id": str(new_timeline_id),
+                    "until_lsn": str(lsn),
+                },
+            )
+
+            new_digest = sk.http_client().timeline_digest(
+                tenant_id, new_timeline_id, timeline_start_lsn, lsn
+            )
+            log.info(f"Digest after timeline copy on safekeeper {sk.id}: {new_digest}")
+
+            assert orig_digest == new_digest
+
+    # TODO: test timelines can start after copy
diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index 68be0b3617..704e3721d6 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -167,22 +167,21 @@ build: |
       && apt-get update \
       && apt-get install -y \
           build-essential \
-          curl \
+          git \
           libevent-dev \
-          libssl-dev \
-          patchutils \
+          libtool \
           pkg-config
 
-  ENV PGBOUNCER_VERSION 1.21.0
-  ENV PGBOUNCER_GITPATH 1_21_0
+  # Note, we use pgbouncer from neondatabase/pgbouncer fork, which could contain extra commits.
+  # Use `dist_man_MANS=` to skip manpage generation (which requires python3/pandoc)
+  ENV PGBOUNCER_TAG pgbouncer_1_21_0-neon-1
   RUN set -e \
-      && curl -sfSL https://github.com/pgbouncer/pgbouncer/releases/download/pgbouncer_${PGBOUNCER_GITPATH}/pgbouncer-${PGBOUNCER_VERSION}.tar.gz -o pgbouncer-${PGBOUNCER_VERSION}.tar.gz \
-      && tar xzvf pgbouncer-${PGBOUNCER_VERSION}.tar.gz \
-      && cd pgbouncer-${PGBOUNCER_VERSION} \
-      && curl https://github.com/pgbouncer/pgbouncer/commit/a7b3c0a5f4caa9dbe92743d04cf1e28c4c05806c.patch | filterdiff --include a/src/server.c | patch -p1 \
+      && git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/neondatabase/pgbouncer.git pgbouncer \
+      && cd pgbouncer \
+      && ./autogen.sh \
       && LDFLAGS=-static ./configure --prefix=/usr/local/pgbouncer --without-openssl \
-      && make -j $(nproc) \
-      && make install
+      && make -j $(nproc) dist_man_MANS= \
+      && make install dist_man_MANS=
 merge: |
   # tweak nofile limits
   RUN set -e \

From d424f2b7c81b7448729e06917e6d328d91910992 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 15 Jan 2024 09:36:22 +0000
Subject: [PATCH 109/412] empty commit so we can produce a merge commit


From 7234208b36fea736ee1204c5a69a34db8160825e Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 22 Jan 2024 09:14:30 +0000
Subject: [PATCH 110/412] bump shlex (#6421)

## Problem

https://rustsec.org/advisories/RUSTSEC-2024-0006

## Summary of changes

`cargo update -p shlex`

(cherry picked from commit 5559b169535b67850129173e694e5297a5a1a960)
---
 Cargo.lock | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 75337481bb..952034a16b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5031,9 +5031,9 @@ dependencies = [
 
 [[package]]
 name = "shlex"
-version = "1.1.0"
+version = "1.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "43b2853a4d09f215c24cc5489c992ce46052d359b5109343cbafbf26bc62f8a3"
+checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
 
 [[package]]
 name = "signal-hook"

From 299d9474c9e3abc67297d81ad301ca46aaf97535 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Mon, 22 Jan 2024 14:24:10 +0100
Subject: [PATCH 111/412] Proxy: fix gc (#6426)

## Problem

Gc currently doesn't work properly.

## Summary of changes

Change statement on running gc.
---
 proxy/src/cache/project_info.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs
index 7af2118873..57d9e5289d 100644
--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -266,7 +266,7 @@ impl ProjectInfoCacheImpl {
             tokio::time::interval(self.config.gc_interval / (self.cache.shards().len()) as u32);
         loop {
             interval.tick().await;
-            if self.cache.len() <= self.config.size {
+            if self.cache.len() < self.config.size {
                 // If there are not too many entries, wait until the next gc cycle.
                 continue;
             }

From f0b2d4b0535fae693af1a7a18b2bb03e7ce243fb Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 22 Jan 2024 15:27:29 +0100
Subject: [PATCH 112/412] fixup(#6037): actually fix the issue, #6388 failed to
 do so (#6429)

Before this patch, the select! still retured immediately if `futs` was
empty. Must have tested a stale build in my manual testing of #6388.

(cherry picked from commit 15c0df4de7ee71b43526d9850b47c9107efe303e)
---
 pageserver/src/page_service.rs | 37 ++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 15 deletions(-)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 1013131a99..77ce9981f0 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -386,12 +386,18 @@ impl PageServerHandler {
 
     /// Future that completes when we need to shut down the connection.
     ///
-    /// Reasons for need to shut down are:
-    /// - any of the timelines we hold GateGuards for in `shard_timelines` is cancelled
-    /// - task_mgr requests shutdown of the connection
+    /// We currently need to shut down when any of the following happens:
+    /// 1. any of the timelines we hold GateGuards for in `shard_timelines` is cancelled
+    /// 2. task_mgr requests shutdown of the connection
     ///
-    /// The need to check for `task_mgr` cancellation arises mainly from `handle_pagerequests`
-    /// where, at first, `shard_timelines` is empty, see <https://github.com/neondatabase/neon/pull/6388>
+    /// NB on (1): the connection's lifecycle is not actually tied to any of the
+    /// `shard_timelines`s' lifecycles. But it's _necessary_ in the current
+    /// implementation to be responsive to timeline cancellation because
+    /// the connection holds their `GateGuards` open (sored in `shard_timelines`).
+    /// We currently do the easy thing and terminate the connection if any of the
+    /// shard_timelines gets cancelled. But really, we cuold spend more effort
+    /// and simply remove the cancelled timeline from the `shard_timelines`, thereby
+    /// dropping the guard.
     ///
     /// NB: keep in sync with [`Self::is_connection_cancelled`]
     async fn await_connection_cancelled(&self) {
@@ -404,16 +410,17 @@ impl PageServerHandler {
         // immutable &self).  So it's fine to evaluate shard_timelines after the sleep, we don't risk
         // missing any inserts to the map.
 
-        let mut futs = self
-            .shard_timelines
-            .values()
-            .map(|ht| ht.timeline.cancel.cancelled())
-            .collect::<FuturesUnordered<_>>();
-
-        tokio::select! {
-            _ = task_mgr::shutdown_watcher() => { }
-            _ = futs.next() => {}
-        }
+        let mut cancellation_sources = Vec::with_capacity(1 + self.shard_timelines.len());
+        use futures::future::Either;
+        cancellation_sources.push(Either::Left(task_mgr::shutdown_watcher()));
+        cancellation_sources.extend(
+            self.shard_timelines
+                .values()
+                .map(|ht| Either::Right(ht.timeline.cancel.cancelled())),
+        );
+        FuturesUnordered::from_iter(cancellation_sources)
+            .next()
+            .await;
     }
 
     /// Checking variant of [`Self::await_connection_cancelled`].

From 90e689addae554d7f79899379d3b84ff414404da Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 22 Jan 2024 15:50:32 +0000
Subject: [PATCH 113/412] pageserver: mark tenant broken when cancelling attach
 (#6430)

## Problem

When a tenant is in Attaching state, and waiting for the
`concurrent_tenant_warmup` semaphore, it also listens for the tenant
cancellation token. When that token fires, Tenant::attach drops out.
Meanwhile, Tenant::set_stopping waits forever for the tenant to exit
Attaching state.

Fixes: https://github.com/neondatabase/neon/issues/6423

## Summary of changes

- In the absence of a valid state for the tenant, it is set to Broken in
this path. A more elegant solution will require more refactoring, beyond
this minimal fix.

(cherry picked from commit 93572a3e99f572f51529b3fbb3b11dafa88f7f5c)
---
 pageserver/src/tenant.rs | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index ce99569beb..1d9b91c9ce 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -716,6 +716,10 @@ impl Tenant {
                             // stayed in Activating for such a long time that shutdown found it in
                             // that state.
                             tracing::info!(state=%tenant_clone.current_state(), "Tenant shut down before activation");
+                            // Make the tenant broken so that set_stopping will not hang waiting for it to leave
+                            // the Attaching state.  This is an over-reaction (nothing really broke, the tenant is
+                            // just shutting down), but ensures progress.
+                            make_broken(&tenant_clone, anyhow::anyhow!("Shut down while Attaching"));
                             return Ok(());
                         },
                     )

From d0cb4b88c8178a8373d6846c160ad32271435786 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 5 Feb 2024 10:53:37 +0100
Subject: [PATCH 114/412] Don't preserve temp files on creation errors of delta
 layers (#6612)

There is currently no cleanup done after a delta layer creation error,
so delta layers can accumulate. The problem gets worse as the operation
gets retried and delta layers accumulate on the disk. Therefore, delete
them from disk (if something has been written to disk).
---
 pageserver/src/tenant/storage_layer/delta_layer.rs | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index ec031d6089..2a51884c0b 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -609,7 +609,19 @@ impl DeltaLayerWriter {
         key_end: Key,
         timeline: &Arc<Timeline>,
     ) -> anyhow::Result<ResidentLayer> {
-        self.inner.take().unwrap().finish(key_end, timeline).await
+        let inner = self.inner.take().unwrap();
+        let temp_path = inner.path.clone();
+        let result = inner.finish(key_end, timeline).await;
+        // The delta layer files can sometimes be really large. Clean them up.
+        if result.is_err() {
+            tracing::warn!(
+                "Cleaning up temporary delta file {temp_path} after error during writing"
+            );
+            if let Err(e) = std::fs::remove_file(&temp_path) {
+                tracing::warn!("Error cleaning up temporary delta layer file {temp_path}: {e:?}")
+            }
+        }
+        result
     }
 }
 

From 1cef395266bf7b439f1f14abe78d1ce8ef25d8ac Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Mon, 12 Feb 2024 14:04:46 +0100
Subject: [PATCH 115/412] Proxy: copy bidirectional fork (#6720)

## Problem

`tokio::io::copy_bidirectional` doesn't close the connection once one of
the sides closes it. It's not really suitable for the postgres protocol.

## Summary of changes

Fork `copy_bidirectional` and initiate a shutdown for both connections.

---------

Co-authored-by: Conrad Ludgate <conradludgate@gmail.com>
---
 proxy/src/proxy.rs                    |   1 +
 proxy/src/proxy/copy_bidirectional.rs | 256 ++++++++++++++++++++++++++
 proxy/src/proxy/passthrough.rs        |   2 +-
 3 files changed, 258 insertions(+), 1 deletion(-)
 create mode 100644 proxy/src/proxy/copy_bidirectional.rs

diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 50e22ec72a..77aadb6f28 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -2,6 +2,7 @@
 mod tests;
 
 pub mod connect_compute;
+mod copy_bidirectional;
 pub mod handshake;
 pub mod passthrough;
 pub mod retry;
diff --git a/proxy/src/proxy/copy_bidirectional.rs b/proxy/src/proxy/copy_bidirectional.rs
new file mode 100644
index 0000000000..2ecc1151da
--- /dev/null
+++ b/proxy/src/proxy/copy_bidirectional.rs
@@ -0,0 +1,256 @@
+use tokio::io::{AsyncRead, AsyncWrite, ReadBuf};
+
+use std::future::poll_fn;
+use std::io;
+use std::pin::Pin;
+use std::task::{ready, Context, Poll};
+
+#[derive(Debug)]
+enum TransferState {
+    Running(CopyBuffer),
+    ShuttingDown(u64),
+    Done(u64),
+}
+
+fn transfer_one_direction<A, B>(
+    cx: &mut Context<'_>,
+    state: &mut TransferState,
+    r: &mut A,
+    w: &mut B,
+) -> Poll<io::Result<u64>>
+where
+    A: AsyncRead + AsyncWrite + Unpin + ?Sized,
+    B: AsyncRead + AsyncWrite + Unpin + ?Sized,
+{
+    let mut r = Pin::new(r);
+    let mut w = Pin::new(w);
+    loop {
+        match state {
+            TransferState::Running(buf) => {
+                let count = ready!(buf.poll_copy(cx, r.as_mut(), w.as_mut()))?;
+                *state = TransferState::ShuttingDown(count);
+            }
+            TransferState::ShuttingDown(count) => {
+                ready!(w.as_mut().poll_shutdown(cx))?;
+                *state = TransferState::Done(*count);
+            }
+            TransferState::Done(count) => return Poll::Ready(Ok(*count)),
+        }
+    }
+}
+
+pub(super) async fn copy_bidirectional<A, B>(
+    a: &mut A,
+    b: &mut B,
+) -> Result<(u64, u64), std::io::Error>
+where
+    A: AsyncRead + AsyncWrite + Unpin + ?Sized,
+    B: AsyncRead + AsyncWrite + Unpin + ?Sized,
+{
+    let mut a_to_b = TransferState::Running(CopyBuffer::new());
+    let mut b_to_a = TransferState::Running(CopyBuffer::new());
+
+    poll_fn(|cx| {
+        let mut a_to_b_result = transfer_one_direction(cx, &mut a_to_b, a, b)?;
+        let mut b_to_a_result = transfer_one_direction(cx, &mut b_to_a, b, a)?;
+
+        // Early termination checks
+        if let TransferState::Done(_) = a_to_b {
+            if let TransferState::Running(buf) = &b_to_a {
+                // Initiate shutdown
+                b_to_a = TransferState::ShuttingDown(buf.amt);
+                b_to_a_result = transfer_one_direction(cx, &mut b_to_a, b, a)?;
+            }
+        }
+        if let TransferState::Done(_) = b_to_a {
+            if let TransferState::Running(buf) = &a_to_b {
+                // Initiate shutdown
+                a_to_b = TransferState::ShuttingDown(buf.amt);
+                a_to_b_result = transfer_one_direction(cx, &mut a_to_b, a, b)?;
+            }
+        }
+
+        // It is not a problem if ready! returns early ... (comment remains the same)
+        let a_to_b = ready!(a_to_b_result);
+        let b_to_a = ready!(b_to_a_result);
+
+        Poll::Ready(Ok((a_to_b, b_to_a)))
+    })
+    .await
+}
+
+#[derive(Debug)]
+pub(super) struct CopyBuffer {
+    read_done: bool,
+    need_flush: bool,
+    pos: usize,
+    cap: usize,
+    amt: u64,
+    buf: Box<[u8]>,
+}
+const DEFAULT_BUF_SIZE: usize = 8 * 1024;
+
+impl CopyBuffer {
+    pub(super) fn new() -> Self {
+        Self {
+            read_done: false,
+            need_flush: false,
+            pos: 0,
+            cap: 0,
+            amt: 0,
+            buf: vec![0; DEFAULT_BUF_SIZE].into_boxed_slice(),
+        }
+    }
+
+    fn poll_fill_buf<R>(
+        &mut self,
+        cx: &mut Context<'_>,
+        reader: Pin<&mut R>,
+    ) -> Poll<io::Result<()>>
+    where
+        R: AsyncRead + ?Sized,
+    {
+        let me = &mut *self;
+        let mut buf = ReadBuf::new(&mut me.buf);
+        buf.set_filled(me.cap);
+
+        let res = reader.poll_read(cx, &mut buf);
+        if let Poll::Ready(Ok(())) = res {
+            let filled_len = buf.filled().len();
+            me.read_done = me.cap == filled_len;
+            me.cap = filled_len;
+        }
+        res
+    }
+
+    fn poll_write_buf<R, W>(
+        &mut self,
+        cx: &mut Context<'_>,
+        mut reader: Pin<&mut R>,
+        mut writer: Pin<&mut W>,
+    ) -> Poll<io::Result<usize>>
+    where
+        R: AsyncRead + ?Sized,
+        W: AsyncWrite + ?Sized,
+    {
+        let me = &mut *self;
+        match writer.as_mut().poll_write(cx, &me.buf[me.pos..me.cap]) {
+            Poll::Pending => {
+                // Top up the buffer towards full if we can read a bit more
+                // data - this should improve the chances of a large write
+                if !me.read_done && me.cap < me.buf.len() {
+                    ready!(me.poll_fill_buf(cx, reader.as_mut()))?;
+                }
+                Poll::Pending
+            }
+            res => res,
+        }
+    }
+
+    pub(super) fn poll_copy<R, W>(
+        &mut self,
+        cx: &mut Context<'_>,
+        mut reader: Pin<&mut R>,
+        mut writer: Pin<&mut W>,
+    ) -> Poll<io::Result<u64>>
+    where
+        R: AsyncRead + ?Sized,
+        W: AsyncWrite + ?Sized,
+    {
+        loop {
+            // If our buffer is empty, then we need to read some data to
+            // continue.
+            if self.pos == self.cap && !self.read_done {
+                self.pos = 0;
+                self.cap = 0;
+
+                match self.poll_fill_buf(cx, reader.as_mut()) {
+                    Poll::Ready(Ok(())) => (),
+                    Poll::Ready(Err(err)) => return Poll::Ready(Err(err)),
+                    Poll::Pending => {
+                        // Try flushing when the reader has no progress to avoid deadlock
+                        // when the reader depends on buffered writer.
+                        if self.need_flush {
+                            ready!(writer.as_mut().poll_flush(cx))?;
+                            self.need_flush = false;
+                        }
+
+                        return Poll::Pending;
+                    }
+                }
+            }
+
+            // If our buffer has some data, let's write it out!
+            while self.pos < self.cap {
+                let i = ready!(self.poll_write_buf(cx, reader.as_mut(), writer.as_mut()))?;
+                if i == 0 {
+                    return Poll::Ready(Err(io::Error::new(
+                        io::ErrorKind::WriteZero,
+                        "write zero byte into writer",
+                    )));
+                } else {
+                    self.pos += i;
+                    self.amt += i as u64;
+                    self.need_flush = true;
+                }
+            }
+
+            // If pos larger than cap, this loop will never stop.
+            // In particular, user's wrong poll_write implementation returning
+            // incorrect written length may lead to thread blocking.
+            debug_assert!(
+                self.pos <= self.cap,
+                "writer returned length larger than input slice"
+            );
+
+            // If we've written all the data and we've seen EOF, flush out the
+            // data and finish the transfer.
+            if self.pos == self.cap && self.read_done {
+                ready!(writer.as_mut().poll_flush(cx))?;
+                return Poll::Ready(Ok(self.amt));
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use tokio::io::AsyncWriteExt;
+
+    #[tokio::test]
+    async fn test_early_termination_a_to_d() {
+        let (mut a_mock, mut b_mock) = tokio::io::duplex(8); // Create a mock duplex stream
+        let (mut c_mock, mut d_mock) = tokio::io::duplex(32); // Create a mock duplex stream
+
+        // Simulate 'a' finishing while there's still data for 'b'
+        a_mock.write_all(b"hello").await.unwrap();
+        a_mock.shutdown().await.unwrap();
+        d_mock.write_all(b"Neon Serverless Postgres").await.unwrap();
+
+        let result = copy_bidirectional(&mut b_mock, &mut c_mock).await.unwrap();
+
+        // Assert correct transferred amounts
+        let (a_to_d_count, d_to_a_count) = result;
+        assert_eq!(a_to_d_count, 5); // 'hello' was transferred
+        assert!(d_to_a_count <= 8); // response only partially transferred or not at all
+    }
+
+    #[tokio::test]
+    async fn test_early_termination_d_to_a() {
+        let (mut a_mock, mut b_mock) = tokio::io::duplex(32); // Create a mock duplex stream
+        let (mut c_mock, mut d_mock) = tokio::io::duplex(8); // Create a mock duplex stream
+
+        // Simulate 'a' finishing while there's still data for 'b'
+        d_mock.write_all(b"hello").await.unwrap();
+        d_mock.shutdown().await.unwrap();
+        a_mock.write_all(b"Neon Serverless Postgres").await.unwrap();
+
+        let result = copy_bidirectional(&mut b_mock, &mut c_mock).await.unwrap();
+
+        // Assert correct transferred amounts
+        let (a_to_d_count, d_to_a_count) = result;
+        assert_eq!(d_to_a_count, 5); // 'hello' was transferred
+        assert!(a_to_d_count <= 8); // response only partially transferred or not at all
+    }
+}
diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs
index b7018c6fb5..c98f68d8d1 100644
--- a/proxy/src/proxy/passthrough.rs
+++ b/proxy/src/proxy/passthrough.rs
@@ -45,7 +45,7 @@ pub async fn proxy_pass(
 
     // Starting from here we only proxy the client's traffic.
     info!("performing the proxy pass...");
-    let _ = tokio::io::copy_bidirectional(&mut client, &mut compute).await?;
+    let _ = crate::proxy::copy_bidirectional::copy_bidirectional(&mut client, &mut compute).await?;
 
     Ok(())
 }

From 30b295b017e086d1478e16f6d8cff878fe66171c Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 12 Feb 2024 13:14:06 +0000
Subject: [PATCH 116/412] proxy: some more parquet data (#6711)

## Summary of changes

add auth_method and database to the parquet logs
---
 proxy/src/auth/backend.rs             |  8 ++--
 proxy/src/auth/backend/classic.rs     |  8 ++--
 proxy/src/auth/backend/hacks.rs       | 12 +++--
 proxy/src/auth/backend/link.rs        |  2 +
 proxy/src/auth/credentials.rs         |  3 ++
 proxy/src/auth/flow.rs                | 17 ++++++-
 proxy/src/context.rs                  | 23 ++++++++-
 proxy/src/context/parquet.rs          | 69 ++++++++++++++++-----------
 proxy/src/proxy/tests.rs              |  2 +-
 proxy/src/serverless/sql_over_http.rs |  9 +++-
 10 files changed, 104 insertions(+), 49 deletions(-)

diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index fa2782bee3..c9f21f1cf5 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -194,8 +194,7 @@ async fn auth_quirks(
     // We now expect to see a very specific payload in the place of password.
     let (info, unauthenticated_password) = match user_info.try_into() {
         Err(info) => {
-            let res = hacks::password_hack_no_authentication(info, client, &mut ctx.latency_timer)
-                .await?;
+            let res = hacks::password_hack_no_authentication(ctx, info, client).await?;
 
             ctx.set_endpoint_id(res.info.endpoint.clone());
             tracing::Span::current().record("ep", &tracing::field::display(&res.info.endpoint));
@@ -276,11 +275,12 @@ async fn authenticate_with_secret(
     // Perform cleartext auth if we're allowed to do that.
     // Currently, we use it for websocket connections (latency).
     if allow_cleartext {
-        return hacks::authenticate_cleartext(info, client, &mut ctx.latency_timer, secret).await;
+        ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
+        return hacks::authenticate_cleartext(ctx, info, client, secret).await;
     }
 
     // Finally, proceed with the main auth flow (SCRAM-based).
-    classic::authenticate(info, client, config, &mut ctx.latency_timer, secret).await
+    classic::authenticate(ctx, info, client, config, secret).await
 }
 
 impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs
index 745dd75107..e855843bc3 100644
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -4,7 +4,7 @@ use crate::{
     compute,
     config::AuthenticationConfig,
     console::AuthSecret,
-    metrics::LatencyTimer,
+    context::RequestMonitoring,
     sasl,
     stream::{PqStream, Stream},
 };
@@ -12,10 +12,10 @@ use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{info, warn};
 
 pub(super) async fn authenticate(
+    ctx: &mut RequestMonitoring,
     creds: ComputeUserInfo,
     client: &mut PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
     config: &'static AuthenticationConfig,
-    latency_timer: &mut LatencyTimer,
     secret: AuthSecret,
 ) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
     let flow = AuthFlow::new(client);
@@ -27,13 +27,11 @@ pub(super) async fn authenticate(
         }
         AuthSecret::Scram(secret) => {
             info!("auth endpoint chooses SCRAM");
-            let scram = auth::Scram(&secret);
+            let scram = auth::Scram(&secret, &mut *ctx);
 
             let auth_outcome = tokio::time::timeout(
                 config.scram_protocol_timeout,
                 async {
-                    // pause the timer while we communicate with the client
-                    let _paused = latency_timer.pause();
 
                     flow.begin(scram).await.map_err(|error| {
                         warn!(?error, "error sending scram acknowledgement");
diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs
index b6c1a92d3c..9f60b709d4 100644
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -4,7 +4,7 @@ use super::{
 use crate::{
     auth::{self, AuthFlow},
     console::AuthSecret,
-    metrics::LatencyTimer,
+    context::RequestMonitoring,
     sasl,
     stream::{self, Stream},
 };
@@ -16,15 +16,16 @@ use tracing::{info, warn};
 /// These properties are benefical for serverless JS workers, so we
 /// use this mechanism for websocket connections.
 pub async fn authenticate_cleartext(
+    ctx: &mut RequestMonitoring,
     info: ComputeUserInfo,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
-    latency_timer: &mut LatencyTimer,
     secret: AuthSecret,
 ) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
     warn!("cleartext auth flow override is enabled, proceeding");
+    ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
 
     // pause the timer while we communicate with the client
-    let _paused = latency_timer.pause();
+    let _paused = ctx.latency_timer.pause();
 
     let auth_outcome = AuthFlow::new(client)
         .begin(auth::CleartextPassword(secret))
@@ -47,14 +48,15 @@ pub async fn authenticate_cleartext(
 /// Similar to [`authenticate_cleartext`], but there's a specific password format,
 /// and passwords are not yet validated (we don't know how to validate them!)
 pub async fn password_hack_no_authentication(
+    ctx: &mut RequestMonitoring,
     info: ComputeUserInfoNoEndpoint,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
-    latency_timer: &mut LatencyTimer,
 ) -> auth::Result<ComputeCredentials<Vec<u8>>> {
     warn!("project not specified, resorting to the password hack auth flow");
+    ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
 
     // pause the timer while we communicate with the client
-    let _paused = latency_timer.pause();
+    let _paused = ctx.latency_timer.pause();
 
     let payload = AuthFlow::new(client)
         .begin(auth::PasswordHack)
diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs
index c71637dd1a..bf9ebf4c18 100644
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -61,6 +61,8 @@ pub(super) async fn authenticate(
     link_uri: &reqwest::Url,
     client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
 ) -> auth::Result<NodeInfo> {
+    ctx.set_auth_method(crate::context::AuthMethod::Web);
+
     // registering waiter can fail if we get unlucky with rng.
     // just try again.
     let (psql_session_id, waiter) = loop {
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index d32609e44c..d318b3be54 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -99,6 +99,9 @@ impl ComputeUserInfoMaybeEndpoint {
         // record the values if we have them
         ctx.set_application(params.get("application_name").map(SmolStr::from));
         ctx.set_user(user.clone());
+        if let Some(dbname) = params.get("database") {
+            ctx.set_dbname(dbname.into());
+        }
 
         // Project name might be passed via PG's command-line options.
         let endpoint_option = params
diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs
index c2783e236c..dce73138c6 100644
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -4,9 +4,11 @@ use super::{backend::ComputeCredentialKeys, AuthErrorImpl, PasswordHackPayload};
 use crate::{
     config::TlsServerEndPoint,
     console::AuthSecret,
+    context::RequestMonitoring,
     sasl, scram,
     stream::{PqStream, Stream},
 };
+use postgres_protocol::authentication::sasl::{SCRAM_SHA_256, SCRAM_SHA_256_PLUS};
 use pq_proto::{BeAuthenticationSaslMessage, BeMessage, BeMessage as Be};
 use std::io;
 use tokio::io::{AsyncRead, AsyncWrite};
@@ -23,7 +25,7 @@ pub trait AuthMethod {
 pub struct Begin;
 
 /// Use [SCRAM](crate::scram)-based auth in [`AuthFlow`].
-pub struct Scram<'a>(pub &'a scram::ServerSecret);
+pub struct Scram<'a>(pub &'a scram::ServerSecret, pub &'a mut RequestMonitoring);
 
 impl AuthMethod for Scram<'_> {
     #[inline(always)]
@@ -138,6 +140,11 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, CleartextPassword> {
 impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
     /// Perform user authentication. Raise an error in case authentication failed.
     pub async fn authenticate(self) -> super::Result<sasl::Outcome<scram::ScramKey>> {
+        let Scram(secret, ctx) = self.state;
+
+        // pause the timer while we communicate with the client
+        let _paused = ctx.latency_timer.pause();
+
         // Initial client message contains the chosen auth method's name.
         let msg = self.stream.read_password_message().await?;
         let sasl = sasl::FirstMessage::parse(&msg)
@@ -148,9 +155,15 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
             return Err(super::AuthError::bad_auth_method(sasl.method));
         }
 
+        match sasl.method {
+            SCRAM_SHA_256 => ctx.auth_method = Some(crate::context::AuthMethod::ScramSha256),
+            SCRAM_SHA_256_PLUS => {
+                ctx.auth_method = Some(crate::context::AuthMethod::ScramSha256Plus)
+            }
+            _ => {}
+        }
         info!("client chooses {}", sasl.method);
 
-        let secret = self.state.0;
         let outcome = sasl::SaslStream::new(self.stream, sasl.message)
             .authenticate(scram::Exchange::new(
                 secret,
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index d2bf3f68d3..0cea53ae63 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -11,7 +11,7 @@ use crate::{
     console::messages::MetricsAuxInfo,
     error::ErrorKind,
     metrics::{LatencyTimer, ENDPOINT_ERRORS_BY_KIND, ERROR_BY_KIND},
-    BranchId, EndpointId, ProjectId, RoleName,
+    BranchId, DbName, EndpointId, ProjectId, RoleName,
 };
 
 pub mod parquet;
@@ -34,9 +34,11 @@ pub struct RequestMonitoring {
     project: Option<ProjectId>,
     branch: Option<BranchId>,
     endpoint_id: Option<EndpointId>,
+    dbname: Option<DbName>,
     user: Option<RoleName>,
     application: Option<SmolStr>,
     error_kind: Option<ErrorKind>,
+    pub(crate) auth_method: Option<AuthMethod>,
     success: bool,
 
     // extra
@@ -45,6 +47,15 @@ pub struct RequestMonitoring {
     pub latency_timer: LatencyTimer,
 }
 
+#[derive(Clone, Debug)]
+pub enum AuthMethod {
+    // aka link aka passwordless
+    Web,
+    ScramSha256,
+    ScramSha256Plus,
+    Cleartext,
+}
+
 impl RequestMonitoring {
     pub fn new(
         session_id: Uuid,
@@ -62,9 +73,11 @@ impl RequestMonitoring {
             project: None,
             branch: None,
             endpoint_id: None,
+            dbname: None,
             user: None,
             application: None,
             error_kind: None,
+            auth_method: None,
             success: false,
 
             sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()),
@@ -106,10 +119,18 @@ impl RequestMonitoring {
         self.application = app.or_else(|| self.application.clone());
     }
 
+    pub fn set_dbname(&mut self, dbname: DbName) {
+        self.dbname = Some(dbname);
+    }
+
     pub fn set_user(&mut self, user: RoleName) {
         self.user = Some(user);
     }
 
+    pub fn set_auth_method(&mut self, auth_method: AuthMethod) {
+        self.auth_method = Some(auth_method);
+    }
+
     pub fn set_error_kind(&mut self, kind: ErrorKind) {
         ERROR_BY_KIND
             .with_label_values(&[kind.to_metric_label()])
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index 0fe46915bc..ad22829183 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -84,8 +84,10 @@ struct RequestData {
     username: Option<String>,
     application_name: Option<String>,
     endpoint_id: Option<String>,
+    database: Option<String>,
     project: Option<String>,
     branch: Option<String>,
+    auth_method: Option<&'static str>,
     error: Option<&'static str>,
     /// Success is counted if we form a HTTP response with sql rows inside
     /// Or if we make it to proxy_pass
@@ -104,8 +106,15 @@ impl From<RequestMonitoring> for RequestData {
             username: value.user.as_deref().map(String::from),
             application_name: value.application.as_deref().map(String::from),
             endpoint_id: value.endpoint_id.as_deref().map(String::from),
+            database: value.dbname.as_deref().map(String::from),
             project: value.project.as_deref().map(String::from),
             branch: value.branch.as_deref().map(String::from),
+            auth_method: value.auth_method.as_ref().map(|x| match x {
+                super::AuthMethod::Web => "web",
+                super::AuthMethod::ScramSha256 => "scram_sha_256",
+                super::AuthMethod::ScramSha256Plus => "scram_sha_256_plus",
+                super::AuthMethod::Cleartext => "cleartext",
+            }),
             protocol: value.protocol,
             region: value.region,
             error: value.error_kind.as_ref().map(|e| e.to_metric_label()),
@@ -431,8 +440,10 @@ mod tests {
             application_name: Some("test".to_owned()),
             username: Some(hex::encode(rng.gen::<[u8; 4]>())),
             endpoint_id: Some(hex::encode(rng.gen::<[u8; 16]>())),
+            database: Some(hex::encode(rng.gen::<[u8; 16]>())),
             project: Some(hex::encode(rng.gen::<[u8; 16]>())),
             branch: Some(hex::encode(rng.gen::<[u8; 16]>())),
+            auth_method: None,
             protocol: ["tcp", "ws", "http"][rng.gen_range(0..3)],
             region: "us-east-1",
             error: None,
@@ -505,15 +516,15 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1087635, 3, 6000),
-                (1087288, 3, 6000),
-                (1087444, 3, 6000),
-                (1087572, 3, 6000),
-                (1087468, 3, 6000),
-                (1087500, 3, 6000),
-                (1087533, 3, 6000),
-                (1087566, 3, 6000),
-                (362671, 1, 2000)
+                (1313727, 3, 6000),
+                (1313720, 3, 6000),
+                (1313780, 3, 6000),
+                (1313737, 3, 6000),
+                (1313867, 3, 6000),
+                (1313709, 3, 6000),
+                (1313501, 3, 6000),
+                (1313737, 3, 6000),
+                (438118, 1, 2000)
             ],
         );
 
@@ -543,11 +554,11 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1028637, 5, 10000),
-                (1031969, 5, 10000),
-                (1019900, 5, 10000),
-                (1020365, 5, 10000),
-                (1025010, 5, 10000)
+                (1219459, 5, 10000),
+                (1225609, 5, 10000),
+                (1227403, 5, 10000),
+                (1226765, 5, 10000),
+                (1218043, 5, 10000)
             ],
         );
 
@@ -579,11 +590,11 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1210770, 6, 12000),
-                (1211036, 6, 12000),
-                (1210990, 6, 12000),
-                (1210861, 6, 12000),
-                (202073, 1, 2000)
+                (1205106, 5, 10000),
+                (1204837, 5, 10000),
+                (1205130, 5, 10000),
+                (1205118, 5, 10000),
+                (1205373, 5, 10000)
             ],
         );
 
@@ -608,15 +619,15 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1087635, 3, 6000),
-                (1087288, 3, 6000),
-                (1087444, 3, 6000),
-                (1087572, 3, 6000),
-                (1087468, 3, 6000),
-                (1087500, 3, 6000),
-                (1087533, 3, 6000),
-                (1087566, 3, 6000),
-                (362671, 1, 2000)
+                (1313727, 3, 6000),
+                (1313720, 3, 6000),
+                (1313780, 3, 6000),
+                (1313737, 3, 6000),
+                (1313867, 3, 6000),
+                (1313709, 3, 6000),
+                (1313501, 3, 6000),
+                (1313737, 3, 6000),
+                (438118, 1, 2000)
             ],
         );
 
@@ -653,7 +664,7 @@ mod tests {
         // files are smaller than the size threshold, but they took too long to fill so were flushed early
         assert_eq!(
             file_stats,
-            [(545264, 2, 3001), (545025, 2, 3000), (544857, 2, 2999)],
+            [(658383, 2, 3001), (658097, 2, 3000), (657893, 2, 2999)],
         );
 
         tmpdir.close().unwrap();
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 3e961afb41..5bb43c0375 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -144,7 +144,7 @@ impl TestAuth for Scram {
         stream: &mut PqStream<Stream<S>>,
     ) -> anyhow::Result<()> {
         let outcome = auth::AuthFlow::new(stream)
-            .begin(auth::Scram(&self.0))
+            .begin(auth::Scram(&self.0, &mut RequestMonitoring::test()))
             .await?
             .authenticate()
             .await?;
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 54424360c4..e9f868d51e 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -36,6 +36,7 @@ use crate::error::ReportableError;
 use crate::metrics::HTTP_CONTENT_LENGTH;
 use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE;
 use crate::proxy::NeonOptions;
+use crate::DbName;
 use crate::RoleName;
 
 use super::backend::PoolingBackend;
@@ -117,6 +118,9 @@ fn get_conn_info(
     headers: &HeaderMap,
     tls: &TlsConfig,
 ) -> Result<ConnInfo, ConnInfoError> {
+    // HTTP only uses cleartext (for now and likely always)
+    ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
+
     let connection_string = headers
         .get("Neon-Connection-String")
         .ok_or(ConnInfoError::InvalidHeader("Neon-Connection-String"))?
@@ -134,7 +138,8 @@ fn get_conn_info(
         .path_segments()
         .ok_or(ConnInfoError::MissingDbName)?;
 
-    let dbname = url_path.next().ok_or(ConnInfoError::InvalidDbName)?;
+    let dbname: DbName = url_path.next().ok_or(ConnInfoError::InvalidDbName)?.into();
+    ctx.set_dbname(dbname.clone());
 
     let username = RoleName::from(urlencoding::decode(connection_url.username())?);
     if username.is_empty() {
@@ -174,7 +179,7 @@ fn get_conn_info(
 
     Ok(ConnInfo {
         user_info,
-        dbname: dbname.into(),
+        dbname,
         password: match password {
             std::borrow::Cow::Borrowed(b) => b.into(),
             std::borrow::Cow::Owned(b) => b.into(),

From f1347f24173bd61c2c4469fca2336c52b0980746 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 12 Feb 2024 15:03:45 +0000
Subject: [PATCH 117/412] proxy: add more http logging (#6726)

## Problem

hard to see where time is taken during HTTP flow.

## Summary of changes

add a lot more for query state. add a conn_id field to the sql-over-http
span
---
 proxy/src/metrics.rs                  |  5 ++--
 proxy/src/serverless/backend.rs       |  8 +++----
 proxy/src/serverless/conn_pool.rs     | 22 +++++-------------
 proxy/src/serverless/sql_over_http.rs | 33 +++++++++++++++++++++++----
 4 files changed, 41 insertions(+), 27 deletions(-)

diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index ccf89f9b05..f7f162a075 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -200,8 +200,9 @@ impl LatencyTimer {
 
     pub fn success(&mut self) {
         // stop the stopwatch and record the time that we have accumulated
-        let start = self.start.take().expect("latency timer should be started");
-        self.accumulated += start.elapsed();
+        if let Some(start) = self.start.take() {
+            self.accumulated += start.elapsed();
+        }
 
         // success
         self.outcome = "success";
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 8285da68d7..156002006d 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -1,7 +1,7 @@
 use std::{sync::Arc, time::Duration};
 
 use async_trait::async_trait;
-use tracing::info;
+use tracing::{field::display, info};
 
 use crate::{
     auth::{backend::ComputeCredentialKeys, check_peer_addr_is_in_list, AuthError},
@@ -15,7 +15,7 @@ use crate::{
     proxy::connect_compute::ConnectMechanism,
 };
 
-use super::conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool, APP_NAME};
+use super::conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool};
 
 pub struct PoolingBackend {
     pub pool: Arc<GlobalConnPool<tokio_postgres::Client>>,
@@ -81,8 +81,8 @@ impl PoolingBackend {
             return Ok(client);
         }
         let conn_id = uuid::Uuid::new_v4();
-        info!(%conn_id, "pool: opening a new connection '{conn_info}'");
-        ctx.set_application(Some(APP_NAME));
+        tracing::Span::current().record("conn_id", display(conn_id));
+        info!("pool: opening a new connection '{conn_info}'");
         let backend = self
             .config
             .auth_backend
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index f4e5b145c5..53e7c1c2ee 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -4,7 +4,6 @@ use metrics::IntCounterPairGuard;
 use parking_lot::RwLock;
 use rand::Rng;
 use smallvec::SmallVec;
-use smol_str::SmolStr;
 use std::{collections::HashMap, pin::pin, sync::Arc, sync::Weak, time::Duration};
 use std::{
     fmt,
@@ -31,8 +30,6 @@ use tracing::{info, info_span, Instrument};
 
 use super::backend::HttpConnError;
 
-pub const APP_NAME: SmolStr = SmolStr::new_inline("/sql_over_http");
-
 #[derive(Debug, Clone)]
 pub struct ConnInfo {
     pub user_info: ComputeUserInfo,
@@ -379,12 +376,13 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
                 info!("pool: cached connection '{conn_info}' is closed, opening a new one");
                 return Ok(None);
             } else {
-                info!("pool: reusing connection '{conn_info}'");
-                client.session.send(ctx.session_id)?;
+                tracing::Span::current().record("conn_id", tracing::field::display(client.conn_id));
                 tracing::Span::current().record(
                     "pid",
                     &tracing::field::display(client.inner.get_process_id()),
                 );
+                info!("pool: reusing connection '{conn_info}'");
+                client.session.send(ctx.session_id)?;
                 ctx.latency_timer.pool_hit();
                 ctx.latency_timer.success();
                 return Ok(Some(Client::new(client, conn_info.clone(), endpoint_pool)));
@@ -577,7 +575,6 @@ pub struct Client<C: ClientInnerExt> {
 }
 
 pub struct Discard<'a, C: ClientInnerExt> {
-    conn_id: uuid::Uuid,
     conn_info: &'a ConnInfo,
     pool: &'a mut Weak<RwLock<EndpointConnPool<C>>>,
 }
@@ -603,14 +600,7 @@ impl<C: ClientInnerExt> Client<C> {
             span: _,
         } = self;
         let inner = inner.as_mut().expect("client inner should not be removed");
-        (
-            &mut inner.inner,
-            Discard {
-                pool,
-                conn_info,
-                conn_id: inner.conn_id,
-            },
-        )
+        (&mut inner.inner, Discard { pool, conn_info })
     }
 
     pub fn check_idle(&mut self, status: ReadyForQueryStatus) {
@@ -625,13 +615,13 @@ impl<C: ClientInnerExt> Discard<'_, C> {
     pub fn check_idle(&mut self, status: ReadyForQueryStatus) {
         let conn_info = &self.conn_info;
         if status != ReadyForQueryStatus::Idle && std::mem::take(self.pool).strong_count() > 0 {
-            info!(conn_id = %self.conn_id, "pool: throwing away connection '{conn_info}' because connection is not idle")
+            info!("pool: throwing away connection '{conn_info}' because connection is not idle")
         }
     }
     pub fn discard(&mut self) {
         let conn_info = &self.conn_info;
         if std::mem::take(self.pool).strong_count() > 0 {
-            info!(conn_id = %self.conn_id, "pool: throwing away connection '{conn_info}' because connection is potentially in a broken state")
+            info!("pool: throwing away connection '{conn_info}' because connection is potentially in a broken state")
         }
     }
 }
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index e9f868d51e..ecb72abe73 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -36,6 +36,7 @@ use crate::error::ReportableError;
 use crate::metrics::HTTP_CONTENT_LENGTH;
 use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE;
 use crate::proxy::NeonOptions;
+use crate::serverless::backend::HttpConnError;
 use crate::DbName;
 use crate::RoleName;
 
@@ -305,7 +306,14 @@ pub async fn handle(
     Ok(response)
 }
 
-#[instrument(name = "sql-over-http", fields(pid = tracing::field::Empty), skip_all)]
+#[instrument(
+    name = "sql-over-http",
+    skip_all,
+    fields(
+        pid = tracing::field::Empty,
+        conn_id = tracing::field::Empty
+    )
+)]
 async fn handle_inner(
     config: &'static ProxyConfig,
     ctx: &mut RequestMonitoring,
@@ -359,12 +367,10 @@ async fn handle_inner(
     let txn_read_only = headers.get(&TXN_READ_ONLY) == Some(&HEADER_VALUE_TRUE);
     let txn_deferrable = headers.get(&TXN_DEFERRABLE) == Some(&HEADER_VALUE_TRUE);
 
-    let paused = ctx.latency_timer.pause();
     let request_content_length = match request.body().size_hint().upper() {
         Some(v) => v,
         None => MAX_REQUEST_SIZE + 1,
     };
-    drop(paused);
     info!(request_content_length, "request size in bytes");
     HTTP_CONTENT_LENGTH.observe(request_content_length as f64);
 
@@ -380,15 +386,20 @@ async fn handle_inner(
         let body = hyper::body::to_bytes(request.into_body())
             .await
             .map_err(anyhow::Error::from)?;
+        info!(length = body.len(), "request payload read");
         let payload: Payload = serde_json::from_slice(&body)?;
         Ok::<Payload, anyhow::Error>(payload) // Adjust error type accordingly
     };
 
     let authenticate_and_connect = async {
         let keys = backend.authenticate(ctx, &conn_info).await?;
-        backend
+        let client = backend
             .connect_to_compute(ctx, conn_info, keys, !allow_pool)
-            .await
+            .await?;
+        // not strictly necessary to mark success here,
+        // but it's just insurance for if we forget it somewhere else
+        ctx.latency_timer.success();
+        Ok::<_, HttpConnError>(client)
     };
 
     // Run both operations in parallel
@@ -420,6 +431,7 @@ async fn handle_inner(
             results
         }
         Payload::Batch(statements) => {
+            info!("starting transaction");
             let (inner, mut discard) = client.inner();
             let mut builder = inner.build_transaction();
             if let Some(isolation_level) = txn_isolation_level {
@@ -449,6 +461,7 @@ async fn handle_inner(
             .await
             {
                 Ok(results) => {
+                    info!("commit");
                     let status = transaction.commit().await.map_err(|e| {
                         // if we cannot commit - for now don't return connection to pool
                         // TODO: get a query status from the error
@@ -459,6 +472,7 @@ async fn handle_inner(
                     results
                 }
                 Err(err) => {
+                    info!("rollback");
                     let status = transaction.rollback().await.map_err(|e| {
                         // if we cannot rollback - for now don't return connection to pool
                         // TODO: get a query status from the error
@@ -533,8 +547,10 @@ async fn query_to_json<T: GenericClient>(
     raw_output: bool,
     default_array_mode: bool,
 ) -> anyhow::Result<(ReadyForQueryStatus, Value)> {
+    info!("executing query");
     let query_params = data.params;
     let row_stream = client.query_raw_txt(&data.query, query_params).await?;
+    info!("finished executing query");
 
     // Manually drain the stream into a vector to leave row_stream hanging
     // around to get a command tag. Also check that the response is not too
@@ -569,6 +585,13 @@ async fn query_to_json<T: GenericClient>(
     }
     .and_then(|s| s.parse::<i64>().ok());
 
+    info!(
+        rows = rows.len(),
+        ?ready,
+        command_tag,
+        "finished reading rows"
+    );
+
     let mut fields = vec![];
     let mut columns = vec![];
 

From 2227540a0d330dbafb885af83a6487903ae4f68e Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Mon, 12 Feb 2024 19:41:02 +0100
Subject: [PATCH 118/412] Proxy refactor auth+connect (#6708)

## Problem

Not really a problem, just refactoring.

## Summary of changes

Separate authenticate from wake compute.

Do not call wake compute second time if we managed to connect to
postgres or if we got it not from cache.
---
 proxy/src/auth.rs                  |   5 -
 proxy/src/auth/backend.rs          | 146 ++++++++++++++++-------------
 proxy/src/auth/backend/classic.rs  |   2 +-
 proxy/src/auth/backend/hacks.rs    |   6 +-
 proxy/src/bin/proxy.rs             |   2 +-
 proxy/src/compute.rs               |   8 +-
 proxy/src/config.rs                |   2 +-
 proxy/src/console/provider.rs      |  33 ++++++-
 proxy/src/console/provider/mock.rs |   4 +-
 proxy/src/error.rs                 |  12 ++-
 proxy/src/proxy.rs                 |  13 +--
 proxy/src/proxy/connect_compute.rs |  67 ++++++++-----
 proxy/src/proxy/tests.rs           | 142 +++++++++++++++++++++-------
 proxy/src/proxy/wake_compute.rs    |  16 +---
 proxy/src/serverless/backend.rs    |  40 +++-----
 15 files changed, 307 insertions(+), 191 deletions(-)

diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs
index 48de4e2353..c8028d1bf0 100644
--- a/proxy/src/auth.rs
+++ b/proxy/src/auth.rs
@@ -36,9 +36,6 @@ pub enum AuthErrorImpl {
     #[error(transparent)]
     GetAuthInfo(#[from] console::errors::GetAuthInfoError),
 
-    #[error(transparent)]
-    WakeCompute(#[from] console::errors::WakeComputeError),
-
     /// SASL protocol errors (includes [SCRAM](crate::scram)).
     #[error(transparent)]
     Sasl(#[from] crate::sasl::Error),
@@ -119,7 +116,6 @@ impl UserFacingError for AuthError {
         match self.0.as_ref() {
             Link(e) => e.to_string_client(),
             GetAuthInfo(e) => e.to_string_client(),
-            WakeCompute(e) => e.to_string_client(),
             Sasl(e) => e.to_string_client(),
             AuthFailed(_) => self.to_string(),
             BadAuthMethod(_) => self.to_string(),
@@ -139,7 +135,6 @@ impl ReportableError for AuthError {
         match self.0.as_ref() {
             Link(e) => e.get_error_kind(),
             GetAuthInfo(e) => e.get_error_kind(),
-            WakeCompute(e) => e.get_error_kind(),
             Sasl(e) => e.get_error_kind(),
             AuthFailed(_) => crate::error::ErrorKind::User,
             BadAuthMethod(_) => crate::error::ErrorKind::User,
diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index c9f21f1cf5..47c1dc4e92 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -10,9 +10,9 @@ use crate::auth::validate_password_and_exchange;
 use crate::cache::Cached;
 use crate::console::errors::GetAuthInfoError;
 use crate::console::provider::{CachedRoleSecret, ConsoleBackend};
-use crate::console::AuthSecret;
+use crate::console::{AuthSecret, NodeInfo};
 use crate::context::RequestMonitoring;
-use crate::proxy::wake_compute::wake_compute;
+use crate::proxy::connect_compute::ComputeConnectBackend;
 use crate::proxy::NeonOptions;
 use crate::stream::Stream;
 use crate::{
@@ -26,7 +26,6 @@ use crate::{
     stream, url,
 };
 use crate::{scram, EndpointCacheKey, EndpointId, RoleName};
-use futures::TryFutureExt;
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::info;
@@ -56,11 +55,11 @@ impl<T> std::ops::Deref for MaybeOwned<'_, T> {
 /// * However, when we substitute `T` with [`ComputeUserInfoMaybeEndpoint`],
 ///   this helps us provide the credentials only to those auth
 ///   backends which require them for the authentication process.
-pub enum BackendType<'a, T> {
+pub enum BackendType<'a, T, D> {
     /// Cloud API (V2).
     Console(MaybeOwned<'a, ConsoleBackend>, T),
     /// Authentication via a web browser.
-    Link(MaybeOwned<'a, url::ApiUrl>),
+    Link(MaybeOwned<'a, url::ApiUrl>, D),
 }
 
 pub trait TestBackend: Send + Sync + 'static {
@@ -71,7 +70,7 @@ pub trait TestBackend: Send + Sync + 'static {
     fn get_role_secret(&self) -> Result<CachedRoleSecret, console::errors::GetAuthInfoError>;
 }
 
-impl std::fmt::Display for BackendType<'_, ()> {
+impl std::fmt::Display for BackendType<'_, (), ()> {
     fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         use BackendType::*;
         match self {
@@ -86,51 +85,50 @@ impl std::fmt::Display for BackendType<'_, ()> {
                 #[cfg(test)]
                 ConsoleBackend::Test(_) => fmt.debug_tuple("Test").finish(),
             },
-            Link(url) => fmt.debug_tuple("Link").field(&url.as_str()).finish(),
+            Link(url, _) => fmt.debug_tuple("Link").field(&url.as_str()).finish(),
         }
     }
 }
 
-impl<T> BackendType<'_, T> {
+impl<T, D> BackendType<'_, T, D> {
     /// Very similar to [`std::option::Option::as_ref`].
     /// This helps us pass structured config to async tasks.
-    pub fn as_ref(&self) -> BackendType<'_, &T> {
+    pub fn as_ref(&self) -> BackendType<'_, &T, &D> {
         use BackendType::*;
         match self {
             Console(c, x) => Console(MaybeOwned::Borrowed(c), x),
-            Link(c) => Link(MaybeOwned::Borrowed(c)),
+            Link(c, x) => Link(MaybeOwned::Borrowed(c), x),
         }
     }
 }
 
-impl<'a, T> BackendType<'a, T> {
+impl<'a, T, D> BackendType<'a, T, D> {
     /// Very similar to [`std::option::Option::map`].
     /// Maps [`BackendType<T>`] to [`BackendType<R>`] by applying
     /// a function to a contained value.
-    pub fn map<R>(self, f: impl FnOnce(T) -> R) -> BackendType<'a, R> {
+    pub fn map<R>(self, f: impl FnOnce(T) -> R) -> BackendType<'a, R, D> {
         use BackendType::*;
         match self {
             Console(c, x) => Console(c, f(x)),
-            Link(c) => Link(c),
+            Link(c, x) => Link(c, x),
         }
     }
 }
-
-impl<'a, T, E> BackendType<'a, Result<T, E>> {
+impl<'a, T, D, E> BackendType<'a, Result<T, E>, D> {
     /// Very similar to [`std::option::Option::transpose`].
     /// This is most useful for error handling.
-    pub fn transpose(self) -> Result<BackendType<'a, T>, E> {
+    pub fn transpose(self) -> Result<BackendType<'a, T, D>, E> {
         use BackendType::*;
         match self {
             Console(c, x) => x.map(|x| Console(c, x)),
-            Link(c) => Ok(Link(c)),
+            Link(c, x) => Ok(Link(c, x)),
         }
     }
 }
 
-pub struct ComputeCredentials<T> {
+pub struct ComputeCredentials {
     pub info: ComputeUserInfo,
-    pub keys: T,
+    pub keys: ComputeCredentialKeys,
 }
 
 #[derive(Debug, Clone)]
@@ -153,7 +151,6 @@ impl ComputeUserInfo {
 }
 
 pub enum ComputeCredentialKeys {
-    #[cfg(any(test, feature = "testing"))]
     Password(Vec<u8>),
     AuthKeys(AuthKeys),
 }
@@ -188,7 +185,7 @@ async fn auth_quirks(
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
     allow_cleartext: bool,
     config: &'static AuthenticationConfig,
-) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
+) -> auth::Result<ComputeCredentials> {
     // If there's no project so far, that entails that client doesn't
     // support SNI or other means of passing the endpoint (project) name.
     // We now expect to see a very specific payload in the place of password.
@@ -198,8 +195,11 @@ async fn auth_quirks(
 
             ctx.set_endpoint_id(res.info.endpoint.clone());
             tracing::Span::current().record("ep", &tracing::field::display(&res.info.endpoint));
-
-            (res.info, Some(res.keys))
+            let password = match res.keys {
+                ComputeCredentialKeys::Password(p) => p,
+                _ => unreachable!("password hack should return a password"),
+            };
+            (res.info, Some(password))
         }
         Ok(info) => (info, None),
     };
@@ -253,7 +253,7 @@ async fn authenticate_with_secret(
     unauthenticated_password: Option<Vec<u8>>,
     allow_cleartext: bool,
     config: &'static AuthenticationConfig,
-) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
+) -> auth::Result<ComputeCredentials> {
     if let Some(password) = unauthenticated_password {
         let auth_outcome = validate_password_and_exchange(&password, secret)?;
         let keys = match auth_outcome {
@@ -283,14 +283,14 @@ async fn authenticate_with_secret(
     classic::authenticate(ctx, info, client, config, secret).await
 }
 
-impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
+impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
     /// Get compute endpoint name from the credentials.
     pub fn get_endpoint(&self) -> Option<EndpointId> {
         use BackendType::*;
 
         match self {
             Console(_, user_info) => user_info.endpoint_id.clone(),
-            Link(_) => Some("link".into()),
+            Link(_, _) => Some("link".into()),
         }
     }
 
@@ -300,7 +300,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
 
         match self {
             Console(_, user_info) => &user_info.user,
-            Link(_) => "link",
+            Link(_, _) => "link",
         }
     }
 
@@ -312,7 +312,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
         client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
         allow_cleartext: bool,
         config: &'static AuthenticationConfig,
-    ) -> auth::Result<(CachedNodeInfo, BackendType<'a, ComputeUserInfo>)> {
+    ) -> auth::Result<BackendType<'a, ComputeCredentials, NodeInfo>> {
         use BackendType::*;
 
         let res = match self {
@@ -323,33 +323,17 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
                     "performing authentication using the console"
                 );
 
-                let compute_credentials =
+                let credentials =
                     auth_quirks(ctx, &*api, user_info, client, allow_cleartext, config).await?;
-
-                let mut num_retries = 0;
-                let mut node =
-                    wake_compute(&mut num_retries, ctx, &api, &compute_credentials.info).await?;
-
-                ctx.set_project(node.aux.clone());
-
-                match compute_credentials.keys {
-                    #[cfg(any(test, feature = "testing"))]
-                    ComputeCredentialKeys::Password(password) => node.config.password(password),
-                    ComputeCredentialKeys::AuthKeys(auth_keys) => node.config.auth_keys(auth_keys),
-                };
-
-                (node, BackendType::Console(api, compute_credentials.info))
+                BackendType::Console(api, credentials)
             }
             // NOTE: this auth backend doesn't use client credentials.
-            Link(url) => {
+            Link(url, _) => {
                 info!("performing link authentication");
 
-                let node_info = link::authenticate(ctx, &url, client).await?;
+                let info = link::authenticate(ctx, &url, client).await?;
 
-                (
-                    CachedNodeInfo::new_uncached(node_info),
-                    BackendType::Link(url),
-                )
+                BackendType::Link(url, info)
             }
         };
 
@@ -358,7 +342,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
     }
 }
 
-impl BackendType<'_, ComputeUserInfo> {
+impl BackendType<'_, ComputeUserInfo, &()> {
     pub async fn get_role_secret(
         &self,
         ctx: &mut RequestMonitoring,
@@ -366,7 +350,7 @@ impl BackendType<'_, ComputeUserInfo> {
         use BackendType::*;
         match self {
             Console(api, user_info) => api.get_role_secret(ctx, user_info).await,
-            Link(_) => Ok(Cached::new_uncached(None)),
+            Link(_, _) => Ok(Cached::new_uncached(None)),
         }
     }
 
@@ -377,21 +361,51 @@ impl BackendType<'_, ComputeUserInfo> {
         use BackendType::*;
         match self {
             Console(api, user_info) => api.get_allowed_ips_and_secret(ctx, user_info).await,
-            Link(_) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
-        }
-    }
-
-    /// When applicable, wake the compute node, gaining its connection info in the process.
-    /// The link auth flow doesn't support this, so we return [`None`] in that case.
-    pub async fn wake_compute(
-        &self,
-        ctx: &mut RequestMonitoring,
-    ) -> Result<Option<CachedNodeInfo>, console::errors::WakeComputeError> {
-        use BackendType::*;
-
-        match self {
-            Console(api, user_info) => api.wake_compute(ctx, user_info).map_ok(Some).await,
-            Link(_) => Ok(None),
+            Link(_, _) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
+        }
+    }
+}
+
+#[async_trait::async_trait]
+impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, NodeInfo> {
+    async fn wake_compute(
+        &self,
+        ctx: &mut RequestMonitoring,
+    ) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
+        use BackendType::*;
+
+        match self {
+            Console(api, creds) => api.wake_compute(ctx, &creds.info).await,
+            Link(_, info) => Ok(Cached::new_uncached(info.clone())),
+        }
+    }
+
+    fn get_keys(&self) -> Option<&ComputeCredentialKeys> {
+        match self {
+            BackendType::Console(_, creds) => Some(&creds.keys),
+            BackendType::Link(_, _) => None,
+        }
+    }
+}
+
+#[async_trait::async_trait]
+impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> {
+    async fn wake_compute(
+        &self,
+        ctx: &mut RequestMonitoring,
+    ) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
+        use BackendType::*;
+
+        match self {
+            Console(api, creds) => api.wake_compute(ctx, &creds.info).await,
+            Link(_, _) => unreachable!("link auth flow doesn't support waking the compute"),
+        }
+    }
+
+    fn get_keys(&self) -> Option<&ComputeCredentialKeys> {
+        match self {
+            BackendType::Console(_, creds) => Some(&creds.keys),
+            BackendType::Link(_, _) => None,
         }
     }
 }
diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs
index e855843bc3..d075331846 100644
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -17,7 +17,7 @@ pub(super) async fn authenticate(
     client: &mut PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
     config: &'static AuthenticationConfig,
     secret: AuthSecret,
-) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
+) -> auth::Result<ComputeCredentials> {
     let flow = AuthFlow::new(client);
     let scram_keys = match secret {
         #[cfg(any(test, feature = "testing"))]
diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs
index 9f60b709d4..26cf7a01f2 100644
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -20,7 +20,7 @@ pub async fn authenticate_cleartext(
     info: ComputeUserInfo,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
     secret: AuthSecret,
-) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
+) -> auth::Result<ComputeCredentials> {
     warn!("cleartext auth flow override is enabled, proceeding");
     ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
 
@@ -51,7 +51,7 @@ pub async fn password_hack_no_authentication(
     ctx: &mut RequestMonitoring,
     info: ComputeUserInfoNoEndpoint,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
-) -> auth::Result<ComputeCredentials<Vec<u8>>> {
+) -> auth::Result<ComputeCredentials> {
     warn!("project not specified, resorting to the password hack auth flow");
     ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
 
@@ -73,6 +73,6 @@ pub async fn password_hack_no_authentication(
             options: info.options,
             endpoint: payload.endpoint,
         },
-        keys: payload.password,
+        keys: ComputeCredentialKeys::Password(payload.password),
     })
 }
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 8fbcb56758..00a229c135 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -383,7 +383,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         }
         AuthBackend::Link => {
             let url = args.uri.parse()?;
-            auth::BackendType::Link(MaybeOwned::Owned(url))
+            auth::BackendType::Link(MaybeOwned::Owned(url), ())
         }
     };
     let http_config = HttpConfig {
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index 83940d80ec..b61c1fb9ef 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -1,7 +1,7 @@
 use crate::{
     auth::parse_endpoint_param,
     cancellation::CancelClosure,
-    console::errors::WakeComputeError,
+    console::{errors::WakeComputeError, messages::MetricsAuxInfo},
     context::RequestMonitoring,
     error::{ReportableError, UserFacingError},
     metrics::NUM_DB_CONNECTIONS_GAUGE,
@@ -93,7 +93,7 @@ impl ConnCfg {
     }
 
     /// Reuse password or auth keys from the other config.
-    pub fn reuse_password(&mut self, other: &Self) {
+    pub fn reuse_password(&mut self, other: Self) {
         if let Some(password) = other.get_password() {
             self.password(password);
         }
@@ -253,6 +253,8 @@ pub struct PostgresConnection {
     pub params: std::collections::HashMap<String, String>,
     /// Query cancellation token.
     pub cancel_closure: CancelClosure,
+    /// Labels for proxy's metrics.
+    pub aux: MetricsAuxInfo,
 
     _guage: IntCounterPairGuard,
 }
@@ -263,6 +265,7 @@ impl ConnCfg {
         &self,
         ctx: &mut RequestMonitoring,
         allow_self_signed_compute: bool,
+        aux: MetricsAuxInfo,
         timeout: Duration,
     ) -> Result<PostgresConnection, ConnectionError> {
         let (socket_addr, stream, host) = self.connect_raw(timeout).await?;
@@ -297,6 +300,7 @@ impl ConnCfg {
             stream,
             params,
             cancel_closure,
+            aux,
             _guage: NUM_DB_CONNECTIONS_GAUGE
                 .with_label_values(&[ctx.protocol])
                 .guard(),
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 31c9228b35..5fcb537834 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -13,7 +13,7 @@ use x509_parser::oid_registry;
 
 pub struct ProxyConfig {
     pub tls_config: Option<TlsConfig>,
-    pub auth_backend: auth::BackendType<'static, ()>,
+    pub auth_backend: auth::BackendType<'static, (), ()>,
     pub metric_collection: Option<MetricCollectionConfig>,
     pub allow_self_signed_compute: bool,
     pub http_config: HttpConfig,
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index e5cad42753..640444d14e 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -4,7 +4,10 @@ pub mod neon;
 
 use super::messages::MetricsAuxInfo;
 use crate::{
-    auth::{backend::ComputeUserInfo, IpPattern},
+    auth::{
+        backend::{ComputeCredentialKeys, ComputeUserInfo},
+        IpPattern,
+    },
     cache::{project_info::ProjectInfoCacheImpl, Cached, TimedLru},
     compute,
     config::{CacheOptions, ProjectInfoCacheOptions},
@@ -261,6 +264,34 @@ pub struct NodeInfo {
     pub allow_self_signed_compute: bool,
 }
 
+impl NodeInfo {
+    pub async fn connect(
+        &self,
+        ctx: &mut RequestMonitoring,
+        timeout: Duration,
+    ) -> Result<compute::PostgresConnection, compute::ConnectionError> {
+        self.config
+            .connect(
+                ctx,
+                self.allow_self_signed_compute,
+                self.aux.clone(),
+                timeout,
+            )
+            .await
+    }
+    pub fn reuse_settings(&mut self, other: Self) {
+        self.allow_self_signed_compute = other.allow_self_signed_compute;
+        self.config.reuse_password(other.config);
+    }
+
+    pub fn set_keys(&mut self, keys: &ComputeCredentialKeys) {
+        match keys {
+            ComputeCredentialKeys::Password(password) => self.config.password(password),
+            ComputeCredentialKeys::AuthKeys(auth_keys) => self.config.auth_keys(*auth_keys),
+        };
+    }
+}
+
 pub type NodeInfoCache = TimedLru<EndpointCacheKey, NodeInfo>;
 pub type CachedNodeInfo = Cached<&'static NodeInfoCache>;
 pub type CachedRoleSecret = Cached<&'static ProjectInfoCacheImpl, Option<AuthSecret>>;
diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs
index 79a04f255d..0579ef6fc4 100644
--- a/proxy/src/console/provider/mock.rs
+++ b/proxy/src/console/provider/mock.rs
@@ -176,9 +176,7 @@ impl super::Api for Api {
         _ctx: &mut RequestMonitoring,
         _user_info: &ComputeUserInfo,
     ) -> Result<CachedNodeInfo, WakeComputeError> {
-        self.do_wake_compute()
-            .map_ok(CachedNodeInfo::new_uncached)
-            .await
+        self.do_wake_compute().map_ok(Cached::new_uncached).await
     }
 }
 
diff --git a/proxy/src/error.rs b/proxy/src/error.rs
index eafe92bf48..69fe1ebc12 100644
--- a/proxy/src/error.rs
+++ b/proxy/src/error.rs
@@ -29,7 +29,7 @@ pub trait UserFacingError: ReportableError {
     }
 }
 
-#[derive(Copy, Clone, Debug)]
+#[derive(Copy, Clone, Debug, Eq, PartialEq)]
 pub enum ErrorKind {
     /// Wrong password, unknown endpoint, protocol violation, etc...
     User,
@@ -90,3 +90,13 @@ impl ReportableError for tokio::time::error::Elapsed {
         ErrorKind::RateLimit
     }
 }
+
+impl ReportableError for tokio_postgres::error::Error {
+    fn get_error_kind(&self) -> ErrorKind {
+        if self.as_db_error().is_some() {
+            ErrorKind::Postgres
+        } else {
+            ErrorKind::Compute
+        }
+    }
+}
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 77aadb6f28..5f65de4c98 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -163,14 +163,14 @@ pub enum ClientMode {
 
 /// Abstracts the logic of handling TCP vs WS clients
 impl ClientMode {
-    fn allow_cleartext(&self) -> bool {
+    pub fn allow_cleartext(&self) -> bool {
         match self {
             ClientMode::Tcp => false,
             ClientMode::Websockets { .. } => true,
         }
     }
 
-    fn allow_self_signed_compute(&self, config: &ProxyConfig) -> bool {
+    pub fn allow_self_signed_compute(&self, config: &ProxyConfig) -> bool {
         match self {
             ClientMode::Tcp => config.allow_self_signed_compute,
             ClientMode::Websockets { .. } => false,
@@ -287,7 +287,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     }
 
     let user = user_info.get_user().to_owned();
-    let (mut node_info, user_info) = match user_info
+    let user_info = match user_info
         .authenticate(
             ctx,
             &mut stream,
@@ -306,14 +306,11 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
         }
     };
 
-    node_info.allow_self_signed_compute = mode.allow_self_signed_compute(config);
-
-    let aux = node_info.aux.clone();
     let mut node = connect_to_compute(
         ctx,
         &TcpMechanism { params: &params },
-        node_info,
         &user_info,
+        mode.allow_self_signed_compute(config),
     )
     .or_else(|e| stream.throw_error(e))
     .await?;
@@ -330,8 +327,8 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
 
     Ok(Some(ProxyPassthrough {
         client: stream,
+        aux: node.aux.clone(),
         compute: node,
-        aux,
         req: _request_gauge,
         conn: _client_gauge,
     }))
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index b9346aa743..6e57caf998 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -1,8 +1,9 @@
 use crate::{
-    auth,
+    auth::backend::ComputeCredentialKeys,
     compute::{self, PostgresConnection},
-    console::{self, errors::WakeComputeError},
+    console::{self, errors::WakeComputeError, CachedNodeInfo, NodeInfo},
     context::RequestMonitoring,
+    error::ReportableError,
     metrics::NUM_CONNECTION_FAILURES,
     proxy::{
         retry::{retry_after, ShouldRetry},
@@ -20,7 +21,7 @@ const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2);
 /// (e.g. the compute node's address might've changed at the wrong time).
 /// Invalidate the cache entry (if any) to prevent subsequent errors.
 #[tracing::instrument(name = "invalidate_cache", skip_all)]
-pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> compute::ConnCfg {
+pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> NodeInfo {
     let is_cached = node_info.cached();
     if is_cached {
         warn!("invalidating stalled compute node info cache entry");
@@ -31,13 +32,13 @@ pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> compute::ConnCfg
     };
     NUM_CONNECTION_FAILURES.with_label_values(&[label]).inc();
 
-    node_info.invalidate().config
+    node_info.invalidate()
 }
 
 #[async_trait]
 pub trait ConnectMechanism {
     type Connection;
-    type ConnectError;
+    type ConnectError: ReportableError;
     type Error: From<Self::ConnectError>;
     async fn connect_once(
         &self,
@@ -49,6 +50,16 @@ pub trait ConnectMechanism {
     fn update_connect_config(&self, conf: &mut compute::ConnCfg);
 }
 
+#[async_trait]
+pub trait ComputeConnectBackend {
+    async fn wake_compute(
+        &self,
+        ctx: &mut RequestMonitoring,
+    ) -> Result<CachedNodeInfo, console::errors::WakeComputeError>;
+
+    fn get_keys(&self) -> Option<&ComputeCredentialKeys>;
+}
+
 pub struct TcpMechanism<'a> {
     /// KV-dictionary with PostgreSQL connection params.
     pub params: &'a StartupMessageParams,
@@ -67,11 +78,7 @@ impl ConnectMechanism for TcpMechanism<'_> {
         node_info: &console::CachedNodeInfo,
         timeout: time::Duration,
     ) -> Result<PostgresConnection, Self::Error> {
-        let allow_self_signed_compute = node_info.allow_self_signed_compute;
-        node_info
-            .config
-            .connect(ctx, allow_self_signed_compute, timeout)
-            .await
+        node_info.connect(ctx, timeout).await
     }
 
     fn update_connect_config(&self, config: &mut compute::ConnCfg) {
@@ -82,16 +89,23 @@ impl ConnectMechanism for TcpMechanism<'_> {
 /// Try to connect to the compute node, retrying if necessary.
 /// This function might update `node_info`, so we take it by `&mut`.
 #[tracing::instrument(skip_all)]
-pub async fn connect_to_compute<M: ConnectMechanism>(
+pub async fn connect_to_compute<M: ConnectMechanism, B: ComputeConnectBackend>(
     ctx: &mut RequestMonitoring,
     mechanism: &M,
-    mut node_info: console::CachedNodeInfo,
-    user_info: &auth::BackendType<'_, auth::backend::ComputeUserInfo>,
+    user_info: &B,
+    allow_self_signed_compute: bool,
 ) -> Result<M::Connection, M::Error>
 where
     M::ConnectError: ShouldRetry + std::fmt::Debug,
     M::Error: From<WakeComputeError>,
 {
+    let mut num_retries = 0;
+    let mut node_info = wake_compute(&mut num_retries, ctx, user_info).await?;
+    if let Some(keys) = user_info.get_keys() {
+        node_info.set_keys(keys);
+    }
+    node_info.allow_self_signed_compute = allow_self_signed_compute;
+    // let mut node_info = credentials.get_node_info(ctx, user_info).await?;
     mechanism.update_connect_config(&mut node_info.config);
 
     // try once
@@ -108,28 +122,31 @@ where
 
     error!(error = ?err, "could not connect to compute node");
 
-    let mut num_retries = 1;
-
-    match user_info {
-        auth::BackendType::Console(api, info) => {
+    let node_info =
+        if err.get_error_kind() == crate::error::ErrorKind::Postgres || !node_info.cached() {
+            // If the error is Postgres, that means that we managed to connect to the compute node, but there was an error.
+            // Do not need to retrieve a new node_info, just return the old one.
+            if !err.should_retry(num_retries) {
+                return Err(err.into());
+            }
+            node_info
+        } else {
             // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
             info!("compute node's state has likely changed; requesting a wake-up");
-
             ctx.latency_timer.cache_miss();
-            let config = invalidate_cache(node_info);
-            node_info = wake_compute(&mut num_retries, ctx, api, info).await?;
+            let old_node_info = invalidate_cache(node_info);
+            let mut node_info = wake_compute(&mut num_retries, ctx, user_info).await?;
+            node_info.reuse_settings(old_node_info);
 
-            node_info.config.reuse_password(&config);
             mechanism.update_connect_config(&mut node_info.config);
-        }
-        // nothing to do?
-        auth::BackendType::Link(_) => {}
-    };
+            node_info
+        };
 
     // now that we have a new node, try connect to it repeatedly.
     // this can error for a few reasons, for instance:
     // * DNS connection settings haven't quite propagated yet
     info!("wake_compute success. attempting to connect");
+    num_retries = 1;
     loop {
         match mechanism
             .connect_once(ctx, &node_info, CONNECT_TIMEOUT)
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 5bb43c0375..efbd661bbf 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -2,13 +2,19 @@
 
 mod mitm;
 
+use std::time::Duration;
+
 use super::connect_compute::ConnectMechanism;
 use super::retry::ShouldRetry;
 use super::*;
-use crate::auth::backend::{ComputeUserInfo, MaybeOwned, TestBackend};
+use crate::auth::backend::{
+    ComputeCredentialKeys, ComputeCredentials, ComputeUserInfo, MaybeOwned, TestBackend,
+};
 use crate::config::CertResolver;
+use crate::console::caches::NodeInfoCache;
 use crate::console::provider::{CachedAllowedIps, CachedRoleSecret, ConsoleBackend};
 use crate::console::{self, CachedNodeInfo, NodeInfo};
+use crate::error::ErrorKind;
 use crate::proxy::retry::{retry_after, NUM_RETRIES_CONNECT};
 use crate::{auth, http, sasl, scram};
 use async_trait::async_trait;
@@ -369,12 +375,15 @@ enum ConnectAction {
     Connect,
     Retry,
     Fail,
+    RetryPg,
+    FailPg,
 }
 
 #[derive(Clone)]
 struct TestConnectMechanism {
     counter: Arc<std::sync::Mutex<usize>>,
     sequence: Vec<ConnectAction>,
+    cache: &'static NodeInfoCache,
 }
 
 impl TestConnectMechanism {
@@ -393,6 +402,12 @@ impl TestConnectMechanism {
         Self {
             counter: Arc::new(std::sync::Mutex::new(0)),
             sequence,
+            cache: Box::leak(Box::new(NodeInfoCache::new(
+                "test",
+                1,
+                Duration::from_secs(100),
+                false,
+            ))),
         }
     }
 }
@@ -403,6 +418,13 @@ struct TestConnection;
 #[derive(Debug)]
 struct TestConnectError {
     retryable: bool,
+    kind: crate::error::ErrorKind,
+}
+
+impl ReportableError for TestConnectError {
+    fn get_error_kind(&self) -> crate::error::ErrorKind {
+        self.kind
+    }
 }
 
 impl std::fmt::Display for TestConnectError {
@@ -436,8 +458,22 @@ impl ConnectMechanism for TestConnectMechanism {
         *counter += 1;
         match action {
             ConnectAction::Connect => Ok(TestConnection),
-            ConnectAction::Retry => Err(TestConnectError { retryable: true }),
-            ConnectAction::Fail => Err(TestConnectError { retryable: false }),
+            ConnectAction::Retry => Err(TestConnectError {
+                retryable: true,
+                kind: ErrorKind::Compute,
+            }),
+            ConnectAction::Fail => Err(TestConnectError {
+                retryable: false,
+                kind: ErrorKind::Compute,
+            }),
+            ConnectAction::FailPg => Err(TestConnectError {
+                retryable: false,
+                kind: ErrorKind::Postgres,
+            }),
+            ConnectAction::RetryPg => Err(TestConnectError {
+                retryable: true,
+                kind: ErrorKind::Postgres,
+            }),
             x => panic!("expecting action {:?}, connect is called instead", x),
         }
     }
@@ -451,7 +487,7 @@ impl TestBackend for TestConnectMechanism {
         let action = self.sequence[*counter];
         *counter += 1;
         match action {
-            ConnectAction::Wake => Ok(helper_create_cached_node_info()),
+            ConnectAction::Wake => Ok(helper_create_cached_node_info(self.cache)),
             ConnectAction::WakeFail => {
                 let err = console::errors::ApiError::Console {
                     status: http::StatusCode::FORBIDDEN,
@@ -483,37 +519,41 @@ impl TestBackend for TestConnectMechanism {
     }
 }
 
-fn helper_create_cached_node_info() -> CachedNodeInfo {
+fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeInfo {
     let node = NodeInfo {
         config: compute::ConnCfg::new(),
         aux: Default::default(),
         allow_self_signed_compute: false,
     };
-    CachedNodeInfo::new_uncached(node)
+    let (_, node) = cache.insert("key".into(), node);
+    node
 }
 
 fn helper_create_connect_info(
     mechanism: &TestConnectMechanism,
-) -> (CachedNodeInfo, auth::BackendType<'static, ComputeUserInfo>) {
-    let cache = helper_create_cached_node_info();
+) -> auth::BackendType<'static, ComputeCredentials, &()> {
     let user_info = auth::BackendType::Console(
         MaybeOwned::Owned(ConsoleBackend::Test(Box::new(mechanism.clone()))),
-        ComputeUserInfo {
-            endpoint: "endpoint".into(),
-            user: "user".into(),
-            options: NeonOptions::parse_options_raw(""),
+        ComputeCredentials {
+            info: ComputeUserInfo {
+                endpoint: "endpoint".into(),
+                user: "user".into(),
+                options: NeonOptions::parse_options_raw(""),
+            },
+            keys: ComputeCredentialKeys::Password("password".into()),
         },
     );
-    (cache, user_info)
+    user_info
 }
 
 #[tokio::test]
 async fn connect_to_compute_success() {
+    let _ = env_logger::try_init();
     use ConnectAction::*;
     let mut ctx = RequestMonitoring::test();
-    let mechanism = TestConnectMechanism::new(vec![Connect]);
-    let (cache, user_info) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, cache, &user_info)
+    let mechanism = TestConnectMechanism::new(vec![Wake, Connect]);
+    let user_info = helper_create_connect_info(&mechanism);
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
         .await
         .unwrap();
     mechanism.verify();
@@ -521,24 +561,52 @@ async fn connect_to_compute_success() {
 
 #[tokio::test]
 async fn connect_to_compute_retry() {
+    let _ = env_logger::try_init();
     use ConnectAction::*;
     let mut ctx = RequestMonitoring::test();
-    let mechanism = TestConnectMechanism::new(vec![Retry, Wake, Retry, Connect]);
-    let (cache, user_info) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, cache, &user_info)
+    let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Connect]);
+    let user_info = helper_create_connect_info(&mechanism);
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
         .await
         .unwrap();
     mechanism.verify();
 }
 
+#[tokio::test]
+async fn connect_to_compute_retry_pg() {
+    let _ = env_logger::try_init();
+    use ConnectAction::*;
+    let mut ctx = RequestMonitoring::test();
+    let mechanism = TestConnectMechanism::new(vec![Wake, RetryPg, Connect]);
+    let user_info = helper_create_connect_info(&mechanism);
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
+        .await
+        .unwrap();
+    mechanism.verify();
+}
+
+#[tokio::test]
+async fn connect_to_compute_fail_pg() {
+    let _ = env_logger::try_init();
+    use ConnectAction::*;
+    let mut ctx = RequestMonitoring::test();
+    let mechanism = TestConnectMechanism::new(vec![Wake, FailPg]);
+    let user_info = helper_create_connect_info(&mechanism);
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
+        .await
+        .unwrap_err();
+    mechanism.verify();
+}
+
 /// Test that we don't retry if the error is not retryable.
 #[tokio::test]
 async fn connect_to_compute_non_retry_1() {
+    let _ = env_logger::try_init();
     use ConnectAction::*;
     let mut ctx = RequestMonitoring::test();
-    let mechanism = TestConnectMechanism::new(vec![Retry, Wake, Retry, Fail]);
-    let (cache, user_info) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, cache, &user_info)
+    let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Fail]);
+    let user_info = helper_create_connect_info(&mechanism);
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
         .await
         .unwrap_err();
     mechanism.verify();
@@ -547,11 +615,12 @@ async fn connect_to_compute_non_retry_1() {
 /// Even for non-retryable errors, we should retry at least once.
 #[tokio::test]
 async fn connect_to_compute_non_retry_2() {
+    let _ = env_logger::try_init();
     use ConnectAction::*;
     let mut ctx = RequestMonitoring::test();
-    let mechanism = TestConnectMechanism::new(vec![Fail, Wake, Retry, Connect]);
-    let (cache, user_info) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, cache, &user_info)
+    let mechanism = TestConnectMechanism::new(vec![Wake, Fail, Wake, Connect]);
+    let user_info = helper_create_connect_info(&mechanism);
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
         .await
         .unwrap();
     mechanism.verify();
@@ -560,15 +629,16 @@ async fn connect_to_compute_non_retry_2() {
 /// Retry for at most `NUM_RETRIES_CONNECT` times.
 #[tokio::test]
 async fn connect_to_compute_non_retry_3() {
+    let _ = env_logger::try_init();
     assert_eq!(NUM_RETRIES_CONNECT, 16);
     use ConnectAction::*;
     let mut ctx = RequestMonitoring::test();
     let mechanism = TestConnectMechanism::new(vec![
-        Retry, Wake, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry,
-        Retry, Retry, Retry, Retry, /* the 17th time */ Retry,
+        Wake, Retry, Wake, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry,
+        Retry, Retry, Retry, Retry, Retry, /* the 17th time */ Retry,
     ]);
-    let (cache, user_info) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, cache, &user_info)
+    let user_info = helper_create_connect_info(&mechanism);
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
         .await
         .unwrap_err();
     mechanism.verify();
@@ -577,11 +647,12 @@ async fn connect_to_compute_non_retry_3() {
 /// Should retry wake compute.
 #[tokio::test]
 async fn wake_retry() {
+    let _ = env_logger::try_init();
     use ConnectAction::*;
     let mut ctx = RequestMonitoring::test();
-    let mechanism = TestConnectMechanism::new(vec![Retry, WakeRetry, Wake, Connect]);
-    let (cache, user_info) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, cache, &user_info)
+    let mechanism = TestConnectMechanism::new(vec![WakeRetry, Wake, Connect]);
+    let user_info = helper_create_connect_info(&mechanism);
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
         .await
         .unwrap();
     mechanism.verify();
@@ -590,11 +661,12 @@ async fn wake_retry() {
 /// Wake failed with a non-retryable error.
 #[tokio::test]
 async fn wake_non_retry() {
+    let _ = env_logger::try_init();
     use ConnectAction::*;
     let mut ctx = RequestMonitoring::test();
-    let mechanism = TestConnectMechanism::new(vec![Retry, WakeFail]);
-    let (cache, user_info) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, cache, &user_info)
+    let mechanism = TestConnectMechanism::new(vec![WakeRetry, WakeFail]);
+    let user_info = helper_create_connect_info(&mechanism);
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
         .await
         .unwrap_err();
     mechanism.verify();
diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs
index 925727bdab..2c593451b4 100644
--- a/proxy/src/proxy/wake_compute.rs
+++ b/proxy/src/proxy/wake_compute.rs
@@ -1,9 +1,4 @@
-use crate::auth::backend::ComputeUserInfo;
-use crate::console::{
-    errors::WakeComputeError,
-    provider::{CachedNodeInfo, ConsoleBackend},
-    Api,
-};
+use crate::console::{errors::WakeComputeError, provider::CachedNodeInfo};
 use crate::context::RequestMonitoring;
 use crate::metrics::{bool_to_str, NUM_WAKEUP_FAILURES};
 use crate::proxy::retry::retry_after;
@@ -11,17 +6,16 @@ use hyper::StatusCode;
 use std::ops::ControlFlow;
 use tracing::{error, warn};
 
+use super::connect_compute::ComputeConnectBackend;
 use super::retry::ShouldRetry;
 
-/// wake a compute (or retrieve an existing compute session from cache)
-pub async fn wake_compute(
+pub async fn wake_compute<B: ComputeConnectBackend>(
     num_retries: &mut u32,
     ctx: &mut RequestMonitoring,
-    api: &ConsoleBackend,
-    info: &ComputeUserInfo,
+    api: &B,
 ) -> Result<CachedNodeInfo, WakeComputeError> {
     loop {
-        let wake_res = api.wake_compute(ctx, info).await;
+        let wake_res = api.wake_compute(ctx).await;
         match handle_try_wake(wake_res, *num_retries) {
             Err(e) => {
                 error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 156002006d..6f93f86d5f 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -4,7 +4,7 @@ use async_trait::async_trait;
 use tracing::{field::display, info};
 
 use crate::{
-    auth::{backend::ComputeCredentialKeys, check_peer_addr_is_in_list, AuthError},
+    auth::{backend::ComputeCredentials, check_peer_addr_is_in_list, AuthError},
     compute,
     config::ProxyConfig,
     console::{
@@ -27,7 +27,7 @@ impl PoolingBackend {
         &self,
         ctx: &mut RequestMonitoring,
         conn_info: &ConnInfo,
-    ) -> Result<ComputeCredentialKeys, AuthError> {
+    ) -> Result<ComputeCredentials, AuthError> {
         let user_info = conn_info.user_info.clone();
         let backend = self.config.auth_backend.as_ref().map(|_| user_info.clone());
         let (allowed_ips, maybe_secret) = backend.get_allowed_ips_and_secret(ctx).await?;
@@ -49,13 +49,17 @@ impl PoolingBackend {
         };
         let auth_outcome =
             crate::auth::validate_password_and_exchange(&conn_info.password, secret)?;
-        match auth_outcome {
+        let res = match auth_outcome {
             crate::sasl::Outcome::Success(key) => Ok(key),
             crate::sasl::Outcome::Failure(reason) => {
                 info!("auth backend failed with an error: {reason}");
                 Err(AuthError::auth_failed(&*conn_info.user_info.user))
             }
-        }
+        };
+        res.map(|key| ComputeCredentials {
+            info: user_info,
+            keys: key,
+        })
     }
 
     // Wake up the destination if needed. Code here is a bit involved because
@@ -66,7 +70,7 @@ impl PoolingBackend {
         &self,
         ctx: &mut RequestMonitoring,
         conn_info: ConnInfo,
-        keys: ComputeCredentialKeys,
+        keys: ComputeCredentials,
         force_new: bool,
     ) -> Result<Client<tokio_postgres::Client>, HttpConnError> {
         let maybe_client = if !force_new {
@@ -82,26 +86,8 @@ impl PoolingBackend {
         }
         let conn_id = uuid::Uuid::new_v4();
         tracing::Span::current().record("conn_id", display(conn_id));
-        info!("pool: opening a new connection '{conn_info}'");
-        let backend = self
-            .config
-            .auth_backend
-            .as_ref()
-            .map(|_| conn_info.user_info.clone());
-
-        let mut node_info = backend
-            .wake_compute(ctx)
-            .await?
-            .ok_or(HttpConnError::NoComputeInfo)?;
-
-        match keys {
-            #[cfg(any(test, feature = "testing"))]
-            ComputeCredentialKeys::Password(password) => node_info.config.password(password),
-            ComputeCredentialKeys::AuthKeys(auth_keys) => node_info.config.auth_keys(auth_keys),
-        };
-
-        ctx.set_project(node_info.aux.clone());
-
+        info!(%conn_id, "pool: opening a new connection '{conn_info}'");
+        let backend = self.config.auth_backend.as_ref().map(|_| keys);
         crate::proxy::connect_compute::connect_to_compute(
             ctx,
             &TokioMechanism {
@@ -109,8 +95,8 @@ impl PoolingBackend {
                 conn_info,
                 pool: self.pool.clone(),
             },
-            node_info,
             &backend,
+            false, // do not allow self signed compute for http flow
         )
         .await
     }
@@ -129,8 +115,6 @@ pub enum HttpConnError {
     AuthError(#[from] AuthError),
     #[error("wake_compute returned error")]
     WakeCompute(#[from] WakeComputeError),
-    #[error("wake_compute returned nothing")]
-    NoComputeInfo,
 }
 
 struct TokioMechanism {

From c49c9707ced792aaa53d4a27bfe2719322704db4 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Tue, 13 Feb 2024 17:58:58 +0100
Subject: [PATCH 119/412] Proxy: send cancel notifications to all instances
 (#6719)

## Problem

If cancel request ends up on the wrong proxy instance, it doesn't take
an effect.

## Summary of changes

Send redis notifications to all proxy pods about the cancel request.

Related issue: https://github.com/neondatabase/neon/issues/5839,
https://github.com/neondatabase/cloud/issues/10262
---
 Cargo.lock                        |   7 +-
 Cargo.toml                        |   2 +-
 libs/pq_proto/Cargo.toml          |   1 +
 libs/pq_proto/src/lib.rs          |   3 +-
 proxy/src/bin/proxy.rs            |  32 ++++-
 proxy/src/cancellation.rs         | 109 ++++++++++++++---
 proxy/src/config.rs               |   1 +
 proxy/src/metrics.rs              |   9 ++
 proxy/src/proxy.rs                |  16 +--
 proxy/src/rate_limiter.rs         |   2 +-
 proxy/src/rate_limiter/limiter.rs |  38 ++++++
 proxy/src/redis.rs                |   1 +
 proxy/src/redis/notifications.rs  | 197 +++++++++++++++++++++++-------
 proxy/src/redis/publisher.rs      |  80 ++++++++++++
 proxy/src/serverless.rs           |  13 +-
 proxy/src/serverless/websocket.rs |   6 +-
 workspace_hack/Cargo.toml         |   4 +-
 17 files changed, 432 insertions(+), 89 deletions(-)
 create mode 100644 proxy/src/redis/publisher.rs

diff --git a/Cargo.lock b/Cargo.lock
index 83afdaf66f..f5f32e8491 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2247,11 +2247,11 @@ dependencies = [
 
 [[package]]
 name = "hashlink"
-version = "0.8.2"
+version = "0.8.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0761a1b9491c4f2e3d66aa0f62d0fba0af9a0e2852e4d48ea506632a4b56e6aa"
+checksum = "e8094feaf31ff591f651a2664fb9cfd92bba7a60ce3197265e9482ebe753c8f7"
 dependencies = [
- "hashbrown 0.13.2",
+ "hashbrown 0.14.0",
 ]
 
 [[package]]
@@ -3936,6 +3936,7 @@ dependencies = [
  "pin-project-lite",
  "postgres-protocol",
  "rand 0.8.5",
+ "serde",
  "thiserror",
  "tokio",
  "tracing",
diff --git a/Cargo.toml b/Cargo.toml
index ebc3dfa7b1..74dbaf1853 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -80,7 +80,7 @@ futures-core = "0.3"
 futures-util = "0.3"
 git-version = "0.3"
 hashbrown = "0.13"
-hashlink = "0.8.1"
+hashlink = "0.8.4"
 hdrhistogram = "7.5.2"
 hex = "0.4"
 hex-literal = "0.4"
diff --git a/libs/pq_proto/Cargo.toml b/libs/pq_proto/Cargo.toml
index b286eb0358..6eeb3bafef 100644
--- a/libs/pq_proto/Cargo.toml
+++ b/libs/pq_proto/Cargo.toml
@@ -13,5 +13,6 @@ rand.workspace = true
 tokio.workspace = true
 tracing.workspace = true
 thiserror.workspace = true
+serde.workspace = true
 
 workspace_hack.workspace = true
diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs
index c52a21bcd3..522b65f5d1 100644
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -7,6 +7,7 @@ pub mod framed;
 
 use byteorder::{BigEndian, ReadBytesExt};
 use bytes::{Buf, BufMut, Bytes, BytesMut};
+use serde::{Deserialize, Serialize};
 use std::{borrow::Cow, collections::HashMap, fmt, io, str};
 
 // re-export for use in utils pageserver_feedback.rs
@@ -123,7 +124,7 @@ impl StartupMessageParams {
     }
 }
 
-#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)]
+#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy, Serialize, Deserialize)]
 pub struct CancelKeyData {
     pub backend_pid: i32,
     pub cancel_key: i32,
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 00a229c135..b3d4fc0411 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -1,6 +1,8 @@
 use futures::future::Either;
 use proxy::auth;
 use proxy::auth::backend::MaybeOwned;
+use proxy::cancellation::CancelMap;
+use proxy::cancellation::CancellationHandler;
 use proxy::config::AuthenticationConfig;
 use proxy::config::CacheOptions;
 use proxy::config::HttpConfig;
@@ -12,6 +14,7 @@ use proxy::rate_limiter::EndpointRateLimiter;
 use proxy::rate_limiter::RateBucketInfo;
 use proxy::rate_limiter::RateLimiterConfig;
 use proxy::redis::notifications;
+use proxy::redis::publisher::RedisPublisherClient;
 use proxy::serverless::GlobalConnPoolOptions;
 use proxy::usage_metrics;
 
@@ -22,6 +25,7 @@ use std::net::SocketAddr;
 use std::pin::pin;
 use std::sync::Arc;
 use tokio::net::TcpListener;
+use tokio::sync::Mutex;
 use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
 use tracing::info;
@@ -129,6 +133,9 @@ struct ProxyCliArgs {
     /// Can be given multiple times for different bucket sizes.
     #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)]
     endpoint_rps_limit: Vec<RateBucketInfo>,
+    /// Redis rate limiter max number of requests per second.
+    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)]
+    redis_rps_limit: Vec<RateBucketInfo>,
     /// Initial limit for dynamic rate limiter. Makes sense only if `rate_limit_algorithm` is *not* `None`.
     #[clap(long, default_value_t = 100)]
     initial_limit: usize,
@@ -225,6 +232,19 @@ async fn main() -> anyhow::Result<()> {
     let cancellation_token = CancellationToken::new();
 
     let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(&config.endpoint_rps_limit));
+    let cancel_map = CancelMap::default();
+    let redis_publisher = match &args.redis_notifications {
+        Some(url) => Some(Arc::new(Mutex::new(RedisPublisherClient::new(
+            url,
+            args.region.clone(),
+            &config.redis_rps_limit,
+        )?))),
+        None => None,
+    };
+    let cancellation_handler = Arc::new(CancellationHandler::new(
+        cancel_map.clone(),
+        redis_publisher,
+    ));
 
     // client facing tasks. these will exit on error or on cancellation
     // cancellation returns Ok(())
@@ -234,6 +254,7 @@ async fn main() -> anyhow::Result<()> {
         proxy_listener,
         cancellation_token.clone(),
         endpoint_rate_limiter.clone(),
+        cancellation_handler.clone(),
     ));
 
     // TODO: rename the argument to something like serverless.
@@ -248,6 +269,7 @@ async fn main() -> anyhow::Result<()> {
             serverless_listener,
             cancellation_token.clone(),
             endpoint_rate_limiter.clone(),
+            cancellation_handler.clone(),
         ));
     }
 
@@ -271,7 +293,12 @@ async fn main() -> anyhow::Result<()> {
             let cache = api.caches.project_info.clone();
             if let Some(url) = args.redis_notifications {
                 info!("Starting redis notifications listener ({url})");
-                maintenance_tasks.spawn(notifications::task_main(url.to_owned(), cache.clone()));
+                maintenance_tasks.spawn(notifications::task_main(
+                    url.to_owned(),
+                    cache.clone(),
+                    cancel_map.clone(),
+                    args.region.clone(),
+                ));
             }
             maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
         }
@@ -403,6 +430,8 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
 
     let mut endpoint_rps_limit = args.endpoint_rps_limit.clone();
     RateBucketInfo::validate(&mut endpoint_rps_limit)?;
+    let mut redis_rps_limit = args.redis_rps_limit.clone();
+    RateBucketInfo::validate(&mut redis_rps_limit)?;
 
     let config = Box::leak(Box::new(ProxyConfig {
         tls_config,
@@ -414,6 +443,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         require_client_ip: args.require_client_ip,
         disable_ip_check_for_http: args.disable_ip_check_for_http,
         endpoint_rps_limit,
+        redis_rps_limit,
         handshake_timeout: args.handshake_timeout,
         // TODO: add this argument
         region: args.region.clone(),
diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index fe614628d8..93a77bc4ae 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -1,16 +1,28 @@
+use async_trait::async_trait;
 use dashmap::DashMap;
 use pq_proto::CancelKeyData;
 use std::{net::SocketAddr, sync::Arc};
 use thiserror::Error;
 use tokio::net::TcpStream;
+use tokio::sync::Mutex;
 use tokio_postgres::{CancelToken, NoTls};
 use tracing::info;
+use uuid::Uuid;
 
-use crate::error::ReportableError;
+use crate::{
+    error::ReportableError, metrics::NUM_CANCELLATION_REQUESTS,
+    redis::publisher::RedisPublisherClient,
+};
+
+pub type CancelMap = Arc<DashMap<CancelKeyData, Option<CancelClosure>>>;
 
 /// Enables serving `CancelRequest`s.
-#[derive(Default)]
-pub struct CancelMap(DashMap<CancelKeyData, Option<CancelClosure>>);
+///
+/// If there is a `RedisPublisherClient` available, it will be used to publish the cancellation key to other proxy instances.
+pub struct CancellationHandler {
+    map: CancelMap,
+    redis_client: Option<Arc<Mutex<RedisPublisherClient>>>,
+}
 
 #[derive(Debug, Error)]
 pub enum CancelError {
@@ -32,15 +44,43 @@ impl ReportableError for CancelError {
     }
 }
 
-impl CancelMap {
+impl CancellationHandler {
+    pub fn new(map: CancelMap, redis_client: Option<Arc<Mutex<RedisPublisherClient>>>) -> Self {
+        Self { map, redis_client }
+    }
     /// Cancel a running query for the corresponding connection.
-    pub async fn cancel_session(&self, key: CancelKeyData) -> Result<(), CancelError> {
+    pub async fn cancel_session(
+        &self,
+        key: CancelKeyData,
+        session_id: Uuid,
+    ) -> Result<(), CancelError> {
+        let from = "from_client";
         // NB: we should immediately release the lock after cloning the token.
-        let Some(cancel_closure) = self.0.get(&key).and_then(|x| x.clone()) else {
+        let Some(cancel_closure) = self.map.get(&key).and_then(|x| x.clone()) else {
             tracing::warn!("query cancellation key not found: {key}");
+            if let Some(redis_client) = &self.redis_client {
+                NUM_CANCELLATION_REQUESTS
+                    .with_label_values(&[from, "not_found"])
+                    .inc();
+                info!("publishing cancellation key to Redis");
+                match redis_client.lock().await.try_publish(key, session_id).await {
+                    Ok(()) => {
+                        info!("cancellation key successfuly published to Redis");
+                    }
+                    Err(e) => {
+                        tracing::error!("failed to publish a message: {e}");
+                        return Err(CancelError::IO(std::io::Error::new(
+                            std::io::ErrorKind::Other,
+                            e.to_string(),
+                        )));
+                    }
+                }
+            }
             return Ok(());
         };
-
+        NUM_CANCELLATION_REQUESTS
+            .with_label_values(&[from, "found"])
+            .inc();
         info!("cancelling query per user's request using key {key}");
         cancel_closure.try_cancel_query().await
     }
@@ -57,7 +97,7 @@ impl CancelMap {
 
             // Random key collisions are unlikely to happen here, but they're still possible,
             // which is why we have to take care not to rewrite an existing key.
-            match self.0.entry(key) {
+            match self.map.entry(key) {
                 dashmap::mapref::entry::Entry::Occupied(_) => continue,
                 dashmap::mapref::entry::Entry::Vacant(e) => {
                     e.insert(None);
@@ -69,18 +109,46 @@ impl CancelMap {
         info!("registered new query cancellation key {key}");
         Session {
             key,
-            cancel_map: self,
+            cancellation_handler: self,
         }
     }
 
     #[cfg(test)]
     fn contains(&self, session: &Session) -> bool {
-        self.0.contains_key(&session.key)
+        self.map.contains_key(&session.key)
     }
 
     #[cfg(test)]
     fn is_empty(&self) -> bool {
-        self.0.is_empty()
+        self.map.is_empty()
+    }
+}
+
+#[async_trait]
+pub trait NotificationsCancellationHandler {
+    async fn cancel_session_no_publish(&self, key: CancelKeyData) -> Result<(), CancelError>;
+}
+
+#[async_trait]
+impl NotificationsCancellationHandler for CancellationHandler {
+    async fn cancel_session_no_publish(&self, key: CancelKeyData) -> Result<(), CancelError> {
+        let from = "from_redis";
+        let cancel_closure = self.map.get(&key).and_then(|x| x.clone());
+        match cancel_closure {
+            Some(cancel_closure) => {
+                NUM_CANCELLATION_REQUESTS
+                    .with_label_values(&[from, "found"])
+                    .inc();
+                cancel_closure.try_cancel_query().await
+            }
+            None => {
+                NUM_CANCELLATION_REQUESTS
+                    .with_label_values(&[from, "not_found"])
+                    .inc();
+                tracing::warn!("query cancellation key not found: {key}");
+                Ok(())
+            }
+        }
     }
 }
 
@@ -115,7 +183,7 @@ pub struct Session {
     /// The user-facing key identifying this session.
     key: CancelKeyData,
     /// The [`CancelMap`] this session belongs to.
-    cancel_map: Arc<CancelMap>,
+    cancellation_handler: Arc<CancellationHandler>,
 }
 
 impl Session {
@@ -123,7 +191,9 @@ impl Session {
     /// This enables query cancellation in `crate::proxy::prepare_client_connection`.
     pub fn enable_query_cancellation(&self, cancel_closure: CancelClosure) -> CancelKeyData {
         info!("enabling query cancellation for this session");
-        self.cancel_map.0.insert(self.key, Some(cancel_closure));
+        self.cancellation_handler
+            .map
+            .insert(self.key, Some(cancel_closure));
 
         self.key
     }
@@ -131,7 +201,7 @@ impl Session {
 
 impl Drop for Session {
     fn drop(&mut self) {
-        self.cancel_map.0.remove(&self.key);
+        self.cancellation_handler.map.remove(&self.key);
         info!("dropped query cancellation key {}", &self.key);
     }
 }
@@ -142,13 +212,16 @@ mod tests {
 
     #[tokio::test]
     async fn check_session_drop() -> anyhow::Result<()> {
-        let cancel_map: Arc<CancelMap> = Default::default();
+        let cancellation_handler = Arc::new(CancellationHandler {
+            map: CancelMap::default(),
+            redis_client: None,
+        });
 
-        let session = cancel_map.clone().get_session();
-        assert!(cancel_map.contains(&session));
+        let session = cancellation_handler.clone().get_session();
+        assert!(cancellation_handler.contains(&session));
         drop(session);
         // Check that the session has been dropped.
-        assert!(cancel_map.is_empty());
+        assert!(cancellation_handler.is_empty());
 
         Ok(())
     }
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 5fcb537834..9f276c3c24 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -21,6 +21,7 @@ pub struct ProxyConfig {
     pub require_client_ip: bool,
     pub disable_ip_check_for_http: bool,
     pub endpoint_rps_limit: Vec<RateBucketInfo>,
+    pub redis_rps_limit: Vec<RateBucketInfo>,
     pub region: String,
     pub handshake_timeout: Duration,
 }
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index f7f162a075..66031f5eb2 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -152,6 +152,15 @@ pub static NUM_OPEN_CLIENTS_IN_HTTP_POOL: Lazy<IntGauge> = Lazy::new(|| {
     .unwrap()
 });
 
+pub static NUM_CANCELLATION_REQUESTS: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "proxy_cancellation_requests_total",
+        "Number of cancellation requests (per found/not_found).",
+        &["source", "kind"],
+    )
+    .unwrap()
+});
+
 #[derive(Clone)]
 pub struct LatencyTimer {
     // time since the stopwatch was started
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 5f65de4c98..ce77098a5f 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -10,7 +10,7 @@ pub mod wake_compute;
 
 use crate::{
     auth,
-    cancellation::{self, CancelMap},
+    cancellation::{self, CancellationHandler},
     compute,
     config::{ProxyConfig, TlsConfig},
     context::RequestMonitoring,
@@ -62,6 +62,7 @@ pub async fn task_main(
     listener: tokio::net::TcpListener,
     cancellation_token: CancellationToken,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
+    cancellation_handler: Arc<CancellationHandler>,
 ) -> anyhow::Result<()> {
     scopeguard::defer! {
         info!("proxy has shut down");
@@ -72,7 +73,6 @@ pub async fn task_main(
     socket2::SockRef::from(&listener).set_keepalive(true)?;
 
     let connections = tokio_util::task::task_tracker::TaskTracker::new();
-    let cancel_map = Arc::new(CancelMap::default());
 
     while let Some(accept_result) =
         run_until_cancelled(listener.accept(), &cancellation_token).await
@@ -80,7 +80,7 @@ pub async fn task_main(
         let (socket, peer_addr) = accept_result?;
 
         let session_id = uuid::Uuid::new_v4();
-        let cancel_map = Arc::clone(&cancel_map);
+        let cancellation_handler = Arc::clone(&cancellation_handler);
         let endpoint_rate_limiter = endpoint_rate_limiter.clone();
 
         let session_span = info_span!(
@@ -113,7 +113,7 @@ pub async fn task_main(
                 let res = handle_client(
                     config,
                     &mut ctx,
-                    cancel_map,
+                    cancellation_handler,
                     socket,
                     ClientMode::Tcp,
                     endpoint_rate_limiter,
@@ -227,7 +227,7 @@ impl ReportableError for ClientRequestError {
 pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     config: &'static ProxyConfig,
     ctx: &mut RequestMonitoring,
-    cancel_map: Arc<CancelMap>,
+    cancellation_handler: Arc<CancellationHandler>,
     stream: S,
     mode: ClientMode,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
@@ -253,8 +253,8 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
         match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? {
             HandshakeData::Startup(stream, params) => (stream, params),
             HandshakeData::Cancel(cancel_key_data) => {
-                return Ok(cancel_map
-                    .cancel_session(cancel_key_data)
+                return Ok(cancellation_handler
+                    .cancel_session(cancel_key_data, ctx.session_id)
                     .await
                     .map(|()| None)?)
             }
@@ -315,7 +315,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     .or_else(|e| stream.throw_error(e))
     .await?;
 
-    let session = cancel_map.get_session();
+    let session = cancellation_handler.get_session();
     prepare_client_connection(&node, &session, &mut stream).await?;
 
     // Before proxy passing, forward to compute whatever data is left in the
diff --git a/proxy/src/rate_limiter.rs b/proxy/src/rate_limiter.rs
index b26386d159..f0da4ead23 100644
--- a/proxy/src/rate_limiter.rs
+++ b/proxy/src/rate_limiter.rs
@@ -4,4 +4,4 @@ mod limiter;
 pub use aimd::Aimd;
 pub use limit_algorithm::{AimdConfig, Fixed, RateLimitAlgorithm, RateLimiterConfig};
 pub use limiter::Limiter;
-pub use limiter::{EndpointRateLimiter, RateBucketInfo};
+pub use limiter::{EndpointRateLimiter, RateBucketInfo, RedisRateLimiter};
diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs
index cbae72711c..3181060e2f 100644
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -22,6 +22,44 @@ use super::{
     RateLimiterConfig,
 };
 
+pub struct RedisRateLimiter {
+    data: Vec<RateBucket>,
+    info: &'static [RateBucketInfo],
+}
+
+impl RedisRateLimiter {
+    pub fn new(info: &'static [RateBucketInfo]) -> Self {
+        Self {
+            data: vec![
+                RateBucket {
+                    start: Instant::now(),
+                    count: 0,
+                };
+                info.len()
+            ],
+            info,
+        }
+    }
+
+    /// Check that number of connections is below `max_rps` rps.
+    pub fn check(&mut self) -> bool {
+        let now = Instant::now();
+
+        let should_allow_request = self
+            .data
+            .iter_mut()
+            .zip(self.info)
+            .all(|(bucket, info)| bucket.should_allow_request(info, now));
+
+        if should_allow_request {
+            // only increment the bucket counts if the request will actually be accepted
+            self.data.iter_mut().for_each(RateBucket::inc);
+        }
+
+        should_allow_request
+    }
+}
+
 // Simple per-endpoint rate limiter.
 //
 // Check that number of connections to the endpoint is below `max_rps` rps.
diff --git a/proxy/src/redis.rs b/proxy/src/redis.rs
index c2a91bed97..35d6db074e 100644
--- a/proxy/src/redis.rs
+++ b/proxy/src/redis.rs
@@ -1 +1,2 @@
 pub mod notifications;
+pub mod publisher;
diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs
index 158884aa17..b8297a206c 100644
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -1,38 +1,44 @@
 use std::{convert::Infallible, sync::Arc};
 
 use futures::StreamExt;
+use pq_proto::CancelKeyData;
 use redis::aio::PubSub;
-use serde::Deserialize;
+use serde::{Deserialize, Serialize};
+use uuid::Uuid;
 
 use crate::{
     cache::project_info::ProjectInfoCache,
+    cancellation::{CancelMap, CancellationHandler, NotificationsCancellationHandler},
     intern::{ProjectIdInt, RoleNameInt},
 };
 
-const CHANNEL_NAME: &str = "neondb-proxy-ws-updates";
+const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates";
+pub(crate) const PROXY_CHANNEL_NAME: &str = "neondb-proxy-to-proxy-updates";
 const RECONNECT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(20);
 const INVALIDATION_LAG: std::time::Duration = std::time::Duration::from_secs(20);
 
-struct ConsoleRedisClient {
+struct RedisConsumerClient {
     client: redis::Client,
 }
 
-impl ConsoleRedisClient {
+impl RedisConsumerClient {
     pub fn new(url: &str) -> anyhow::Result<Self> {
         let client = redis::Client::open(url)?;
         Ok(Self { client })
     }
     async fn try_connect(&self) -> anyhow::Result<PubSub> {
         let mut conn = self.client.get_async_connection().await?.into_pubsub();
-        tracing::info!("subscribing to a channel `{CHANNEL_NAME}`");
-        conn.subscribe(CHANNEL_NAME).await?;
+        tracing::info!("subscribing to a channel `{CPLANE_CHANNEL_NAME}`");
+        conn.subscribe(CPLANE_CHANNEL_NAME).await?;
+        tracing::info!("subscribing to a channel `{PROXY_CHANNEL_NAME}`");
+        conn.subscribe(PROXY_CHANNEL_NAME).await?;
         Ok(conn)
     }
 }
 
-#[derive(Clone, Debug, Deserialize, Eq, PartialEq)]
+#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
 #[serde(tag = "topic", content = "data")]
-enum Notification {
+pub(crate) enum Notification {
     #[serde(
         rename = "/allowed_ips_updated",
         deserialize_with = "deserialize_json_string"
@@ -45,16 +51,25 @@ enum Notification {
         deserialize_with = "deserialize_json_string"
     )]
     PasswordUpdate { password_update: PasswordUpdate },
+    #[serde(rename = "/cancel_session")]
+    Cancel(CancelSession),
 }
-#[derive(Clone, Debug, Deserialize, Eq, PartialEq)]
-struct AllowedIpsUpdate {
+#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
+pub(crate) struct AllowedIpsUpdate {
     project_id: ProjectIdInt,
 }
-#[derive(Clone, Debug, Deserialize, Eq, PartialEq)]
-struct PasswordUpdate {
+#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
+pub(crate) struct PasswordUpdate {
     project_id: ProjectIdInt,
     role_name: RoleNameInt,
 }
+#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
+pub(crate) struct CancelSession {
+    pub region_id: Option<String>,
+    pub cancel_key_data: CancelKeyData,
+    pub session_id: Uuid,
+}
+
 fn deserialize_json_string<'de, D, T>(deserializer: D) -> Result<T, D::Error>
 where
     T: for<'de2> serde::Deserialize<'de2>,
@@ -64,6 +79,88 @@ where
     serde_json::from_str(&s).map_err(<D::Error as serde::de::Error>::custom)
 }
 
+struct MessageHandler<
+    C: ProjectInfoCache + Send + Sync + 'static,
+    H: NotificationsCancellationHandler + Send + Sync + 'static,
+> {
+    cache: Arc<C>,
+    cancellation_handler: Arc<H>,
+    region_id: String,
+}
+
+impl<
+        C: ProjectInfoCache + Send + Sync + 'static,
+        H: NotificationsCancellationHandler + Send + Sync + 'static,
+    > MessageHandler<C, H>
+{
+    pub fn new(cache: Arc<C>, cancellation_handler: Arc<H>, region_id: String) -> Self {
+        Self {
+            cache,
+            cancellation_handler,
+            region_id,
+        }
+    }
+    pub fn disable_ttl(&self) {
+        self.cache.disable_ttl();
+    }
+    pub fn enable_ttl(&self) {
+        self.cache.enable_ttl();
+    }
+    #[tracing::instrument(skip(self, msg), fields(session_id = tracing::field::Empty))]
+    async fn handle_message(&self, msg: redis::Msg) -> anyhow::Result<()> {
+        use Notification::*;
+        let payload: String = msg.get_payload()?;
+        tracing::debug!(?payload, "received a message payload");
+
+        let msg: Notification = match serde_json::from_str(&payload) {
+            Ok(msg) => msg,
+            Err(e) => {
+                tracing::error!("broken message: {e}");
+                return Ok(());
+            }
+        };
+        tracing::debug!(?msg, "received a message");
+        match msg {
+            Cancel(cancel_session) => {
+                tracing::Span::current().record(
+                    "session_id",
+                    &tracing::field::display(cancel_session.session_id),
+                );
+                if let Some(cancel_region) = cancel_session.region_id {
+                    // If the message is not for this region, ignore it.
+                    if cancel_region != self.region_id {
+                        return Ok(());
+                    }
+                }
+                // This instance of cancellation_handler doesn't have a RedisPublisherClient so it can't publish the message.
+                match self
+                    .cancellation_handler
+                    .cancel_session_no_publish(cancel_session.cancel_key_data)
+                    .await
+                {
+                    Ok(()) => {}
+                    Err(e) => {
+                        tracing::error!("failed to cancel session: {e}");
+                    }
+                }
+            }
+            _ => {
+                invalidate_cache(self.cache.clone(), msg.clone());
+                // It might happen that the invalid entry is on the way to be cached.
+                // To make sure that the entry is invalidated, let's repeat the invalidation in INVALIDATION_LAG seconds.
+                // TODO: include the version (or the timestamp) in the message and invalidate only if the entry is cached before the message.
+                let cache = self.cache.clone();
+                tokio::spawn(async move {
+                    tokio::time::sleep(INVALIDATION_LAG).await;
+                    invalidate_cache(cache, msg);
+                });
+            }
+        }
+
+        Ok(())
+    }
+}
+
 fn invalidate_cache<C: ProjectInfoCache>(cache: Arc<C>, msg: Notification) {
     use Notification::*;
     match msg {
@@ -74,50 +171,33 @@ fn invalidate_cache<C: ProjectInfoCache>(cache: Arc<C>, msg: Notification) {
             password_update.project_id,
             password_update.role_name,
         ),
+        Cancel(_) => unreachable!("cancel message should be handled separately"),
     }
 }
 
-#[tracing::instrument(skip(cache))]
-fn handle_message<C>(msg: redis::Msg, cache: Arc<C>) -> anyhow::Result<()>
-where
-    C: ProjectInfoCache + Send + Sync + 'static,
-{
-    let payload: String = msg.get_payload()?;
-    tracing::debug!(?payload, "received a message payload");
-
-    let msg: Notification = match serde_json::from_str(&payload) {
-        Ok(msg) => msg,
-        Err(e) => {
-            tracing::error!("broken message: {e}");
-            return Ok(());
-        }
-    };
-    tracing::debug!(?msg, "received a message");
-    invalidate_cache(cache.clone(), msg.clone());
-    // It might happen that the invalid entry is on the way to be cached.
-    // To make sure that the entry is invalidated, let's repeat the invalidation in INVALIDATION_LAG seconds.
-    // TODO: include the version (or the timestamp) in the message and invalidate only if the entry is cached before the message.
-    tokio::spawn(async move {
-        tokio::time::sleep(INVALIDATION_LAG).await;
-        invalidate_cache(cache, msg.clone());
-    });
-
-    Ok(())
-}
-
 /// Handle console's invalidation messages.
 #[tracing::instrument(name = "console_notifications", skip_all)]
-pub async fn task_main<C>(url: String, cache: Arc<C>) -> anyhow::Result<Infallible>
+pub async fn task_main<C>(
+    url: String,
+    cache: Arc<C>,
+    cancel_map: CancelMap,
+    region_id: String,
+) -> anyhow::Result<Infallible>
 where
     C: ProjectInfoCache + Send + Sync + 'static,
 {
     cache.enable_ttl();
+    let handler = MessageHandler::new(
+        cache,
+        Arc::new(CancellationHandler::new(cancel_map, None)),
+        region_id,
+    );
 
     loop {
-        let redis = ConsoleRedisClient::new(&url)?;
+        let redis = RedisConsumerClient::new(&url)?;
         let conn = match redis.try_connect().await {
             Ok(conn) => {
-                cache.disable_ttl();
+                handler.disable_ttl();
                 conn
             }
             Err(e) => {
@@ -130,7 +210,7 @@ where
         };
         let mut stream = conn.into_on_message();
         while let Some(msg) = stream.next().await {
-            match handle_message(msg, cache.clone()) {
+            match handler.handle_message(msg).await {
                 Ok(()) => {}
                 Err(e) => {
                     tracing::error!("failed to handle message: {e}, will try to reconnect");
@@ -138,7 +218,7 @@ where
                 }
             }
         }
-        cache.enable_ttl();
+        handler.enable_ttl();
     }
 }
 
@@ -198,6 +278,33 @@ mod tests {
             }
         );
 
+        Ok(())
+    }
+    #[test]
+    fn parse_cancel_session() -> anyhow::Result<()> {
+        let cancel_key_data = CancelKeyData {
+            backend_pid: 42,
+            cancel_key: 41,
+        };
+        let uuid = uuid::Uuid::new_v4();
+        let msg = Notification::Cancel(CancelSession {
+            cancel_key_data,
+            region_id: None,
+            session_id: uuid,
+        });
+        let text = serde_json::to_string(&msg)?;
+        let result: Notification = serde_json::from_str(&text)?;
+        assert_eq!(msg, result);
+
+        let msg = Notification::Cancel(CancelSession {
+            cancel_key_data,
+            region_id: Some("region".to_string()),
+            session_id: uuid,
+        });
+        let text = serde_json::to_string(&msg)?;
+        let result: Notification = serde_json::from_str(&text)?;
+        assert_eq!(msg, result,);
+
         Ok(())
     }
 }
diff --git a/proxy/src/redis/publisher.rs b/proxy/src/redis/publisher.rs
new file mode 100644
index 0000000000..f85593afdd
--- /dev/null
+++ b/proxy/src/redis/publisher.rs
@@ -0,0 +1,80 @@
+use pq_proto::CancelKeyData;
+use redis::AsyncCommands;
+use uuid::Uuid;
+
+use crate::rate_limiter::{RateBucketInfo, RedisRateLimiter};
+
+use super::notifications::{CancelSession, Notification, PROXY_CHANNEL_NAME};
+
+pub struct RedisPublisherClient {
+    client: redis::Client,
+    publisher: Option<redis::aio::Connection>,
+    region_id: String,
+    limiter: RedisRateLimiter,
+}
+
+impl RedisPublisherClient {
+    pub fn new(
+        url: &str,
+        region_id: String,
+        info: &'static [RateBucketInfo],
+    ) -> anyhow::Result<Self> {
+        let client = redis::Client::open(url)?;
+        Ok(Self {
+            client,
+            publisher: None,
+            region_id,
+            limiter: RedisRateLimiter::new(info),
+        })
+    }
+    pub async fn try_publish(
+        &mut self,
+        cancel_key_data: CancelKeyData,
+        session_id: Uuid,
+    ) -> anyhow::Result<()> {
+        if !self.limiter.check() {
+            tracing::info!("Rate limit exceeded. Skipping cancellation message");
+            return Err(anyhow::anyhow!("Rate limit exceeded"));
+        }
+        match self.publish(cancel_key_data, session_id).await {
+            Ok(()) => return Ok(()),
+            Err(e) => {
+                tracing::error!("failed to publish a message: {e}");
+                self.publisher = None;
+            }
+        }
+        tracing::info!("Publisher is disconnected. Reconnectiong...");
+        self.try_connect().await?;
+        self.publish(cancel_key_data, session_id).await
+    }
+
+    async fn publish(
+        &mut self,
+        cancel_key_data: CancelKeyData,
+        session_id: Uuid,
+    ) -> anyhow::Result<()> {
+        let conn = self
+            .publisher
+            .as_mut()
+            .ok_or_else(|| anyhow::anyhow!("not connected"))?;
+        let payload = serde_json::to_string(&Notification::Cancel(CancelSession {
+            region_id: Some(self.region_id.clone()),
+            cancel_key_data,
+            session_id,
+        }))?;
+        conn.publish(PROXY_CHANNEL_NAME, payload).await?;
+        Ok(())
+    }
+    pub async fn try_connect(&mut self) -> anyhow::Result<()> {
+        match self.client.get_async_connection().await {
+            Ok(conn) => {
+                self.publisher = Some(conn);
+            }
+            Err(e) => {
+                tracing::error!("failed to connect to redis: {e}");
+                return Err(e.into());
+            }
+        }
+        Ok(())
+    }
+}
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index a20600b94a..ee3e91495b 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -24,7 +24,7 @@ use crate::metrics::NUM_CLIENT_CONNECTION_GAUGE;
 use crate::protocol2::{ProxyProtocolAccept, WithClientIp};
 use crate::rate_limiter::EndpointRateLimiter;
 use crate::serverless::backend::PoolingBackend;
-use crate::{cancellation::CancelMap, config::ProxyConfig};
+use crate::{cancellation::CancellationHandler, config::ProxyConfig};
 use futures::StreamExt;
 use hyper::{
     server::{
@@ -50,6 +50,7 @@ pub async fn task_main(
     ws_listener: TcpListener,
     cancellation_token: CancellationToken,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
+    cancellation_handler: Arc<CancellationHandler>,
 ) -> anyhow::Result<()> {
     scopeguard::defer! {
         info!("websocket server has shut down");
@@ -115,7 +116,7 @@ pub async fn task_main(
             let backend = backend.clone();
             let ws_connections = ws_connections.clone();
             let endpoint_rate_limiter = endpoint_rate_limiter.clone();
-
+            let cancellation_handler = cancellation_handler.clone();
             async move {
                 let peer_addr = match client_addr {
                     Some(addr) => addr,
@@ -127,9 +128,9 @@ pub async fn task_main(
                         let backend = backend.clone();
                         let ws_connections = ws_connections.clone();
                         let endpoint_rate_limiter = endpoint_rate_limiter.clone();
+                        let cancellation_handler = cancellation_handler.clone();
 
                         async move {
-                            let cancel_map = Arc::new(CancelMap::default());
                             let session_id = uuid::Uuid::new_v4();
 
                             request_handler(
@@ -137,7 +138,7 @@ pub async fn task_main(
                                 config,
                                 backend,
                                 ws_connections,
-                                cancel_map,
+                                cancellation_handler,
                                 session_id,
                                 peer_addr.ip(),
                                 endpoint_rate_limiter,
@@ -205,7 +206,7 @@ async fn request_handler(
     config: &'static ProxyConfig,
     backend: Arc<PoolingBackend>,
     ws_connections: TaskTracker,
-    cancel_map: Arc<CancelMap>,
+    cancellation_handler: Arc<CancellationHandler>,
     session_id: uuid::Uuid,
     peer_addr: IpAddr,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
@@ -232,7 +233,7 @@ async fn request_handler(
                     config,
                     ctx,
                     websocket,
-                    cancel_map,
+                    cancellation_handler,
                     host,
                     endpoint_rate_limiter,
                 )
diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs
index 062dd440b2..24f2bb7e8c 100644
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -1,5 +1,5 @@
 use crate::{
-    cancellation::CancelMap,
+    cancellation::CancellationHandler,
     config::ProxyConfig,
     context::RequestMonitoring,
     error::{io_error, ReportableError},
@@ -133,7 +133,7 @@ pub async fn serve_websocket(
     config: &'static ProxyConfig,
     mut ctx: RequestMonitoring,
     websocket: HyperWebsocket,
-    cancel_map: Arc<CancelMap>,
+    cancellation_handler: Arc<CancellationHandler>,
     hostname: Option<String>,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> anyhow::Result<()> {
@@ -141,7 +141,7 @@ pub async fn serve_websocket(
     let res = handle_client(
         config,
         &mut ctx,
-        cancel_map,
+        cancellation_handler,
         WebSocketRw::new(websocket),
         ClientMode::Websockets { hostname },
         endpoint_rate_limiter,
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 8e9cc43152..e808fabbe7 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -38,7 +38,7 @@ futures-io = { version = "0.3" }
 futures-sink = { version = "0.3" }
 futures-util = { version = "0.3", features = ["channel", "io", "sink"] }
 getrandom = { version = "0.2", default-features = false, features = ["std"] }
-hashbrown-582f2526e08bb6a0 = { package = "hashbrown", version = "0.14", default-features = false, features = ["raw"] }
+hashbrown-582f2526e08bb6a0 = { package = "hashbrown", version = "0.14", features = ["raw"] }
 hashbrown-594e8ee84c453af0 = { package = "hashbrown", version = "0.13", features = ["raw"] }
 hex = { version = "0.4", features = ["serde"] }
 hmac = { version = "0.12", default-features = false, features = ["reset"] }
@@ -91,7 +91,7 @@ cc = { version = "1", default-features = false, features = ["parallel"] }
 chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] }
 either = { version = "1" }
 getrandom = { version = "0.2", default-features = false, features = ["std"] }
-hashbrown-582f2526e08bb6a0 = { package = "hashbrown", version = "0.14", default-features = false, features = ["raw"] }
+hashbrown-582f2526e08bb6a0 = { package = "hashbrown", version = "0.14", features = ["raw"] }
 indexmap = { version = "1", default-features = false, features = ["std"] }
 itertools = { version = "0.10" }
 libc = { version = "0.2", features = ["extra_traits", "use_std"] }

From 855d7b478156fbaacdc3b841e98671590c16b051 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 14 Feb 2024 10:26:32 +0000
Subject: [PATCH 120/412] hold cancel session (#6750)

## Problem

In a recent refactor, we accidentally dropped the cancel session early

## Summary of changes

Hold the cancel session during proxy passthrough
---
 proxy/src/proxy.rs             | 1 +
 proxy/src/proxy/passthrough.rs | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index ce77098a5f..8a9445303a 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -331,6 +331,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
         compute: node,
         req: _request_gauge,
         conn: _client_gauge,
+        cancel: session,
     }))
 }
 
diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs
index c98f68d8d1..73c170fc0b 100644
--- a/proxy/src/proxy/passthrough.rs
+++ b/proxy/src/proxy/passthrough.rs
@@ -1,4 +1,5 @@
 use crate::{
+    cancellation,
     compute::PostgresConnection,
     console::messages::MetricsAuxInfo,
     metrics::NUM_BYTES_PROXIED_COUNTER,
@@ -57,6 +58,7 @@ pub struct ProxyPassthrough<S> {
 
     pub req: IntCounterPairGuard,
     pub conn: IntCounterPairGuard,
+    pub cancel: cancellation::Session,
 }
 
 impl<S: AsyncRead + AsyncWrite + Unpin> ProxyPassthrough<S> {

From edc691647d636408713b32d2a4a6a06cb737e345 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Wed, 14 Feb 2024 19:43:52 +0100
Subject: [PATCH 121/412] Proxy: remove fail fast logic to connect to compute
 (#6759)

## Problem

Flaky tests

## Summary of changes

Remove failfast logic
---
 proxy/src/proxy/connect_compute.rs | 35 ++++++++++++++---------------
 proxy/src/proxy/tests.rs           | 36 ------------------------------
 2 files changed, 17 insertions(+), 54 deletions(-)

diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index 6e57caf998..c76e2ff6d9 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -122,25 +122,24 @@ where
 
     error!(error = ?err, "could not connect to compute node");
 
-    let node_info =
-        if err.get_error_kind() == crate::error::ErrorKind::Postgres || !node_info.cached() {
-            // If the error is Postgres, that means that we managed to connect to the compute node, but there was an error.
-            // Do not need to retrieve a new node_info, just return the old one.
-            if !err.should_retry(num_retries) {
-                return Err(err.into());
-            }
-            node_info
-        } else {
-            // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
-            info!("compute node's state has likely changed; requesting a wake-up");
-            ctx.latency_timer.cache_miss();
-            let old_node_info = invalidate_cache(node_info);
-            let mut node_info = wake_compute(&mut num_retries, ctx, user_info).await?;
-            node_info.reuse_settings(old_node_info);
+    let node_info = if !node_info.cached() {
+        // If we just recieved this from cplane and dodn't get it from cache, we shouldn't retry.
+        // Do not need to retrieve a new node_info, just return the old one.
+        if !err.should_retry(num_retries) {
+            return Err(err.into());
+        }
+        node_info
+    } else {
+        // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
+        info!("compute node's state has likely changed; requesting a wake-up");
+        ctx.latency_timer.cache_miss();
+        let old_node_info = invalidate_cache(node_info);
+        let mut node_info = wake_compute(&mut num_retries, ctx, user_info).await?;
+        node_info.reuse_settings(old_node_info);
 
-            mechanism.update_connect_config(&mut node_info.config);
-            node_info
-        };
+        mechanism.update_connect_config(&mut node_info.config);
+        node_info
+    };
 
     // now that we have a new node, try connect to it repeatedly.
     // this can error for a few reasons, for instance:
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index efbd661bbf..1a01f32339 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -375,8 +375,6 @@ enum ConnectAction {
     Connect,
     Retry,
     Fail,
-    RetryPg,
-    FailPg,
 }
 
 #[derive(Clone)]
@@ -466,14 +464,6 @@ impl ConnectMechanism for TestConnectMechanism {
                 retryable: false,
                 kind: ErrorKind::Compute,
             }),
-            ConnectAction::FailPg => Err(TestConnectError {
-                retryable: false,
-                kind: ErrorKind::Postgres,
-            }),
-            ConnectAction::RetryPg => Err(TestConnectError {
-                retryable: true,
-                kind: ErrorKind::Postgres,
-            }),
             x => panic!("expecting action {:?}, connect is called instead", x),
         }
     }
@@ -572,32 +562,6 @@ async fn connect_to_compute_retry() {
     mechanism.verify();
 }
 
-#[tokio::test]
-async fn connect_to_compute_retry_pg() {
-    let _ = env_logger::try_init();
-    use ConnectAction::*;
-    let mut ctx = RequestMonitoring::test();
-    let mechanism = TestConnectMechanism::new(vec![Wake, RetryPg, Connect]);
-    let user_info = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
-        .await
-        .unwrap();
-    mechanism.verify();
-}
-
-#[tokio::test]
-async fn connect_to_compute_fail_pg() {
-    let _ = env_logger::try_init();
-    use ConnectAction::*;
-    let mut ctx = RequestMonitoring::test();
-    let mechanism = TestConnectMechanism::new(vec![Wake, FailPg]);
-    let user_info = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
-        .await
-        .unwrap_err();
-    mechanism.verify();
-}
-
 /// Test that we don't retry if the error is not retryable.
 #[tokio::test]
 async fn connect_to_compute_non_retry_1() {

From 96a4e8de660be469fb00efd7d268120890ca06fd Mon Sep 17 00:00:00 2001
From: Nikita Kalyanov <44959448+nikitakalyanov@users.noreply.github.com>
Date: Thu, 22 Feb 2024 11:51:19 +0200
Subject: [PATCH 122/412] Add /terminate API (#6745) (#6853)

this is to speed up suspends, see
https://github.com/neondatabase/cloud/issues/10284


Cherry-pick to release branch to build new compute images
---
 compute_tools/src/bin/compute_ctl.rs     | 25 +++++------
 compute_tools/src/compute.rs             | 16 +++++++
 compute_tools/src/http/api.rs            | 55 ++++++++++++++++++++++++
 compute_tools/src/http/openapi_spec.yaml | 23 ++++++++++
 control_plane/src/endpoint.rs            |  4 +-
 libs/compute_api/src/responses.rs        |  4 ++
 6 files changed, 114 insertions(+), 13 deletions(-)

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index a7e10d0aee..117919786e 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -45,7 +45,6 @@ use std::{thread, time::Duration};
 use anyhow::{Context, Result};
 use chrono::Utc;
 use clap::Arg;
-use nix::sys::signal::{kill, Signal};
 use signal_hook::consts::{SIGQUIT, SIGTERM};
 use signal_hook::{consts::SIGINT, iterator::Signals};
 use tracing::{error, info};
@@ -53,7 +52,9 @@ use url::Url;
 
 use compute_api::responses::ComputeStatus;
 
-use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec, PG_PID, SYNC_SAFEKEEPERS_PID};
+use compute_tools::compute::{
+    forward_termination_signal, ComputeNode, ComputeState, ParsedSpec, PG_PID,
+};
 use compute_tools::configurator::launch_configurator;
 use compute_tools::extension_server::get_pg_version;
 use compute_tools::http::api::launch_http_server;
@@ -394,6 +395,15 @@ fn main() -> Result<()> {
         info!("synced safekeepers at lsn {lsn}");
     }
 
+    let mut state = compute.state.lock().unwrap();
+    if state.status == ComputeStatus::TerminationPending {
+        state.status = ComputeStatus::Terminated;
+        compute.state_changed.notify_all();
+        // we were asked to terminate gracefully, don't exit to avoid restart
+        delay_exit = true
+    }
+    drop(state);
+
     if let Err(err) = compute.check_for_core_dumps() {
         error!("error while checking for core dumps: {err:?}");
     }
@@ -523,16 +533,7 @@ fn cli() -> clap::Command {
 /// wait for termination which would be easy then.
 fn handle_exit_signal(sig: i32) {
     info!("received {sig} termination signal");
-    let ss_pid = SYNC_SAFEKEEPERS_PID.load(Ordering::SeqCst);
-    if ss_pid != 0 {
-        let ss_pid = nix::unistd::Pid::from_raw(ss_pid as i32);
-        kill(ss_pid, Signal::SIGTERM).ok();
-    }
-    let pg_pid = PG_PID.load(Ordering::SeqCst);
-    if pg_pid != 0 {
-        let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
-        kill(pg_pid, Signal::SIGTERM).ok();
-    }
+    forward_termination_signal();
     exit(1);
 }
 
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 1c5363d048..142bb14fe5 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -28,6 +28,8 @@ use compute_api::responses::{ComputeMetrics, ComputeStatus};
 use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec};
 use utils::measured_stream::MeasuredReader;
 
+use nix::sys::signal::{kill, Signal};
+
 use remote_storage::{DownloadError, RemotePath};
 
 use crate::checker::create_availability_check_data;
@@ -1322,3 +1324,17 @@ LIMIT 100",
         Ok(remote_ext_metrics)
     }
 }
+
+pub fn forward_termination_signal() {
+    let ss_pid = SYNC_SAFEKEEPERS_PID.load(Ordering::SeqCst);
+    if ss_pid != 0 {
+        let ss_pid = nix::unistd::Pid::from_raw(ss_pid as i32);
+        kill(ss_pid, Signal::SIGTERM).ok();
+    }
+    let pg_pid = PG_PID.load(Ordering::SeqCst);
+    if pg_pid != 0 {
+        let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
+        // use 'immediate' shutdown (SIGQUIT): https://www.postgresql.org/docs/current/server-shutdown.html
+        kill(pg_pid, Signal::SIGQUIT).ok();
+    }
+}
diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs
index fa2c4cff28..f076951239 100644
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -5,6 +5,7 @@ use std::net::SocketAddr;
 use std::sync::Arc;
 use std::thread;
 
+use crate::compute::forward_termination_signal;
 use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
 use compute_api::requests::ConfigurationRequest;
 use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIError};
@@ -123,6 +124,17 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
             }
         }
 
+        (&Method::POST, "/terminate") => {
+            info!("serving /terminate POST request");
+            match handle_terminate_request(compute).await {
+                Ok(()) => Response::new(Body::empty()),
+                Err((msg, code)) => {
+                    error!("error handling /terminate request: {msg}");
+                    render_json_error(&msg, code)
+                }
+            }
+        }
+
         // download extension files from remote extension storage on demand
         (&Method::POST, route) if route.starts_with("/extension_server/") => {
             info!("serving {:?} POST request", route);
@@ -297,6 +309,49 @@ fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {
         .unwrap()
 }
 
+async fn handle_terminate_request(compute: &Arc<ComputeNode>) -> Result<(), (String, StatusCode)> {
+    {
+        let mut state = compute.state.lock().unwrap();
+        if state.status == ComputeStatus::Terminated {
+            return Ok(());
+        }
+        if state.status != ComputeStatus::Empty && state.status != ComputeStatus::Running {
+            let msg = format!(
+                "invalid compute status for termination request: {:?}",
+                state.status.clone()
+            );
+            return Err((msg, StatusCode::PRECONDITION_FAILED));
+        }
+        state.status = ComputeStatus::TerminationPending;
+        compute.state_changed.notify_all();
+        drop(state);
+    }
+    forward_termination_signal();
+    info!("sent signal and notified waiters");
+
+    // Spawn a blocking thread to wait for compute to become Terminated.
+    // This is needed to do not block the main pool of workers and
+    // be able to serve other requests while some particular request
+    // is waiting for compute to finish configuration.
+    let c = compute.clone();
+    task::spawn_blocking(move || {
+        let mut state = c.state.lock().unwrap();
+        while state.status != ComputeStatus::Terminated {
+            state = c.state_changed.wait(state).unwrap();
+            info!(
+                "waiting for compute to become Terminated, current status: {:?}",
+                state.status
+            );
+        }
+
+        Ok(())
+    })
+    .await
+    .unwrap()?;
+    info!("terminated Postgres");
+    Ok(())
+}
+
 // Main Hyper HTTP server function that runs it and blocks waiting on it forever.
 #[tokio::main]
 async fn serve(port: u16, state: Arc<ComputeNode>) {
diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml
index cedc6ece8f..d2ec54299f 100644
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -168,6 +168,29 @@ paths:
               schema:
                 $ref: "#/components/schemas/GenericError"
 
+  /terminate:
+    post:
+      tags:
+      - Terminate
+      summary: Terminate Postgres and wait for it to exit
+      description: ""
+      operationId: terminate
+      responses:
+        200:
+          description: Result
+        412:
+          description: "wrong state"
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/GenericError"
+        500:
+          description: "Unexpected error"
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/GenericError"
+
 components:
   securitySchemes:
     JWT:
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index f1fe12e05f..ce8f035dfc 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -652,7 +652,9 @@ impl Endpoint {
                         }
                         ComputeStatus::Empty
                         | ComputeStatus::ConfigurationPending
-                        | ComputeStatus::Configuration => {
+                        | ComputeStatus::Configuration
+                        | ComputeStatus::TerminationPending
+                        | ComputeStatus::Terminated => {
                             bail!("unexpected compute status: {:?}", state.status)
                         }
                     }
diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs
index 92bbf79cd4..fd0c90d447 100644
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -52,6 +52,10 @@ pub enum ComputeStatus {
     // compute will exit soon or is waiting for
     // control-plane to terminate it.
     Failed,
+    // Termination requested
+    TerminationPending,
+    // Terminated Postgres
+    Terminated,
 }
 
 fn rfc3339_serialize<S>(x: &Option<DateTime<Utc>>, s: S) -> Result<S::Ok, S::Error>

From 936a00e077403de97795e6432788ccc0e5279563 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 26 Feb 2024 10:19:24 +0100
Subject: [PATCH 123/412] pageserver: remove two obsolete/unused per-timeline
 metrics (#6893)

over-compensating the addition of a new per-timeline metric in
https://github.com/neondatabase/neon/pull/6834

part of https://github.com/neondatabase/neon/issues/6737
---
 pageserver/src/metrics.rs       | 35 ---------------------------------
 test_runner/fixtures/metrics.py |  2 --
 2 files changed, 37 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index ee0bd268cc..1749e02c7f 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -642,26 +642,6 @@ pub(crate) static TENANT_SYNTHETIC_SIZE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|
     .expect("Failed to register pageserver_tenant_synthetic_cached_size_bytes metric")
 });
 
-// Metrics for cloud upload. These metrics reflect data uploaded to cloud storage,
-// or in testing they estimate how much we would upload if we did.
-static NUM_PERSISTENT_FILES_CREATED: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "pageserver_created_persistent_files_total",
-        "Number of files created that are meant to be uploaded to cloud storage",
-        &["tenant_id", "shard_id", "timeline_id"]
-    )
-    .expect("failed to define a metric")
-});
-
-static PERSISTENT_BYTES_WRITTEN: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "pageserver_written_persistent_bytes_total",
-        "Total bytes written that are meant to be uploaded to cloud storage",
-        &["tenant_id", "shard_id", "timeline_id"]
-    )
-    .expect("failed to define a metric")
-});
-
 pub(crate) static EVICTION_ITERATION_DURATION: Lazy<HistogramVec> = Lazy::new(|| {
     register_histogram_vec!(
         "pageserver_eviction_iteration_duration_seconds_global",
@@ -1802,8 +1782,6 @@ pub(crate) struct TimelineMetrics {
     /// copy of LayeredTimeline.current_logical_size
     pub current_logical_size_gauge: UIntGauge,
     pub directory_entries_count_gauge: Lazy<UIntGauge, Box<dyn Send + Fn() -> UIntGauge>>,
-    pub num_persistent_files_created: IntCounter,
-    pub persistent_bytes_written: IntCounter,
     pub evictions: IntCounter,
     pub evictions_with_low_residence_duration: std::sync::RwLock<EvictionsWithLowResidenceDuration>,
 }
@@ -1885,12 +1863,6 @@ impl TimelineMetrics {
         };
         let directory_entries_count_gauge: Lazy<UIntGauge, Box<dyn Send + Fn() -> UIntGauge>> =
             Lazy::new(Box::new(directory_entries_count_gauge_closure));
-        let num_persistent_files_created = NUM_PERSISTENT_FILES_CREATED
-            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
-            .unwrap();
-        let persistent_bytes_written = PERSISTENT_BYTES_WRITTEN
-            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
-            .unwrap();
         let evictions = EVICTIONS
             .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
             .unwrap();
@@ -1912,8 +1884,6 @@ impl TimelineMetrics {
             resident_physical_size_gauge,
             current_logical_size_gauge,
             directory_entries_count_gauge,
-            num_persistent_files_created,
-            persistent_bytes_written,
             evictions,
             evictions_with_low_residence_duration: std::sync::RwLock::new(
                 evictions_with_low_residence_duration,
@@ -1923,8 +1893,6 @@ impl TimelineMetrics {
 
     pub(crate) fn record_new_file_metrics(&self, sz: u64) {
         self.resident_physical_size_add(sz);
-        self.num_persistent_files_created.inc_by(1);
-        self.persistent_bytes_written.inc_by(sz);
     }
 
     pub(crate) fn resident_physical_size_sub(&self, sz: u64) {
@@ -1957,9 +1925,6 @@ impl Drop for TimelineMetrics {
         if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) {
             let _ = metric.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
         }
-        let _ =
-            NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
-        let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
         let _ = EVICTIONS.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
 
         self.evictions_with_low_residence_duration
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index fd4618ca6a..c615dd154f 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -147,8 +147,6 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
     "pageserver_smgr_query_seconds_sum",
     "pageserver_storage_operations_seconds_count_total",
     "pageserver_storage_operations_seconds_sum_total",
-    "pageserver_created_persistent_files_total",
-    "pageserver_written_persistent_bytes_total",
     "pageserver_evictions_total",
     "pageserver_evictions_with_low_residence_duration_total",
     *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,

From 6f7f8958db48bfc08bebe1e71fa7c76a5a09b2e2 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 26 Feb 2024 10:24:58 +0000
Subject: [PATCH 124/412] pageserver: only write out legacy tenant config if no
 generation (#6891)

## Problem

Previously we always wrote out both legacy and modern tenant config
files. The legacy write enabled rollbacks, but we are long past the
point where that is needed.

We still need the legacy format for situations where someone is running
tenants without generations (that will be yanked as well eventually),
but we can avoid writing it out at all if we do have a generation number
set. We implicitly also avoid writing the legacy config if our mode is
Secondary (secondary mode is newer than generations).

## Summary of changes

- Make writing legacy tenant config conditional on there being no
generation number set.
---
 pageserver/src/tenant.rs                | 27 +++++++++++++++----------
 test_runner/fixtures/neon_fixtures.py   |  2 +-
 test_runner/regress/test_tenant_conf.py |  3 +--
 3 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 6389d52014..ebdff5f924 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2573,19 +2573,24 @@ impl Tenant {
         legacy_config_path: &Utf8Path,
         location_conf: &LocationConf,
     ) -> anyhow::Result<()> {
-        // Forward compat: write out an old-style configuration that old versions can read, in case we roll back
-        Self::persist_tenant_config_legacy(
-            tenant_shard_id,
-            legacy_config_path,
-            &location_conf.tenant_conf,
-        )
-        .await?;
-
         if let LocationMode::Attached(attach_conf) = &location_conf.mode {
-            // Once we use LocationMode, generations are mandatory.  If we aren't using generations,
-            // then drop out after writing legacy-style config.
+            // The modern-style LocationConf config file requires a generation to be set. In case someone
+            // is running a pageserver without the infrastructure to set generations, write out the legacy-style
+            // config file that only contains TenantConf.
+            //
+            // This will eventually be removed in https://github.com/neondatabase/neon/issues/5388
+
             if attach_conf.generation.is_none() {
-                tracing::debug!("Running without generations, not writing new-style LocationConf");
+                tracing::info!(
+                    "Running without generations, writing legacy-style tenant config file"
+                );
+                Self::persist_tenant_config_legacy(
+                    tenant_shard_id,
+                    legacy_config_path,
+                    &location_conf.tenant_conf,
+                )
+                .await?;
+
                 return Ok(());
             }
         }
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 441b64ebfc..6cb7656660 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3812,7 +3812,7 @@ def pytest_addoption(parser: Parser):
 
 
 SMALL_DB_FILE_NAME_REGEX: re.Pattern = re.compile(  # type: ignore[type-arg]
-    r"config|config-v1|heatmap-v1|metadata|.+\.(?:toml|pid|json|sql|conf)"
+    r"config-v1|heatmap-v1|metadata|.+\.(?:toml|pid|json|sql|conf)"
 )
 
 
diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py
index 2ed22cabc4..a2ffd200a6 100644
--- a/test_runner/regress/test_tenant_conf.py
+++ b/test_runner/regress/test_tenant_conf.py
@@ -299,8 +299,7 @@ def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder):
 
     # tenant is created with defaults, as in without config file
     (tenant_id, timeline_id) = env.neon_cli.create_tenant()
-    config_path = env.pageserver.tenant_dir(tenant_id) / "config"
-    assert config_path.exists(), "config file is always initially created"
+    config_path = env.pageserver.tenant_dir(tenant_id) / "config-v1"
 
     http_client = env.pageserver.http_client()
 

From 970066a914ec6f8a5c827e26e7ea7247eedfaeba Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 29 Feb 2024 13:55:38 +0000
Subject: [PATCH 125/412] libs: fix expired token in auth decode test (#6963)

The test token expired earlier today (1709200879). I regenerated the
token, but without an expiration date this time.
---
 libs/utils/src/auth.rs | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs
index 51ab238d77..fbf0dff665 100644
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -206,12 +206,11 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
         //   "scope": "tenant",
         //   "tenant_id": "3d1f7595b468230304e0b73cecbcb081",
         //   "iss": "neon.controlplane",
-        //   "exp": 1709200879,
         //   "iat": 1678442479
         // }
         // ```
         //
-        let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJleHAiOjE3MDkyMDA4NzksImlhdCI6MTY3ODQ0MjQ3OX0.U3eA8j-uU-JnhzeO3EDHRuXLwkAUFCPxtGHEgw6p7Ccc3YRbFs2tmCdbD9PZEXP-XsxSeBQi1FY0YPcT3NXADw";
+        let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJpYXQiOjE2Nzg0NDI0Nzl9.rNheBnluMJNgXzSTTJoTNIGy4P_qe0JUHl_nVEGuDCTgHOThPVr552EnmKccrCKquPeW3c2YUk0Y9Oh4KyASAw";
 
         // Check it can be validated with the public key
         let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap()]);

From f0a9017008a5ce26a9329042c381d722a7ae5cb7 Mon Sep 17 00:00:00 2001
From: Roman Zaynetdinov <roman@neon.tech>
Date: Mon, 11 Mar 2024 10:10:04 +0200
Subject: [PATCH 126/412] Export db size, deadlocks and changed row metrics
 (#7050)

## Problem

We want to report metrics for the oldest user database.
---
 vm-image-spec.yaml | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index c1b7ad533a..5b93088303 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -142,6 +142,51 @@ files:
         query: |
           select datname, state, count(*) as count from pg_stat_activity where state <> '' group by datname, state;
 
+      - metric_name: pg_stats_userdb
+        type: gauge
+        help: 'Stats for the oldest non-system db'
+        key_labels:
+          - datname
+        value_label: kind
+        values:
+          - db_size
+          - deadlocks
+          # Rows
+          - inserted
+          - updated
+          - deleted
+        # We export stats for only one non-system database. Without this limit
+        # it is too easy to abuse the system by creating lots of databases.
+        # We can try lifting this limit in the future after we understand the needs better.
+        query: |
+          select pg_database_size(datname) as db_size, deadlocks,
+                 tup_inserted as inserted, tup_updated as updated, tup_deleted as deleted,
+                 datname
+            from pg_stat_database
+           where datname IN (
+             select datname
+               from pg_database
+              where datname <> 'postgres' and not datistemplate
+              order by oid
+              limit 1
+           );
+
+      - metric_name: max_cluster_size
+        type: gauge
+        help: 'neon.max_cluster_size setting'
+        key_labels:
+        values: [max_cluster_size]
+        query: |
+          select setting::int as max_cluster_size from pg_settings where name = 'neon.max_cluster_size';
+
+      - metric_name: db_total_size
+        type: gauge
+        help: 'Size of all databases'
+        key_labels:
+        values: [total]
+        query: |
+          select sum(pg_database_size(datname)) as total from pg_database;
+
 build: |
   # Build cgroup-tools
   #

From 464717451b6d6fc25b3ae2c844510d6e0aca9458 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 18 Mar 2024 11:28:45 +0200
Subject: [PATCH 127/412] build: make procfs linux only dependency (#7156)

the dependency refuses to build on macos so builds on `main` are broken
right now, including the `release` PR.
---
 Cargo.lock                               | 1 -
 pageserver/Cargo.toml                    | 4 +++-
 pageserver/src/virtual_file/io_engine.rs | 1 +
 workspace_hack/Cargo.toml                | 2 --
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 022dc11f07..c4f925e3c7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -6988,7 +6988,6 @@ dependencies = [
  "axum",
  "base64 0.21.1",
  "base64ct",
- "byteorder",
  "bytes",
  "cc",
  "chrono",
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 2702a2040a..f304294591 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -48,7 +48,6 @@ postgres.workspace = true
 postgres_backend.workspace = true
 postgres-protocol.workspace = true
 postgres-types.workspace = true
-procfs.workspace = true
 rand.workspace = true
 regex.workspace = true
 scopeguard.workspace = true
@@ -90,6 +89,9 @@ enumset = { workspace = true, features = ["serde"]}
 strum.workspace = true
 strum_macros.workspace = true
 
+[target.'cfg(target_os = "linux")'.dependencies]
+procfs.workspace = true
+
 [dev-dependencies]
 criterion.workspace = true
 hex-literal.workspace = true
diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs
index 2dd0ce64d6..7a27be2ca1 100644
--- a/pageserver/src/virtual_file/io_engine.rs
+++ b/pageserver/src/virtual_file/io_engine.rs
@@ -9,6 +9,7 @@
 //!
 //!
 
+#[cfg(target_os = "linux")]
 pub(super) mod tokio_epoll_uring_ext;
 
 use tokio_epoll_uring::{IoBuf, Slice};
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 0646091006..8593b752c2 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -24,7 +24,6 @@ aws-smithy-types = { version = "1", default-features = false, features = ["byte-
 axum = { version = "0.6", features = ["ws"] }
 base64 = { version = "0.21", features = ["alloc"] }
 base64ct = { version = "1", default-features = false, features = ["std"] }
-byteorder = { version = "1", features = ["i128"] }
 bytes = { version = "1", features = ["serde"] }
 chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] }
 clap = { version = "4", features = ["derive", "string"] }
@@ -87,7 +86,6 @@ zstd-sys = { version = "2", default-features = false, features = ["legacy", "std
 
 [build-dependencies]
 anyhow = { version = "1", features = ["backtrace"] }
-byteorder = { version = "1", features = ["i128"] }
 bytes = { version = "1", features = ["serde"] }
 cc = { version = "1", default-features = false, features = ["parallel"] }
 chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] }

From fb2b1ce57bc394493233c7cfc597531b42980529 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 18 Mar 2024 11:47:59 +0000
Subject: [PATCH 128/412] fixup(#7141 / tokio_epoll_uring_ext): high frequency
 log message

The PR #7141 added log message

```
ThreadLocalState is being dropped and id might be re-used in the future
```

which was supposed to be emitted when the thread-local is destroyed.
Instead, it was emitted on _each_ call to `thread_local_system()`,
ie.., on each tokio-epoll-uring operation.
---
 .../io_engine/tokio_epoll_uring_ext.rs        | 27 ++++++++++++++-----
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs b/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs
index c4b10f3a24..9b2efef5d4 100644
--- a/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs
+++ b/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs
@@ -6,7 +6,7 @@
 //! See <https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391> for more details.
 
 use std::sync::atomic::AtomicU32;
-use std::sync::Arc;
+use std::sync::{Arc, Weak};
 
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, info_span, warn, Instrument};
@@ -24,23 +24,36 @@ struct ThreadLocalState(Arc<ThreadLocalStateInner>);
 struct ThreadLocalStateInner {
     cell: tokio::sync::OnceCell<SystemHandle>,
     launch_attempts: AtomicU32,
+    weak_self: Weak<ThreadLocalStateInner>,
 }
 
 impl ThreadLocalState {
     pub fn new() -> Self {
-        Self(Arc::new(ThreadLocalStateInner {
+        Self(Arc::new_cyclic(|weak| ThreadLocalStateInner {
             cell: tokio::sync::OnceCell::default(),
             launch_attempts: AtomicU32::new(0),
+            weak_self: Weak::clone(weak),
         }))
     }
-    pub fn make_id_string(&self) -> String {
-        format!("0x{:p}", Arc::as_ptr(&self.0))
-    }
 }
 
-impl Drop for ThreadLocalState {
+impl ThreadLocalStateInner {
+    pub fn make_id_string(&self) -> String {
+        format!("0x{:p}", self.weak_self.as_ptr())
+    }
+}
+
+impl Drop for ThreadLocalStateInner {
     fn drop(&mut self) {
-        info!(parent: None, id=%self.make_id_string(), "tokio-epoll-uring_ext: ThreadLocalState is being dropped and id might be re-used in the future");
+        info!(parent: None, id=%self.make_id_string(), "tokio_epoll_uring_ext: thread-local state is being dropped and id might be re-used in the future");
+    }
+}
+
+impl std::ops::Deref for ThreadLocalState {
+    type Target = ThreadLocalStateInner;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
     }
 }
 

From 41fc96e20fd8546240a26540ba4e51df5de4e30a Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 18 Mar 2024 16:12:01 +0100
Subject: [PATCH 129/412] fixup(#7160 / tokio_epoll_uring_ext): double-panic
 caused by info! in thread-local's drop() (#7164)

Manual testing of the changes in #7160 revealed that, if the
thread-local destructor ever runs (it apparently doesn't in our test
suite runs, otherwise #7160 would not have auto-merged), we can
encounter an `abort()` due to a double-panic in the tracing code.

This github comment here contains the stack trace:
https://github.com/neondatabase/neon/pull/7160#issuecomment-2003778176

This PR reverts #7160 and uses a atomic counter to identify the
thread-local in log messages, instead of the memory address of the
thread local, which may be re-used.
---
 .../io_engine/tokio_epoll_uring_ext.rs        | 29 +++++--------------
 1 file changed, 8 insertions(+), 21 deletions(-)

diff --git a/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs b/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs
index 9b2efef5d4..6ea19d6b2d 100644
--- a/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs
+++ b/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs
@@ -5,8 +5,8 @@
 //! on older kernels, such as some (but not all) older kernels in the Linux 5.10 series.
 //! See <https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391> for more details.
 
-use std::sync::atomic::AtomicU32;
-use std::sync::{Arc, Weak};
+use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};
+use std::sync::Arc;
 
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, info_span, warn, Instrument};
@@ -24,38 +24,25 @@ struct ThreadLocalState(Arc<ThreadLocalStateInner>);
 struct ThreadLocalStateInner {
     cell: tokio::sync::OnceCell<SystemHandle>,
     launch_attempts: AtomicU32,
-    weak_self: Weak<ThreadLocalStateInner>,
+    /// populated through fetch_add from [`THREAD_LOCAL_STATE_ID`]
+    thread_local_state_id: u64,
 }
 
 impl ThreadLocalState {
     pub fn new() -> Self {
-        Self(Arc::new_cyclic(|weak| ThreadLocalStateInner {
+        Self(Arc::new(ThreadLocalStateInner {
             cell: tokio::sync::OnceCell::default(),
             launch_attempts: AtomicU32::new(0),
-            weak_self: Weak::clone(weak),
+            thread_local_state_id: THREAD_LOCAL_STATE_ID.fetch_add(1, Ordering::Relaxed),
         }))
     }
-}
 
-impl ThreadLocalStateInner {
     pub fn make_id_string(&self) -> String {
-        format!("0x{:p}", self.weak_self.as_ptr())
+        format!("{}", self.0.thread_local_state_id)
     }
 }
 
-impl Drop for ThreadLocalStateInner {
-    fn drop(&mut self) {
-        info!(parent: None, id=%self.make_id_string(), "tokio_epoll_uring_ext: thread-local state is being dropped and id might be re-used in the future");
-    }
-}
-
-impl std::ops::Deref for ThreadLocalState {
-    type Target = ThreadLocalStateInner;
-
-    fn deref(&self) -> &Self::Target {
-        &self.0
-    }
-}
+static THREAD_LOCAL_STATE_ID: AtomicU64 = AtomicU64::new(0);
 
 thread_local! {
     static THREAD_LOCAL: ThreadLocalState = ThreadLocalState::new();

From 0519138b04ae4b711b4a5b3328cc7e9495ca9962 Mon Sep 17 00:00:00 2001
From: Em Sharnoff <sharnoff@neon.tech>
Date: Wed, 10 Apr 2024 06:13:48 -0700
Subject: [PATCH 130/412] compute_ctl: Auto-set dynamic_shared_memory_type
 (#7348)

Part of neondatabase/cloud#12047.

The basic idea is that for our VMs, we want to enable swap and disable
Linux memory overcommit. Alongside these, we should set postgres'
dynamic_shared_memory_type to mmap, but we want to avoid setting it to
mmap if swap is not enabled.

Implementing this in the control plane would be fiddly, but it's
relatively straightforward to add to compute_ctl.
---
 compute_tools/src/config.rs     | 25 +++++++++++++++++++++++--
 compute_tools/src/pg_helpers.rs |  2 +-
 2 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs
index f1fd8637f5..89c866b20c 100644
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -6,8 +6,8 @@ use std::path::Path;
 use anyhow::Result;
 
 use crate::pg_helpers::escape_conf_value;
-use crate::pg_helpers::PgOptionsSerialize;
-use compute_api::spec::{ComputeMode, ComputeSpec};
+use crate::pg_helpers::{GenericOptionExt, PgOptionsSerialize};
+use compute_api::spec::{ComputeMode, ComputeSpec, GenericOption};
 
 /// Check that `line` is inside a text file and put it there if it is not.
 /// Create file if it doesn't exist.
@@ -92,6 +92,27 @@ pub fn write_postgres_conf(
         }
     }
 
+    if cfg!(target_os = "linux") {
+        // Check /proc/sys/vm/overcommit_memory -- if it equals 2 (i.e. linux memory overcommit is
+        // disabled), then the control plane has enabled swap and we should set
+        // dynamic_shared_memory_type = 'mmap'.
+        //
+        // This is (maybe?) temporary - for more, see https://github.com/neondatabase/cloud/issues/12047.
+        let overcommit_memory_contents = std::fs::read_to_string("/proc/sys/vm/overcommit_memory")
+            // ignore any errors - they may be expected to occur under certain situations (e.g. when
+            // not running in Linux).
+            .unwrap_or_else(|_| String::new());
+        if overcommit_memory_contents.trim() == "2" {
+            let opt = GenericOption {
+                name: "dynamic_shared_memory_type".to_owned(),
+                value: Some("mmap".to_owned()),
+                vartype: "enum".to_owned(),
+            };
+
+            write!(file, "{}", opt.to_pg_setting())?;
+        }
+    }
+
     // If there are any extra options in the 'settings' field, append those
     if spec.cluster.settings.is_some() {
         writeln!(file, "# Managed by compute_ctl: begin")?;
diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs
index 5deb50d6b7..fa0822748b 100644
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -44,7 +44,7 @@ pub fn escape_conf_value(s: &str) -> String {
     format!("'{}'", res)
 }
 
-trait GenericOptionExt {
+pub trait GenericOptionExt {
     fn to_pg_option(&self) -> String;
     fn to_pg_setting(&self) -> String;
 }

From 459bc479dcdcf86cbf80b2b8a7c0bd4d1fa6ff35 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 22 Apr 2024 12:47:24 +0100
Subject: [PATCH 131/412] pageserver: fix unlogged relations with sharding
 (#7454)

## Problem

- #7451

INIT_FORKNUM blocks must be stored on shard 0 to enable including them
in basebackup.

This issue can be missed in simple tests because creating an unlogged
table isn't sufficient -- to repro I had to create an _index_ on an
unlogged table (then restart the endpoint).

Closes: #7451

## Summary of changes

- Add a reproducer for the issue.
- Tweak the condition for `key_is_shard0` to include anything that isn't
a normal relation block _and_ any normal relation block whose forknum is
INIT_FORKNUM.
- To enable existing databases to recover from the issue, add a special
case that omits relations if they were stored on the wrong INITFORK.
This enables postgres to start and the user to drop the table and
recreate it.
---
 libs/pageserver_api/src/shard.rs     | 27 +++++++++++++++++-
 pageserver/src/basebackup.rs         | 17 +++++++++--
 test_runner/regress/test_sharding.py | 42 ++++++++++++++++++++++++++++
 3 files changed, 83 insertions(+), 3 deletions(-)

diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index c293ad705b..6a8a5cc8f3 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -5,6 +5,7 @@ use crate::{
     models::ShardParameters,
 };
 use hex::FromHex;
+use postgres_ffi::relfile_utils::INIT_FORKNUM;
 use serde::{Deserialize, Serialize};
 use utils::id::TenantId;
 
@@ -537,6 +538,24 @@ impl ShardIdentity {
         }
     }
 
+    /// Special case for issue `<https://github.com/neondatabase/neon/issues/7451>`
+    ///
+    /// When we fail to read a forknum block, this function tells us whether we may ignore the error
+    /// as a symptom of that issue.
+    pub fn is_key_buggy_forknum(&self, key: &Key) -> bool {
+        if !is_rel_block_key(key) || key.field5 != INIT_FORKNUM {
+            return false;
+        }
+
+        let mut hash = murmurhash32(key.field4);
+        hash = hash_combine(hash, murmurhash32(key.field6 / self.stripe_size.0));
+        let mapped_shard = ShardNumber((hash % self.count.0 as u32) as u8);
+
+        // The key may be affected by issue #7454: it is an initfork and it would not
+        // have mapped to shard 0 until we fixed that issue.
+        mapped_shard != ShardNumber(0)
+    }
+
     /// Return true if the key should be discarded if found in this shard's
     /// data store, e.g. during compaction after a split.
     ///
@@ -649,7 +668,13 @@ fn key_is_shard0(key: &Key) -> bool {
     // relation pages are distributed to shards other than shard zero. Everything else gets
     // stored on shard 0.  This guarantees that shard 0 can independently serve basebackup
     // requests, and any request other than those for particular blocks in relations.
-    !is_rel_block_key(key)
+    //
+    // The only exception to this rule is "initfork" data -- this relates to postgres's UNLOGGED table
+    // type. These are special relations, usually with only 0 or 1 blocks, and we store them on shard 0
+    // because they must be included in basebackups.
+    let is_initfork = key.field5 == INIT_FORKNUM;
+
+    !is_rel_block_key(key) || is_initfork
 }
 
 /// Provide the same result as the function in postgres `hashfn.h` with the same name
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index 0479d05f8f..107758f385 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -13,7 +13,7 @@
 use anyhow::{anyhow, bail, ensure, Context};
 use bytes::{BufMut, Bytes, BytesMut};
 use fail::fail_point;
-use pageserver_api::key::{key_to_slru_block, Key};
+use pageserver_api::key::{key_to_slru_block, rel_block_to_key, Key};
 use postgres_ffi::pg_constants;
 use std::fmt::Write as FmtWrite;
 use std::time::SystemTime;
@@ -297,7 +297,20 @@ where
                 if rel.forknum == INIT_FORKNUM {
                     // I doubt we need _init fork itself, but having it at least
                     // serves as a marker relation is unlogged.
-                    self.add_rel(rel, rel).await?;
+                    if let Err(_e) = self.add_rel(rel, rel).await {
+                        if self
+                            .timeline
+                            .get_shard_identity()
+                            .is_key_buggy_forknum(&rel_block_to_key(rel, 0x0))
+                        {
+                            // Workaround https://github.com/neondatabase/neon/issues/7451 -- if we have an unlogged relation
+                            // whose INIT_FORKNUM is not correctly on shard zero, then omit it in the basebackup.  This allows
+                            // postgres to start up.  The relation won't work, but it will be possible to DROP TABLE on it and
+                            // recreate.
+                            tracing::warn!("Omitting relation {rel} for issue #7451: drop and recreate this unlogged relation");
+                            continue;
+                        }
+                    };
                     self.add_rel(rel, rel.with_forknum(MAIN_FORKNUM)).await?;
                     continue;
                 }
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index bfaab9125f..101d2620b0 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -1201,3 +1201,45 @@ def test_sharding_backpressure(neon_env_builder: NeonEnvBuilder):
         max_lsn = max(Lsn(info["last_record_lsn"]) for info in infos)
         diff = max_lsn - min_lsn
         assert diff < 2 * 1024 * 1024, f"LSN diff={diff}, expected diff < 2MB due to backpressure"
+
+
+def test_sharding_unlogged_relation(neon_env_builder: NeonEnvBuilder):
+    """
+    Check that an unlogged relation is handled properly on a sharded tenant
+
+    Reproducer for https://github.com/neondatabase/neon/issues/7451
+    """
+
+    neon_env_builder.num_pageservers = 2
+    env = neon_env_builder.init_configs()
+    neon_env_builder.start()
+
+    tenant_id = TenantId.generate()
+    timeline_id = TimelineId.generate()
+    env.neon_cli.create_tenant(tenant_id, timeline_id, shard_count=8)
+
+    # We will create many tables to ensure it's overwhelmingly likely that at least one
+    # of them doesn't land on shard 0
+    table_names = [f"my_unlogged_{i}" for i in range(0, 16)]
+
+    with env.endpoints.create_start("main", tenant_id=tenant_id) as ep:
+        for table_name in table_names:
+            ep.safe_psql(f"CREATE UNLOGGED TABLE {table_name} (id integer, value varchar(64));")
+            ep.safe_psql(f"INSERT INTO {table_name} VALUES (1, 'foo')")
+            result = ep.safe_psql(f"SELECT * from {table_name};")
+            assert result == [(1, "foo")]
+            ep.safe_psql(f"CREATE INDEX ON {table_name} USING btree (value);")
+
+        wait_for_last_flush_lsn(env, ep, tenant_id, timeline_id)
+
+    with env.endpoints.create_start("main", tenant_id=tenant_id) as ep:
+        for table_name in table_names:
+            # Check that table works: we can select and insert
+            result = ep.safe_psql(f"SELECT * from {table_name};")
+            assert result == []
+            ep.safe_psql(f"INSERT INTO {table_name} VALUES (2, 'bar');")
+            result = ep.safe_psql(f"SELECT * from {table_name};")
+            assert result == [(2, "bar")]
+
+        # Ensure that post-endpoint-restart modifications are ingested happily by pageserver
+        wait_for_last_flush_lsn(env, ep, tenant_id, timeline_id)

From 101043122e845869ae93da81d4dc1218f72c3b1d Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Mon, 13 May 2024 13:41:14 +0100
Subject: [PATCH 132/412] Revert protocol version upgrade (#7727)

## Problem

"John pointed out that the switch to protocol version 2 made
test_gc_aggressive test flaky:
https://github.com/neondatabase/neon/issues/7692.
I tracked it down, and that is indeed an issue. Conditions for hitting
the issue:
The problem occurs in the primary
GC horizon is set to a very low value, e.g. 0.
If the primary is actively writing WAL, and GC runs in the pageserver at
the same time that the primary sends a GetPage request, it's possible
that the GC advances the GC horizon past the GetPage request's LSN. I'm
working on a fix here: https://github.com/neondatabase/neon/pull/7708."
- Heikki

## Summary of changes
Use protocol version 1 as default.
---
 pgxn/neon/libpagestore.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index f5ce2caff3..b7b1e7ccbf 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -49,7 +49,7 @@ char	   *neon_auth_token;
 int			readahead_buffer_size = 128;
 int			flush_every_n_requests = 8;
 
-int         neon_protocol_version = 2;
+int         neon_protocol_version = 1;
 
 static int	n_reconnect_attempts = 0;
 static int	max_reconnect_attempts = 60;
@@ -860,7 +860,7 @@ pg_init_libpagestore(void)
 							"Version of compute<->page server protocol",
 							NULL,
 							&neon_protocol_version,
-							2, /* use protocol version 2 */
+							1, /* default to old protocol for now */
 							1, /* min */
 							2, /* max */
 							PGC_SU_BACKEND,

From dbff725a0c80a78b7be7aead8bc95d7e87e5ef0f Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Thu, 23 May 2024 13:35:59 -0700
Subject: [PATCH 133/412] Remove apostrophe (#7868)

## Problem

## Summary of changes

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist
---
 vm-image-spec.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index 0f9d56e466..484a86fc21 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -320,7 +320,7 @@ files:
 
       - metric_name: wal_is_lost
         type: gauge
-        help: 'Whether or not the replication slot\'s wal_status is lost'
+        help: 'Whether or not the replication slot wal_status is lost'
         key_labels:
           - slot_name
         values: [wal_status_is_lost]

From 068c158ca564b42c15a9b9a0b2cdfe4423809e1e Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 27 May 2024 15:57:57 +0300
Subject: [PATCH 134/412] Fix connect to PS on MacOS/X (#7885)

## Problem

After [0e4f1826805d040a23c25c54f3993a942755dbc2] which introduce async
connect
Neon is not able to connect to page server.

## Summary of changes

Perform sync commit at MacOS/X

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/libpagestore.c   | 89 +++++++++++++++++++-------------------
 pgxn/neon/pagestore_smgr.c | 10 ++---
 2 files changed, 50 insertions(+), 49 deletions(-)

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index a9c8d59c3a..5eae2d8204 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -125,13 +125,6 @@ typedef struct
 	 *	- WL_EXIT_ON_PM_DEATH.
 	 */
 	WaitEventSet   *wes_read;
-	/*---
-	 * WaitEventSet containing:
-	 *	- WL_SOCKET_WRITABLE on 'conn'
-	 *	- WL_LATCH_SET on MyLatch, and
-	 *	- WL_EXIT_ON_PM_DEATH.
-	 */
-	WaitEventSet   *wes_write;
 } PageServer;
 
 static PageServer page_servers[MAX_SHARDS];
@@ -336,11 +329,6 @@ CLEANUP_AND_DISCONNECT(PageServer *shard)
 		FreeWaitEventSet(shard->wes_read);
 		shard->wes_read = NULL;
 	}
-	if (shard->wes_write)
-	{
-		FreeWaitEventSet(shard->wes_write);
-		shard->wes_write = NULL;
-	}
 	if (shard->conn)
 	{
 		PQfinish(shard->conn);
@@ -436,22 +424,6 @@ pageserver_connect(shardno_t shard_no, int elevel)
 			return false;
 		}
 
-		shard->wes_read = CreateWaitEventSet(TopMemoryContext, 3);
-		AddWaitEventToSet(shard->wes_read, WL_LATCH_SET, PGINVALID_SOCKET,
-						  MyLatch, NULL);
-		AddWaitEventToSet(shard->wes_read, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
-						  NULL, NULL);
-		AddWaitEventToSet(shard->wes_read, WL_SOCKET_READABLE, PQsocket(shard->conn), NULL, NULL);
-
-		shard->wes_write = CreateWaitEventSet(TopMemoryContext, 3);
-		AddWaitEventToSet(shard->wes_write, WL_LATCH_SET, PGINVALID_SOCKET,
-						  MyLatch, NULL);
-		AddWaitEventToSet(shard->wes_write, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
-						  NULL, NULL);
-		AddWaitEventToSet(shard->wes_write, WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE,
-						  PQsocket(shard->conn),
-						  NULL, NULL);
-
 		shard->state = PS_Connecting_Startup;
 		/* fallthrough */
 	}
@@ -460,13 +432,12 @@ pageserver_connect(shardno_t shard_no, int elevel)
 		char	   *pagestream_query;
 		int			ps_send_query_ret;
 		bool		connected = false;
-
+		int poll_result = PGRES_POLLING_WRITING;
 		neon_shard_log(shard_no, DEBUG5, "Connection state: Connecting_Startup");
 
 		do
 		{
 			WaitEvent	event;
-			int			poll_result = PQconnectPoll(shard->conn);
 
 			switch (poll_result)
 			{
@@ -497,25 +468,45 @@ pageserver_connect(shardno_t shard_no, int elevel)
 				}
 			case PGRES_POLLING_READING:
 				/* Sleep until there's something to do */
-				(void) WaitEventSetWait(shard->wes_read, -1L, &event, 1,
-										PG_WAIT_EXTENSION);
-				ResetLatch(MyLatch);
-
-				/* query cancellation, backend shutdown */
-				CHECK_FOR_INTERRUPTS();
-
+				while (true)
+				{
+					int rc = WaitLatchOrSocket(MyLatch,
+											   WL_EXIT_ON_PM_DEATH | WL_LATCH_SET | WL_SOCKET_READABLE,
+											   PQsocket(shard->conn),
+											   0,
+											   PG_WAIT_EXTENSION);
+					elog(DEBUG5, "PGRES_POLLING_READING=>%d", rc);
+					if (rc & WL_LATCH_SET)
+					{
+						ResetLatch(MyLatch);
+						/* query cancellation, backend shutdown */
+						CHECK_FOR_INTERRUPTS();
+					}
+					if (rc & WL_SOCKET_READABLE)
+						break;
+				}
 				/* PQconnectPoll() handles the socket polling state updates */
 
 				break;
 			case PGRES_POLLING_WRITING:
 				/* Sleep until there's something to do */
-				(void) WaitEventSetWait(shard->wes_write, -1L, &event, 1,
-										PG_WAIT_EXTENSION);
-				ResetLatch(MyLatch);
-
-				/* query cancellation, backend shutdown */
-				CHECK_FOR_INTERRUPTS();
-
+				while (true)
+				{
+					int rc = WaitLatchOrSocket(MyLatch,
+											   WL_EXIT_ON_PM_DEATH | WL_LATCH_SET | WL_SOCKET_WRITEABLE,
+											   PQsocket(shard->conn),
+											   0,
+											   PG_WAIT_EXTENSION);
+					elog(DEBUG5, "PGRES_POLLING_WRITING=>%d", rc);
+					if (rc & WL_LATCH_SET)
+					{
+						ResetLatch(MyLatch);
+						/* query cancellation, backend shutdown */
+						CHECK_FOR_INTERRUPTS();
+					}
+					if (rc & WL_SOCKET_WRITEABLE)
+						break;
+				}
 				/* PQconnectPoll() handles the socket polling state updates */
 
 				break;
@@ -524,12 +515,22 @@ pageserver_connect(shardno_t shard_no, int elevel)
 				connected = true;
 				break;
 			}
+			poll_result = PQconnectPoll(shard->conn);
+			elog(DEBUG5, "PQconnectPoll=>%d", poll_result);
 		}
 		while (!connected);
 
 		/* No more polling needed; connection succeeded */
 		shard->last_connect_time = GetCurrentTimestamp();
 
+		shard->wes_read = CreateWaitEventSet(TopMemoryContext, 3);
+		AddWaitEventToSet(shard->wes_read, WL_LATCH_SET, PGINVALID_SOCKET,
+						  MyLatch, NULL);
+		AddWaitEventToSet(shard->wes_read, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
+						  NULL, NULL);
+		AddWaitEventToSet(shard->wes_read, WL_SOCKET_READABLE, PQsocket(shard->conn), NULL, NULL);
+
+
 		switch (neon_protocol_version)
 		{
 		case 2:
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index ac505fe6fb..0e4d210be8 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -584,9 +584,9 @@ prefetch_read(PrefetchRequest *slot)
 		slot->response != NULL ||
 		slot->my_ring_index != MyPState->ring_receive)
 		neon_shard_log(slot->shard_no, ERROR,
-					   "Incorrect prefetch read: status=%d response=%llx my=%llu receive=%llu",
-					   slot->status, (size_t) (void *) slot->response,
-					   slot->my_ring_index, MyPState->ring_receive);
+					   "Incorrect prefetch read: status=%d response=%p my=%lu receive=%lu",
+					   slot->status, slot->response,
+					   (long)slot->my_ring_index, (long)MyPState->ring_receive);
 
 	old = MemoryContextSwitchTo(MyPState->errctx);
 	response = (NeonResponse *) page_server->receive(slot->shard_no);
@@ -606,8 +606,8 @@ prefetch_read(PrefetchRequest *slot)
 	else
 	{
 		neon_shard_log(slot->shard_no, WARNING,
-					   "No response from reading prefetch entry %llu: %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect",
-					   slot->my_ring_index,
+					   "No response from reading prefetch entry %lu: %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect",
+					   (long)slot->my_ring_index,
 					   RelFileInfoFmt(BufTagGetNRelFileInfo(slot->buftag)),
 					   slot->buftag.forkNum, slot->buftag.blockNum);
 		return false;

From be014a2222b2883c7a9cf3fdbc479c2b134d85e4 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Wed, 29 May 2024 22:18:09 +0300
Subject: [PATCH 135/412] Do not produce error if gin page is not restored in
 redo (#7876)

## Problem

See https://github.com/neondatabase/cloud/issues/10845

## Summary of changes

Do not report error if GIN page is not restored

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 test_runner/regress/test_gin_redo.py | 22 ++++++++++++++++++++++
 vendor/postgres-v14                  |  2 +-
 vendor/postgres-v15                  |  2 +-
 vendor/postgres-v16                  |  2 +-
 vendor/revisions.json                |  6 +++---
 5 files changed, 28 insertions(+), 6 deletions(-)
 create mode 100644 test_runner/regress/test_gin_redo.py

diff --git a/test_runner/regress/test_gin_redo.py b/test_runner/regress/test_gin_redo.py
new file mode 100644
index 0000000000..9205882239
--- /dev/null
+++ b/test_runner/regress/test_gin_redo.py
@@ -0,0 +1,22 @@
+import time
+
+from fixtures.neon_fixtures import NeonEnv, wait_replica_caughtup
+
+
+#
+# Test that redo of XLOG_GIN_VACUUM_PAGE doesn't produce error
+#
+def test_gin_redo(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+
+    primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary")
+    time.sleep(1)
+    secondary = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary")
+    con = primary.connect()
+    cur = con.cursor()
+    cur.execute("create table gin_test_tbl(id integer, i int4[])")
+    cur.execute("create index gin_test_idx on gin_test_tbl using gin (i)")
+    cur.execute("insert into gin_test_tbl select g,array[3, 1, g] from generate_series(1, 10000) g")
+    cur.execute("delete from gin_test_tbl where id % 2 = 0")
+    cur.execute("vacuum gin_test_tbl")
+    wait_replica_caughtup(primary, secondary)
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 0d30e28f74..17e0f5ff4e 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 0d30e28f74f49fe6a27a6bd45dcfeb1060656b8f
+Subproject commit 17e0f5ff4e1905691aa40e1e08f9b79b14c99652
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 74fb144890..c2c3d40534 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 74fb144890c4f955db1ef50ee1eeb9d8a6c2f69d
+Subproject commit c2c3d40534db97d83dd7e185d1971e707fa2f445
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 3c2b9d576c..b228f20372 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 3c2b9d576c580e0b5b7108001f959b8c5b42e0a2
+Subproject commit b228f20372ebcabfd7946647cb7adbd38bacb14a
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 2f16f334c5..5bf4e289ef 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
-  "v16": ["16.3", "3c2b9d576c580e0b5b7108001f959b8c5b42e0a2"],
-  "v15": ["15.7", "74fb144890c4f955db1ef50ee1eeb9d8a6c2f69d"],
-  "v14": ["14.12", "0d30e28f74f49fe6a27a6bd45dcfeb1060656b8f"]
+  "v16": ["16.3", "b228f20372ebcabfd7946647cb7adbd38bacb14a"],
+  "v15": ["15.7", "c2c3d40534db97d83dd7e185d1971e707fa2f445"],
+  "v14": ["14.12", "17e0f5ff4e1905691aa40e1e08f9b79b14c99652"]
 }

From 0384267d58b1a4ebcc120136788df90161a040e0 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 10 Jun 2024 13:20:20 +0200
Subject: [PATCH 136/412] Revert "Include openssl and ICU statically linked"
 (#8003)

Reverts neondatabase/neon#7956

Rationale: compute incompatibilties

Slack thread:
https://neondb.slack.com/archives/C033RQ5SPDH/p1718011276665839?thread_ts=1718008160.431869&cid=C033RQ5SPDH

Relevant quotes from @hlinnaka

> If we go through with the current release candidate, but the compute
is pinned, people who create new projects will get that warning, which
is silly. To them, it looks like the ICU version was downgraded, because
initdb was run with newer version.

> We should upgrade the ICU version eventually. And when we do that,
users with old projects that use ICU will start to see that warning. I
think that's acceptable, as long as we do homework, notify users, and
communicate that properly.
> When do that, we should to try to upgrade the storage and compute
versions at roughly the same time.
---
 .github/workflows/build_and_test.yml |  6 +++---
 Dockerfile                           |  2 ++
 Dockerfile.build-tools               | 32 ----------------------------
 Makefile                             | 15 +------------
 4 files changed, 6 insertions(+), 49 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 1fc0fbb0b6..b9caf76060 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -299,21 +299,21 @@ jobs:
         uses: actions/cache@v4
         with:
           path: pg_install/v14
-          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}-${{ hashFiles('Dockerfile.build-tools') }}
+          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
 
       - name: Cache postgres v15 build
         id: cache_pg_15
         uses: actions/cache@v4
         with:
           path: pg_install/v15
-          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}-${{ hashFiles('Dockerfile.build-tools') }}
+          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
 
       - name: Cache postgres v16 build
         id: cache_pg_16
         uses: actions/cache@v4
         with:
           path: pg_install/v16
-          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}-${{ hashFiles('Dockerfile.build-tools') }}
+          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
 
       - name: Build postgres v14
         if: steps.cache_pg_14.outputs.cache-hit != 'true'
diff --git a/Dockerfile b/Dockerfile
index b4900d4a94..5f82df3e18 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -69,6 +69,8 @@ RUN set -e \
     && apt install -y \
         libreadline-dev \
         libseccomp-dev \
+        libicu67 \
+        openssl \
         ca-certificates \
     && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \
     && useradd -d /data neon \
diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools
index 91194eda1a..460b8c996d 100644
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -112,35 +112,6 @@ RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JS
     && make install \
     && rm -rf ../lcov.tar.gz
 
-# Compile and install the static OpenSSL library
-ENV OPENSSL_VERSION=3.2.2
-ENV OPENSSL_PREFIX=/usr/local/openssl
-RUN wget -O /tmp/openssl-${OPENSSL_VERSION}.tar.gz https://www.openssl.org/source/openssl-${OPENSSL_VERSION}.tar.gz && \
-    echo "197149c18d9e9f292c43f0400acaba12e5f52cacfe050f3d199277ea738ec2e7 /tmp/openssl-${OPENSSL_VERSION}.tar.gz" | sha256sum --check && \
-    cd /tmp && \
-    tar xzvf /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \
-    rm /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \
-    cd /tmp/openssl-${OPENSSL_VERSION} && \
-    ./config --prefix=${OPENSSL_PREFIX}  -static --static no-shared -fPIC && \
-    make ${MAKE_ARGS} && \
-    make install && \
-    cd /tmp && \
-    rm -rf /tmp/openssl-${OPENSSL_VERSION}
-
-# Set the ICU version
-ENV ICU_VERSION=72.1
-ENV ICU_PREFIX=/usr/local/icu
-
-# Download and build static ICU
-RUN wget https://github.com/unicode-org/icu/releases/download/release-${ICU_VERSION//./-}/icu4c-${ICU_VERSION//./_}-src.tgz && \
-    tar -xzf icu4c-${ICU_VERSION//./_}-src.tgz && \
-    cd icu/source && \
-    ./configure --prefix=${ICU_PREFIX}  --enable-static --enable-shared=no CXXFLAGS="-fPIC" CFLAGS="-fPIC" && \
-    make && \
-    make install && \
-    cd ../.. && \
-    rm -rf icu icu4c-${ICU_VERSION//./_}-src.tgz
-
 # Switch to nonroot user
 USER nonroot:nonroot
 WORKDIR /home/nonroot
@@ -199,6 +170,3 @@ RUN whoami \
     && rustup --version --verbose \
     && rustc --version --verbose \
     && clang --version
-
-# Set following flag to check in Makefile if its running in Docker
-RUN touch /home/nonroot/.docker_build
diff --git a/Makefile b/Makefile
index b5f426344e..dcbfdbcbc1 100644
--- a/Makefile
+++ b/Makefile
@@ -3,9 +3,6 @@ ROOT_PROJECT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 # Where to install Postgres, default is ./pg_install, maybe useful for package managers
 POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/pg_install/
 
-OPENSSL_PREFIX_DIR := /usr/local/openssl
-ICU_PREFIX_DIR := /usr/local/icu
-
 #
 # We differentiate between release / debug build types using the BUILD_TYPE
 # environment variable.
@@ -23,16 +20,6 @@ else
 	$(error Bad build type '$(BUILD_TYPE)', see Makefile for options)
 endif
 
-ifeq ($(shell test -e /home/nonroot/.docker_build && echo -n yes),yes)
-	# Exclude static build openssl, icu for local build (MacOS, Linux)
-	# Only keep for build type release and debug
-	PG_CFLAGS += -I$(OPENSSL_PREFIX_DIR)/include
-	PG_CONFIGURE_OPTS += --with-icu
-	PG_CONFIGURE_OPTS += ICU_CFLAGS='-I/$(ICU_PREFIX_DIR)/include -DU_STATIC_IMPLEMENTATION'
-	PG_CONFIGURE_OPTS += ICU_LIBS='-L$(ICU_PREFIX_DIR)/lib -L$(ICU_PREFIX_DIR)/lib64 -licui18n -licuuc -licudata -lstdc++ -Wl,-Bdynamic -lm'
-	PG_CONFIGURE_OPTS += LDFLAGS='-L$(OPENSSL_PREFIX_DIR)/lib -L$(OPENSSL_PREFIX_DIR)/lib64 -L$(ICU_PREFIX_DIR)/lib -L$(ICU_PREFIX_DIR)/lib64 -Wl,-Bstatic -lssl -lcrypto -Wl,-Bdynamic -lrt -lm -ldl -lpthread'
-endif
-
 UNAME_S := $(shell uname -s)
 ifeq ($(UNAME_S),Linux)
 	# Seccomp BPF is only available for Linux
@@ -41,7 +28,7 @@ else ifeq ($(UNAME_S),Darwin)
 	ifndef DISABLE_HOMEBREW
 		# macOS with brew-installed openssl requires explicit paths
 		# It can be configured with OPENSSL_PREFIX variable
-		OPENSSL_PREFIX := $(shell brew --prefix openssl@3)
+		OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
 		PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
 		PG_CONFIGURE_OPTS += PKG_CONFIG_PATH=$(shell brew --prefix icu4c)/lib/pkgconfig
 		# macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure

From be598f1bf41f93cec0f3462687f4c75a63b44c16 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 1 Jul 2024 11:23:31 +0300
Subject: [PATCH 137/412] tests: remove a leftover 'running' flag (#8216)

The 'running' boolean was replaced with a semaphore in commit
f0e2bb79b2, but this initialization was missed. Remove it so that if a
test tries to access it, you get an error rather than always claiming
that the endpoint is not running.

Spotted by Arseny at
https://github.com/neondatabase/neon/pull/7288#discussion_r1660068657
---
 test_runner/fixtures/neon_fixtures.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 4911917bf4..a1cb1b5195 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3491,7 +3491,6 @@ class Endpoint(PgProtocol, LogUtils):
     ):
         super().__init__(host="localhost", port=pg_port, user="cloud_admin", dbname="postgres")
         self.env = env
-        self.running = False
         self.branch_name: Optional[str] = None  # dubious
         self.endpoint_id: Optional[str] = None  # dubious, see asserts below
         self.pgdata_dir: Optional[str] = None  # Path to computenode PGDATA

From 7ee2bebdb78d199c69745ee2d65285acfbe2eba9 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 1 Jul 2024 12:58:08 +0300
Subject: [PATCH 138/412] tests: Make neon_xlogflush() flush all WAL, if you
 omit the LSN arg

This makes it much more convenient to use in the common case that you
want to flush all the WAL. (Passing pg_current_wal_insert_lsn() as the
argument doesn't work for the same reasons as explained in the
comments: we need to be back off to the beginning of a page if the
previous record ended at page boundary.)

I plan to use this to fix the issue that Arseny Sher called out at
https://github.com/neondatabase/neon/pull/7288#discussion_r1660063852
---
 pgxn/neon_test_utils/Makefile                 |  2 +-
 ...tils--1.1.sql => neon_test_utils--1.2.sql} |  2 +-
 pgxn/neon_test_utils/neon_test_utils.control  |  2 +-
 pgxn/neon_test_utils/neontest.c               | 38 ++++++++++++++++++-
 4 files changed, 40 insertions(+), 4 deletions(-)
 rename pgxn/neon_test_utils/{neon_test_utils--1.1.sql => neon_test_utils--1.2.sql} (96%)

diff --git a/pgxn/neon_test_utils/Makefile b/pgxn/neon_test_utils/Makefile
index 1ee87357e5..1371272439 100644
--- a/pgxn/neon_test_utils/Makefile
+++ b/pgxn/neon_test_utils/Makefile
@@ -7,7 +7,7 @@ OBJS = \
 	neontest.o
 
 EXTENSION = neon_test_utils
-DATA = neon_test_utils--1.1.sql
+DATA = neon_test_utils--1.2.sql
 PGFILEDESC = "neon_test_utils - helpers for neon testing and debugging"
 
 PG_CONFIG = pg_config
diff --git a/pgxn/neon_test_utils/neon_test_utils--1.1.sql b/pgxn/neon_test_utils/neon_test_utils--1.2.sql
similarity index 96%
rename from pgxn/neon_test_utils/neon_test_utils--1.1.sql
rename to pgxn/neon_test_utils/neon_test_utils--1.2.sql
index 534784f319..f84a24ec8d 100644
--- a/pgxn/neon_test_utils/neon_test_utils--1.1.sql
+++ b/pgxn/neon_test_utils/neon_test_utils--1.2.sql
@@ -41,7 +41,7 @@ RETURNS bytea
 AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn_ex'
 LANGUAGE C PARALLEL UNSAFE;
 
-CREATE FUNCTION neon_xlogflush(lsn pg_lsn)
+CREATE FUNCTION neon_xlogflush(lsn pg_lsn DEFAULT NULL)
 RETURNS VOID
 AS 'MODULE_PATHNAME', 'neon_xlogflush'
 LANGUAGE C PARALLEL UNSAFE;
diff --git a/pgxn/neon_test_utils/neon_test_utils.control b/pgxn/neon_test_utils/neon_test_utils.control
index 5f6d640835..c7b9191ddc 100644
--- a/pgxn/neon_test_utils/neon_test_utils.control
+++ b/pgxn/neon_test_utils/neon_test_utils.control
@@ -1,6 +1,6 @@
 # neon_test_utils extension
 comment = 'helpers for neon testing and debugging'
-default_version = '1.1'
+default_version = '1.2'
 module_pathname = '$libdir/neon_test_utils'
 relocatable = true
 trusted = true
diff --git a/pgxn/neon_test_utils/neontest.c b/pgxn/neon_test_utils/neontest.c
index 47f245fbf1..944936d395 100644
--- a/pgxn/neon_test_utils/neontest.c
+++ b/pgxn/neon_test_utils/neontest.c
@@ -15,6 +15,7 @@
 #include "access/relation.h"
 #include "access/xact.h"
 #include "access/xlog.h"
+#include "access/xlog_internal.h"
 #include "catalog/namespace.h"
 #include "fmgr.h"
 #include "funcapi.h"
@@ -444,11 +445,46 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS)
 
 /*
  * Directly calls XLogFlush(lsn) to flush WAL buffers.
+ *
+ * If 'lsn' is not specified (is NULL), flush all generated WAL.
  */
 Datum
 neon_xlogflush(PG_FUNCTION_ARGS)
 {
-	XLogRecPtr	lsn = PG_GETARG_LSN(0);
+	XLogRecPtr	lsn;
+
+	if (RecoveryInProgress())
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("recovery is in progress"),
+				 errhint("cannot flush WAL during recovery.")));
+
+	if (!PG_ARGISNULL(0))
+		lsn = PG_GETARG_LSN(0);
+	else
+	{
+		lsn = GetXLogInsertRecPtr();
+
+		/*---
+		 * The LSN returned by GetXLogInsertRecPtr() is the position where the
+		 * next inserted record would begin. If the last record ended just at
+		 * the page boundary, the next record will begin after the page header
+		 * on the next page, and that's what GetXLogInsertRecPtr().returns,
+		 * but the page header has not been written yet. If we tried to flush
+		 * it, XLogFlush() would throw an error:
+		 *
+		 * ERROR : xlog flush request %X/%X is not satisfied --- flushed only to %X/%X
+		 *
+		 * To avoid that, if the insert position points to just after the page
+		 * header, back off to page boundary.
+		 */
+		if (lsn % XLOG_BLCKSZ == SizeOfXLogShortPHD &&
+			XLogSegmentOffset(lsn, wal_segment_size) > XLOG_BLCKSZ)
+			lsn -= SizeOfXLogShortPHD;
+		else if (lsn % XLOG_BLCKSZ == SizeOfXLogLongPHD &&
+				 XLogSegmentOffset(lsn, wal_segment_size) < XLOG_BLCKSZ)
+			lsn -= SizeOfXLogLongPHD;
+	}
 
 	XLogFlush(lsn);
 	PG_RETURN_VOID();

From 57f476ff5a2d5eec4d3585f710b78636ce75f794 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 1 Jul 2024 12:58:12 +0300
Subject: [PATCH 139/412] Restore running xacts from CLOG on replica startup
 (#7288)

We have one pretty serious MVCC visibility bug with hot standby
replicas. We incorrectly treat any transactions that are in progress
in the primary, when the standby is started, as aborted. That can
break MVCC for queries running concurrently in the standby. It can
also lead to hint bits being set incorrectly, and that damage can last
until the replica is restarted.

The fundamental bug was that we treated any replica start as starting
from a shut down server. The fix for that is straightforward: we need
to set 'wasShutdown = false' in InitWalRecovery() (see changes in the
postgres repo).

However, that introduces a new problem: with wasShutdown = false, the
standby will not open up for queries until it receives a running-xacts
WAL record from the primary. That's correct, and that's how Postgres
hot standby always works. But it's a problem for Neon, because:

* It changes the historical behavior for existing users. Currently,
  the standby immediately opens up for queries, so if they now need to
  wait, we can breka existing use cases that were working fine
  (assuming you don't hit the MVCC issues).

* The problem is much worse for Neon than it is for standalone
  PostgreSQL, because in Neon, we can start a replica from an
  arbitrary LSN. In standalone PostgreSQL, the replica always starts
  WAL replay from a checkpoint record, and the primary arranges things
  so that there is always a running-xacts record soon after each
  checkpoint record. You can still hit this issue with PostgreSQL if
  you have a transaction with lots of subtransactions running in the
  primary, but it's pretty rare in practice.

To mitigate that, we introduce another way to collect the
running-xacts information at startup, without waiting for the
running-xacts WAL record: We can the CLOG for XIDs that haven't been
marked as committed or aborted. It has limitations with
subtransactions too, but should mitigate the problem for most users.

See https://github.com/neondatabase/neon/issues/7236.

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pageserver/src/walingest.rs                   |  40 +-
 pgxn/neon/neon.c                              | 293 ++++++++
 test_runner/fixtures/neon_fixtures.py         |   4 +-
 test_runner/fixtures/pageserver/utils.py      |   2 +-
 test_runner/regress/test_replica_start.py     | 646 ++++++++++++++++++
 test_runner/regress/test_replication_start.py |  32 -
 vendor/postgres-v14                           |   2 +-
 vendor/postgres-v15                           |   2 +-
 vendor/postgres-v16                           |   2 +-
 vendor/revisions.json                         |   6 +-
 10 files changed, 981 insertions(+), 48 deletions(-)
 create mode 100644 test_runner/regress/test_replica_start.py
 delete mode 100644 test_runner/regress/test_replication_start.py

diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index fb10bca5a6..07c90385e6 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -343,7 +343,33 @@ impl WalIngest {
                         xlog_checkpoint.oldestActiveXid,
                         self.checkpoint.oldestActiveXid
                     );
-                    self.checkpoint.oldestActiveXid = xlog_checkpoint.oldestActiveXid;
+
+                    // A shutdown checkpoint has `oldestActiveXid == InvalidTransactionid`,
+                    // because at shutdown, all in-progress transactions will implicitly
+                    // end. Postgres startup code knows that, and allows hot standby to start
+                    // immediately from a shutdown checkpoint.
+                    //
+                    // In Neon, Postgres hot standby startup always behaves as if starting from
+                    // an online checkpoint. It needs a valid `oldestActiveXid` value, so
+                    // instead of overwriting self.checkpoint.oldestActiveXid with
+                    // InvalidTransactionid from the checkpoint WAL record, update it to a
+                    // proper value, knowing that there are no in-progress transactions at this
+                    // point, except for prepared transactions.
+                    //
+                    // See also the neon code changes in the InitWalRecovery() function.
+                    if xlog_checkpoint.oldestActiveXid == pg_constants::INVALID_TRANSACTION_ID
+                        && info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
+                    {
+                        let mut oldest_active_xid = self.checkpoint.nextXid.value as u32;
+                        for xid in modification.tline.list_twophase_files(lsn, ctx).await? {
+                            if (xid.wrapping_sub(oldest_active_xid) as i32) < 0 {
+                                oldest_active_xid = xid;
+                            }
+                        }
+                        self.checkpoint.oldestActiveXid = oldest_active_xid;
+                    } else {
+                        self.checkpoint.oldestActiveXid = xlog_checkpoint.oldestActiveXid;
+                    }
 
                     // Write a new checkpoint key-value pair on every checkpoint record, even
                     // if nothing really changed. Not strictly required, but it seems nice to
@@ -375,6 +401,7 @@ impl WalIngest {
                 if info == pg_constants::XLOG_RUNNING_XACTS {
                     let xlrec = crate::walrecord::XlRunningXacts::decode(&mut buf);
                     self.checkpoint.oldestActiveXid = xlrec.oldest_running_xid;
+                    self.checkpoint_modified = true;
                 }
             }
             pg_constants::RM_REPLORIGIN_ID => {
@@ -1277,13 +1304,10 @@ impl WalIngest {
             xlrec.pageno, xlrec.oldest_xid, xlrec.oldest_xid_db
         );
 
-        // Here we treat oldestXid and oldestXidDB
-        // differently from postgres redo routines.
-        // In postgres checkpoint.oldestXid lags behind xlrec.oldest_xid
-        // until checkpoint happens and updates the value.
-        // Here we can use the most recent value.
-        // It's just an optimization, though and can be deleted.
-        // TODO Figure out if there will be any issues with replica.
+        // In Postgres, oldestXid and oldestXidDB are updated in memory when the CLOG is
+        // truncated, but a checkpoint record with the updated values isn't written until
+        // later. In Neon, a server can start at any LSN, not just on a checkpoint record,
+        // so we keep the oldestXid and oldestXidDB up-to-date.
         self.checkpoint.oldestXid = xlrec.oldest_xid;
         self.checkpoint.oldestXidDB = xlrec.oldest_xid_db;
         self.checkpoint_modified = true;
diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index b6b2db7e71..e4968bdf89 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -12,6 +12,8 @@
 #include "fmgr.h"
 
 #include "miscadmin.h"
+#include "access/subtrans.h"
+#include "access/twophase.h"
 #include "access/xact.h"
 #include "access/xlog.h"
 #include "storage/buf_internals.h"
@@ -22,10 +24,12 @@
 #include "replication/logical.h"
 #include "replication/slot.h"
 #include "replication/walsender.h"
+#include "storage/proc.h"
 #include "storage/procsignal.h"
 #include "tcop/tcopprot.h"
 #include "funcapi.h"
 #include "access/htup_details.h"
+#include "utils/builtins.h"
 #include "utils/pg_lsn.h"
 #include "utils/guc.h"
 #include "utils/wait_event.h"
@@ -266,6 +270,293 @@ LogicalSlotsMonitorMain(Datum main_arg)
 	}
 }
 
+/*
+ * XXX: These private to procarray.c, but we need them here.
+ */
+#define PROCARRAY_MAXPROCS	(MaxBackends + max_prepared_xacts)
+#define TOTAL_MAX_CACHED_SUBXIDS \
+	((PGPROC_MAX_CACHED_SUBXIDS + 1) * PROCARRAY_MAXPROCS)
+
+/*
+ * Restore running-xact information by scanning the CLOG at startup.
+ *
+ * In PostgreSQL, a standby always has to wait for a running-xacts WAL record
+ * to arrive before it can start accepting queries. Furthermore, if there are
+ * transactions with too many subxids (> 64) open to fit in the in-memory
+ * subxids cache, the running-xacts record will be marked as "suboverflowed",
+ * and the standby will need to also wait for the currently in-progress
+ * transactions to finish.
+ *
+ * That's not great in PostgreSQL, because a hot standby does not necessary
+ * open up for queries immediately as you might expect. But it's worse in
+ * Neon: A standby in Neon doesn't need to start WAL replay from a checkpoint
+ * record; it can start at any LSN. Postgres arranges things so that there is
+ * a running-xacts record soon after every checkpoint record, but when you
+ * start from an arbitrary LSN, that doesn't help. If the primary is idle, or
+ * not running at all, it might never write a new running-xacts record,
+ * leaving the replica in a limbo where it can never start accepting queries.
+ *
+ * To mitigate that, we have an additional mechanism to find the running-xacts
+ * information: we scan the CLOG, making note of any XIDs not marked as
+ * committed or aborted. They are added to the Postgres known-assigned XIDs
+ * array by calling ProcArrayApplyRecoveryInfo() in the caller of this
+ * function.
+ *
+ * There is one big limitation with that mechanism: The size of the
+ * known-assigned XIDs is limited, so if there are a lot of in-progress XIDs,
+ * we have to give up. Furthermore, we don't know how many of the in-progress
+ * XIDs are subtransactions, and if we use up all the space in the
+ * known-assigned XIDs array for subtransactions, we might run out of space in
+ * the array later during WAL replay, causing the replica to shut down with
+ * "ERROR: too many KnownAssignedXids". The safe # of XIDs that we can add to
+ * the known-assigned array without risking that error later is very low,
+ * merely PGPROC_MAX_CACHED_SUBXIDS == 64, so we take our chances and use up
+ * to half of the known-assigned XIDs array for the subtransactions, even
+ * though that risks getting the error later.
+ *
+ * Note: It's OK if the recovered list of XIDs includes some transactions that
+ * have crashed in the primary, and hence will never commit. They will be seen
+ * as in-progress, until we see a new next running-acts record with an
+ * oldestActiveXid that invalidates them. That's how the known-assigned XIDs
+ * array always works.
+ *
+ * If scraping the CLOG doesn't succeed for some reason, like the subxid
+ * overflow, Postgres will fall back to waiting for a running-xacts record
+ * like usual.
+ *
+ * Returns true if a complete list of in-progress XIDs was scraped.
+ */
+static bool
+RestoreRunningXactsFromClog(CheckPoint *checkpoint, TransactionId **xids, int *nxids)
+{
+	TransactionId from;
+	TransactionId till;
+	int			max_xcnt;
+	TransactionId *prepared_xids = NULL;
+	int			n_prepared_xids;
+	TransactionId *restored_xids = NULL;
+	int			n_restored_xids;
+	int			next_prepared_idx;
+
+	Assert(*xids == NULL);
+
+	/*
+	 * If the checkpoint doesn't have a valid oldestActiveXid, bail out. We
+	 * don't know where to start the scan.
+	 *
+	 * This shouldn't happen, because the pageserver always maintains a valid
+	 * oldestActiveXid nowadays. Except when starting at an old point in time
+	 * that was ingested before the pageserver was taught to do that.
+	 */
+	if (!TransactionIdIsValid(checkpoint->oldestActiveXid))
+	{
+		elog(LOG, "cannot restore running-xacts from CLOG because oldestActiveXid is not set");
+		goto fail;
+	}
+
+	/*
+	 * We will scan the CLOG starting from the oldest active XID.
+	 *
+	 * In some corner cases, the oldestActiveXid from the last checkpoint
+	 * might already have been truncated from the CLOG. That is,
+	 * oldestActiveXid might be older than oldestXid. That's possible because
+	 * oldestActiveXid is only updated at checkpoints. After the last
+	 * checkpoint, the oldest transaction might have committed, and the CLOG
+	 * might also have been already truncated. So if oldestActiveXid is older
+	 * than oldestXid, start at oldestXid instead. (Otherwise we'd try to
+	 * access CLOG segments that have already been truncated away.)
+	 */
+	from = TransactionIdPrecedes(checkpoint->oldestXid, checkpoint->oldestActiveXid)
+		? checkpoint->oldestActiveXid : checkpoint->oldestXid;
+	till = XidFromFullTransactionId(checkpoint->nextXid);
+
+	/*
+	 * To avoid "too many KnownAssignedXids" error later during replay, we
+	 * limit number of collected transactions. This is a tradeoff: if we are
+	 * willing to consume more of the KnownAssignedXids space for the XIDs
+	 * now, that allows us to start up, but we might run out of space later.
+	 *
+	 * The size of the KnownAssignedXids array is TOTAL_MAX_CACHED_SUBXIDS,
+	 * which is (PGPROC_MAX_CACHED_SUBXIDS + 1) * PROCARRAY_MAXPROCS). In
+	 * PostgreSQL, that's always enough because the primary will always write
+	 * an XLOG_XACT_ASSIGNMENT record if a transaction has more than
+	 * PGPROC_MAX_CACHED_SUBXIDS subtransactions. Seeing that record allows
+	 * the standby to mark the XIDs in pg_subtrans and removing them from the
+	 * KnowingAssignedXids array.
+	 *
+	 * Here, we don't know which XIDs belong to subtransactions that have
+	 * already been WAL-logged with an XLOG_XACT_ASSIGNMENT record. If we
+	 * wanted to be totally safe and avoid the possibility of getting a "too
+	 * many KnownAssignedXids" error later, we would have to limit ourselves
+	 * to PGPROC_MAX_CACHED_SUBXIDS, which is not much. And that includes top
+	 * transaction IDs too, because we cannot distinguish between top
+	 * transaction IDs and subtransactions here.
+	 *
+	 * Somewhat arbitrarily, we use up to half of KnownAssignedXids. That
+	 * strikes a sensible balance between being useful, and risking a "too
+	 * many KnownAssignedXids" error later.
+	 */
+	max_xcnt = TOTAL_MAX_CACHED_SUBXIDS / 2;
+
+	/*
+	 * Collect XIDs of prepared transactions in an array. This includes only
+	 * their top-level XIDs. We assume that StandbyRecoverPreparedTransactions
+	 * has already been called, so we can find all the sub-transactions in
+	 * pg_subtrans.
+	 */
+	PrescanPreparedTransactions(&prepared_xids, &n_prepared_xids);
+	qsort(prepared_xids, n_prepared_xids, sizeof(TransactionId), xidLogicalComparator);
+
+	/*
+	 * Scan the CLOG, collecting in-progress XIDs into 'restored_xids'.
+	 */
+	elog(DEBUG1, "scanning CLOG between %u and %u for in-progress XIDs", from, till);
+	restored_xids = (TransactionId *) palloc(max_xcnt * sizeof(TransactionId));
+	n_restored_xids = 0;
+	next_prepared_idx = 0;
+	for (TransactionId xid = from; xid != till;)
+	{
+		XLogRecPtr	xidlsn;
+		XidStatus	xidstatus;
+
+		xidstatus = TransactionIdGetStatus(xid, &xidlsn);
+
+		/*
+		 * "Merge" the prepared transactions into the restored_xids array as
+		 * we go.  The prepared transactions array is sorted. This is mostly
+		 * a sanity check to ensure that all the prpeared transactions are
+		 * seen as in-progress. (There is a check after the loop that we didn't
+		 * miss any.)
+		 */
+		if (next_prepared_idx < n_prepared_xids && xid == prepared_xids[next_prepared_idx])
+		{
+			/*
+			 * This is a top-level transaction ID of a prepared transaction.
+			 * Include it in the array.
+			 */
+
+			/* sanity check */
+			if (xidstatus != TRANSACTION_STATUS_IN_PROGRESS)
+			{
+				elog(LOG, "prepared transaction %u has unexpected status %X, cannot restore running-xacts from CLOG",
+					 xid, xidstatus);
+				Assert(false);
+				goto fail;
+			}
+
+			elog(DEBUG1, "XID %u: was next prepared xact (%d / %d)", xid, next_prepared_idx, n_prepared_xids);
+			next_prepared_idx++;
+		}
+		else if (xidstatus == TRANSACTION_STATUS_COMMITTED)
+		{
+			elog(DEBUG1, "XID %u: was committed", xid);
+			goto skip;
+		}
+		else if (xidstatus == TRANSACTION_STATUS_ABORTED)
+		{
+			elog(DEBUG1, "XID %u: was aborted", xid);
+			goto skip;
+		}
+		else if (xidstatus == TRANSACTION_STATUS_IN_PROGRESS)
+		{
+			/*
+			 * In-progress transactions are included in the array.
+			 *
+			 * Except subtransactions of the prepared transactions. They are
+			 * already set in pg_subtrans, and hence don't need to be tracked
+			 * in the known-assigned XIDs array.
+			 */
+			if (n_prepared_xids > 0)
+			{
+				TransactionId parent = SubTransGetParent(xid);
+
+				if (TransactionIdIsValid(parent))
+				{
+					/*
+					 * This is a subtransaction belonging to a prepared
+					 * transaction.
+					 *
+					 * Sanity check that it is in the prepared XIDs array. It
+					 * should be, because StandbyRecoverPreparedTransactions
+					 * populated pg_subtrans, and no other XID should be set
+					 * in it yet. (This also relies on the fact that
+					 * StandbyRecoverPreparedTransactions sets the parent of
+					 * each subxid to point directly to the top-level XID,
+					 * rather than restoring the original subtransaction
+					 * hierarchy.)
+					 */
+					if (bsearch(&parent, prepared_xids, next_prepared_idx,
+								sizeof(TransactionId), xidLogicalComparator) == NULL)
+					{
+						elog(LOG, "sub-XID %u has unexpected parent %u, cannot restore running-xacts from CLOG",
+							 xid, parent);
+						Assert(false);
+						goto fail;
+					}
+					elog(DEBUG1, "XID %u: was a subtransaction of prepared xid %u", xid, parent);
+					goto skip;
+				}
+			}
+
+			/* include it in the array */
+			elog(DEBUG1, "XID %u: is in progress", xid);
+		}
+		else
+		{
+			/*
+			 * SUB_COMMITTED is a transient state used at commit. We don't
+			 * expect to see that here.
+			 */
+			elog(LOG, "XID %u has unexpected status %X in pg_xact, cannot restore running-xacts from CLOG",
+				 xid, xidstatus);
+			Assert(false);
+			goto fail;
+		}
+
+		if (n_restored_xids >= max_xcnt)
+		{
+			/*
+			 * Overflowed. We won't be able to install the RunningTransactions
+			 * snapshot.
+			 */
+			elog(LOG, "too many running xacts to restore from the CLOG; oldestXid=%u oldestActiveXid=%u nextXid %u",
+				 checkpoint->oldestXid, checkpoint->oldestActiveXid,
+				 XidFromFullTransactionId(checkpoint->nextXid));
+			goto fail;
+		}
+
+		restored_xids[n_restored_xids++] = xid;
+
+	skip:
+		TransactionIdAdvance(xid);
+		continue;
+	}
+
+	/* sanity check */
+	if (next_prepared_idx != n_prepared_xids)
+	{
+		elog(LOG, "prepared transaction ID %u was not visited in the CLOG scan, cannot restore running-xacts from CLOG",
+			 prepared_xids[next_prepared_idx]);
+		Assert(false);
+		goto fail;
+	}
+
+	elog(LOG, "restored %d running xacts by scanning the CLOG; oldestXid=%u oldestActiveXid=%u nextXid %u",
+		 n_restored_xids, checkpoint->oldestXid, checkpoint->oldestActiveXid, XidFromFullTransactionId(checkpoint->nextXid));
+	*nxids = n_restored_xids;
+	*xids = restored_xids;
+	return true;
+
+ fail:
+	*nxids = 0;
+	*xids = NULL;
+	if (restored_xids)
+		pfree(restored_xids);
+	if (prepared_xids)
+		pfree(prepared_xids);
+	return false;
+}
+
 void
 _PG_init(void)
 {
@@ -288,6 +579,8 @@ _PG_init(void)
 
 	pg_init_extension_server();
 
+	restore_running_xacts_callback = RestoreRunningXactsFromClog;
+
 	/*
 	 * Important: This must happen after other parts of the extension are
 	 * loaded, otherwise any settings to GUCs that were set before the
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index a1cb1b5195..e1c8514351 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3856,7 +3856,9 @@ class EndpointFactory:
 
         return self
 
-    def new_replica(self, origin: Endpoint, endpoint_id: str, config_lines: Optional[List[str]]):
+    def new_replica(
+        self, origin: Endpoint, endpoint_id: str, config_lines: Optional[List[str]] = None
+    ):
         branch_name = origin.branch_name
         assert origin in self.endpoints
         assert branch_name is not None
diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index 60535b7592..b75a480a63 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -198,7 +198,7 @@ def wait_for_last_record_lsn(
     lsn: Lsn,
 ) -> Lsn:
     """waits for pageserver to catch up to a certain lsn, returns the last observed lsn."""
-    for i in range(100):
+    for i in range(1000):
         current_lsn = last_record_lsn(pageserver_http, tenant, timeline)
         if current_lsn >= lsn:
             return current_lsn
diff --git a/test_runner/regress/test_replica_start.py b/test_runner/regress/test_replica_start.py
new file mode 100644
index 0000000000..17d476a8a6
--- /dev/null
+++ b/test_runner/regress/test_replica_start.py
@@ -0,0 +1,646 @@
+"""
+In PostgreSQL, a standby always has to wait for a running-xacts WAL record to
+arrive before it can start accepting queries. Furthermore, if there are
+transactions with too many subxids (> 64) open to fit in the in-memory subxids
+cache, the running-xacts record will be marked as "suboverflowed", and the
+standby will need to also wait for the currently in-progress transactions to
+finish.
+
+In Neon, we have an additional mechanism that scans the CLOG at server startup
+to determine the list of running transactions, so that the standby can start up
+immediately without waiting for the running-xacts record, but that mechanism
+only works if the # of active (sub-)transactions is reasonably small. Otherwise
+it falls back to waiting. Furthermore, it's somewhat optimistic in using up the
+known-assigned XIDs array: if too many transactions with subxids are started in
+the primary later, the replay in the replica will crash with "too many
+KnownAssignedXids" error.
+
+This module contains tests for those various cases at standby startup: starting
+from shutdown checkpoint, using the CLOG scanning mechanism, waiting for
+running-xacts record and for in-progress transactions to finish etc.
+"""
+
+import threading
+from contextlib import closing
+
+import psycopg2
+import pytest
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnv, wait_for_last_flush_lsn, wait_replica_caughtup
+from fixtures.pg_version import PgVersion
+from fixtures.utils import query_scalar, wait_until
+
+CREATE_SUBXACTS_FUNC = """
+create or replace function create_subxacts(n integer) returns void as $$
+declare
+   i integer;
+begin
+   for i in 1..n loop
+      begin
+         insert into t (payload) values (0);
+      exception
+         when others then
+            raise exception 'caught something: %', sqlerrm;
+      end;
+   end loop;
+end; $$ language plpgsql
+"""
+
+
+def test_replica_start_scan_clog(neon_simple_env: NeonEnv):
+    """
+    Test the CLOG-scanning mechanism at hot standby startup. There is one
+    transaction active in the primary when the standby is started. The primary
+    is killed before it has a chance to write a running-xacts record. The
+    CLOG-scanning at neon startup allows the standby to start up anyway.
+
+    See the module docstring for background.
+    """
+
+    # Initialize the primary, a test table, and a helper function to create lots
+    # of subtransactions.
+    env = neon_simple_env
+    primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary")
+    primary_conn = primary.connect()
+    primary_cur = primary_conn.cursor()
+    primary_cur.execute("CREATE EXTENSION neon_test_utils")
+    primary_cur.execute("create table t(pk serial primary key, payload integer)")
+    primary_cur.execute(CREATE_SUBXACTS_FUNC)
+    primary_cur.execute("select pg_switch_wal()")
+
+    # Start a transaction in the primary. Leave the transaction open.
+    #
+    # The transaction has some subtransactions, but not too many to cause the
+    # CLOG-scanning mechanism to give up.
+    primary_cur.execute("begin")
+    primary_cur.execute("select create_subxacts(50)")
+
+    # Wait for the WAL to be flushed, but then immediately kill the primary,
+    # before it has a chance to generate a running-xacts record.
+    primary_cur.execute("select neon_xlogflush()")
+    wait_for_last_flush_lsn(env, primary, env.initial_tenant, env.initial_timeline)
+    primary.stop(mode="immediate")
+
+    # Create a replica. It should start up normally, thanks to the CLOG-scanning
+    # mechanism.
+    secondary = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary")
+
+    # The transaction did not commit, so it should not be visible in the secondary
+    secondary_conn = secondary.connect()
+    secondary_cur = secondary_conn.cursor()
+    secondary_cur.execute("select count(*) from t")
+    assert secondary_cur.fetchone() == (0,)
+
+
+def test_replica_start_scan_clog_crashed_xids(neon_simple_env: NeonEnv):
+    """
+    Test the CLOG-scanning mechanism at hot standby startup, after
+    leaving behind crashed transactions.
+
+    See the module docstring for background.
+    """
+
+    # Initialize the primary, a test table, and a helper function to create lots
+    # of subtransactions.
+    env = neon_simple_env
+    primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary")
+    primary_conn = primary.connect()
+    primary_cur = primary_conn.cursor()
+    primary_cur.execute("create table t(pk serial primary key, payload integer)")
+    primary_cur.execute(CREATE_SUBXACTS_FUNC)
+    primary_cur.execute("select pg_switch_wal()")
+
+    # Consume a lot of XIDs, then kill Postgres without giving it a
+    # chance to write abort records for them.
+    primary_cur.execute("begin")
+    primary_cur.execute("select create_subxacts(100000)")
+    primary.stop(mode="immediate")
+
+    # Restart the primary. Do some light work, and shut it down cleanly
+    primary.start()
+    primary_conn = primary.connect()
+    primary_cur = primary_conn.cursor()
+    primary_cur.execute("insert into t (payload) values (0)")
+    primary.stop(mode="fast")
+
+    # Create a replica. It should start up normally, thanks to the CLOG-scanning
+    # mechanism. (Restarting the primary writes a checkpoint and/or running-xacts
+    # record, which allows the standby to know that the crashed XIDs are aborted)
+    secondary = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary")
+
+    secondary_conn = secondary.connect()
+    secondary_cur = secondary_conn.cursor()
+    secondary_cur.execute("select count(*) from t")
+    assert secondary_cur.fetchone() == (1,)
+
+
+def test_replica_start_at_running_xacts(neon_simple_env: NeonEnv, pg_version):
+    """
+    Test that starting a replica works right after the primary has
+    created a running-xacts record. This may seem like a trivial case,
+    but during development, we had a bug that was triggered by having
+    oldestActiveXid == nextXid. Starting right after a running-xacts
+    record is one way to test that case.
+
+    See the module docstring for background.
+    """
+    env = neon_simple_env
+
+    if env.pg_version == PgVersion.V14 or env.pg_version == PgVersion.V15:
+        pytest.skip("pg_log_standby_snapshot() function is available only in PG16")
+
+    primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary")
+    primary_conn = primary.connect()
+    primary_cur = primary_conn.cursor()
+
+    primary_cur.execute("CREATE EXTENSION neon_test_utils")
+    primary_cur.execute("select pg_log_standby_snapshot()")
+    primary_cur.execute("select neon_xlogflush()")
+    wait_for_last_flush_lsn(env, primary, env.initial_tenant, env.initial_timeline)
+
+    secondary = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary")
+
+    secondary_conn = secondary.connect()
+    secondary_cur = secondary_conn.cursor()
+    secondary_cur.execute("select 123")
+    assert secondary_cur.fetchone() == (123,)
+
+
+def test_replica_start_wait_subxids_finish(neon_simple_env: NeonEnv):
+    """
+    Test replica startup when there are a lot of (sub)transactions active in the
+    primary. That's too many for the CLOG-scanning mechanism to handle, so the
+    replica has to wait for the large transaction to finish before it starts to
+    accept queries.
+
+    After replica startup, test MVCC with transactions that were in-progress
+    when the replica was started.
+
+    See the module docstring for background.
+    """
+
+    # Initialize the primary, a test table, and a helper function to create
+    # lots of subtransactions.
+    env = neon_simple_env
+    primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary")
+    primary_conn = primary.connect()
+    primary_cur = primary_conn.cursor()
+    primary_cur.execute("create table t(pk serial primary key, payload integer)")
+    primary_cur.execute(CREATE_SUBXACTS_FUNC)
+
+    # Start a transaction with 100000 subtransactions, and leave it open. That's
+    # too many to fit in the "known-assigned XIDs array" in the replica, and
+    # also too many to fit in the subxid caches so the running-xacts record will
+    # also overflow.
+    primary_cur.execute("begin")
+    primary_cur.execute("select create_subxacts(100000)")
+
+    # Start another, smaller transaction in the primary. We'll come back to this
+    # later.
+    primary_conn2 = primary.connect()
+    primary_cur2 = primary_conn2.cursor()
+    primary_cur2.execute("begin")
+    primary_cur2.execute("insert into t (payload) values (0)")
+
+    # Create a replica. but before that, wait for the wal to be flushed to
+    # safekeepers, so that the replica is started at a point where the large
+    # transaction is already active. (The whole transaction might not be flushed
+    # yet, but that's OK.)
+    #
+    # Start it in a separate thread, so that we can do other stuff while it's
+    # blocked waiting for the startup to finish.
+    wait_for_last_flush_lsn(env, primary, env.initial_tenant, env.initial_timeline)
+    secondary = env.endpoints.new_replica(origin=primary, endpoint_id="secondary")
+    start_secondary_thread = threading.Thread(target=secondary.start)
+    start_secondary_thread.start()
+
+    # Verify that the replica has otherwise started up, but cannot start
+    # accepting queries yet.
+    log.info("Waiting 5 s to verify that the secondary does not start")
+    start_secondary_thread.join(5)
+    assert secondary.log_contains("consistent recovery state reached")
+    assert secondary.log_contains("started streaming WAL from primary")
+    # The "redo starts" message is printed when the first WAL record is
+    # received. It might or might not be present in the log depending on how
+    # far exactly the WAL was flushed when the replica was started, and whether
+    # background activity caused any more WAL records to be flushed on the
+    # primary afterwards.
+    #
+    # assert secondary.log_contains("redo # starts")
+
+    # should not be open for connections yet
+    assert start_secondary_thread.is_alive()
+    assert not secondary.is_running()
+    assert not secondary.log_contains("database system is ready to accept read-only connections")
+
+    # Commit the large transaction in the primary.
+    #
+    # Within the next 15 s, the primary should write a new running-xacts record
+    # to the WAL which shows the transaction as completed. Once the replica
+    # replays that record, it will start accepting queries.
+    primary_cur.execute("commit")
+    start_secondary_thread.join()
+
+    # Verify that the large transaction is correctly visible in the secondary
+    # (but not the second, small transaction, which is still in-progress!)
+    secondary_conn = secondary.connect()
+    secondary_cur = secondary_conn.cursor()
+    secondary_cur.execute("select count(*) from t")
+    assert secondary_cur.fetchone() == (100000,)
+
+    # Perform some more MVCC testing using the second transaction that was
+    # started in the primary before the replica was created
+    primary_cur2.execute("select create_subxacts(10000)")
+
+    # The second transaction still hasn't committed
+    wait_replica_caughtup(primary, secondary)
+    secondary_cur.execute("BEGIN ISOLATION LEVEL REPEATABLE READ")
+    secondary_cur.execute("select count(*) from t")
+    assert secondary_cur.fetchone() == (100000,)
+
+    # Commit the second transaction in the primary
+    primary_cur2.execute("commit")
+
+    # Should still be invisible to the old snapshot
+    wait_replica_caughtup(primary, secondary)
+    secondary_cur.execute("select count(*) from t")
+    assert secondary_cur.fetchone() == (100000,)
+
+    # Commit the REPEATABLE READ transaction in the replica. Both
+    # primary transactions should now be visible to a new snapshot.
+    secondary_cur.execute("commit")
+    secondary_cur.execute("select count(*) from t")
+    assert secondary_cur.fetchone() == (110001,)
+
+
+def test_replica_too_many_known_assigned_xids(neon_simple_env: NeonEnv):
+    """
+    The CLOG-scanning mechanism fills the known-assigned XIDs array
+    optimistically at standby startup, betting that it can still fit
+    upcoming transactions replayed later from the WAL in the
+    array. This test tests what happens when that bet fails and the
+    known-assigned XID array fills up after the standby has already
+    been started. The WAL redo will fail with an error:
+
+    FATAL:  too many KnownAssignedXids
+    CONTEXT:  WAL redo at 0/1895CB0 for neon/INSERT: off: 25, flags: 0x08; blkref #0: rel 1663/5/16385, blk 64
+
+    which causes the standby to shut down.
+
+    See the module docstring for background.
+    """
+
+    # Initialize the primary, a test table, and a helper function to create lots
+    # of subtransactions.
+    env = neon_simple_env
+    primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary")
+    primary_conn = primary.connect()
+    primary_cur = primary_conn.cursor()
+    primary_cur.execute("CREATE EXTENSION neon_test_utils")
+    primary_cur.execute("create table t(pk serial primary key, payload integer)")
+    primary_cur.execute(CREATE_SUBXACTS_FUNC)
+
+    # Determine how many connections we can use
+    primary_cur.execute("show max_connections")
+    max_connections = int(primary_cur.fetchall()[0][0])
+    primary_cur.execute("show superuser_reserved_connections")
+    superuser_reserved_connections = int(primary_cur.fetchall()[0][0])
+    n_connections = max_connections - superuser_reserved_connections
+    n_subxids = 200
+
+    # Start one top transaction in primary, with lots of subtransactions. This
+    # uses up much of the known-assigned XIDs space in the standby, but doesn't
+    # cause it to overflow.
+    large_p_conn = primary.connect()
+    large_p_cur = large_p_conn.cursor()
+    large_p_cur.execute("begin")
+    large_p_cur.execute(f"select create_subxacts({max_connections} * 30)")
+
+    with closing(primary.connect()) as small_p_conn:
+        with small_p_conn.cursor() as small_p_cur:
+            small_p_cur.execute("select create_subxacts(1)")
+
+    # Create a replica at this LSN
+    primary_cur.execute("select neon_xlogflush()")
+    wait_for_last_flush_lsn(env, primary, env.initial_tenant, env.initial_timeline)
+    secondary = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary")
+    secondary_conn = secondary.connect()
+    secondary_cur = secondary_conn.cursor()
+
+    # The transaction in primary has not committed yet.
+    wait_replica_caughtup(primary, secondary)
+    secondary_cur.execute("select count(*) from t")
+    assert secondary_cur.fetchone() == (1,)
+
+    # Start max number of top transactions in primary, with a lot of
+    # subtransactions each. We add the subtransactions to each top transaction
+    # in a round-robin fashion, instead of adding a lot of subtransactions to
+    # one top transaction at a time. This way, we will have the max number of
+    # subtransactions in the in-memory subxid cache of each top transaction,
+    # until they all overflow.
+    #
+    # Currently, PGPROC_MAX_CACHED_SUBXIDS == 64, so this will overflow the all
+    # the subxid caches after creating 64 subxids in each top transaction. The
+    # point just before the caches have overflowed is the most interesting point
+    # in time, but we'll keep going beyond that, to ensure that this test is
+    # robust even if PGPROC_MAX_CACHED_SUBXIDS changes.
+    p_curs = []
+    for _ in range(0, n_connections):
+        p_cur = primary.connect().cursor()
+        p_cur.execute("begin")
+        p_curs.append(p_cur)
+
+    for _subxid in range(0, n_subxids):
+        for i in range(0, n_connections):
+            p_curs[i].execute("select create_subxacts(1)")
+
+    # Commit all the transactions in the primary
+    for i in range(0, n_connections):
+        p_curs[i].execute("commit")
+    large_p_cur.execute("commit")
+
+    # Wait until the replica crashes with "too many KnownAssignedXids" error.
+    def check_replica_crashed():
+        try:
+            secondary.connect()
+        except psycopg2.Error:
+            # Once the connection fails, return success
+            return None
+        raise RuntimeError("connection succeeded")
+
+    wait_until(20, 0.5, check_replica_crashed)
+    assert secondary.log_contains("too many KnownAssignedXids")
+
+    # Replica is crashed, so ignore stop result
+    secondary.check_stop_result = False
+
+
+def test_replica_start_repro_visibility_bug(neon_simple_env: NeonEnv):
+    """
+    Before PR #7288, a hot standby in neon incorrectly started up
+    immediately, before it had received a running-xacts record. That
+    led to visibility bugs if there were active transactions in the
+    primary. This test reproduces the incorrect query results and
+    incorrectly set hint bits, before that was fixed.
+    """
+    env = neon_simple_env
+
+    primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary")
+    p_cur = primary.connect().cursor()
+
+    p_cur.execute("begin")
+    p_cur.execute("create table t(pk integer primary key, payload integer)")
+    p_cur.execute("insert into t values (generate_series(1,100000), 0)")
+
+    secondary = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary")
+    wait_replica_caughtup(primary, secondary)
+    s_cur = secondary.connect().cursor()
+
+    # Set hint bits for pg_class tuples. If primary's transaction is
+    # not marked as in-progress in MVCC snapshot, then XMIN_INVALID
+    # hint bit will be set for table's 't' tuple, making it invisible
+    # even after the commit record is replayed later.
+    s_cur.execute("select * from pg_class")
+
+    p_cur.execute("commit")
+    wait_replica_caughtup(primary, secondary)
+    s_cur.execute("select * from t where pk = 1")
+    assert s_cur.fetchone() == (1, 0)
+
+
+@pytest.mark.parametrize("shutdown", [True, False])
+def test_replica_start_with_prepared_xacts(neon_simple_env: NeonEnv, shutdown: bool):
+    """
+    Test the CLOG-scanning mechanism at hot standby startup in the presence of
+    prepared transactions.
+
+    This test is run in two variants: one where the primary server is shut down
+    before starting the secondary, or not.
+    """
+
+    # Initialize the primary, a test table, and a helper function to create lots
+    # of subtransactions.
+    env = neon_simple_env
+    primary = env.endpoints.create_start(
+        branch_name="main", endpoint_id="primary", config_lines=["max_prepared_transactions=5"]
+    )
+    primary_conn = primary.connect()
+    primary_cur = primary_conn.cursor()
+    primary_cur.execute("CREATE EXTENSION neon_test_utils")
+    primary_cur.execute("create table t(pk serial primary key, payload integer)")
+    primary_cur.execute("create table t1(pk integer primary key)")
+    primary_cur.execute("create table t2(pk integer primary key)")
+    primary_cur.execute(CREATE_SUBXACTS_FUNC)
+
+    # Prepare a transaction for two-phase commit
+    primary_cur.execute("begin")
+    primary_cur.execute("insert into t1 values (1)")
+    primary_cur.execute("prepare transaction 't1'")
+
+    # Prepare another transaction for two-phase commit, with a subtransaction
+    primary_cur.execute("begin")
+    primary_cur.execute("insert into t2 values (2)")
+    primary_cur.execute("savepoint sp")
+    primary_cur.execute("insert into t2 values (3)")
+    primary_cur.execute("prepare transaction 't2'")
+
+    # Start a transaction in the primary. Leave the transaction open.
+    #
+    # The transaction has some subtransactions, but not too many to cause the
+    # CLOG-scanning mechanism to give up.
+    primary_cur.execute("begin")
+    primary_cur.execute("select create_subxacts(50)")
+
+    # Wait for the WAL to be flushed
+    primary_cur.execute("select neon_xlogflush()")
+    wait_for_last_flush_lsn(env, primary, env.initial_tenant, env.initial_timeline)
+
+    if shutdown:
+        primary.stop(mode="fast")
+
+    # Create a replica. It should start up normally, thanks to the CLOG-scanning
+    # mechanism.
+    secondary = env.endpoints.new_replica_start(
+        origin=primary, endpoint_id="secondary", config_lines=["max_prepared_transactions=5"]
+    )
+
+    # The transaction did not commit, so it should not be visible in the secondary
+    secondary_conn = secondary.connect()
+    secondary_cur = secondary_conn.cursor()
+    secondary_cur.execute("select count(*) from t")
+    assert secondary_cur.fetchone() == (0,)
+    secondary_cur.execute("select count(*) from t1")
+    assert secondary_cur.fetchone() == (0,)
+    secondary_cur.execute("select count(*) from t2")
+    assert secondary_cur.fetchone() == (0,)
+
+    if shutdown:
+        primary.start()
+        primary_conn = primary.connect()
+        primary_cur = primary_conn.cursor()
+    else:
+        primary_cur.execute("commit")
+    primary_cur.execute("commit prepared 't1'")
+    primary_cur.execute("commit prepared 't2'")
+
+    wait_replica_caughtup(primary, secondary)
+
+    secondary_cur.execute("select count(*) from t")
+    if shutdown:
+        assert secondary_cur.fetchone() == (0,)
+    else:
+        assert secondary_cur.fetchone() == (50,)
+    secondary_cur.execute("select * from t1")
+    assert secondary_cur.fetchall() == [(1,)]
+    secondary_cur.execute("select * from t2")
+    assert secondary_cur.fetchall() == [(2,), (3,)]
+
+
+def test_replica_start_with_prepared_xacts_with_subxacts(neon_simple_env: NeonEnv):
+    """
+    Test the CLOG-scanning mechanism at hot standby startup in the presence of
+    prepared transactions, with subtransactions.
+    """
+
+    # Initialize the primary, a test table, and a helper function to create lots
+    # of subtransactions.
+    env = neon_simple_env
+    primary = env.endpoints.create_start(
+        branch_name="main", endpoint_id="primary", config_lines=["max_prepared_transactions=5"]
+    )
+    primary_conn = primary.connect()
+    primary_cur = primary_conn.cursor()
+
+    # Install extension containing function needed for test
+    primary_cur.execute("CREATE EXTENSION neon_test_utils")
+
+    primary_cur.execute("create table t(pk serial primary key, payload integer)")
+    primary_cur.execute(CREATE_SUBXACTS_FUNC)
+
+    # Advance nextXid close to the beginning of the next pg_subtrans segment (2^16 XIDs)
+    #
+    # This is interesting, because it tests that pg_subtrans is initialized correctly
+    # at standby startup. (We had a bug where it didn't at one point during development.)
+    while True:
+        xid = int(query_scalar(primary_cur, "SELECT txid_current()"))
+        log.info(f"xid now {xid}")
+        # Consume 500 transactions at a time until we get close
+        if xid < 65535 - 600:
+            primary_cur.execute("select test_consume_xids(500);")
+        else:
+            break
+    primary_cur.execute("checkpoint")
+
+    # Prepare a transaction for two-phase commit
+    primary_cur.execute("begin")
+    primary_cur.execute("select create_subxacts(1000)")
+    primary_cur.execute("prepare transaction 't1'")
+
+    # Wait for the WAL to be flushed, and stop the primary
+    wait_for_last_flush_lsn(env, primary, env.initial_tenant, env.initial_timeline)
+    primary.stop(mode="fast")
+
+    # Create a replica. It should start up normally, thanks to the CLOG-scanning
+    # mechanism.
+    secondary = env.endpoints.new_replica_start(
+        origin=primary, endpoint_id="secondary", config_lines=["max_prepared_transactions=5"]
+    )
+
+    # The transaction did not commit, so it should not be visible in the secondary
+    secondary_conn = secondary.connect()
+    secondary_cur = secondary_conn.cursor()
+    secondary_cur.execute("select count(*) from t")
+    assert secondary_cur.fetchone() == (0,)
+
+    primary.start()
+
+    # Open a lot of subtransactions in the primary, causing the subxids cache to overflow
+    primary_conn = primary.connect()
+    primary_cur = primary_conn.cursor()
+    primary_cur.execute("select create_subxacts(100000)")
+
+    wait_replica_caughtup(primary, secondary)
+
+    secondary_cur.execute("select count(*) from t")
+    assert secondary_cur.fetchone() == (100000,)
+
+    primary_cur.execute("commit prepared 't1'")
+
+    wait_replica_caughtup(primary, secondary)
+    secondary_cur.execute("select count(*) from t")
+    assert secondary_cur.fetchone() == (101000,)
+
+
+def test_replica_start_with_prepared_xacts_with_many_subxacts(neon_simple_env: NeonEnv):
+    """
+    Test the CLOG-scanning mechanism at hot standby startup in the presence of
+    prepared transactions, with lots of subtransactions.
+
+    Like test_replica_start_with_prepared_xacts_with_subxacts, but with more
+    subxacts, to test that the prepared transaction's subxids don't consume
+    space in the known-assigned XIDs array. (They are set in pg_subtrans
+    instead)
+    """
+
+    # Initialize the primary, a test table, and a helper function to create lots
+    # of subtransactions.
+    env = neon_simple_env
+    primary = env.endpoints.create_start(
+        branch_name="main", endpoint_id="primary", config_lines=["max_prepared_transactions=5"]
+    )
+    primary_conn = primary.connect()
+    primary_cur = primary_conn.cursor()
+
+    # Install extension containing function needed for test
+    primary_cur.execute("CREATE EXTENSION neon_test_utils")
+
+    primary_cur.execute("create table t(pk serial primary key, payload integer)")
+    primary_cur.execute(CREATE_SUBXACTS_FUNC)
+
+    # Prepare a transaction for two-phase commit, with lots of subxids
+    primary_cur.execute("begin")
+    primary_cur.execute("select create_subxacts(50000)")
+
+    # to make things a bit more varied, intersperse a few other XIDs in between
+    # the prepared transaction's sub-XIDs
+    with primary.connect().cursor() as primary_cur2:
+        primary_cur2.execute("insert into t (payload) values (123)")
+        primary_cur2.execute("begin; insert into t (payload) values (-1); rollback")
+
+    primary_cur.execute("select create_subxacts(50000)")
+    primary_cur.execute("prepare transaction 't1'")
+
+    # Wait for the WAL to be flushed
+    wait_for_last_flush_lsn(env, primary, env.initial_tenant, env.initial_timeline)
+
+    primary.stop(mode="fast")
+
+    # Create a replica. It should start up normally, thanks to the CLOG-scanning
+    # mechanism.
+    secondary = env.endpoints.new_replica_start(
+        origin=primary, endpoint_id="secondary", config_lines=["max_prepared_transactions=5"]
+    )
+
+    # The transaction did not commit, so it should not be visible in the secondary
+    secondary_conn = secondary.connect()
+    secondary_cur = secondary_conn.cursor()
+    secondary_cur.execute("select count(*) from t")
+    assert secondary_cur.fetchone() == (1,)
+
+    primary.start()
+
+    # Open a lot of subtransactions in the primary, causing the subxids cache to overflow
+    primary_conn = primary.connect()
+    primary_cur = primary_conn.cursor()
+    primary_cur.execute("select create_subxacts(100000)")
+
+    wait_replica_caughtup(primary, secondary)
+
+    secondary_cur.execute("select count(*) from t")
+    assert secondary_cur.fetchone() == (100001,)
+
+    primary_cur.execute("commit prepared 't1'")
+
+    wait_replica_caughtup(primary, secondary)
+    secondary_cur.execute("select count(*) from t")
+    assert secondary_cur.fetchone() == (200001,)
diff --git a/test_runner/regress/test_replication_start.py b/test_runner/regress/test_replication_start.py
deleted file mode 100644
index 2360745990..0000000000
--- a/test_runner/regress/test_replication_start.py
+++ /dev/null
@@ -1,32 +0,0 @@
-import pytest
-from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv, wait_replica_caughtup
-
-
-@pytest.mark.xfail
-def test_replication_start(neon_simple_env: NeonEnv):
-    env = neon_simple_env
-
-    with env.endpoints.create_start(branch_name="main", endpoint_id="primary") as primary:
-        with primary.connect() as p_con:
-            with p_con.cursor() as p_cur:
-                p_cur.execute("begin")
-                p_cur.execute("create table t(pk integer primary key, payload integer)")
-                p_cur.execute("insert into t values (generate_series(1,100000), 0)")
-                p_cur.execute("select txid_current()")
-                xid = p_cur.fetchall()[0][0]
-                log.info(f"Master transaction {xid}")
-                with env.endpoints.new_replica_start(
-                    origin=primary, endpoint_id="secondary"
-                ) as secondary:
-                    wait_replica_caughtup(primary, secondary)
-                    with secondary.connect() as s_con:
-                        with s_con.cursor() as s_cur:
-                            # Enforce setting hint bits for pg_class tuples.
-                            # If master's transaction is not marked as in-progress in MVCC snapshot,
-                            # then XMIN_INVALID hint bit will be set for table's 't' tuple makeing it invisible.
-                            s_cur.execute("select * from pg_class")
-                            p_cur.execute("commit")
-                            wait_replica_caughtup(primary, secondary)
-                            s_cur.execute("select * from t where pk = 1")
-                            assert s_cur.fetchone() == (1, 0)
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 223dd92595..ad73770c44 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 223dd925959f8124711dd3d867dc8ba6629d52c0
+Subproject commit ad73770c446ea361f43e4f0404798b7e5e7a62d8
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index f54d7373eb..4874c8e52e 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit f54d7373eb0de5a54bce2becdb1c801026c7edff
+Subproject commit 4874c8e52ed349a9f8290bbdcd91eb92677a5d24
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index e06bebc753..b810fdfcbb 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit e06bebc75306b583e758b52c95946d41109239b2
+Subproject commit b810fdfcbb59afea7ea7bbe0cf94eaccb55a2ea2
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 574e371934..da49ff19c3 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
-  "v16": ["16.3", "e06bebc75306b583e758b52c95946d41109239b2"],
-  "v15": ["15.7", "f54d7373eb0de5a54bce2becdb1c801026c7edff"],
-  "v14": ["14.12", "223dd925959f8124711dd3d867dc8ba6629d52c0"]
+  "v16": ["16.3", "b810fdfcbb59afea7ea7bbe0cf94eaccb55a2ea2"],
+  "v15": ["15.7", "4874c8e52ed349a9f8290bbdcd91eb92677a5d24"],
+  "v14": ["14.12", "ad73770c446ea361f43e4f0404798b7e5e7a62d8"]
 }

From 86c8ba2563cec39f8fa852846bf185bd8c118424 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 1 Jul 2024 12:48:20 +0100
Subject: [PATCH 140/412] pageserver: add metric
 `pageserver_secondary_resident_physical_size` (#8204)

## Problem

We lack visibility of how much local disk space is used by secondary
tenant locations

Close: https://github.com/neondatabase/neon/issues/8181

## Summary of changes

- Add `pageserver_secondary_resident_physical_size`, tagged by tenant
- Register & de-register label sets from SecondaryTenant
- Add+use wrappers in SecondaryDetail that update metrics when
adding+removing layers/timelines
---
 pageserver/src/metrics.rs                     |  11 +-
 pageserver/src/tenant/secondary.rs            |  37 +++-
 pageserver/src/tenant/secondary/downloader.rs | 173 ++++++++++++++----
 3 files changed, 171 insertions(+), 50 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index f5aca6dfb3..9cd7ffa042 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -476,7 +476,7 @@ static STANDBY_HORIZON: Lazy<IntGaugeVec> = Lazy::new(|| {
 static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
     register_uint_gauge_vec!(
         "pageserver_resident_physical_size",
-        "The size of the layer files present in the pageserver's filesystem.",
+        "The size of the layer files present in the pageserver's filesystem, for attached locations.",
         &["tenant_id", "shard_id", "timeline_id"]
     )
     .expect("failed to define a metric")
@@ -1691,6 +1691,15 @@ pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| {
 }
 });
 
+pub(crate) static SECONDARY_RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_secondary_resident_physical_size",
+        "The size of the layer files present in the pageserver's filesystem, for secondary locations.",
+        &["tenant_id", "shard_id"]
+    )
+    .expect("failed to define a metric")
+});
+
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub enum RemoteOpKind {
     Upload,
diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs
index af6840f525..a233d11c4a 100644
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -23,6 +23,8 @@ use super::{
     storage_layer::LayerName,
 };
 
+use crate::metrics::SECONDARY_RESIDENT_PHYSICAL_SIZE;
+use metrics::UIntGauge;
 use pageserver_api::{
     models,
     shard::{ShardIdentity, TenantShardId},
@@ -99,6 +101,17 @@ pub(crate) struct SecondaryTenant {
 
     // Public state indicating overall progress of downloads relative to the last heatmap seen
     pub(crate) progress: std::sync::Mutex<models::SecondaryProgress>,
+
+    // Sum of layer sizes on local disk
+    pub(super) resident_size_metric: UIntGauge,
+}
+
+impl Drop for SecondaryTenant {
+    fn drop(&mut self) {
+        let tenant_id = self.tenant_shard_id.tenant_id.to_string();
+        let shard_id = format!("{}", self.tenant_shard_id.shard_slug());
+        let _ = SECONDARY_RESIDENT_PHYSICAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]);
+    }
 }
 
 impl SecondaryTenant {
@@ -108,6 +121,12 @@ impl SecondaryTenant {
         tenant_conf: TenantConfOpt,
         config: &SecondaryLocationConfig,
     ) -> Arc<Self> {
+        let tenant_id = tenant_shard_id.tenant_id.to_string();
+        let shard_id = format!("{}", tenant_shard_id.shard_slug());
+        let resident_size_metric = SECONDARY_RESIDENT_PHYSICAL_SIZE
+            .get_metric_with_label_values(&[&tenant_id, &shard_id])
+            .unwrap();
+
         Arc::new(Self {
             tenant_shard_id,
             // todo: shall we make this a descendent of the
@@ -123,6 +142,8 @@ impl SecondaryTenant {
             detail: std::sync::Mutex::new(SecondaryDetail::new(config.clone())),
 
             progress: std::sync::Mutex::default(),
+
+            resident_size_metric,
         })
     }
 
@@ -211,16 +232,12 @@ impl SecondaryTenant {
             // have to 100% match what is on disk, because it's a best-effort warming
             // of the cache.
             let mut detail = this.detail.lock().unwrap();
-            if let Some(timeline_detail) = detail.timelines.get_mut(&timeline_id) {
-                let removed = timeline_detail.on_disk_layers.remove(&name);
-
-                // We might race with removal of the same layer during downloads, if it was removed
-                // from the heatmap.  If we see that the OnDiskState is gone, then no need to
-                // do a physical deletion or store in evicted_at.
-                if let Some(removed) = removed {
-                    removed.remove_blocking();
-                    timeline_detail.evicted_at.insert(name, now);
-                }
+            if let Some(removed) =
+                detail.evict_layer(name, &timeline_id, now, &this.resident_size_metric)
+            {
+                // We might race with removal of the same layer during downloads, so finding the layer we
+                // were trying to remove is optional.  Only issue the disk I/O to remove it if we found it.
+                removed.remove_blocking();
             }
         })
         .await
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index f6f30641db..27439d4f03 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -46,6 +46,7 @@ use crate::tenant::{
 use camino::Utf8PathBuf;
 use chrono::format::{DelayedFormat, StrftimeItems};
 use futures::Future;
+use metrics::UIntGauge;
 use pageserver_api::models::SecondaryProgress;
 use pageserver_api::shard::TenantShardId;
 use remote_storage::{DownloadError, Etag, GenericRemoteStorage};
@@ -131,16 +132,66 @@ impl OnDiskState {
             .or_else(fs_ext::ignore_not_found)
             .fatal_err("Deleting secondary layer")
     }
+
+    pub(crate) fn file_size(&self) -> u64 {
+        self.metadata.file_size
+    }
 }
 
 #[derive(Debug, Clone, Default)]
 pub(super) struct SecondaryDetailTimeline {
-    pub(super) on_disk_layers: HashMap<LayerName, OnDiskState>,
+    on_disk_layers: HashMap<LayerName, OnDiskState>,
 
     /// We remember when layers were evicted, to prevent re-downloading them.
     pub(super) evicted_at: HashMap<LayerName, SystemTime>,
 }
 
+impl SecondaryDetailTimeline {
+    pub(super) fn remove_layer(
+        &mut self,
+        name: &LayerName,
+        resident_metric: &UIntGauge,
+    ) -> Option<OnDiskState> {
+        let removed = self.on_disk_layers.remove(name);
+        if let Some(removed) = &removed {
+            resident_metric.sub(removed.file_size());
+        }
+        removed
+    }
+
+    /// `local_path`
+    fn touch_layer<F>(
+        &mut self,
+        conf: &'static PageServerConf,
+        tenant_shard_id: &TenantShardId,
+        timeline_id: &TimelineId,
+        touched: &HeatMapLayer,
+        resident_metric: &UIntGauge,
+        local_path: F,
+    ) where
+        F: FnOnce() -> Utf8PathBuf,
+    {
+        use std::collections::hash_map::Entry;
+        match self.on_disk_layers.entry(touched.name.clone()) {
+            Entry::Occupied(mut v) => {
+                v.get_mut().access_time = touched.access_time;
+            }
+            Entry::Vacant(e) => {
+                e.insert(OnDiskState::new(
+                    conf,
+                    tenant_shard_id,
+                    timeline_id,
+                    touched.name.clone(),
+                    touched.metadata.clone(),
+                    touched.access_time,
+                    local_path(),
+                ));
+                resident_metric.add(touched.metadata.file_size);
+            }
+        }
+    }
+}
+
 // Aspects of a heatmap that we remember after downloading it
 #[derive(Clone, Debug)]
 struct DownloadSummary {
@@ -158,7 +209,7 @@ pub(super) struct SecondaryDetail {
 
     last_download: Option<DownloadSummary>,
     next_download: Option<Instant>,
-    pub(super) timelines: HashMap<TimelineId, SecondaryDetailTimeline>,
+    timelines: HashMap<TimelineId, SecondaryDetailTimeline>,
 }
 
 /// Helper for logging SystemTime
@@ -191,6 +242,38 @@ impl SecondaryDetail {
         }
     }
 
+    pub(super) fn evict_layer(
+        &mut self,
+        name: LayerName,
+        timeline_id: &TimelineId,
+        now: SystemTime,
+        resident_metric: &UIntGauge,
+    ) -> Option<OnDiskState> {
+        let timeline = self.timelines.get_mut(timeline_id)?;
+        let removed = timeline.remove_layer(&name, resident_metric);
+        if removed.is_some() {
+            timeline.evicted_at.insert(name, now);
+        }
+        removed
+    }
+
+    pub(super) fn remove_timeline(
+        &mut self,
+        timeline_id: &TimelineId,
+        resident_metric: &UIntGauge,
+    ) {
+        let removed = self.timelines.remove(timeline_id);
+        if let Some(removed) = removed {
+            resident_metric.sub(
+                removed
+                    .on_disk_layers
+                    .values()
+                    .map(|l| l.metadata.file_size)
+                    .sum(),
+            );
+        }
+    }
+
     /// Additionally returns the total number of layers, used for more stable relative access time
     /// based eviction.
     pub(super) fn get_layers_for_eviction(
@@ -601,8 +684,13 @@ impl<'a> TenantDownloader<'a> {
                 Some(t) => t,
                 None => {
                     // We have no existing state: need to scan local disk for layers first.
-                    let timeline_state =
-                        init_timeline_state(self.conf, tenant_shard_id, timeline).await;
+                    let timeline_state = init_timeline_state(
+                        self.conf,
+                        tenant_shard_id,
+                        timeline,
+                        &self.secondary_state.resident_size_metric,
+                    )
+                    .await;
 
                     // Re-acquire detail lock now that we're done with async load from local FS
                     self.secondary_state
@@ -671,6 +759,25 @@ impl<'a> TenantDownloader<'a> {
                 .await?;
         }
 
+        // Metrics consistency check in testing builds
+        if cfg!(feature = "testing") {
+            let detail = self.secondary_state.detail.lock().unwrap();
+            let resident_size = detail
+                .timelines
+                .values()
+                .map(|tl| {
+                    tl.on_disk_layers
+                        .values()
+                        .map(|v| v.metadata.file_size)
+                        .sum::<u64>()
+                })
+                .sum::<u64>();
+            assert_eq!(
+                resident_size,
+                self.secondary_state.resident_size_metric.get()
+            );
+        }
+
         // Only update last_etag after a full successful download: this way will not skip
         // the next download, even if the heatmap's actual etag is unchanged.
         self.secondary_state.detail.lock().unwrap().last_download = Some(DownloadSummary {
@@ -783,7 +890,7 @@ impl<'a> TenantDownloader<'a> {
             for delete_timeline in &delete_timelines {
                 // We haven't removed from disk yet, but optimistically remove from in-memory state: if removal
                 // from disk fails that will be a fatal error.
-                detail.timelines.remove(delete_timeline);
+                detail.remove_timeline(delete_timeline, &self.secondary_state.resident_size_metric);
             }
         }
 
@@ -801,7 +908,7 @@ impl<'a> TenantDownloader<'a> {
             let Some(timeline_state) = detail.timelines.get_mut(&timeline_id) else {
                 continue;
             };
-            timeline_state.on_disk_layers.remove(&layer_name);
+            timeline_state.remove_layer(&layer_name, &self.secondary_state.resident_size_metric);
         }
 
         for timeline_id in delete_timelines {
@@ -1000,33 +1107,24 @@ impl<'a> TenantDownloader<'a> {
             let timeline_detail = detail.timelines.entry(timeline_id).or_default();
 
             tracing::info!("Wrote timeline_detail for {} touched layers", touched.len());
-
-            for t in touched {
-                use std::collections::hash_map::Entry;
-                match timeline_detail.on_disk_layers.entry(t.name.clone()) {
-                    Entry::Occupied(mut v) => {
-                        v.get_mut().access_time = t.access_time;
-                    }
-                    Entry::Vacant(e) => {
-                        let local_path = local_layer_path(
+            touched.into_iter().for_each(|t| {
+                timeline_detail.touch_layer(
+                    self.conf,
+                    tenant_shard_id,
+                    &timeline_id,
+                    &t,
+                    &self.secondary_state.resident_size_metric,
+                    || {
+                        local_layer_path(
                             self.conf,
                             tenant_shard_id,
                             &timeline_id,
                             &t.name,
                             &t.metadata.generation,
-                        );
-                        e.insert(OnDiskState::new(
-                            self.conf,
-                            tenant_shard_id,
-                            &timeline_id,
-                            t.name,
-                            t.metadata.clone(),
-                            t.access_time,
-                            local_path,
-                        ));
-                    }
-                }
-            }
+                        )
+                    },
+                )
+            });
         }
 
         result
@@ -1135,6 +1233,7 @@ async fn init_timeline_state(
     conf: &'static PageServerConf,
     tenant_shard_id: &TenantShardId,
     heatmap: &HeatMapTimeline,
+    resident_metric: &UIntGauge,
 ) -> SecondaryDetailTimeline {
     let timeline_path = conf.timeline_path(tenant_shard_id, &heatmap.timeline_id);
     let mut detail = SecondaryDetailTimeline::default();
@@ -1210,17 +1309,13 @@ async fn init_timeline_state(
                         } else {
                             // We expect the access time to be initialized immediately afterwards, when
                             // the latest heatmap is applied to the state.
-                            detail.on_disk_layers.insert(
-                                name.clone(),
-                                OnDiskState::new(
-                                    conf,
-                                    tenant_shard_id,
-                                    &heatmap.timeline_id,
-                                    name,
-                                    remote_meta.metadata.clone(),
-                                    remote_meta.access_time,
-                                    file_path,
-                                ),
+                            detail.touch_layer(
+                                conf,
+                                tenant_shard_id,
+                                &heatmap.timeline_id,
+                                remote_meta,
+                                resident_metric,
+                                || file_path,
                             );
                         }
                     }

From 325294bced98ade28513d428f55531e9b7e33b8b Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 1 Jul 2024 13:11:55 +0100
Subject: [PATCH 141/412] CI(build-tools): Remove libpq from build image
 (#8206)

## Problem
We use `build-tools` image as a base image to build other images, and it
has a pretty old `libpq-dev` installed (v13; it wasn't that old until I
removed system Postgres 14 from `build-tools` image in
https://github.com/neondatabase/neon/pull/6540)

## Summary of changes
- Remove `libpq-dev` from `build-tools` image
- Set `LD_LIBRARY_PATH` for tests (for different Postgres binaries that
we use, like psql and pgbench)
- Set `PQ_LIB_DIR` to build Storage Controller
- Set `LD_LIBRARY_PATH`/`DYLD_LIBRARY_PATH` in the Storage Controller
where it calls Postgres binaries
---
 .../actions/run-python-test-set/action.yml    |  1 +
 .github/workflows/benchmarking.yml            |  4 +++
 .github/workflows/build-build-tools-image.yml |  1 +
 .github/workflows/build_and_test.yml          |  7 ++++
 .github/workflows/neon_extra_builds.yml       |  7 ++++
 Dockerfile                                    |  3 +-
 Dockerfile.build-tools                        |  1 -
 control_plane/src/local_env.rs                | 11 ++++--
 control_plane/src/storage_controller.rs       | 34 +++++++++++++++----
 9 files changed, 57 insertions(+), 12 deletions(-)

diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index c6ea52ba88..a2aae0772b 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -114,6 +114,7 @@ runs:
         export PLATFORM=${PLATFORM:-github-actions-selfhosted}
         export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install}
         export DEFAULT_PG_VERSION=${PG_VERSION#v}
+        export LD_LIBRARY_PATH=${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/lib
 
         if [ "${BUILD_TYPE}" = "remote" ]; then
           export REMOTE_ENV=1
diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index db4209500f..0e748adeb6 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -379,6 +379,10 @@ jobs:
 
     - name: Add Postgres binaries to PATH
       run: |
+        LD_LIBRARY_PATH="${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/lib"
+        export LD_LIBRARY_PATH
+        echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" >> $GITHUB_ENV
+
         ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
         echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
 
diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml
index 5a94dd8e6f..f1c39e7e4f 100644
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -82,6 +82,7 @@ jobs:
           tags: neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.arch }}
 
       - name: Remove custom docker config directory
+        if: always()
         run: |
           rm -rf /tmp/.docker-custom
 
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 9cea9f4148..24ad26205b 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -335,6 +335,8 @@ jobs:
 
       - name: Run cargo build
         run: |
+          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
+          export PQ_LIB_DIR
           ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
 
       # Do install *before* running rust tests because they might recompile the
@@ -383,6 +385,11 @@ jobs:
         env:
           NEXTEST_RETRIES: 3
         run: |
+          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
+          export PQ_LIB_DIR
+          LD_LIBRARY_PATH=$(pwd)/pg_install/v16/lib
+          export LD_LIBRARY_PATH
+
           #nextest does not yet support running doctests
           cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
 
diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index 7d2187e59c..330d858c0e 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -232,12 +232,19 @@ jobs:
 
       - name: Run cargo build
         run: |
+          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
+          export PQ_LIB_DIR
           mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests -j$(nproc)
 
       - name: Run cargo test
         env:
           NEXTEST_RETRIES: 3
         run: |
+          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
+          export PQ_LIB_DIR
+          LD_LIBRARY_PATH=$(pwd)/pg_install/v16/lib
+          export LD_LIBRARY_PATH
+
           cargo nextest run $CARGO_FEATURES -j$(nproc)
 
           # Run separate tests for real S3
diff --git a/Dockerfile b/Dockerfile
index b4900d4a94..f0197758e4 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -42,12 +42,13 @@ ARG CACHEPOT_BUCKET=neon-github-dev
 COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server
 COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server
 COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server
+COPY --from=pg-build /home/nonroot/pg_install/v16/lib                       pg_install/v16/lib
 COPY --chown=nonroot . .
 
 # Show build caching stats to check if it was used in the end.
 # Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
 RUN set -e \
-    && RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment" cargo build  \
+    && PQ_LIB_DIR=$(pwd)/pg_install/v16/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment" cargo build \
       --bin pg_sni_router  \
       --bin pageserver  \
       --bin pagectl  \
diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools
index f85706ef6a..30314376ef 100644
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -26,7 +26,6 @@ RUN set -e \
         liblzma-dev \
         libncurses5-dev \
         libncursesw5-dev \
-        libpq-dev \
         libreadline-dev \
         libseccomp-dev \
         libsqlite3-dev \
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 6634274d2a..3ac3ce21df 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -325,11 +325,16 @@ impl LocalEnv {
         }
     }
 
-    pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
-        Ok(self.pg_distrib_dir(pg_version)?.join("bin"))
+    pub fn pg_dir(&self, pg_version: u32, dir_name: &str) -> anyhow::Result<PathBuf> {
+        Ok(self.pg_distrib_dir(pg_version)?.join(dir_name))
     }
+
+    pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
+        self.pg_dir(pg_version, "bin")
+    }
+
     pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
-        Ok(self.pg_distrib_dir(pg_version)?.join("lib"))
+        self.pg_dir(pg_version, "lib")
     }
 
     pub fn pageserver_bin(&self) -> PathBuf {
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index 5ca1b13b2a..47103a2e0a 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -155,16 +155,16 @@ impl StorageController {
         .expect("non-Unicode path")
     }
 
-    /// Find the directory containing postgres binaries, such as `initdb` and `pg_ctl`
+    /// Find the directory containing postgres subdirectories, such `bin` and `lib`
     ///
     /// This usually uses STORAGE_CONTROLLER_POSTGRES_VERSION of postgres, but will fall back
     /// to other versions if that one isn't found.  Some automated tests create circumstances
     /// where only one version is available in pg_distrib_dir, such as `test_remote_extensions`.
-    pub async fn get_pg_bin_dir(&self) -> anyhow::Result<Utf8PathBuf> {
+    async fn get_pg_dir(&self, dir_name: &str) -> anyhow::Result<Utf8PathBuf> {
         let prefer_versions = [STORAGE_CONTROLLER_POSTGRES_VERSION, 15, 14];
 
         for v in prefer_versions {
-            let path = Utf8PathBuf::from_path_buf(self.env.pg_bin_dir(v)?).unwrap();
+            let path = Utf8PathBuf::from_path_buf(self.env.pg_dir(v, dir_name)?).unwrap();
             if tokio::fs::try_exists(&path).await? {
                 return Ok(path);
             }
@@ -172,11 +172,20 @@ impl StorageController {
 
         // Fall through
         anyhow::bail!(
-            "Postgres binaries not found in {}",
-            self.env.pg_distrib_dir.display()
+            "Postgres directory '{}' not found in {}",
+            dir_name,
+            self.env.pg_distrib_dir.display(),
         );
     }
 
+    pub async fn get_pg_bin_dir(&self) -> anyhow::Result<Utf8PathBuf> {
+        self.get_pg_dir("bin").await
+    }
+
+    pub async fn get_pg_lib_dir(&self) -> anyhow::Result<Utf8PathBuf> {
+        self.get_pg_dir("lib").await
+    }
+
     /// Readiness check for our postgres process
     async fn pg_isready(&self, pg_bin_dir: &Utf8Path) -> anyhow::Result<bool> {
         let bin_path = pg_bin_dir.join("pg_isready");
@@ -229,12 +238,17 @@ impl StorageController {
             .unwrap()
             .join("storage_controller_db");
         let pg_bin_dir = self.get_pg_bin_dir().await?;
+        let pg_lib_dir = self.get_pg_lib_dir().await?;
         let pg_log_path = pg_data_path.join("postgres.log");
 
         if !tokio::fs::try_exists(&pg_data_path).await? {
             // Initialize empty database
             let initdb_path = pg_bin_dir.join("initdb");
             let mut child = Command::new(&initdb_path)
+                .envs(vec![
+                    ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+                    ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+                ])
                 .args(["-D", pg_data_path.as_ref()])
                 .spawn()
                 .expect("Failed to spawn initdb");
@@ -269,7 +283,10 @@ impl StorageController {
             &self.env.base_data_dir,
             pg_bin_dir.join("pg_ctl").as_std_path(),
             db_start_args,
-            [],
+            vec![
+                ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+                ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+            ],
             background_process::InitialPidFile::Create(self.postgres_pid_file()),
             retry_timeout,
             || self.pg_isready(&pg_bin_dir),
@@ -324,7 +341,10 @@ impl StorageController {
             &self.env.base_data_dir,
             &self.env.storage_controller_bin(),
             args,
-            [],
+            vec![
+                ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+                ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+            ],
             background_process::InitialPidFile::Create(self.pid_file()),
             retry_timeout,
             || async {

From cf4ea92aadac1b0b9ac5b1e84763b48f7d4d61fe Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Mon, 1 Jul 2024 10:36:49 -0400
Subject: [PATCH 142/412] fix(pageserver): include aux file in basebackup only
 once (#8207)

Extracted from https://github.com/neondatabase/neon/pull/6560, currently
we include multiple copies of aux files in the basebackup.

## Summary of changes

Fix the loop.

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pageserver/src/basebackup.rs | 57 ++++++++++++++++++------------------
 1 file changed, 29 insertions(+), 28 deletions(-)

diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index 0f057a4368..207f781e1b 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -348,35 +348,36 @@ where
                     self.add_rel(rel, rel).await?;
                 }
             }
-
-            for (path, content) in self
-                .timeline
-                .list_aux_files(self.lsn, self.ctx)
-                .await
-                .map_err(|e| BasebackupError::Server(e.into()))?
-            {
-                if path.starts_with("pg_replslot") {
-                    let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN;
-                    let restart_lsn = Lsn(u64::from_le_bytes(
-                        content[offs..offs + 8].try_into().unwrap(),
-                    ));
-                    info!("Replication slot {} restart LSN={}", path, restart_lsn);
-                    min_restart_lsn = Lsn::min(min_restart_lsn, restart_lsn);
-                } else if path == "pg_logical/replorigin_checkpoint" {
-                    // replorigin_checkoint is written only on compute shutdown, so it contains
-                    // deteriorated values. So we generate our own version of this file for the particular LSN
-                    // based on information about replorigins extracted from transaction commit records.
-                    // In future we will not generate AUX record for "pg_logical/replorigin_checkpoint" at all,
-                    // but now we should handle (skip) it for backward compatibility.
-                    continue;
-                }
-                let header = new_tar_header(&path, content.len() as u64)?;
-                self.ar
-                    .append(&header, &*content)
-                    .await
-                    .context("could not add aux file to basebackup tarball")?;
-            }
         }
+
+        for (path, content) in self
+            .timeline
+            .list_aux_files(self.lsn, self.ctx)
+            .await
+            .map_err(|e| BasebackupError::Server(e.into()))?
+        {
+            if path.starts_with("pg_replslot") {
+                let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN;
+                let restart_lsn = Lsn(u64::from_le_bytes(
+                    content[offs..offs + 8].try_into().unwrap(),
+                ));
+                info!("Replication slot {} restart LSN={}", path, restart_lsn);
+                min_restart_lsn = Lsn::min(min_restart_lsn, restart_lsn);
+            } else if path == "pg_logical/replorigin_checkpoint" {
+                // replorigin_checkoint is written only on compute shutdown, so it contains
+                // deteriorated values. So we generate our own version of this file for the particular LSN
+                // based on information about replorigins extracted from transaction commit records.
+                // In future we will not generate AUX record for "pg_logical/replorigin_checkpoint" at all,
+                // but now we should handle (skip) it for backward compatibility.
+                continue;
+            }
+            let header = new_tar_header(&path, content.len() as u64)?;
+            self.ar
+                .append(&header, &*content)
+                .await
+                .context("could not add aux file to basebackup tarball")?;
+        }
+
         if min_restart_lsn != Lsn::MAX {
             info!(
                 "Min restart LSN for logical replication is {}",

From 504ca7720f5a926782e18c1739332d419ca4bb55 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 1 Jul 2024 16:42:23 +0100
Subject: [PATCH 143/412] CI(gather-rust-build-stats): fix build with libpq
 (#8219)

## Problem
I've missed setting `PQ_LIB_DIR` in
https://github.com/neondatabase/neon/pull/8206 in
`gather-rust-build-stats` job and it fails now:
```
  = note: /usr/bin/ld: cannot find -lpq
          collect2: error: ld returned 1 exit status


error: could not compile `storage_controller` (bin "storage_controller") due to 1 previous error
```

https://github.com/neondatabase/neon/actions/runs/9743960062/job/26888597735

## Summary of changes
- Set `PQ_LIB_DIR` for `gather-rust-build-stats` job
---
 .github/workflows/neon_extra_builds.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index 330d858c0e..11ff634b6c 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -385,7 +385,7 @@ jobs:
         run: make walproposer-lib -j$(nproc)
 
       - name: Produce the build stats
-        run: cargo build --all --release --timings -j$(nproc)
+        run: PQ_LIB_DIR=$(pwd)/pg_install/v16/lib cargo build --all --release --timings -j$(nproc)
 
       - name: Upload the build stats
         id: upload-stats

From 588bda98e7ffc96a20ccf46e8f5bd50dc443e4fe Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 1 Jul 2024 18:55:18 +0300
Subject: [PATCH 144/412] tests: Make neon_xlogflush() flush all WAL, if you
 omit the LSN arg (#8215)

This makes it much more convenient to use in the common case that you
want to flush all the WAL. (Passing pg_current_wal_insert_lsn() as the
argument doesn't work for the same reasons as explained in the comments:
we need to be back off to the beginning of a page if the previous record
ended at page boundary.)

I plan to use this to fix the issue that Arseny Sher called out at
https://github.com/neondatabase/neon/pull/7288#discussion_r1660063852
---
 pgxn/neon_test_utils/neontest.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pgxn/neon_test_utils/neontest.c b/pgxn/neon_test_utils/neontest.c
index 944936d395..071dc122ed 100644
--- a/pgxn/neon_test_utils/neontest.c
+++ b/pgxn/neon_test_utils/neontest.c
@@ -469,9 +469,9 @@ neon_xlogflush(PG_FUNCTION_ARGS)
 		 * The LSN returned by GetXLogInsertRecPtr() is the position where the
 		 * next inserted record would begin. If the last record ended just at
 		 * the page boundary, the next record will begin after the page header
-		 * on the next page, and that's what GetXLogInsertRecPtr().returns,
-		 * but the page header has not been written yet. If we tried to flush
-		 * it, XLogFlush() would throw an error:
+		 * on the next page, but the next page's page header has not been
+		 * written yet. If we tried to flush it, XLogFlush() would throw an
+		 * error:
 		 *
 		 * ERROR : xlog flush request %X/%X is not satisfied --- flushed only to %X/%X
 		 *

From 8c0ec2f681e269a2602a374eb2a8706eb8313654 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Mon, 1 Jul 2024 18:44:28 +0100
Subject: [PATCH 145/412] docs: Graceful storage controller cluster restarts
 RFC (#7704)

RFC for "Graceful Restarts of Storage Controller Managed Clusters".
Related https://github.com/neondatabase/neon/issues/7387
---
 .../033-storage-controller-drain-and-fill.md  | 345 ++++++++++++++++++
 1 file changed, 345 insertions(+)
 create mode 100644 docs/rfcs/033-storage-controller-drain-and-fill.md

diff --git a/docs/rfcs/033-storage-controller-drain-and-fill.md b/docs/rfcs/033-storage-controller-drain-and-fill.md
new file mode 100644
index 0000000000..77c84cd2a5
--- /dev/null
+++ b/docs/rfcs/033-storage-controller-drain-and-fill.md
@@ -0,0 +1,345 @@
+# Graceful Restarts of Storage Controller Managed Clusters
+
+## Summary
+This RFC describes new storage controller APIs for draining and filling tenant shards from/on pageserver nodes.
+It also covers how these new APIs should be used by an orchestrator (e.g. Ansible) in order to implement
+graceful cluster restarts.
+
+## Motivation
+
+Pageserver restarts cause read availablity downtime for tenants.
+
+For example pageserver-3 @ us-east-1 was unavailable for a randomly
+picked tenant (which requested on-demand activation) for around 30 seconds
+during the restart at 2024-04-03 16:37 UTC.
+
+Note that lots of shutdowns on loaded pageservers do not finish within the
+[10 second systemd enforced timeout](https://github.com/neondatabase/aws/blob/0a5280b383e43c063d43cbf87fa026543f6d6ad4/.github/ansible/systemd/pageserver.service#L16). This means we are shutting down without flushing ephemeral layers
+and have to reingest data in order to serve requests after restarting, potentially making first request latencies worse.
+
+This problem is not yet very acutely felt in storage controller managed pageservers since
+tenant density is much lower there. However, we are planning on eventually migrating all
+pageservers to storage controller management, so it makes sense to solve the issue proactively.
+
+## Requirements
+
+- Pageserver re-deployments cause minimal downtime for tenants
+- The storage controller exposes HTTP API hooks for draining and filling tenant shards
+from a given pageserver. Said hooks can be used by an orchestrator proces or a human operator.
+- The storage controller exposes some HTTP API to cancel draining and filling background operations.
+- Failures to drain or fill the node should not be fatal. In such cases, cluster restarts should proceed
+as usual (with downtime).
+- Progress of draining/filling is visible through metrics
+
+## Non Goals
+
+- Integration with the control plane
+- Graceful restarts for large non-HA tenants.
+
+## Impacted Components
+
+- storage controller
+- deployment orchestrator (i.e. Ansible)
+- pageserver (indirectly)
+
+## Terminology
+
+** Draining ** is the process through which all tenant shards that can be migrated from a given pageserver
+are distributed across the rest of the cluster.
+
+** Filling ** is the symmetric opposite of draining. In this process tenant shards are migrated onto a given
+pageserver until the cluster reaches a resonable, quiescent distribution of tenant shards across pageservers.
+
+** Node scheduling policies ** act as constraints to the scheduler. For instance, when a
+node is set in the `Paused` policy, no further shards will be scheduled on it.
+
+** Node ** is a pageserver. Term is used interchangeably in this RFC.
+
+** Deployment orchestrator ** is a generic term for whatever drives our deployments.
+Currently, it's an Ansible playbook.
+
+## Background
+
+### Storage Controller Basics (skip if already familiar)
+
+Fundamentally, the storage controller is a reconciler which aims to move from the observed mapping between pageservers and tenant shards to an intended mapping. Pageserver nodes and tenant shards metadata is durably persisted in a database, but note that the mapping between the two entities is not durably persisted. Instead, this mapping (*observed state*) is constructed at startup by sending `GET location_config` requests to registered pageservers.
+
+An internal scheduler maps tenant shards to pageservers while respecting certain constraints. The result of scheduling is the *intent state*. When the intent state changes, a *reconciliation* will inform pageservers about the new assigment via `PUT location_config` requests and will notify the compute via the configured hook.
+
+### Background Optimizations
+
+The storage controller performs scheduling optimizations in the background. It will
+migrate attachments to warm secondaries and replace secondaries in order to balance
+the cluster out.
+
+### Reconciliations Concurrency Limiting
+
+There's a hard limit on the number of reconciles that the storage controller
+can have in flight at any given time. To get an idea of scales, the limit is
+128 at the time of writing.
+
+## Implementation
+
+Note: this section focuses on the core functionality of the graceful restart process.
+It doesn't neccesarily describe the most efficient approach. Optimizations are described
+separately in a later section.
+
+### Overall Flow
+
+This section describes how to implement graceful restarts from the perspective
+of Ansible, the deployment orchestrator. Pageservers are already restarted sequentially.
+The orchestrator shall implement the following epilogue and prologue steps for each
+pageserver restart:
+
+#### Prologue
+
+The orchestrator shall first fetch the pageserver node id from the control plane or
+the pageserver it aims to restart directly. Next, it issues an HTTP request
+to the storage controller in order to start the drain of said pageserver node.
+All error responses are retried with a short back-off. When a 202 (Accepted)
+HTTP code is returned, the drain has started. Now the orchestrator polls the
+node status endpoint exposed by the storage controller in order to await the
+end of the drain process. When the `policy` field of the node status response
+becomes `PauseForRestart`, the drain has completed and the orchestrator can
+proceed with restarting the pageserver.
+
+The prologue is subject to an overall timeout. It will have a value in the ballpark
+of minutes. As storage controller managed pageservers become more loaded this timeout
+will likely have to increase.
+
+#### Epilogue
+
+After restarting the pageserver, the orchestrator issues an HTTP request
+to the storage controller to kick off the filling process. This API call
+may be retried for all error codes with a short backoff. This also serves
+as a synchronization primitive as the fill will be refused if the pageserver
+has not yet re-attached to the storage controller. When a 202(Accepted) HTTP
+code is returned, the fill has started. Now the orchestrator polls the node
+status endpoint exposed by the storage controller in order to await the end of
+the filling process. When the `policy` field of the node status response becomes
+`Active`, the fill has completed and the orchestrator may proceed to the next pageserver.
+
+Again, the epilogue is subject to an overall timeout. We can start off with
+using the same timeout as for the prologue, but can also consider relying on
+the storage controller's background optimizations with a shorter timeout.
+
+In the case that the deployment orchestrator times out, it attempts to cancel
+the fill. This operation shall be retried with a short back-off. If it ultimately
+fails it will require manual intervention to set the nodes scheduling policy to
+`NodeSchedulingPolicy::Active`. Not doing that is not immediately problematic,
+but it constrains the scheduler as mentioned previously.
+
+### Node Scheduling Policy State Machine
+
+The state machine below encodes the behaviours discussed above and
+the various failover situations described in a later section.
+
+Assuming no failures and/or timeouts the flow should be:
+`Active -> Draining -> PauseForRestart -> Active -> Filling -> Active`
+
+```
+                          Operator requested drain
+               +-----------------------------------------+
+               |                                         |
+       +-------+-------+                         +-------v-------+
+       |               |                         |               |
+       |     Pause     |             +----------->    Draining   +----------+
+       |               |             |           |               |          |
+       +---------------+             |           +-------+-------+          |
+                                     |                   |                  |
+                                     |                   |                  |
+                      Drain requested|                   |                  |
+                                     |                   |Drain complete    | Drain failed
+                                     |                   |                  | Cancelled/PS reattach/Storcon restart
+                                     |                   |                  |
+                             +-------+-------+           |                  |
+                             |               |           |                  |
+               +-------------+    Active     <-----------+------------------+
+               |             |               |           |
+Fill requested |             +---^---^-------+           |
+               |                 |   |                   |
+               |                 |   |                   |
+               |                 |   |                   |
+               |   Fill completed|   |                   |
+               |                 |   |PS reattach        |
+               |                 |   |after restart      |
+       +-------v-------+         |   |           +-------v-------+
+       |               |         |   |           |               |
+       |    Filling    +---------+   +-----------+PauseForRestart|
+       |               |                         |               |
+       +---------------+                         +---------------+
+```
+
+### Draining/Filling APIs
+
+The storage controller API to trigger the draining of a given node is:
+`PUT /v1/control/node/:node_id/{drain,fill}`.
+
+The following HTTP non-success return codes are used.
+All of them are safely retriable from the perspective of the storage controller.
+- 404: Requested node was not found
+- 503: Requested node is known to the storage controller, but unavailable
+- 412: Drain precondition failed: there is no other node to drain to or the node's schedulling policy forbids draining
+- 409: A {drain, fill} is already in progress. Only one such background operation
+is allowed per node.
+
+When the drain is accepted and commenced a 202 HTTP code is returned.
+
+Drains and fills shall be cancellable by the deployment orchestrator or a
+human operator via: `DELETE /v1/control/node/:node_id/{drain,fill}`. A 200
+response is returned when the cancelation is successful. Errors are retriable.
+
+### Drain Process
+
+Before accpeting a drain request the following validations is applied:
+* Ensure that the node is known the storage controller
+* Ensure that the schedulling policy is `NodeSchedulingPolicy::Active` or `NodeSchedulingPolicy::Pause`
+* Ensure that another drain or fill is not already running on the node
+* Ensure that a drain is possible (i.e. check that there is at least one
+schedulable node to drain to)
+
+After accepting the drain, the scheduling policy of the node is set to
+`NodeSchedulingPolicy::Draining` and persisted in both memory and the database.
+This disallows the optimizer from adding or removing shards from the node which
+is desirable to avoid them racing.
+
+Next, a separate Tokio task is spawned to manage the draining. For each tenant
+shard attached to the node being drained, demote the node to a secondary and
+attempt to schedule the node away. Scheduling might fail due to unsatisfiable
+constraints, but that is fine. Draining is a best effort process since it might
+not always be possible to cut over all shards.
+
+Importantly, this task manages the concurrency of issued reconciles in order to
+avoid drowning out the target pageservers and to allow other important reconciles
+to proceed.
+
+Once the triggered reconciles have finished or timed out, set the node's scheduling
+policy to `NodeSchedulingPolicy::PauseForRestart` to signal the end of the drain.
+
+A note on non HA tenants: These tenants do not have secondaries, so by the description
+above, they would not be migrated. It makes sense to skip them (especially the large ones)
+since, depending on tenant size, this might be more disruptive than the restart since the
+pageserver we've moved to do will need to on-demand download the entire working set for the tenant.
+We can consider expanding to small non-HA tenants in the future.
+
+### Fill Process
+
+Before accpeting a fill request the following validations is applied:
+* Ensure that the node is known the storage controller
+* Ensure that the schedulling policy is `NodeSchedulingPolicy::Active`.
+This is the only acceptable policy for the fill starting state. When a node re-attaches,
+it set the scheduling policy to `NodeSchedulingPolicy::Active` if it was equal to
+`NodeSchedulingPolicy::PauseForRestart` or `NodeSchedulingPolicy::Draining` (possible end states for a node drain).
+* Ensure that another drain or fill is not already running on the node
+
+After accepting the drain, the scheduling policy of the node is set to
+`NodeSchedulingPolicy::Filling` and persisted in both memory and the database.
+This disallows the optimizer from adding or removing shards from the node which
+is desirable to avoid them racing.
+
+Next, a separate Tokio task is spawned to manage the draining. For each tenant
+shard where the filled node is a secondary, promote the secondary. This is done
+until we run out of shards or the counts of attached shards become balanced across
+the cluster.
+
+Like for draining, the concurrency of spawned reconciles is limited.
+
+### Failure Modes & Handling
+
+Failures are generally handled by transition back into the `Active`
+(neutral) state. This simplifies the implementation greatly at the
+cost of adding transitions to the state machine. For example, we
+could detect the `Draining` state upon restart and proceed with a drain,
+but how should the storage controller know that's what the orchestrator
+needs still?
+
+#### Storage Controller Crash
+
+When the storage controller starts up reset the node scheduling policy
+of all nodes in states `Draining`, `Filling` or `PauseForRestart` to
+`Active`. The rationale is that when the storage controller restarts,
+we have lost context of what the deployment orchestrator wants. It also
+has the benefit of making things easier to reason about.
+
+#### Pageserver Crash During Drain
+
+The pageserver will attempt to re-attach during restart at which
+point the node scheduling policy will be set back to `Active`, thus
+reenabling the scheduler to use the node.
+
+#### Non-drained Pageserver Crash During Drain
+
+What should happen when a pageserver we are draining to crashes during the
+process. Two reasonable options are: cancel the drain and focus on the failover
+*or* do both, but prioritise failover. Since the number of concurrent reconciles
+produced by drains/fills are limited, we get the later behaviour for free.
+My suggestion is we take this approach, but the cancellation option is trivial
+to implement as well.
+
+#### Pageserver Crash During Fill
+
+The pageserver will attempt to re-attach during restart at which
+point the node scheduling policy will be set back to `Active`, thus
+reenabling the scheduler to use the node.
+
+#### Pageserver Goes unavailable During Drain/Fill
+
+The drain and fill jobs handle this by stopping early. When the pageserver
+is detected as online by storage controller heartbeats, reset its scheduling
+policy to `Active`. If a restart happens instead, see the pageserver crash
+failure mode.
+
+#### Orchestrator Drain Times Out
+
+Orchestrator will still proceed with the restart.
+When the pageserver re-attaches, the scheduling policy is set back to
+`Active`.
+
+#### Orchestrator Fill Times Out
+
+Orchestrator will attempt to cancel the fill operation. If that fails,
+the fill will continue until it quiesces and the node will be left
+in the `Filling` scheduling policy. This hinders the scheduler, but is
+otherwise harmless. A human operator can handle this by setting the scheduling
+policy to `Active`, or we can bake in a fill timeout into the storage controller.
+
+## Optimizations
+
+### Location Warmth
+
+When cutting over to a secondary, the storage controller will wait for it to
+become "warm" (i.e. download enough of the tenants data). This means that some
+reconciliations can take significantly longer than others and hold up precious
+reconciliations units. As an optimization, the drain stage can only cut over
+tenants that are already "warm". Similarly, the fill stage can prioritise the
+"warmest" tenants in the fill.
+
+Given that the number of tenants by the storage controller will be fairly low
+for the foreseable future, the first implementation could simply query the tenants
+for secondary status. This doesn't scale well with increasing tenant counts, so
+eventually we will need new pageserver API endpoints to report the sets of
+"warm" and "cold" nodes.
+
+## Alternatives Considered
+
+### Draining and Filling Purely as Scheduling Constraints
+
+At its core, the storage controller is a big background loop that detects changes
+in the environment and reacts on them. One could express draining and filling
+of nodes purely in terms of constraining the scheduler (as opposed to having
+such background tasks).
+
+While theoretically nice, I think that's harder to implement and more importantly operate and reason about.
+Consider cancellation of a drain/fill operation. We would have to update the scheduler state, create
+an entirely new schedule (intent state) and start work on applying that. It gets trickier if we wish
+to cancel the reconciliation tasks spawned by drain/fill nodes. How would we know which ones belong
+to the conceptual drain/fill? One could add labels to reconciliations, but it gets messy in my opinion.
+
+It would also mean that reconciliations themselves have side effects that persist in the database
+(persist something to the databse when the drain is done), which I'm not conceptually fond of.
+
+## Proof of Concept
+
+This RFC is accompanied by a POC which implements nearly everything mentioned here
+apart from the optimizations and some of the failure handling:
+https://github.com/neondatabase/neon/pull/7682

From 47c50ec46081beef2b3b46600d6858635ebb64d9 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Tue, 2 Jul 2024 06:56:10 +0300
Subject: [PATCH 146/412] Check status of connection after PQconnectStartParams
 (#8210)

## Problem

See https://github.com/neondatabase/cloud/issues/14289

## Summary of changes

Check connection status after calling PQconnectStartParams

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/libpagestore.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index a665cafafe..a3fdcc537e 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -432,7 +432,17 @@ pageserver_connect(shardno_t shard_no, int elevel)
 			neon_shard_log(shard_no, elevel, "Failed to connect to pageserver: out of memory");
 			return false;
 		}
-
+		if (PQstatus(shard->conn) == CONNECTION_BAD)
+		{
+			char	   *msg = pchomp(PQerrorMessage(shard->conn));
+			CLEANUP_AND_DISCONNECT(shard);
+			ereport(elevel,
+					(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
+						errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no),
+						errdetail_internal("%s", msg)));
+			pfree(msg);
+			return false;
+		}
 		shard->state = PS_Connecting_Startup;
 		/* fallthrough */
 	}

From ed3b97604c3b621f20cbfec84f1638ce59292917 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 2 Jul 2024 12:53:08 +0200
Subject: [PATCH 147/412] remote_storage config: move handling of empty inline
 table `{}` to callers (#8193)

Before this PR, `RemoteStorageConfig::from_toml` would support
deserializing an
empty `{}` TOML inline table to a `None`, otherwise try `Some()`.

We can instead let
* in proxy: let clap derive handle the Option
* in PS & SK: assume that if the field is specified, it must be a valid
  RemtoeStorageConfig

(This PR started with a much simpler goal of factoring out the
`deserialize_item` function because I need that in another PR).
---
 Cargo.lock                            |  1 +
 libs/remote_storage/src/config.rs     | 25 ++++++-------------------
 libs/utils/Cargo.toml                 |  1 +
 libs/utils/src/lib.rs                 |  2 ++
 libs/utils/src/toml_edit_ext.rs       | 22 ++++++++++++++++++++++
 pageserver/ctl/src/main.rs            |  2 +-
 pageserver/src/config.rs              | 19 ++++++++++++++++---
 proxy/src/bin/proxy.rs                |  9 ++++-----
 proxy/src/config.rs                   |  8 ++------
 proxy/src/context/parquet.rs          | 15 ++++++---------
 safekeeper/src/bin/safekeeper.rs      | 13 ++-----------
 test_runner/fixtures/neon_fixtures.py |  4 +++-
 12 files changed, 66 insertions(+), 55 deletions(-)
 create mode 100644 libs/utils/src/toml_edit_ext.rs

diff --git a/Cargo.lock b/Cargo.lock
index 5393538c59..6dae8e3403 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -6811,6 +6811,7 @@ dependencies = [
  "tokio-stream",
  "tokio-tar",
  "tokio-util",
+ "toml_edit 0.19.10",
  "tracing",
  "tracing-error",
  "tracing-subscriber",
diff --git a/libs/remote_storage/src/config.rs b/libs/remote_storage/src/config.rs
index 8a8f6212e9..fa3f2cba58 100644
--- a/libs/remote_storage/src/config.rs
+++ b/libs/remote_storage/src/config.rs
@@ -1,6 +1,5 @@
 use std::{fmt::Debug, num::NonZeroUsize, str::FromStr, time::Duration};
 
-use anyhow::bail;
 use aws_sdk_s3::types::StorageClass;
 use camino::Utf8PathBuf;
 
@@ -176,20 +175,8 @@ fn serialize_storage_class<S: serde::Serializer>(
 impl RemoteStorageConfig {
     pub const DEFAULT_TIMEOUT: Duration = std::time::Duration::from_secs(120);
 
-    pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result<Option<RemoteStorageConfig>> {
-        let document: toml_edit::Document = match toml {
-            toml_edit::Item::Table(toml) => toml.clone().into(),
-            toml_edit::Item::Value(toml_edit::Value::InlineTable(toml)) => {
-                toml.clone().into_table().into()
-            }
-            _ => bail!("toml not a table or inline table"),
-        };
-
-        if document.is_empty() {
-            return Ok(None);
-        }
-
-        Ok(Some(toml_edit::de::from_document(document)?))
+    pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result<RemoteStorageConfig> {
+        Ok(utils::toml_edit_ext::deserialize_item(toml)?)
     }
 }
 
@@ -197,7 +184,7 @@ impl RemoteStorageConfig {
 mod tests {
     use super::*;
 
-    fn parse(input: &str) -> anyhow::Result<Option<RemoteStorageConfig>> {
+    fn parse(input: &str) -> anyhow::Result<RemoteStorageConfig> {
         let toml = input.parse::<toml_edit::Document>().unwrap();
         RemoteStorageConfig::from_toml(toml.as_item())
     }
@@ -207,7 +194,7 @@ mod tests {
         let input = "local_path = '.'
 timeout = '5s'";
 
-        let config = parse(input).unwrap().expect("it exists");
+        let config = parse(input).unwrap();
 
         assert_eq!(
             config,
@@ -229,7 +216,7 @@ timeout = '5s'";
     timeout = '7s'
     ";
 
-        let config = parse(toml).unwrap().expect("it exists");
+        let config = parse(toml).unwrap();
 
         assert_eq!(
             config,
@@ -257,7 +244,7 @@ timeout = '5s'";
     timeout = '7s'
     ";
 
-        let config = parse(toml).unwrap().expect("it exists");
+        let config = parse(toml).unwrap();
 
         assert_eq!(
             config,
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index a6a081c5c1..261ca2cc1a 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -40,6 +40,7 @@ thiserror.workspace = true
 tokio.workspace = true
 tokio-tar.workspace = true
 tokio-util.workspace = true
+toml_edit.workspace = true
 tracing.workspace = true
 tracing-error.workspace = true
 tracing-subscriber = { workspace = true, features = ["json", "registry"] }
diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs
index 2953f0aad4..2a397d97d2 100644
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -94,6 +94,8 @@ pub mod env;
 
 pub mod poison;
 
+pub mod toml_edit_ext;
+
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
diff --git a/libs/utils/src/toml_edit_ext.rs b/libs/utils/src/toml_edit_ext.rs
new file mode 100644
index 0000000000..ab5f7bdd95
--- /dev/null
+++ b/libs/utils/src/toml_edit_ext.rs
@@ -0,0 +1,22 @@
+#[derive(Debug, thiserror::Error)]
+pub enum Error {
+    #[error("item is not a document")]
+    ItemIsNotADocument,
+    #[error(transparent)]
+    Serde(toml_edit::de::Error),
+}
+
+pub fn deserialize_item<T>(item: &toml_edit::Item) -> Result<T, Error>
+where
+    T: serde::de::DeserializeOwned,
+{
+    let document: toml_edit::Document = match item {
+        toml_edit::Item::Table(toml) => toml.clone().into(),
+        toml_edit::Item::Value(toml_edit::Value::InlineTable(toml)) => {
+            toml.clone().into_table().into()
+        }
+        _ => return Err(Error::ItemIsNotADocument),
+    };
+
+    toml_edit::de::from_document(document).map_err(Error::Serde)
+}
diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs
index 50c3ac4c61..ea09a011e5 100644
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -178,7 +178,7 @@ async fn main() -> anyhow::Result<()> {
             let toml_item = toml_document
                 .get("remote_storage")
                 .expect("need remote_storage");
-            let config = RemoteStorageConfig::from_toml(toml_item)?.expect("incomplete config");
+            let config = RemoteStorageConfig::from_toml(toml_item)?;
             let storage = remote_storage::GenericRemoteStorage::from_config(&config);
             let cancel = CancellationToken::new();
             storage
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index f36e63f035..2b698b75dc 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -159,7 +159,7 @@ pub mod defaults {
 
 #ephemeral_bytes_per_memory_kb = {DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB}
 
-[remote_storage]
+#[remote_storage]
 
 "#
     );
@@ -918,7 +918,7 @@ impl PageServerConf {
                 "http_auth_type" => builder.http_auth_type(parse_toml_from_str(key, item)?),
                 "pg_auth_type" => builder.pg_auth_type(parse_toml_from_str(key, item)?),
                 "remote_storage" => {
-                    builder.remote_storage_config(RemoteStorageConfig::from_toml(item)?)
+                    builder.remote_storage_config(Some(RemoteStorageConfig::from_toml(item).context("remote_storage")?))
                 }
                 "tenant_config" => {
                     t_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("failed to parse: '{key}'"))?;
@@ -946,7 +946,7 @@ impl PageServerConf {
                     builder.metric_collection_endpoint(Some(endpoint));
                 },
                 "metric_collection_bucket" => {
-                    builder.metric_collection_bucket(RemoteStorageConfig::from_toml(item)?)
+                    builder.metric_collection_bucket(Some(RemoteStorageConfig::from_toml(item)?))
                 }
                 "synthetic_size_calculation_interval" =>
                     builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?),
@@ -1681,6 +1681,19 @@ threshold = "20m"
         }
     }
 
+    #[test]
+    fn empty_remote_storage_is_error() {
+        let tempdir = tempdir().unwrap();
+        let (workdir, _) = prepare_fs(&tempdir).unwrap();
+        let input = r#"
+remote_storage = {}
+        "#;
+        let doc = toml_edit::Document::from_str(input).unwrap();
+        let err = PageServerConf::parse_and_validate(&doc, &workdir)
+            .expect_err("empty remote_storage field should fail, don't specify it if you want no remote_storage");
+        assert!(format!("{err}").contains("remote_storage"), "{err}");
+    }
+
     fn prepare_fs(tempdir: &Utf8TempDir) -> anyhow::Result<(Utf8PathBuf, Utf8PathBuf)> {
         let tempdir_path = tempdir.path();
 
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index dffebf5580..7f4cb2c010 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -35,6 +35,7 @@ use proxy::usage_metrics;
 use anyhow::bail;
 use proxy::config::{self, ProxyConfig};
 use proxy::serverless;
+use remote_storage::RemoteStorageConfig;
 use std::net::SocketAddr;
 use std::pin::pin;
 use std::sync::Arc;
@@ -205,8 +206,8 @@ struct ProxyCliArgs {
     /// remote storage configuration for backup metric collection
     /// Encoded as toml (same format as pageservers), eg
     /// `{bucket_name='the-bucket',bucket_region='us-east-1',prefix_in_bucket='proxy',endpoint='http://minio:9000'}`
-    #[clap(long, default_value = "{}")]
-    metric_backup_collection_remote_storage: String,
+    #[clap(long, value_parser = remote_storage_from_toml)]
+    metric_backup_collection_remote_storage: Option<RemoteStorageConfig>,
     /// chunk size for backup metric collection
     /// Size of each event is no more than 400 bytes, so 2**22 is about 200MB before the compression.
     #[clap(long, default_value = "4194304")]
@@ -511,9 +512,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
     }
     let backup_metric_collection_config = config::MetricBackupCollectionConfig {
         interval: args.metric_backup_collection_interval,
-        remote_storage_config: remote_storage_from_toml(
-            &args.metric_backup_collection_remote_storage,
-        )?,
+        remote_storage_config: args.metric_backup_collection_remote_storage.clone(),
         chunk_size: args.metric_backup_collection_chunk_size,
     };
 
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index f4707a33aa..af5511d7ec 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -399,15 +399,11 @@ impl FromStr for EndpointCacheConfig {
 #[derive(Debug)]
 pub struct MetricBackupCollectionConfig {
     pub interval: Duration,
-    pub remote_storage_config: OptRemoteStorageConfig,
+    pub remote_storage_config: Option<RemoteStorageConfig>,
     pub chunk_size: usize,
 }
 
-/// Hack to avoid clap being smarter. If you don't use this type alias, clap assumes more about the optional state and you get
-/// runtime type errors from the value parser we use.
-pub type OptRemoteStorageConfig = Option<RemoteStorageConfig>;
-
-pub fn remote_storage_from_toml(s: &str) -> anyhow::Result<OptRemoteStorageConfig> {
+pub fn remote_storage_from_toml(s: &str) -> anyhow::Result<RemoteStorageConfig> {
     RemoteStorageConfig::from_toml(&s.parse()?)
 }
 
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index e72bf199e3..cfc1f8e89e 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -14,17 +14,14 @@ use parquet::{
     record::RecordWriter,
 };
 use pq_proto::StartupMessageParams;
-use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel};
+use remote_storage::{GenericRemoteStorage, RemotePath, RemoteStorageConfig, TimeoutOrCancel};
 use serde::ser::SerializeMap;
 use tokio::{sync::mpsc, time};
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, Span};
 use utils::backoff;
 
-use crate::{
-    config::{remote_storage_from_toml, OptRemoteStorageConfig},
-    context::LOG_CHAN_DISCONNECT,
-};
+use crate::{config::remote_storage_from_toml, context::LOG_CHAN_DISCONNECT};
 
 use super::{RequestMonitoring, LOG_CHAN};
 
@@ -33,11 +30,11 @@ pub struct ParquetUploadArgs {
     /// Storage location to upload the parquet files to.
     /// Encoded as toml (same format as pageservers), eg
     /// `{bucket_name='the-bucket',bucket_region='us-east-1',prefix_in_bucket='proxy',endpoint='http://minio:9000'}`
-    #[clap(long, default_value = "{}", value_parser = remote_storage_from_toml)]
-    parquet_upload_remote_storage: OptRemoteStorageConfig,
+    #[clap(long, value_parser = remote_storage_from_toml)]
+    parquet_upload_remote_storage: Option<RemoteStorageConfig>,
 
-    #[clap(long, default_value = "{}", value_parser = remote_storage_from_toml)]
-    parquet_upload_disconnect_events_remote_storage: OptRemoteStorageConfig,
+    #[clap(long, value_parser = remote_storage_from_toml)]
+    parquet_upload_disconnect_events_remote_storage: Option<RemoteStorageConfig>,
 
     /// How many rows to include in a row group
     #[clap(long, default_value_t = 8192)]
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index c81373c77c..d25b8722ac 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -12,7 +12,6 @@ use sd_notify::NotifyState;
 use tokio::runtime::Handle;
 use tokio::signal::unix::{signal, SignalKind};
 use tokio::task::JoinError;
-use toml_edit::Document;
 use utils::logging::SecretString;
 
 use std::env::{var, VarError};
@@ -126,7 +125,7 @@ struct Args {
     peer_recovery: bool,
     /// Remote storage configuration for WAL backup (offloading to s3) as TOML
     /// inline table, e.g.
-    ///   {"max_concurrent_syncs" = 17, "max_sync_errors": 13, "bucket_name": "<BUCKETNAME>", "bucket_region":"<REGION>", "concurrency_limit": 119}
+    ///   {max_concurrent_syncs = 17, max_sync_errors = 13, bucket_name = "<BUCKETNAME>", bucket_region = "<REGION>", concurrency_limit = 119}
     /// Safekeeper offloads WAL to
     ///   [prefix_in_bucket/]<tenant_id>/<timeline_id>/<segment_file>, mirroring
     /// structure on the file system.
@@ -553,16 +552,8 @@ fn set_id(workdir: &Utf8Path, given_id: Option<NodeId>) -> Result<NodeId> {
     Ok(my_id)
 }
 
-// Parse RemoteStorage from TOML table.
 fn parse_remote_storage(storage_conf: &str) -> anyhow::Result<RemoteStorageConfig> {
-    // funny toml doesn't consider plain inline table as valid document, so wrap in a key to parse
-    let storage_conf_toml = format!("remote_storage = {storage_conf}");
-    let parsed_toml = storage_conf_toml.parse::<Document>()?; // parse
-    let (_, storage_conf_parsed_toml) = parsed_toml.iter().next().unwrap(); // and strip key off again
-    RemoteStorageConfig::from_toml(storage_conf_parsed_toml).and_then(|parsed_config| {
-        // XXX: Don't print the original toml here, there might be some sensitive data
-        parsed_config.context("Incorrectly parsed remote storage toml as no remote storage config")
-    })
+    RemoteStorageConfig::from_toml(&storage_conf.parse()?)
 }
 
 #[test]
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index e1c8514351..565aaba6e0 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1167,7 +1167,9 @@ class NeonEnv:
             if config.auth_enabled:
                 sk_cfg["auth_enabled"] = True
             if self.safekeepers_remote_storage is not None:
-                sk_cfg["remote_storage"] = self.safekeepers_remote_storage.to_toml_inline_table()
+                sk_cfg[
+                    "remote_storage"
+                ] = self.safekeepers_remote_storage.to_toml_inline_table().strip()
             self.safekeepers.append(Safekeeper(env=self, id=id, port=port))
             cfg["safekeepers"].append(sk_cfg)
 

From 7ff9989dd523c9a165edc935038e21f6ec57ded4 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 2 Jul 2024 13:45:04 +0100
Subject: [PATCH 148/412] pageserver: simpler, stricter config error handling
 (#8177)

## Problem

Tenant attachment has error paths for failures to write local
configuration, but these types of local storage I/O errors should be
considered fatal for the process. Related thread on an earlier PR that
touched this code:
https://github.com/neondatabase/neon/pull/7947#discussion_r1655134114

## Summary of changes

- Make errors writing tenant config fatal (abort process)
- When reading tenant config, make all I/O errors except ENOENT fatal
- Replace use of bare anyhow errors with `LoadConfigError`
---
 pageserver/src/http/routes.rs       |   4 +-
 pageserver/src/tenant.rs            |  78 ++++++------
 pageserver/src/tenant/mgr.rs        | 191 +++++++++++++---------------
 test_runner/regress/test_tenants.py |  25 +++-
 4 files changed, 154 insertions(+), 144 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 1fda2eaa85..f726ba115d 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -227,7 +227,7 @@ impl From<UpsertLocationError> for ApiError {
             BadRequest(e) => ApiError::BadRequest(e),
             Unavailable(_) => ApiError::ShuttingDown,
             e @ InProgress => ApiError::Conflict(format!("{e}")),
-            Flush(e) | Other(e) => ApiError::InternalServerError(e),
+            Flush(e) | InternalError(e) => ApiError::InternalServerError(e),
         }
     }
 }
@@ -1296,7 +1296,7 @@ async fn update_tenant_config_handler(
 
     crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
         .await
-        .map_err(ApiError::InternalServerError)?;
+        .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
     tenant.set_new_tenant_config(new_tenant_conf);
 
     json_response(StatusCode::OK, ())
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 3ffbaf98c6..116481a1eb 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -529,6 +529,15 @@ impl From<PageReconstructError> for GcError {
     }
 }
 
+#[derive(thiserror::Error, Debug)]
+pub(crate) enum LoadConfigError {
+    #[error("TOML deserialization error: '{0}'")]
+    DeserializeToml(#[from] toml_edit::de::Error),
+
+    #[error("Config not found at {0}")]
+    NotFound(Utf8PathBuf),
+}
+
 impl Tenant {
     /// Yet another helper for timeline initialization.
     ///
@@ -2563,36 +2572,35 @@ impl Tenant {
     pub(super) fn load_tenant_config(
         conf: &'static PageServerConf,
         tenant_shard_id: &TenantShardId,
-    ) -> anyhow::Result<LocationConf> {
+    ) -> Result<LocationConf, LoadConfigError> {
         let config_path = conf.tenant_location_config_path(tenant_shard_id);
 
-        if config_path.exists() {
-            // New-style config takes precedence
-            let deserialized = Self::read_config(&config_path)?;
-            Ok(toml_edit::de::from_document::<LocationConf>(deserialized)?)
-        } else {
-            // The config should almost always exist for a tenant directory:
-            //  - When attaching a tenant, the config is the first thing we write
-            //  - When detaching a tenant, we atomically move the directory to a tmp location
-            //    before deleting contents.
-            //
-            // The very rare edge case that can result in a missing config is if we crash during attach
-            // between creating directory and writing config.  Callers should handle that as if the
-            // directory didn't exist.
-            anyhow::bail!("tenant config not found in {}", config_path);
-        }
-    }
-
-    fn read_config(path: &Utf8Path) -> anyhow::Result<toml_edit::Document> {
-        info!("loading tenant configuration from {path}");
+        info!("loading tenant configuration from {config_path}");
 
         // load and parse file
-        let config = fs::read_to_string(path)
-            .with_context(|| format!("Failed to load config from path '{path}'"))?;
+        let config = fs::read_to_string(&config_path).map_err(|e| {
+            match e.kind() {
+                std::io::ErrorKind::NotFound => {
+                    // The config should almost always exist for a tenant directory:
+                    //  - When attaching a tenant, the config is the first thing we write
+                    //  - When detaching a tenant, we atomically move the directory to a tmp location
+                    //    before deleting contents.
+                    //
+                    // The very rare edge case that can result in a missing config is if we crash during attach
+                    // between creating directory and writing config.  Callers should handle that as if the
+                    // directory didn't exist.
 
-        config
-            .parse::<toml_edit::Document>()
-            .with_context(|| format!("Failed to parse config from file '{path}' as toml file"))
+                    LoadConfigError::NotFound(config_path)
+                }
+                _ => {
+                    // No IO errors except NotFound are acceptable here: other kinds of error indicate local storage or permissions issues
+                    // that we cannot cleanly recover
+                    crate::virtual_file::on_fatal_io_error(&e, "Reading tenant config file")
+                }
+            }
+        })?;
+
+        Ok(toml_edit::de::from_str::<LocationConf>(&config)?)
     }
 
     #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
@@ -2600,7 +2608,7 @@ impl Tenant {
         conf: &'static PageServerConf,
         tenant_shard_id: &TenantShardId,
         location_conf: &LocationConf,
-    ) -> anyhow::Result<()> {
+    ) -> std::io::Result<()> {
         let config_path = conf.tenant_location_config_path(tenant_shard_id);
 
         Self::persist_tenant_config_at(tenant_shard_id, &config_path, location_conf).await
@@ -2611,7 +2619,7 @@ impl Tenant {
         tenant_shard_id: &TenantShardId,
         config_path: &Utf8Path,
         location_conf: &LocationConf,
-    ) -> anyhow::Result<()> {
+    ) -> std::io::Result<()> {
         debug!("persisting tenantconf to {config_path}");
 
         let mut conf_content = r#"# This file contains a specific per-tenant's config.
@@ -2620,22 +2628,20 @@ impl Tenant {
         .to_string();
 
         fail::fail_point!("tenant-config-before-write", |_| {
-            anyhow::bail!("tenant-config-before-write");
+            Err(std::io::Error::new(
+                std::io::ErrorKind::Other,
+                "tenant-config-before-write",
+            ))
         });
 
         // Convert the config to a toml file.
-        conf_content += &toml_edit::ser::to_string_pretty(&location_conf)?;
+        conf_content +=
+            &toml_edit::ser::to_string_pretty(&location_conf).expect("Config serialization failed");
 
         let temp_path = path_with_suffix_extension(config_path, TEMP_FILE_SUFFIX);
 
-        let tenant_shard_id = *tenant_shard_id;
-        let config_path = config_path.to_owned();
         let conf_content = conf_content.into_bytes();
-        VirtualFile::crashsafe_overwrite(config_path.clone(), temp_path, conf_content)
-            .await
-            .with_context(|| format!("write tenant {tenant_shard_id} config to {config_path}"))?;
-
-        Ok(())
+        VirtualFile::crashsafe_overwrite(config_path.to_owned(), temp_path, conf_content).await
     }
 
     //
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 08c3f19b6f..c1da1d2c55 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -43,7 +43,8 @@ use crate::tenant::config::{
 use crate::tenant::span::debug_assert_current_span_has_tenant_id;
 use crate::tenant::storage_layer::inmemory_layer;
 use crate::tenant::timeline::ShutdownMode;
-use crate::tenant::{AttachedTenantConf, GcError, SpawnMode, Tenant, TenantState};
+use crate::tenant::{AttachedTenantConf, GcError, LoadConfigError, SpawnMode, Tenant, TenantState};
+use crate::virtual_file::MaybeFatalIo;
 use crate::{InitializationOrder, TEMP_FILE_SUFFIX};
 
 use utils::crashsafe::path_with_suffix_extension;
@@ -272,7 +273,7 @@ pub struct TenantManager {
 }
 
 fn emergency_generations(
-    tenant_confs: &HashMap<TenantShardId, anyhow::Result<LocationConf>>,
+    tenant_confs: &HashMap<TenantShardId, Result<LocationConf, LoadConfigError>>,
 ) -> HashMap<TenantShardId, TenantStartupMode> {
     tenant_confs
         .iter()
@@ -296,7 +297,7 @@ fn emergency_generations(
 
 async fn init_load_generations(
     conf: &'static PageServerConf,
-    tenant_confs: &HashMap<TenantShardId, anyhow::Result<LocationConf>>,
+    tenant_confs: &HashMap<TenantShardId, Result<LocationConf, LoadConfigError>>,
     resources: &TenantSharedResources,
     cancel: &CancellationToken,
 ) -> anyhow::Result<Option<HashMap<TenantShardId, TenantStartupMode>>> {
@@ -346,56 +347,32 @@ async fn init_load_generations(
 /// Given a directory discovered in the pageserver's tenants/ directory, attempt
 /// to load a tenant config from it.
 ///
-/// If file is missing, return Ok(None)
+/// If we cleaned up something expected (like an empty dir or a temp dir), return None.
 fn load_tenant_config(
     conf: &'static PageServerConf,
+    tenant_shard_id: TenantShardId,
     dentry: Utf8DirEntry,
-) -> anyhow::Result<Option<(TenantShardId, anyhow::Result<LocationConf>)>> {
+) -> Option<Result<LocationConf, LoadConfigError>> {
     let tenant_dir_path = dentry.path().to_path_buf();
     if crate::is_temporary(&tenant_dir_path) {
         info!("Found temporary tenant directory, removing: {tenant_dir_path}");
         // No need to use safe_remove_tenant_dir_all because this is already
         // a temporary path
-        if let Err(e) = std::fs::remove_dir_all(&tenant_dir_path) {
-            error!(
-                "Failed to remove temporary directory '{}': {:?}",
-                tenant_dir_path, e
-            );
-        }
-        return Ok(None);
+        std::fs::remove_dir_all(&tenant_dir_path).fatal_err("Deleting temporary tenant dir");
+        return None;
     }
 
     // This case happens if we crash during attachment before writing a config into the dir
     let is_empty = tenant_dir_path
         .is_empty_dir()
-        .with_context(|| format!("Failed to check whether {tenant_dir_path:?} is an empty dir"))?;
+        .fatal_err("Checking for empty tenant dir");
     if is_empty {
         info!("removing empty tenant directory {tenant_dir_path:?}");
-        if let Err(e) = std::fs::remove_dir(&tenant_dir_path) {
-            error!(
-                "Failed to remove empty tenant directory '{}': {e:#}",
-                tenant_dir_path
-            )
-        }
-        return Ok(None);
+        std::fs::remove_dir(&tenant_dir_path).fatal_err("Deleting empty tenant dir");
+        return None;
     }
 
-    let tenant_shard_id = match tenant_dir_path
-        .file_name()
-        .unwrap_or_default()
-        .parse::<TenantShardId>()
-    {
-        Ok(id) => id,
-        Err(_) => {
-            warn!("Invalid tenant path (garbage in our repo directory?): {tenant_dir_path}",);
-            return Ok(None);
-        }
-    };
-
-    Ok(Some((
-        tenant_shard_id,
-        Tenant::load_tenant_config(conf, &tenant_shard_id),
-    )))
+    Some(Tenant::load_tenant_config(conf, &tenant_shard_id))
 }
 
 /// Initial stage of load: walk the local tenants directory, clean up any temp files,
@@ -405,32 +382,51 @@ fn load_tenant_config(
 /// seconds even on reasonably fast drives.
 async fn init_load_tenant_configs(
     conf: &'static PageServerConf,
-) -> anyhow::Result<HashMap<TenantShardId, anyhow::Result<LocationConf>>> {
+) -> HashMap<TenantShardId, Result<LocationConf, LoadConfigError>> {
     let tenants_dir = conf.tenants_path();
 
-    let dentries = tokio::task::spawn_blocking(move || -> anyhow::Result<Vec<Utf8DirEntry>> {
-        let dir_entries = tenants_dir
-            .read_dir_utf8()
-            .with_context(|| format!("Failed to list tenants dir {tenants_dir:?}"))?;
+    let dentries = tokio::task::spawn_blocking(move || -> Vec<Utf8DirEntry> {
+        let context = format!("Reading tenants dir {tenants_dir}");
+        let dir_entries = tenants_dir.read_dir_utf8().fatal_err(&context);
 
-        Ok(dir_entries.collect::<Result<Vec<_>, std::io::Error>>()?)
+        dir_entries
+            .collect::<Result<Vec<_>, std::io::Error>>()
+            .fatal_err(&context)
     })
-    .await??;
+    .await
+    .expect("Config load task panicked");
 
     let mut configs = HashMap::new();
 
     let mut join_set = JoinSet::new();
     for dentry in dentries {
-        join_set.spawn_blocking(move || load_tenant_config(conf, dentry));
+        let tenant_shard_id = match dentry.file_name().parse::<TenantShardId>() {
+            Ok(id) => id,
+            Err(_) => {
+                warn!(
+                    "Invalid tenant path (garbage in our repo directory?): '{}'",
+                    dentry.file_name()
+                );
+                continue;
+            }
+        };
+
+        join_set.spawn_blocking(move || {
+            (
+                tenant_shard_id,
+                load_tenant_config(conf, tenant_shard_id, dentry),
+            )
+        });
     }
 
     while let Some(r) = join_set.join_next().await {
-        if let Some((tenant_id, tenant_config)) = r?? {
-            configs.insert(tenant_id, tenant_config);
+        let (tenant_shard_id, tenant_config) = r.expect("Panic in config load task");
+        if let Some(tenant_config) = tenant_config {
+            configs.insert(tenant_shard_id, tenant_config);
         }
     }
 
-    Ok(configs)
+    configs
 }
 
 #[derive(Debug, thiserror::Error)]
@@ -472,7 +468,7 @@ pub async fn init_tenant_mgr(
     );
 
     // Scan local filesystem for attached tenants
-    let tenant_configs = init_load_tenant_configs(conf).await?;
+    let tenant_configs = init_load_tenant_configs(conf).await;
 
     // Determine which tenants are to be secondary or attached, and in which generation
     let tenant_modes = init_load_generations(conf, &tenant_configs, &resources, &cancel).await?;
@@ -590,31 +586,23 @@ pub async fn init_tenant_mgr(
     );
     // For those shards that have live configurations, construct `Tenant` or `SecondaryTenant` objects and start them running
     for (tenant_shard_id, location_conf, config_write_result) in config_write_results {
-        // Errors writing configs are fatal
-        config_write_result?;
+        // Writing a config to local disk is foundational to startup up tenants: panic if we can't.
+        config_write_result.fatal_err("writing tenant shard config file");
 
         let tenant_dir_path = conf.tenant_path(&tenant_shard_id);
         let shard_identity = location_conf.shard;
         let slot = match location_conf.mode {
-            LocationMode::Attached(attached_conf) => {
-                match tenant_spawn(
-                    conf,
-                    tenant_shard_id,
-                    &tenant_dir_path,
-                    resources.clone(),
-                    AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
-                    shard_identity,
-                    Some(init_order.clone()),
-                    SpawnMode::Lazy,
-                    &ctx,
-                ) {
-                    Ok(tenant) => TenantSlot::Attached(tenant),
-                    Err(e) => {
-                        error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to start tenant: {e:#}");
-                        continue;
-                    }
-                }
-            }
+            LocationMode::Attached(attached_conf) => TenantSlot::Attached(tenant_spawn(
+                conf,
+                tenant_shard_id,
+                &tenant_dir_path,
+                resources.clone(),
+                AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
+                shard_identity,
+                Some(init_order.clone()),
+                SpawnMode::Lazy,
+                &ctx,
+            )),
             LocationMode::Secondary(secondary_conf) => {
                 info!(
                     tenant_id = %tenant_shard_id.tenant_id,
@@ -649,8 +637,7 @@ pub async fn init_tenant_mgr(
     })
 }
 
-/// Wrapper for Tenant::spawn that checks invariants before running, and inserts
-/// a broken tenant in the map if Tenant::spawn fails.
+/// Wrapper for Tenant::spawn that checks invariants before running
 #[allow(clippy::too_many_arguments)]
 fn tenant_spawn(
     conf: &'static PageServerConf,
@@ -662,23 +649,18 @@ fn tenant_spawn(
     init_order: Option<InitializationOrder>,
     mode: SpawnMode,
     ctx: &RequestContext,
-) -> anyhow::Result<Arc<Tenant>> {
-    anyhow::ensure!(
-        tenant_path.is_dir(),
-        "Cannot load tenant from path {tenant_path:?}, it either does not exist or not a directory"
-    );
-    anyhow::ensure!(
-        !crate::is_temporary(tenant_path),
-        "Cannot load tenant from temporary path {tenant_path:?}"
-    );
-    anyhow::ensure!(
-        !tenant_path.is_empty_dir().with_context(|| {
-            format!("Failed to check whether {tenant_path:?} is an empty dir")
-        })?,
-        "Cannot load tenant from empty directory {tenant_path:?}"
-    );
+) -> Arc<Tenant> {
+    // All these conditions should have been satisfied by our caller: the tenant dir exists, is a well formed
+    // path, and contains a configuration file.  Assertions that do synchronous I/O are limited to debug mode
+    // to avoid impacting prod runtime performance.
+    assert!(!crate::is_temporary(tenant_path));
+    debug_assert!(tenant_path.is_dir());
+    debug_assert!(conf
+        .tenant_location_config_path(&tenant_shard_id)
+        .try_exists()
+        .unwrap());
 
-    let tenant = Tenant::spawn(
+    Tenant::spawn(
         conf,
         tenant_shard_id,
         resources,
@@ -687,9 +669,7 @@ fn tenant_spawn(
         init_order,
         mode,
         ctx,
-    );
-
-    Ok(tenant)
+    )
 }
 
 async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
@@ -840,8 +820,9 @@ pub(crate) enum UpsertLocationError {
     #[error("Failed to flush: {0}")]
     Flush(anyhow::Error),
 
+    /// This error variant is for unexpected situations (soft assertions) where the system is in an unexpected state.
     #[error("Internal error: {0}")]
-    Other(#[from] anyhow::Error),
+    InternalError(anyhow::Error),
 }
 
 impl TenantManager {
@@ -971,7 +952,8 @@ impl TenantManager {
         match fast_path_taken {
             Some(FastPathModified::Attached(tenant)) => {
                 Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
-                    .await?;
+                    .await
+                    .fatal_err("writing tenant shard config");
 
                 // Transition to AttachedStale means we may well hold a valid generation
                 // still, and have been requested to go stale as part of a migration.  If
@@ -1001,7 +983,8 @@ impl TenantManager {
             }
             Some(FastPathModified::Secondary(_secondary_tenant)) => {
                 Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
-                    .await?;
+                    .await
+                    .fatal_err("writing tenant shard config");
 
                 return Ok(None);
             }
@@ -1067,7 +1050,7 @@ impl TenantManager {
             Some(TenantSlot::InProgress(_)) => {
                 // This should never happen: acquire_slot should error out
                 // if the contents of a slot were InProgress.
-                return Err(UpsertLocationError::Other(anyhow::anyhow!(
+                return Err(UpsertLocationError::InternalError(anyhow::anyhow!(
                     "Acquired an InProgress slot, this is a bug."
                 )));
             }
@@ -1086,12 +1069,14 @@ impl TenantManager {
         // Does not need to be fsync'd because local storage is just a cache.
         tokio::fs::create_dir_all(&timelines_path)
             .await
-            .with_context(|| format!("Creating {timelines_path}"))?;
+            .fatal_err("creating timelines/ dir");
 
         // Before activating either secondary or attached mode, persist the
         // configuration, so that on restart we will re-attach (or re-start
         // secondary) on the tenant.
-        Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config).await?;
+        Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
+            .await
+            .fatal_err("writing tenant shard config");
 
         let new_slot = match &new_location_config.mode {
             LocationMode::Secondary(secondary_config) => {
@@ -1110,13 +1095,15 @@ impl TenantManager {
                 // from upserts.  This enables creating generation-less tenants even though neon_local
                 // always uses generations when calling the location conf API.
                 let attached_conf = if cfg!(feature = "testing") {
-                    let mut conf = AttachedTenantConf::try_from(new_location_config)?;
+                    let mut conf = AttachedTenantConf::try_from(new_location_config)
+                        .map_err(UpsertLocationError::BadRequest)?;
                     if self.conf.control_plane_api.is_none() {
                         conf.location.generation = Generation::none();
                     }
                     conf
                 } else {
-                    AttachedTenantConf::try_from(new_location_config)?
+                    AttachedTenantConf::try_from(new_location_config)
+                        .map_err(UpsertLocationError::BadRequest)?
                 };
 
                 let tenant = tenant_spawn(
@@ -1129,7 +1116,7 @@ impl TenantManager {
                     None,
                     spawn_mode,
                     ctx,
-                )?;
+                );
 
                 TenantSlot::Attached(tenant)
             }
@@ -1143,7 +1130,7 @@ impl TenantManager {
 
         match slot_guard.upsert(new_slot) {
             Err(TenantSlotUpsertError::InternalError(e)) => {
-                Err(UpsertLocationError::Other(anyhow::anyhow!(e)))
+                Err(UpsertLocationError::InternalError(anyhow::anyhow!(e)))
             }
             Err(TenantSlotUpsertError::MapState(e)) => Err(UpsertLocationError::Unavailable(e)),
             Err(TenantSlotUpsertError::ShuttingDown((new_slot, _completion))) => {
@@ -1250,7 +1237,7 @@ impl TenantManager {
             None,
             SpawnMode::Eager,
             ctx,
-        )?;
+        );
 
         slot_guard.upsert(TenantSlot::Attached(tenant))?;
 
@@ -1984,7 +1971,7 @@ impl TenantManager {
             None,
             SpawnMode::Eager,
             ctx,
-        )?;
+        );
 
         slot_guard.upsert(TenantSlot::Attached(tenant))?;
 
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index 93e9ad3673..3705406c2f 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -41,18 +41,35 @@ def test_tenant_creation_fails(neon_simple_env: NeonEnv):
     neon_simple_env.storage_controller.allowed_errors.extend(error_regexes)
 
     pageserver_http = neon_simple_env.pageserver.http_client()
+
+    # Failure to write a config to local disk makes the pageserver assume that local disk is bad and abort the process
     pageserver_http.configure_failpoints(("tenant-config-before-write", "return"))
-    with pytest.raises(Exception, match="tenant-config-before-write"):
+
+    # Storage controller will see a torn TCP connection when the crash point is reached, and follow an unclean 500 error path
+    neon_simple_env.storage_controller.allowed_errors.extend(
+        [
+            ".*Reconcile not done yet while creating tenant.*",
+            ".*Reconcile error: receive body: error sending request.*",
+            ".*Error processing HTTP request: InternalServerError.*",
+        ]
+    )
+
+    with pytest.raises(Exception, match="error sending request"):
         _ = neon_simple_env.neon_cli.create_tenant()
 
+    # Any files left behind on disk during failed creation do not prevent
+    # a retry from succeeding.  Restart pageserver with no failpoints.
+    neon_simple_env.pageserver.running = False
+    neon_simple_env.pageserver.start()
+
+    # The failed creation should not be present in list of tenants, as when we start up we'll see
+    # an empty tenant dir with no config in it.
+    neon_simple_env.pageserver.allowed_errors.append(".*Failed to load tenant config.*")
     new_tenants = sorted(
         map(lambda t: t.split()[0], neon_simple_env.neon_cli.list_tenants().stdout.splitlines())
     )
     assert initial_tenants == new_tenants, "should not create new tenants"
 
-    # Any files left behind on disk during failed creation do not prevent
-    # a retry from succeeding.
-    pageserver_http.configure_failpoints(("tenant-config-before-write", "off"))
     neon_simple_env.neon_cli.create_tenant()
 
 
From daaa3211a473cab5b8a1fa4fe6bb27fd754902dc Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 2 Jul 2024 15:13:27 +0200
Subject: [PATCH 149/412] fix: noisy logging when download gets cancelled
 during shutdown (#8224)

Before this PR, during timeline shutdown, we'd occasionally see
log lines like this one:

```
2024-06-26T18:28:11.063402Z  INFO initial_size_calculation{tenant_id=$TENANT,shard_id=0000 timeline_id=$TIMELINE}:logical_size_calculation_task:get_or_maybe_download{layer=000000000000000000000000000000000000-000000067F0001A3950001C1630100000000__0000000D88265898}: layer file download failed, and caller has been cancelled: Cancelled, shutting down
Stack backtrace:
   0: <core::result::Result<T,F> as core::ops::try_trait::FromResidual<core::result::Result<core::convert::Infallible,E>>>::from_residual
             at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/core/src/result.rs:1964:27
      pageserver::tenant::remote_timeline_client::RemoteTimelineClient::download_layer_file::{{closure}}
             at /home/nonroot/pageserver/src/tenant/remote_timeline_client.rs:531:13
      pageserver::tenant::storage_layer::layer::LayerInner::download_and_init::{{closure}}
             at /home/nonroot/pageserver/src/tenant/storage_layer/layer.rs:1136:14
      pageserver::tenant::storage_layer::layer::LayerInner::download_init_and_wait::{{closure}}::{{closure}}
             at /home/nonroot/pageserver/src/tenant/storage_layer/layer.rs:1082:74
```

We can eliminate the anyhow backtrace with no loss of information
because the conversion to anyhow::Error happens in exactly one place.

refs #7427
---
 pageserver/src/tenant/remote_timeline_client.rs |  2 +-
 pageserver/src/tenant/storage_layer/layer.rs    | 17 ++++-------------
 2 files changed, 5 insertions(+), 14 deletions(-)

diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index e33e4b84aa..bc9364de61 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -519,7 +519,7 @@ impl RemoteTimelineClient {
         local_path: &Utf8Path,
         cancel: &CancellationToken,
         ctx: &RequestContext,
-    ) -> anyhow::Result<u64> {
+    ) -> Result<u64, DownloadError> {
         let downloaded_size = {
             let _unfinished_gauge_guard = self.metrics.call_begin(
                 &RemoteOpFileKind::Layer,
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 5dd9472535..02069c29d2 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1096,19 +1096,10 @@ impl LayerInner {
 
         match rx.await {
             Ok(Ok(res)) => Ok(res),
-            Ok(Err(e)) => {
-                // sleep already happened in the spawned task, if it was not cancelled
-                match e.downcast_ref::<remote_storage::DownloadError>() {
-                    // If the download failed due to its cancellation token,
-                    // propagate the cancellation error upstream.
-                    Some(remote_storage::DownloadError::Cancelled) => {
-                        Err(DownloadError::DownloadCancelled)
-                    }
-                    // FIXME: this is not embedding the error because historically it would had
-                    // been output to compute, however that is no longer the case.
-                    _ => Err(DownloadError::DownloadFailed),
-                }
+            Ok(Err(remote_storage::DownloadError::Cancelled)) => {
+                Err(DownloadError::DownloadCancelled)
             }
+            Ok(Err(_)) => Err(DownloadError::DownloadFailed),
             Err(_gone) => Err(DownloadError::DownloadCancelled),
         }
     }
@@ -1118,7 +1109,7 @@ impl LayerInner {
         timeline: Arc<Timeline>,
         permit: heavier_once_cell::InitPermit,
         ctx: &RequestContext,
-    ) -> anyhow::Result<Arc<DownloadedLayer>> {
+    ) -> Result<Arc<DownloadedLayer>, remote_storage::DownloadError> {
         let result = timeline
             .remote_client
             .download_layer_file(

From c3e5223a5d00ff22c90013c516b5fbd602c0d800 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 2 Jul 2024 14:14:10 +0100
Subject: [PATCH 150/412] pageserver: rate limit log for loads of layers
 visited (#8228)

## Problem
At high percentiles we see more than 800 layers being visited by the
read path. We need the tenant/timeline to investigate.

## Summary of changes
Add a rate limited log line when the average number of layers visited
per key is in the last specified histogram bucket.
I plan to use this to identify tenants in us-east-2 staging that exhibit
this behaviour. Will revert before next week's release.
---
 libs/pageserver_api/src/keyspace.rs | 10 ++++++++++
 pageserver/src/tenant/timeline.rs   | 22 +++++++++++++++++++---
 2 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs
index 9a61f2ad81..401887d362 100644
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -17,6 +17,16 @@ pub struct KeySpace {
     pub ranges: Vec<Range<Key>>,
 }
 
+impl std::fmt::Display for KeySpace {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "[")?;
+        for range in &self.ranges {
+            write!(f, "{}..{},", range.start, range.end)?;
+        }
+        write!(f, "]")
+    }
+}
+
 /// A wrapper type for sparse keyspaces.
 #[derive(Clone, Debug, Default, PartialEq, Eq)]
 pub struct SparseKeySpace(pub KeySpace);
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 8dd0a23f46..ec94ed3a56 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -996,6 +996,7 @@ impl Timeline {
     }
 
     pub(crate) const MAX_GET_VECTORED_KEYS: u64 = 32;
+    pub(crate) const VEC_GET_LAYERS_VISITED_WARN_THRESH: f64 = 512.0;
 
     /// Look up multiple page versions at a given LSN
     ///
@@ -1228,7 +1229,7 @@ impl Timeline {
         let get_data_timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME
             .for_get_kind(get_kind)
             .start_timer();
-        self.get_vectored_reconstruct_data(keyspace, lsn, reconstruct_state, ctx)
+        self.get_vectored_reconstruct_data(keyspace.clone(), lsn, reconstruct_state, ctx)
             .await?;
         get_data_timer.stop_and_record();
 
@@ -1258,11 +1259,26 @@ impl Timeline {
         // (this is a requirement, not a bug). Skip updating the metric in these cases
         // to avoid infinite results.
         if !results.is_empty() {
+            let avg = layers_visited as f64 / results.len() as f64;
+            if avg >= Self::VEC_GET_LAYERS_VISITED_WARN_THRESH {
+                use utils::rate_limit::RateLimit;
+                static LOGGED: Lazy<Mutex<RateLimit>> =
+                    Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
+                let mut rate_limit = LOGGED.lock().unwrap();
+                rate_limit.call(|| {
+                    tracing::info!(
+                    tenant_id = %self.tenant_shard_id.tenant_id,
+                    shard_id = %self.tenant_shard_id.shard_slug(),
+                    timeline_id = %self.timeline_id,
+                    "Vectored read for {} visited {} layers on average per key and {} in total. {}/{} pages were returned",
+                    keyspace, avg, layers_visited, results.len(), keyspace.total_raw_size());
+                });
+            }
+
             // Note that this is an approximation. Tracking the exact number of layers visited
             // per key requires virtually unbounded memory usage and is inefficient
             // (i.e. segment tree tracking each range queried from a layer)
-            crate::metrics::VEC_READ_NUM_LAYERS_VISITED
-                .observe(layers_visited as f64 / results.len() as f64);
+            crate::metrics::VEC_READ_NUM_LAYERS_VISITED.observe(avg);
         }
 
         Ok(results)

From aeb68e51dff72266603b4a1db01fe1d055924f04 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 2 Jul 2024 16:14:12 +0200
Subject: [PATCH 151/412] Add support for reading and writing compressed blobs
 (#8106)

Add support for reading and writing zstd-compressed blobs for use in
image layer generation, but maybe one day useful also for delta layers.
The reading of them is unconditional while the writing is controlled by
the `image_compression` config variable allowing for experiments.

For the on-disk format, we re-use some of the bitpatterns we currently
keep reserved for blobs larger than 256 MiB. This assumes that we have
never ever written any such large blobs to image layers.

After the preparation in #7852, we now are unable to read blobs with a
size larger than 256 MiB (or write them).

A non-goal of this PR is to come up with good heuristics of when to
compress a bitpattern. This is left for future work.

Parts of the PR were inspired by #7091.

cc  #7879

Part of #5431
---
 libs/pageserver_api/src/models.rs             |  18 ++
 pageserver/src/config.rs                      |  21 ++-
 pageserver/src/tenant/blob_io.rs              | 155 +++++++++++++++---
 .../src/tenant/storage_layer/delta_layer.rs   |   7 +-
 4 files changed, 177 insertions(+), 24 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 61a255cdbc..959e161c16 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -432,6 +432,24 @@ pub enum CompactionAlgorithm {
     Tiered,
 }
 
+#[derive(
+    Debug,
+    Clone,
+    Copy,
+    PartialEq,
+    Eq,
+    Serialize,
+    Deserialize,
+    strum_macros::FromRepr,
+    strum_macros::EnumString,
+)]
+#[strum(serialize_all = "kebab-case")]
+pub enum ImageCompressionAlgorithm {
+    /// Zstandard compression. Level 0 means and None mean the same (default level). Levels can be negative as well.
+    /// For details, see the [manual](http://facebook.github.io/zstd/zstd_manual.html).
+    Zstd { level: Option<i8> },
+}
+
 #[derive(Eq, PartialEq, Debug, Clone, Serialize, Deserialize)]
 pub struct CompactionAlgorithmSettings {
     pub kind: CompactionAlgorithm,
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 2b698b75dc..470e941c33 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -5,7 +5,7 @@
 //! See also `settings.md` for better description on every parameter.
 
 use anyhow::{anyhow, bail, ensure, Context, Result};
-use pageserver_api::shard::TenantShardId;
+use pageserver_api::{models::ImageCompressionAlgorithm, shard::TenantShardId};
 use remote_storage::{RemotePath, RemoteStorageConfig};
 use serde;
 use serde::de::IntoDeserializer;
@@ -50,6 +50,7 @@ pub mod defaults {
         DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR,
         DEFAULT_PG_LISTEN_PORT,
     };
+    use pageserver_api::models::ImageCompressionAlgorithm;
     pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT;
 
     pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "60 s";
@@ -90,6 +91,8 @@ pub mod defaults {
 
     pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB
 
+    pub const DEFAULT_IMAGE_COMPRESSION: Option<ImageCompressionAlgorithm> = None;
+
     pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true;
 
     pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
@@ -285,6 +288,8 @@ pub struct PageServerConf {
 
     pub validate_vectored_get: bool,
 
+    pub image_compression: Option<ImageCompressionAlgorithm>,
+
     /// How many bytes of ephemeral layer content will we allow per kilobyte of RAM.  When this
     /// is exceeded, we start proactively closing ephemeral layers to limit the total amount
     /// of ephemeral data.
@@ -395,6 +400,8 @@ struct PageServerConfigBuilder {
 
     validate_vectored_get: BuilderValue<bool>,
 
+    image_compression: BuilderValue<Option<ImageCompressionAlgorithm>>,
+
     ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
 }
 
@@ -482,6 +489,7 @@ impl PageServerConfigBuilder {
             max_vectored_read_bytes: Set(MaxVectoredReadBytes(
                 NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
             )),
+            image_compression: Set(DEFAULT_IMAGE_COMPRESSION),
             validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
             ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
         }
@@ -667,6 +675,10 @@ impl PageServerConfigBuilder {
         self.validate_vectored_get = BuilderValue::Set(value);
     }
 
+    pub fn get_image_compression(&mut self, value: Option<ImageCompressionAlgorithm>) {
+        self.image_compression = BuilderValue::Set(value);
+    }
+
     pub fn get_ephemeral_bytes_per_memory_kb(&mut self, value: usize) {
         self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value);
     }
@@ -727,6 +739,7 @@ impl PageServerConfigBuilder {
                 get_impl,
                 max_vectored_read_bytes,
                 validate_vectored_get,
+                image_compression,
                 ephemeral_bytes_per_memory_kb,
             }
             CUSTOM LOGIC
@@ -1004,6 +1017,9 @@ impl PageServerConf {
                 "validate_vectored_get" => {
                     builder.get_validate_vectored_get(parse_toml_bool("validate_vectored_get", item)?)
                 }
+                "image_compression" => {
+                    builder.get_image_compression(Some(parse_toml_from_str("image_compression", item)?))
+                }
                 "ephemeral_bytes_per_memory_kb" => {
                     builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize)
                 }
@@ -1088,6 +1104,7 @@ impl PageServerConf {
                 NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
                     .expect("Invalid default constant"),
             ),
+            image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
             validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
             ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
         }
@@ -1328,6 +1345,7 @@ background_task_maximum_delay = '334 s'
                         .expect("Invalid default constant")
                 ),
                 validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
+                image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
                 ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
             },
             "Correct defaults should be used when no config values are provided"
@@ -1401,6 +1419,7 @@ background_task_maximum_delay = '334 s'
                         .expect("Invalid default constant")
                 ),
                 validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
+                image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
                 ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
             },
             "Should be able to parse all basic config values correctly"
diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs
index 2be8816cef..022801b17f 100644
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -6,12 +6,18 @@
 //! is written as a one byte. If it's larger than that, the length
 //! is written as a four-byte integer, in big-endian, with the high
 //! bit set. This way, we can detect whether it's 1- or 4-byte header
-//! by peeking at the first byte.
+//! by peeking at the first byte. For blobs larger than 128 bits,
+//! we also specify three reserved bits, only one of the three bit
+//! patterns is currently in use (0b011) and signifies compression
+//! with zstd.
 //!
 //! len <  128: 0XXXXXXX
-//! len >= 128: 1XXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX
+//! len >= 128: 1CCCXXXX XXXXXXXX XXXXXXXX XXXXXXXX
 //!
+use async_compression::Level;
 use bytes::{BufMut, BytesMut};
+use pageserver_api::models::ImageCompressionAlgorithm;
+use tokio::io::AsyncWriteExt;
 use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};
 
 use crate::context::RequestContext;
@@ -66,12 +72,29 @@ impl<'a> BlockCursor<'a> {
                 len_buf.copy_from_slice(&buf[off..off + 4]);
                 off += 4;
             }
-            len_buf[0] &= 0x7f;
+            len_buf[0] &= !LEN_COMPRESSION_BIT_MASK;
             u32::from_be_bytes(len_buf) as usize
         };
+        let compression_bits = first_len_byte & LEN_COMPRESSION_BIT_MASK;
 
-        dstbuf.clear();
-        dstbuf.reserve(len);
+        let mut tmp_buf = Vec::new();
+        let buf_to_write;
+        let compression = if compression_bits <= BYTE_UNCOMPRESSED {
+            buf_to_write = dstbuf;
+            None
+        } else if compression_bits == BYTE_ZSTD {
+            buf_to_write = &mut tmp_buf;
+            Some(dstbuf)
+        } else {
+            let error = std::io::Error::new(
+                std::io::ErrorKind::InvalidData,
+                format!("invalid compression byte {compression_bits:x}"),
+            );
+            return Err(error);
+        };
+
+        buf_to_write.clear();
+        buf_to_write.reserve(len);
 
         // Read the payload
         let mut remain = len;
@@ -85,14 +108,35 @@ impl<'a> BlockCursor<'a> {
                 page_remain = PAGE_SZ;
             }
             let this_blk_len = min(remain, page_remain);
-            dstbuf.extend_from_slice(&buf[off..off + this_blk_len]);
+            buf_to_write.extend_from_slice(&buf[off..off + this_blk_len]);
             remain -= this_blk_len;
             off += this_blk_len;
         }
+
+        if let Some(dstbuf) = compression {
+            if compression_bits == BYTE_ZSTD {
+                let mut decoder = async_compression::tokio::write::ZstdDecoder::new(dstbuf);
+                decoder.write_all(buf_to_write).await?;
+                decoder.flush().await?;
+            } else {
+                unreachable!("already checked above")
+            }
+        }
+
         Ok(())
     }
 }
 
+/// Reserved bits for length and compression
+const LEN_COMPRESSION_BIT_MASK: u8 = 0xf0;
+
+/// The maximum size of blobs we support. The highest few bits
+/// are reserved for compression and other further uses.
+const MAX_SUPPORTED_LEN: usize = 0x0fff_ffff;
+
+const BYTE_UNCOMPRESSED: u8 = 0x80;
+const BYTE_ZSTD: u8 = BYTE_UNCOMPRESSED | 0x10;
+
 /// A wrapper of `VirtualFile` that allows users to write blobs.
 ///
 /// If a `BlobWriter` is dropped, the internal buffer will be
@@ -219,6 +263,17 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
         &mut self,
         srcbuf: B,
         ctx: &RequestContext,
+    ) -> (B::Buf, Result<u64, Error>) {
+        self.write_blob_maybe_compressed(srcbuf, ctx, None).await
+    }
+
+    /// Write a blob of data. Returns the offset that it was written to,
+    /// which can be used to retrieve the data later.
+    pub async fn write_blob_maybe_compressed<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+        &mut self,
+        srcbuf: B,
+        ctx: &RequestContext,
+        algorithm: Option<ImageCompressionAlgorithm>,
     ) -> (B::Buf, Result<u64, Error>) {
         let offset = self.offset;
 
@@ -226,29 +281,58 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
 
         let mut io_buf = self.io_buf.take().expect("we always put it back below");
         io_buf.clear();
-        let (io_buf, hdr_res) = async {
+        let mut compressed_buf = None;
+        let ((io_buf, hdr_res), srcbuf) = async {
             if len < 128 {
                 // Short blob. Write a 1-byte length header
                 io_buf.put_u8(len as u8);
-                self.write_all(io_buf, ctx).await
+                (
+                    self.write_all(io_buf, ctx).await,
+                    srcbuf.slice_full().into_inner(),
+                )
             } else {
                 // Write a 4-byte length header
-                if len > 0x7fff_ffff {
+                if len > MAX_SUPPORTED_LEN {
                     return (
-                        io_buf,
-                        Err(Error::new(
-                            ErrorKind::Other,
-                            format!("blob too large ({len} bytes)"),
-                        )),
+                        (
+                            io_buf,
+                            Err(Error::new(
+                                ErrorKind::Other,
+                                format!("blob too large ({len} bytes)"),
+                            )),
+                        ),
+                        srcbuf.slice_full().into_inner(),
                     );
                 }
-                if len > 0x0fff_ffff {
-                    tracing::warn!("writing blob above future limit ({len} bytes)");
-                }
-                let mut len_buf = (len as u32).to_be_bytes();
-                len_buf[0] |= 0x80;
+                let (high_bit_mask, len_written, srcbuf) = match algorithm {
+                    Some(ImageCompressionAlgorithm::Zstd { level }) => {
+                        let mut encoder = if let Some(level) = level {
+                            async_compression::tokio::write::ZstdEncoder::with_quality(
+                                Vec::new(),
+                                Level::Precise(level.into()),
+                            )
+                        } else {
+                            async_compression::tokio::write::ZstdEncoder::new(Vec::new())
+                        };
+                        let slice = srcbuf.slice_full();
+                        encoder.write_all(&slice[..]).await.unwrap();
+                        encoder.shutdown().await.unwrap();
+                        let compressed = encoder.into_inner();
+                        if compressed.len() < len {
+                            let compressed_len = compressed.len();
+                            compressed_buf = Some(compressed);
+                            (BYTE_ZSTD, compressed_len, slice.into_inner())
+                        } else {
+                            (BYTE_UNCOMPRESSED, len, slice.into_inner())
+                        }
+                    }
+                    None => (BYTE_UNCOMPRESSED, len, srcbuf.slice_full().into_inner()),
+                };
+                let mut len_buf = (len_written as u32).to_be_bytes();
+                assert_eq!(len_buf[0] & 0xf0, 0);
+                len_buf[0] |= high_bit_mask;
                 io_buf.extend_from_slice(&len_buf[..]);
-                self.write_all(io_buf, ctx).await
+                (self.write_all(io_buf, ctx).await, srcbuf)
             }
         }
         .await;
@@ -257,7 +341,12 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
             Ok(_) => (),
             Err(e) => return (Slice::into_inner(srcbuf.slice(..)), Err(e)),
         }
-        let (srcbuf, res) = self.write_all(srcbuf, ctx).await;
+        let (srcbuf, res) = if let Some(compressed_buf) = compressed_buf {
+            let (_buf, res) = self.write_all(compressed_buf, ctx).await;
+            (Slice::into_inner(srcbuf.slice(..)), res)
+        } else {
+            self.write_all(srcbuf, ctx).await
+        };
         (srcbuf, res.map(|_| offset))
     }
 }
@@ -295,6 +384,12 @@ mod tests {
     use rand::{Rng, SeedableRng};
 
     async fn round_trip_test<const BUFFERED: bool>(blobs: &[Vec<u8>]) -> Result<(), Error> {
+        round_trip_test_compressed::<BUFFERED, 0>(blobs).await
+    }
+
+    async fn round_trip_test_compressed<const BUFFERED: bool, const COMPRESSION: u8>(
+        blobs: &[Vec<u8>],
+    ) -> Result<(), Error> {
         let temp_dir = camino_tempfile::tempdir()?;
         let pathbuf = temp_dir.path().join("file");
         let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
@@ -305,7 +400,18 @@ mod tests {
             let file = VirtualFile::create(pathbuf.as_path(), &ctx).await?;
             let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
             for blob in blobs.iter() {
-                let (_, res) = wtr.write_blob(blob.clone(), &ctx).await;
+                let (_, res) = match COMPRESSION {
+                    0 => wtr.write_blob(blob.clone(), &ctx).await,
+                    1 => {
+                        wtr.write_blob_maybe_compressed(
+                            blob.clone(),
+                            &ctx,
+                            Some(ImageCompressionAlgorithm::Zstd { level: Some(1) }),
+                        )
+                        .await
+                    }
+                    _ => unreachable!("Invalid compression {COMPRESSION}"),
+                };
                 let offs = res?;
                 offsets.push(offs);
             }
@@ -361,10 +467,15 @@ mod tests {
         let blobs = &[
             b"test".to_vec(),
             random_array(10 * PAGE_SZ),
+            b"hello".to_vec(),
+            random_array(66 * PAGE_SZ),
+            vec![0xf3; 24 * PAGE_SZ],
             b"foobar".to_vec(),
         ];
         round_trip_test::<false>(blobs).await?;
         round_trip_test::<true>(blobs).await?;
+        round_trip_test_compressed::<false, 1>(blobs).await?;
+        round_trip_test_compressed::<true, 1>(blobs).await?;
         Ok(())
     }
 
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index c2d4a2776b..e6a4d6d5c4 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -452,7 +452,12 @@ impl DeltaLayerWriterInner {
         ctx: &RequestContext,
     ) -> (Vec<u8>, anyhow::Result<()>) {
         assert!(self.lsn_range.start <= lsn);
-        let (val, res) = self.blob_writer.write_blob(val, ctx).await;
+        // We don't want to use compression in delta layer creation
+        let compression = None;
+        let (val, res) = self
+            .blob_writer
+            .write_blob_maybe_compressed(val, ctx, compression)
+            .await;
         let off = match res {
             Ok(off) => off,
             Err(e) => return (val, Err(anyhow::anyhow!(e))),

From 1ae6aa09dda396798772fd1944a8bf6a9dc93709 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 2 Jul 2024 16:29:09 +0200
Subject: [PATCH 152/412] L0 flush: opt-in mechanism to bypass PageCache reads
 and writes (#8190)

part of https://github.com/neondatabase/neon/issues/7418

# Motivation

(reproducing #7418)

When we do an `InMemoryLayer::write_to_disk`, there is a tremendous
amount of random read I/O, as deltas from the ephemeral file (written in
LSN order) are written out to the delta layer in key order.

In benchmarks (https://github.com/neondatabase/neon/pull/7409) we can
see that this delta layer writing phase is substantially more expensive
than the initial ingest of data, and that within the delta layer write a
significant amount of the CPU time is spent traversing the page cache.

# High-Level Changes

Add a new mode for L0 flush that works as follows:

* Read the full ephemeral file into memory -- layers are much smaller
than total memory, so this is afforable
* Do all the random reads directly from this in memory buffer instead of
using blob IO/page cache/disk reads.
* Add a semaphore to limit how many timelines may concurrently do this
(limit peak memory).
* Make the semaphore configurable via PS config.

# Implementation Details

The new `BlobReaderRef::Slice` is a temporary hack until we can ditch
`blob_io` for `InMemoryLayer` => Plan for this is laid out in
https://github.com/neondatabase/neon/issues/8183

# Correctness

The correctness of this change is quite obvious to me: we do what we did
before (`blob_io`) but read from memory instead of going to disk.

The highest bug potential is in doing owned-buffers IO. I refactored the
API a bit in preliminary PR
https://github.com/neondatabase/neon/pull/8186 to make it less
error-prone, but still, careful review is requested.

# Performance

I manually measured single-client ingest performance from `pgbench -i
...`.

Full report:
https://neondatabase.notion.site/2024-06-28-benchmarking-l0-flush-performance-e98cff3807f94cb38f2054d8c818fe84?pvs=4

tl;dr:

* no speed improvements during ingest,  but
* significantly lower pressure on PS PageCache (eviction rate drops to
1/3)
  * (that's why I'm working on this)
* noticable but modestly lower CPU time

This is good enough for merging this PR because the changes require
opt-in.

We'll do more testing in staging & pre-prod.

# Stability / Monitoring

**memory consumption**: there's no _hard_ limit on max `InMemoryLayer`
size (aka "checkpoint distance") , hence there's no hard limit on the
memory allocation we do for flushing. In practice, we a) [log a
warning](https://github.com/neondatabase/neon/blob/23827c6b0d400cbb9a972d4d05d49834816c40d1/pageserver/src/tenant/timeline.rs#L5741-L5743)
when we flush oversized layers, so we'd know which tenant is to blame
and b) if we were to put a hard limit in place, we would have to decide
what to do if there is an InMemoryLayer that exceeds the limit.
It seems like a better option to guarantee a max size for frozen layer,
dependent on `checkpoint_distance`. Then limit concurrency based on
that.

**metrics**: we do have the
[flush_time_histo](https://github.com/neondatabase/neon/blob/23827c6b0d400cbb9a972d4d05d49834816c40d1/pageserver/src/tenant/timeline.rs#L3725-L3726),
but that includes the wait time for the semaphore. We could add a
separate metric for the time spent after acquiring the semaphore, so one
can infer the wait time. Seems unnecessary at this point, though.
---
 pageserver/src/bin/pageserver.rs              |   5 +
 pageserver/src/config.rs                      |  18 ++-
 pageserver/src/l0_flush.rs                    |  46 ++++++
 pageserver/src/lib.rs                         |   1 +
 pageserver/src/tenant.rs                      |  13 ++
 pageserver/src/tenant/block_io.rs             |  22 +++
 pageserver/src/tenant/ephemeral_file.rs       |   8 +-
 .../src/tenant/ephemeral_file/page_caching.rs | 146 +++++++++++++-----
 .../ephemeral_file/zero_padded_read_write.rs  |  15 ++
 .../tenant/storage_layer/inmemory_layer.rs    |  94 ++++++++---
 pageserver/src/tenant/timeline.rs             |  10 +-
 pageserver/src/tenant/timeline/delete.rs      |   1 +
 12 files changed, 322 insertions(+), 57 deletions(-)
 create mode 100644 pageserver/src/l0_flush.rs

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index ba5b2608bd..39d4e46c96 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -421,6 +421,10 @@ fn start_pageserver(
         background_jobs_can_start: background_jobs_barrier.clone(),
     };
 
+    info!(config=?conf.l0_flush, "using l0_flush config");
+    let l0_flush_global_state =
+        pageserver::l0_flush::L0FlushGlobalState::new(conf.l0_flush.clone());
+
     // Scan the local 'tenants/' directory and start loading the tenants
     let deletion_queue_client = deletion_queue.new_client();
     let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
@@ -429,6 +433,7 @@ fn start_pageserver(
             broker_client: broker_client.clone(),
             remote_storage: remote_storage.clone(),
             deletion_queue_client,
+            l0_flush_global_state,
         },
         order,
         shutdown_pageserver.clone(),
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 470e941c33..fa7f7d8d97 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -30,11 +30,11 @@ use utils::{
     logging::LogFormat,
 };
 
-use crate::tenant::timeline::GetVectoredImpl;
 use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
 use crate::tenant::{config::TenantConfOpt, timeline::GetImpl};
 use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
 use crate::{disk_usage_eviction_task::DiskUsageEvictionTaskConfig, virtual_file::io_engine};
+use crate::{l0_flush::L0FlushConfig, tenant::timeline::GetVectoredImpl};
 use crate::{tenant::config::TenantConf, virtual_file};
 use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX};
 
@@ -296,6 +296,8 @@ pub struct PageServerConf {
     ///
     /// Setting this to zero disables limits on total ephemeral layer size.
     pub ephemeral_bytes_per_memory_kb: usize,
+
+    pub l0_flush: L0FlushConfig,
 }
 
 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -403,6 +405,8 @@ struct PageServerConfigBuilder {
     image_compression: BuilderValue<Option<ImageCompressionAlgorithm>>,
 
     ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
+
+    l0_flush: BuilderValue<L0FlushConfig>,
 }
 
 impl PageServerConfigBuilder {
@@ -492,6 +496,7 @@ impl PageServerConfigBuilder {
             image_compression: Set(DEFAULT_IMAGE_COMPRESSION),
             validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
             ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
+            l0_flush: Set(L0FlushConfig::default()),
         }
     }
 }
@@ -683,6 +688,10 @@ impl PageServerConfigBuilder {
         self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value);
     }
 
+    pub fn l0_flush(&mut self, value: L0FlushConfig) {
+        self.l0_flush = BuilderValue::Set(value);
+    }
+
     pub fn build(self) -> anyhow::Result<PageServerConf> {
         let default = Self::default_values();
 
@@ -741,6 +750,7 @@ impl PageServerConfigBuilder {
                 validate_vectored_get,
                 image_compression,
                 ephemeral_bytes_per_memory_kb,
+                l0_flush,
             }
             CUSTOM LOGIC
             {
@@ -1023,6 +1033,9 @@ impl PageServerConf {
                 "ephemeral_bytes_per_memory_kb" => {
                     builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize)
                 }
+                "l0_flush" => {
+                    builder.l0_flush(utils::toml_edit_ext::deserialize_item(item).context("l0_flush")?)
+                }
                 _ => bail!("unrecognized pageserver option '{key}'"),
             }
         }
@@ -1107,6 +1120,7 @@ impl PageServerConf {
             image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
             validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
             ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
+            l0_flush: L0FlushConfig::default(),
         }
     }
 }
@@ -1347,6 +1361,7 @@ background_task_maximum_delay = '334 s'
                 validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
                 image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
                 ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
+                l0_flush: L0FlushConfig::default(),
             },
             "Correct defaults should be used when no config values are provided"
         );
@@ -1421,6 +1436,7 @@ background_task_maximum_delay = '334 s'
                 validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
                 image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
                 ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
+                l0_flush: L0FlushConfig::default(),
             },
             "Should be able to parse all basic config values correctly"
         );
diff --git a/pageserver/src/l0_flush.rs b/pageserver/src/l0_flush.rs
new file mode 100644
index 0000000000..7fe8fedc63
--- /dev/null
+++ b/pageserver/src/l0_flush.rs
@@ -0,0 +1,46 @@
+use std::{num::NonZeroUsize, sync::Arc};
+
+use crate::tenant::ephemeral_file;
+
+#[derive(Default, Debug, PartialEq, Eq, Clone, serde::Deserialize)]
+#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
+pub enum L0FlushConfig {
+    #[default]
+    PageCached,
+    #[serde(rename_all = "snake_case")]
+    Direct { max_concurrency: NonZeroUsize },
+}
+
+#[derive(Clone)]
+pub struct L0FlushGlobalState(Arc<Inner>);
+
+pub(crate) enum Inner {
+    PageCached,
+    Direct { semaphore: tokio::sync::Semaphore },
+}
+
+impl L0FlushGlobalState {
+    pub fn new(config: L0FlushConfig) -> Self {
+        match config {
+            L0FlushConfig::PageCached => Self(Arc::new(Inner::PageCached)),
+            L0FlushConfig::Direct { max_concurrency } => {
+                let semaphore = tokio::sync::Semaphore::new(max_concurrency.get());
+                Self(Arc::new(Inner::Direct { semaphore }))
+            }
+        }
+    }
+
+    pub(crate) fn inner(&self) -> &Arc<Inner> {
+        &self.0
+    }
+}
+
+impl L0FlushConfig {
+    pub(crate) fn prewarm_on_write(&self) -> ephemeral_file::PrewarmPageCacheOnWrite {
+        use L0FlushConfig::*;
+        match self {
+            PageCached => ephemeral_file::PrewarmPageCacheOnWrite::Yes,
+            Direct { .. } => ephemeral_file::PrewarmPageCacheOnWrite::No,
+        }
+    }
+}
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 353f97264c..ac6b9b4f2a 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -11,6 +11,7 @@ pub mod deletion_queue;
 pub mod disk_usage_eviction_task;
 pub mod http;
 pub mod import_datadir;
+pub mod l0_flush;
 pub use pageserver_api::keyspace;
 pub mod aux_file;
 pub mod metrics;
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 116481a1eb..89bf89471c 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -73,6 +73,7 @@ use crate::deletion_queue::DeletionQueueClient;
 use crate::deletion_queue::DeletionQueueError;
 use crate::import_datadir;
 use crate::is_uninit_mark;
+use crate::l0_flush::L0FlushGlobalState;
 use crate::metrics::TENANT;
 use crate::metrics::{
     remove_tenant_metrics, BROKEN_TENANTS_SET, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC,
@@ -166,6 +167,7 @@ pub struct TenantSharedResources {
     pub broker_client: storage_broker::BrokerClientChannel,
     pub remote_storage: GenericRemoteStorage,
     pub deletion_queue_client: DeletionQueueClient,
+    pub l0_flush_global_state: L0FlushGlobalState,
 }
 
 /// A [`Tenant`] is really an _attached_ tenant.  The configuration
@@ -294,6 +296,8 @@ pub struct Tenant {
 
     /// An ongoing timeline detach must be checked during attempts to GC or compact a timeline.
     ongoing_timeline_detach: std::sync::Mutex<Option<(TimelineId, utils::completion::Barrier)>>,
+
+    l0_flush_global_state: L0FlushGlobalState,
 }
 
 impl std::fmt::Debug for Tenant {
@@ -676,6 +680,7 @@ impl Tenant {
             broker_client,
             remote_storage,
             deletion_queue_client,
+            l0_flush_global_state,
         } = resources;
 
         let attach_mode = attached_conf.location.attach_mode;
@@ -690,6 +695,7 @@ impl Tenant {
             tenant_shard_id,
             remote_storage.clone(),
             deletion_queue_client,
+            l0_flush_global_state,
         ));
 
         // The attach task will carry a GateGuard, so that shutdown() reliably waits for it to drop out if
@@ -989,6 +995,7 @@ impl Tenant {
                 TimelineResources {
                     remote_client,
                     timeline_get_throttle: self.timeline_get_throttle.clone(),
+                    l0_flush_global_state: self.l0_flush_global_state.clone(),
                 },
                 ctx,
             )
@@ -2478,6 +2485,7 @@ impl Tenant {
         tenant_shard_id: TenantShardId,
         remote_storage: GenericRemoteStorage,
         deletion_queue_client: DeletionQueueClient,
+        l0_flush_global_state: L0FlushGlobalState,
     ) -> Tenant {
         debug_assert!(
             !attached_conf.location.generation.is_none() || conf.control_plane_api.is_none()
@@ -2565,6 +2573,7 @@ impl Tenant {
             )),
             tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
             ongoing_timeline_detach: std::sync::Mutex::default(),
+            l0_flush_global_state,
         }
     }
 
@@ -3302,6 +3311,7 @@ impl Tenant {
         TimelineResources {
             remote_client,
             timeline_get_throttle: self.timeline_get_throttle.clone(),
+            l0_flush_global_state: self.l0_flush_global_state.clone(),
         }
     }
 
@@ -3638,6 +3648,7 @@ pub(crate) mod harness {
     use utils::logging;
 
     use crate::deletion_queue::mock::MockDeletionQueue;
+    use crate::l0_flush::L0FlushConfig;
     use crate::walredo::apply_neon;
     use crate::{repository::Key, walrecord::NeonWalRecord};
 
@@ -3827,6 +3838,8 @@ pub(crate) mod harness {
                 self.tenant_shard_id,
                 self.remote_storage.clone(),
                 self.deletion_queue.new_client(),
+                // TODO: ideally we should run all unit tests with both configs
+                L0FlushGlobalState::new(L0FlushConfig::default()),
             ));
 
             let preload = tenant
diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs
index b406d50332..85f3b1c799 100644
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -37,6 +37,7 @@ where
 pub enum BlockLease<'a> {
     PageReadGuard(PageReadGuard<'static>),
     EphemeralFileMutableTail(&'a [u8; PAGE_SZ]),
+    Slice(&'a [u8; PAGE_SZ]),
     #[cfg(test)]
     Arc(std::sync::Arc<[u8; PAGE_SZ]>),
     #[cfg(test)]
@@ -63,6 +64,7 @@ impl<'a> Deref for BlockLease<'a> {
         match self {
             BlockLease::PageReadGuard(v) => v.deref(),
             BlockLease::EphemeralFileMutableTail(v) => v,
+            BlockLease::Slice(v) => v,
             #[cfg(test)]
             BlockLease::Arc(v) => v.deref(),
             #[cfg(test)]
@@ -81,6 +83,7 @@ pub(crate) enum BlockReaderRef<'a> {
     FileBlockReader(&'a FileBlockReader<'a>),
     EphemeralFile(&'a EphemeralFile),
     Adapter(Adapter<&'a DeltaLayerInner>),
+    Slice(&'a [u8]),
     #[cfg(test)]
     TestDisk(&'a super::disk_btree::tests::TestDisk),
     #[cfg(test)]
@@ -99,6 +102,7 @@ impl<'a> BlockReaderRef<'a> {
             FileBlockReader(r) => r.read_blk(blknum, ctx).await,
             EphemeralFile(r) => r.read_blk(blknum, ctx).await,
             Adapter(r) => r.read_blk(blknum, ctx).await,
+            Slice(s) => Self::read_blk_slice(s, blknum),
             #[cfg(test)]
             TestDisk(r) => r.read_blk(blknum),
             #[cfg(test)]
@@ -107,6 +111,24 @@ impl<'a> BlockReaderRef<'a> {
     }
 }
 
+impl<'a> BlockReaderRef<'a> {
+    fn read_blk_slice(slice: &[u8], blknum: u32) -> std::io::Result<BlockLease> {
+        let start = (blknum as usize).checked_mul(PAGE_SZ).unwrap();
+        let end = start.checked_add(PAGE_SZ).unwrap();
+        if end > slice.len() {
+            return Err(std::io::Error::new(
+                std::io::ErrorKind::UnexpectedEof,
+                format!("slice too short, len={} end={}", slice.len(), end),
+            ));
+        }
+        let slice = &slice[start..end];
+        let page_sized: &[u8; PAGE_SZ] = slice
+            .try_into()
+            .expect("we add PAGE_SZ to start, so the slice must have PAGE_SZ");
+        Ok(BlockLease::Slice(page_sized))
+    }
+}
+
 ///
 /// A "cursor" for efficiently reading multiple pages from a BlockReader
 ///
diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs
index 79cc7bf153..bb65ae24fc 100644
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -21,6 +21,7 @@ pub struct EphemeralFile {
 }
 
 mod page_caching;
+pub(crate) use page_caching::PrewarmOnWrite as PrewarmPageCacheOnWrite;
 mod zero_padded_read_write;
 
 impl EphemeralFile {
@@ -53,7 +54,7 @@ impl EphemeralFile {
         Ok(EphemeralFile {
             _tenant_shard_id: tenant_shard_id,
             _timeline_id: timeline_id,
-            rw: page_caching::RW::new(file),
+            rw: page_caching::RW::new(file, conf.l0_flush.prewarm_on_write()),
         })
     }
 
@@ -65,6 +66,11 @@ impl EphemeralFile {
         self.rw.page_cache_file_id()
     }
 
+    /// See [`self::page_caching::RW::load_to_vec`].
+    pub(crate) async fn load_to_vec(&self, ctx: &RequestContext) -> Result<Vec<u8>, io::Error> {
+        self.rw.load_to_vec(ctx).await
+    }
+
     pub(crate) async fn read_blk(
         &self,
         blknum: u32,
diff --git a/pageserver/src/tenant/ephemeral_file/page_caching.rs b/pageserver/src/tenant/ephemeral_file/page_caching.rs
index 276ac87064..43b9fff28d 100644
--- a/pageserver/src/tenant/ephemeral_file/page_caching.rs
+++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs
@@ -8,6 +8,7 @@ use crate::virtual_file::VirtualFile;
 
 use once_cell::sync::Lazy;
 use std::io::{self, ErrorKind};
+use std::ops::{Deref, Range};
 use tokio_epoll_uring::BoundedBuf;
 use tracing::*;
 
@@ -19,14 +20,23 @@ pub struct RW {
     rw: super::zero_padded_read_write::RW<PreWarmingWriter>,
 }
 
+/// When we flush a block to the underlying [`crate::virtual_file::VirtualFile`],
+/// should we pre-warm the [`crate::page_cache`] with the contents?
+#[derive(Clone, Copy)]
+pub enum PrewarmOnWrite {
+    Yes,
+    No,
+}
+
 impl RW {
-    pub fn new(file: VirtualFile) -> Self {
+    pub fn new(file: VirtualFile, prewarm_on_write: PrewarmOnWrite) -> Self {
         let page_cache_file_id = page_cache::next_file_id();
         Self {
             page_cache_file_id,
             rw: super::zero_padded_read_write::RW::new(PreWarmingWriter::new(
                 page_cache_file_id,
                 file,
+                prewarm_on_write,
             )),
         }
     }
@@ -49,6 +59,43 @@ impl RW {
         self.rw.bytes_written()
     }
 
+    /// Load all blocks that can be read via [`Self::read_blk`] into a contiguous memory buffer.
+    ///
+    /// This includes the blocks that aren't yet flushed to disk by the internal buffered writer.
+    /// The last block is zero-padded to [`PAGE_SZ`], so, the returned buffer is always a multiple of [`PAGE_SZ`].
+    pub(super) async fn load_to_vec(&self, ctx: &RequestContext) -> Result<Vec<u8>, io::Error> {
+        // round up to the next PAGE_SZ multiple, required by blob_io
+        let size = {
+            let s = usize::try_from(self.bytes_written()).unwrap();
+            if s % PAGE_SZ == 0 {
+                s
+            } else {
+                s.checked_add(PAGE_SZ - (s % PAGE_SZ)).unwrap()
+            }
+        };
+        let vec = Vec::with_capacity(size);
+
+        // read from disk what we've already flushed
+        let writer = self.rw.as_writer();
+        let flushed_range = writer.written_range();
+        let mut vec = writer
+            .file
+            .read_exact_at(
+                vec.slice(0..(flushed_range.end - flushed_range.start)),
+                u64::try_from(flushed_range.start).unwrap(),
+                ctx,
+            )
+            .await?
+            .into_inner();
+
+        // copy from in-memory buffer what we haven't flushed yet but would return when accessed via read_blk
+        let buffered = self.rw.get_tail_zero_padded();
+        vec.extend_from_slice(buffered);
+        assert_eq!(vec.len(), size);
+        assert_eq!(vec.len() % PAGE_SZ, 0);
+        Ok(vec)
+    }
+
     pub(crate) async fn read_blk(
         &self,
         blknum: u32,
@@ -116,19 +163,40 @@ impl Drop for RW {
 }
 
 struct PreWarmingWriter {
+    prewarm_on_write: PrewarmOnWrite,
     nwritten_blocks: u32,
     page_cache_file_id: page_cache::FileId,
     file: VirtualFile,
 }
 
 impl PreWarmingWriter {
-    fn new(page_cache_file_id: page_cache::FileId, file: VirtualFile) -> Self {
+    fn new(
+        page_cache_file_id: page_cache::FileId,
+        file: VirtualFile,
+        prewarm_on_write: PrewarmOnWrite,
+    ) -> Self {
         Self {
+            prewarm_on_write,
             nwritten_blocks: 0,
             page_cache_file_id,
             file,
         }
     }
+
+    /// Return the byte range within `file` that has been written though `write_all`.
+    ///
+    /// The returned range would be invalidated by another `write_all`. To prevent that, we capture `&_`.
+    fn written_range(&self) -> (impl Deref<Target = Range<usize>> + '_) {
+        let nwritten_blocks = usize::try_from(self.nwritten_blocks).unwrap();
+        struct Wrapper(Range<usize>);
+        impl Deref for Wrapper {
+            type Target = Range<usize>;
+            fn deref(&self) -> &Range<usize> {
+                &self.0
+            }
+        }
+        Wrapper(0..nwritten_blocks * PAGE_SZ)
+    }
 }
 
 impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmingWriter {
@@ -178,45 +246,51 @@ impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmi
             assert_eq!(&check_bounds_stuff_works, &*buf);
         }
 
-        // Pre-warm page cache with the contents.
-        // At least in isolated bulk ingest benchmarks (test_bulk_insert.py), the pre-warming
-        // benefits the code that writes InMemoryLayer=>L0 layers.
         let nblocks = buflen / PAGE_SZ;
         let nblocks32 = u32::try_from(nblocks).unwrap();
-        let cache = page_cache::get();
-        static CTX: Lazy<RequestContext> = Lazy::new(|| {
-            RequestContext::new(
-                crate::task_mgr::TaskKind::EphemeralFilePreWarmPageCache,
-                crate::context::DownloadBehavior::Error,
-            )
-        });
-        for blknum_in_buffer in 0..nblocks {
-            let blk_in_buffer = &buf[blknum_in_buffer * PAGE_SZ..(blknum_in_buffer + 1) * PAGE_SZ];
-            let blknum = self
-                .nwritten_blocks
-                .checked_add(blknum_in_buffer as u32)
-                .unwrap();
-            match cache
-                .read_immutable_buf(self.page_cache_file_id, blknum, &CTX)
-                .await
-            {
-                Err(e) => {
-                    error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}");
-                    // fail gracefully, it's not the end of the world if we can't pre-warm the cache here
-                }
-                Ok(v) => match v {
-                    page_cache::ReadBufResult::Found(_guard) => {
-                        // This function takes &mut self, so, it shouldn't be possible to reach this point.
-                        unreachable!("we just wrote block {blknum} to the VirtualFile, which is owned by Self, \
+
+        if matches!(self.prewarm_on_write, PrewarmOnWrite::Yes) {
+            // Pre-warm page cache with the contents.
+            // At least in isolated bulk ingest benchmarks (test_bulk_insert.py), the pre-warming
+            // benefits the code that writes InMemoryLayer=>L0 layers.
+
+            let cache = page_cache::get();
+            static CTX: Lazy<RequestContext> = Lazy::new(|| {
+                RequestContext::new(
+                    crate::task_mgr::TaskKind::EphemeralFilePreWarmPageCache,
+                    crate::context::DownloadBehavior::Error,
+                )
+            });
+            for blknum_in_buffer in 0..nblocks {
+                let blk_in_buffer =
+                    &buf[blknum_in_buffer * PAGE_SZ..(blknum_in_buffer + 1) * PAGE_SZ];
+                let blknum = self
+                    .nwritten_blocks
+                    .checked_add(blknum_in_buffer as u32)
+                    .unwrap();
+                match cache
+                    .read_immutable_buf(self.page_cache_file_id, blknum, &CTX)
+                    .await
+                {
+                    Err(e) => {
+                        error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}");
+                        // fail gracefully, it's not the end of the world if we can't pre-warm the cache here
+                    }
+                    Ok(v) => match v {
+                        page_cache::ReadBufResult::Found(_guard) => {
+                            // This function takes &mut self, so, it shouldn't be possible to reach this point.
+                            unreachable!("we just wrote block {blknum} to the VirtualFile, which is owned by Self, \
                                       and this function takes &mut self, so, no concurrent read_blk is possible");
-                    }
-                    page_cache::ReadBufResult::NotFound(mut write_guard) => {
-                        write_guard.copy_from_slice(blk_in_buffer);
-                        let _ = write_guard.mark_valid();
-                    }
-                },
+                        }
+                        page_cache::ReadBufResult::NotFound(mut write_guard) => {
+                            write_guard.copy_from_slice(blk_in_buffer);
+                            let _ = write_guard.mark_valid();
+                        }
+                    },
+                }
             }
         }
+
         self.nwritten_blocks = self.nwritten_blocks.checked_add(nblocks32).unwrap();
         Ok((buflen, buf.into_inner()))
     }
diff --git a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs
index b37eafb52c..fe310acab8 100644
--- a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs
+++ b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs
@@ -75,6 +75,21 @@ where
         flushed_offset + u64::try_from(buffer.pending()).unwrap()
     }
 
+    /// Get a slice of all blocks that [`Self::read_blk`] would return as [`ReadResult::ServedFromZeroPaddedMutableTail`].
+    pub fn get_tail_zero_padded(&self) -> &[u8] {
+        let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
+        let buffer_written_up_to = buffer.pending();
+        // pad to next page boundary
+        let read_up_to = if buffer_written_up_to % PAGE_SZ == 0 {
+            buffer_written_up_to
+        } else {
+            buffer_written_up_to
+                .checked_add(PAGE_SZ - (buffer_written_up_to % PAGE_SZ))
+                .unwrap()
+        };
+        &buffer.as_zero_padded_slice()[0..read_up_to]
+    }
+
     pub(crate) async fn read_blk(&self, blknum: u32) -> Result<ReadResult<'_, W>, std::io::Error> {
         let flushed_offset = self.buffered_writer.as_inner().bytes_written();
         let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 6624fb7e6b..e1eaea90af 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -6,13 +6,14 @@
 //!
 use crate::config::PageServerConf;
 use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
+use crate::page_cache::PAGE_SZ;
 use crate::repository::{Key, Value};
-use crate::tenant::block_io::BlockReader;
+use crate::tenant::block_io::{BlockCursor, BlockReader, BlockReaderRef};
 use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::storage_layer::ValueReconstructResult;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::{PageReconstructError, Timeline};
-use crate::{page_cache, walrecord};
+use crate::{l0_flush, page_cache, walrecord};
 use anyhow::{anyhow, ensure, Result};
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::InMemoryLayerInfo;
@@ -410,6 +411,7 @@ impl InMemoryLayer {
                 continue;
             }
 
+            // TODO: this uses the page cache => https://github.com/neondatabase/neon/issues/8183
             let buf = reader.read_blob(block_read.block_offset, &ctx).await;
             if let Err(e) = buf {
                 reconstruct_state
@@ -620,6 +622,13 @@ impl InMemoryLayer {
         // rare though, so we just accept the potential latency hit for now.
         let inner = self.inner.read().await;
 
+        let l0_flush_global_state = timeline.l0_flush_global_state.inner().clone();
+        use l0_flush::Inner;
+        let _concurrency_permit = match &*l0_flush_global_state {
+            Inner::PageCached => None,
+            Inner::Direct { semaphore, .. } => Some(semaphore.acquire().await),
+        };
+
         let end_lsn = *self.end_lsn.get().unwrap();
 
         let key_count = if let Some(key_range) = key_range {
@@ -645,28 +654,77 @@ impl InMemoryLayer {
         )
         .await?;
 
-        let mut buf = Vec::new();
+        match &*l0_flush_global_state {
+            l0_flush::Inner::PageCached => {
+                let ctx = RequestContextBuilder::extend(ctx)
+                    .page_content_kind(PageContentKind::InMemoryLayer)
+                    .build();
 
-        let cursor = inner.file.block_cursor();
+                let mut buf = Vec::new();
 
-        let ctx = RequestContextBuilder::extend(ctx)
-            .page_content_kind(PageContentKind::InMemoryLayer)
-            .build();
-        for (key, vec_map) in inner.index.iter() {
-            // Write all page versions
-            for (lsn, pos) in vec_map.as_slice() {
-                cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
-                let will_init = Value::des(&buf)?.will_init();
-                let res;
-                (buf, res) = delta_layer_writer
-                    .put_value_bytes(*key, *lsn, buf, will_init, &ctx)
-                    .await;
-                res?;
+                let cursor = inner.file.block_cursor();
+
+                for (key, vec_map) in inner.index.iter() {
+                    // Write all page versions
+                    for (lsn, pos) in vec_map.as_slice() {
+                        cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
+                        let will_init = Value::des(&buf)?.will_init();
+                        let res;
+                        (buf, res) = delta_layer_writer
+                            .put_value_bytes(*key, *lsn, buf, will_init, &ctx)
+                            .await;
+                        res?;
+                    }
+                }
+            }
+            l0_flush::Inner::Direct { .. } => {
+                let file_contents: Vec<u8> = inner.file.load_to_vec(ctx).await?;
+                assert_eq!(
+                    file_contents.len() % PAGE_SZ,
+                    0,
+                    "needed by BlockReaderRef::Slice"
+                );
+                assert_eq!(file_contents.len(), {
+                    let written = usize::try_from(inner.file.len()).unwrap();
+                    if written % PAGE_SZ == 0 {
+                        written
+                    } else {
+                        written.checked_add(PAGE_SZ - (written % PAGE_SZ)).unwrap()
+                    }
+                });
+
+                let cursor = BlockCursor::new(BlockReaderRef::Slice(&file_contents));
+
+                let mut buf = Vec::new();
+
+                for (key, vec_map) in inner.index.iter() {
+                    // Write all page versions
+                    for (lsn, pos) in vec_map.as_slice() {
+                        // TODO: once we have blob lengths in the in-memory index, we can
+                        // 1. get rid of the blob_io / BlockReaderRef::Slice business and
+                        // 2. load the file contents into a Bytes and
+                        // 3. the use `Bytes::slice` to get the `buf` that is our blob
+                        // 4. pass that `buf` into `put_value_bytes`
+                        // => https://github.com/neondatabase/neon/issues/8183
+                        cursor.read_blob_into_buf(*pos, &mut buf, ctx).await?;
+                        let will_init = Value::des(&buf)?.will_init();
+                        let res;
+                        (buf, res) = delta_layer_writer
+                            .put_value_bytes(*key, *lsn, buf, will_init, ctx)
+                            .await;
+                        res?;
+                    }
+                }
+
+                // Hold the permit until the IO is done; if we didn't, one could drop this future,
+                // thereby releasing the permit, but the Vec<u8> remains allocated until the IO completes.
+                // => we'd have more concurrenct Vec<u8> than allowed as per the semaphore.
+                drop(_concurrency_permit);
             }
         }
 
         // MAX is used here because we identify L0 layers by full key range
-        let delta_layer = delta_layer_writer.finish(Key::MAX, timeline, &ctx).await?;
+        let delta_layer = delta_layer_writer.finish(Key::MAX, timeline, ctx).await?;
         Ok(Some(delta_layer))
     }
 }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index ec94ed3a56..de9361d721 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -65,7 +65,6 @@ use std::{
     ops::{Deref, Range},
 };
 
-use crate::metrics::GetKind;
 use crate::pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS;
 use crate::{
     aux_file::AuxFileSizeEstimator,
@@ -90,6 +89,10 @@ use crate::{
 use crate::{
     disk_usage_eviction_task::EvictionCandidate, tenant::storage_layer::delta_layer::DeltaEntry,
 };
+use crate::{
+    l0_flush::{self, L0FlushGlobalState},
+    metrics::GetKind,
+};
 use crate::{
     metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize,
 };
@@ -208,6 +211,7 @@ pub struct TimelineResources {
     pub timeline_get_throttle: Arc<
         crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>,
     >,
+    pub l0_flush_global_state: l0_flush::L0FlushGlobalState,
 }
 
 pub(crate) struct AuxFilesState {
@@ -433,6 +437,8 @@ pub struct Timeline {
     /// in the future, add `extra_test_sparse_keyspace` if necessary.
     #[cfg(test)]
     pub(crate) extra_test_dense_keyspace: ArcSwap<KeySpace>,
+
+    pub(crate) l0_flush_global_state: L0FlushGlobalState,
 }
 
 pub struct WalReceiverInfo {
@@ -2392,6 +2398,8 @@ impl Timeline {
 
                 #[cfg(test)]
                 extra_test_dense_keyspace: ArcSwap::new(Arc::new(KeySpace::default())),
+
+                l0_flush_global_state: resources.l0_flush_global_state,
             };
             result.repartition_threshold =
                 result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index 6d747d424d..b0088f4ea2 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -272,6 +272,7 @@ impl DeleteTimelineFlow {
                 TimelineResources {
                     remote_client,
                     timeline_get_throttle: tenant.timeline_get_throttle.clone(),
+                    l0_flush_global_state: tenant.l0_flush_global_state.clone(),
                 },
                 // Important. We dont pass ancestor above because it can be missing.
                 // Thus we need to skip the validation here.

From 64ccdf65e09c3bb15d1202659999d94a192577cd Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 2 Jul 2024 16:21:23 +0100
Subject: [PATCH 153/412] CI(benchmarking): move psql queries to
 actions/run-python-test-set (#8230)

## Problem

Some of the Nightly benchmarks fail with the error
```
+ /tmp/neon/pg_install/v14/bin/pgbench --version
/tmp/neon/pg_install/v14/bin/pgbench: error while loading shared libraries: libpq.so.5: cannot open shared object file: No such file or directory
```
Originally, we added the `pgbench --version` call to check that
`pgbench` is installed and to fail earlier if it's not.
The failure happens because we don't have `LD_LIBRARY_PATH` set for
every job, and it also affects `psql` command.
We can move it to `actions/run-python-test-set` so as not to duplicate
code (as it already have `LD_LIBRARY_PATH` set).

## Summary of changes
- Remove `pgbench --version` call
- Move `psql` commands to common `actions/run-python-test-set`
---
 .../actions/run-python-test-set/action.yml    | 10 ++-
 .github/workflows/benchmarking.yml            | 83 +------------------
 2 files changed, 12 insertions(+), 81 deletions(-)

diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index a2aae0772b..7f843de1a5 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -179,7 +179,15 @@ runs:
 
         # Wake up the cluster if we use remote neon instance
         if [ "${{ inputs.build_type }}" = "remote" ] && [ -n "${BENCHMARK_CONNSTR}" ]; then
-          ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/psql ${BENCHMARK_CONNSTR} -c "SELECT version();"
+          QUERIES=("SELECT version()")
+          if [[ "${PLATFORM}" = "neon"* ]]; then
+            QUERIES+=("SHOW neon.tenant_id")
+            QUERIES+=("SHOW neon.timeline_id")
+          fi
+
+          for q in "${QUERIES[@]}"; do
+            ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/psql ${BENCHMARK_CONNSTR} -c "${q}"
+          done
         fi
 
         # Run the tests.
diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 0e748adeb6..db04b5de7d 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -239,11 +239,6 @@ jobs:
         path: /tmp/neon/
         prefix: latest
 
-    - name: Add Postgres binaries to PATH
-      run: |
-        ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
-        echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
-
     - name: Create Neon Project
       if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier"]'), matrix.platform)
       id: create-neon-project
@@ -282,16 +277,6 @@ jobs:
 
         echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
 
-        QUERIES=("SELECT version()")
-        if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERIES+=("SHOW neon.tenant_id")
-          QUERIES+=("SHOW neon.timeline_id")
-        fi
-
-        for q in "${QUERIES[@]}"; do
-          psql ${CONNSTR} -c "${q}"
-        done
-
     - name: Benchmark init
       uses: ./.github/actions/run-python-test-set
       with:
@@ -377,29 +362,12 @@ jobs:
         path: /tmp/neon/
         prefix: latest
 
-    - name: Add Postgres binaries to PATH
-      run: |
-        LD_LIBRARY_PATH="${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/lib"
-        export LD_LIBRARY_PATH
-        echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" >> $GITHUB_ENV
-
-        ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
-        echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
-
     - name: Set up Connection String
       id: set-up-connstr
       run: |
         CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
-        
-        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
 
-        QUERIES=("SELECT version()")
-        QUERIES+=("SHOW neon.tenant_id")
-        QUERIES+=("SHOW neon.timeline_id")
-        
-        for q in "${QUERIES[@]}"; do
-          psql ${CONNSTR} -c "${q}"
-        done
+        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
 
     - name: Benchmark pgvector hnsw indexing
       uses: ./.github/actions/run-python-test-set
@@ -421,12 +389,12 @@ jobs:
         test_selection: performance/test_perf_pgvector_queries.py
         run_in_parallel: false
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
-        extra_params: -m remote_cluster --timeout 21600 
+        extra_params: -m remote_cluster --timeout 21600
       env:
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
         PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
-    
+
     - name: Create Allure report
       if: ${{ !cancelled() }}
       uses: ./.github/actions/allure-report-generate
@@ -481,11 +449,6 @@ jobs:
         path: /tmp/neon/
         prefix: latest
 
-    - name: Add Postgres binaries to PATH
-      run: |
-        ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
-        echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
-
     - name: Set up Connection String
       id: set-up-connstr
       run: |
@@ -507,16 +470,6 @@ jobs:
 
         echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
 
-        QUERIES=("SELECT version()")
-        if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERIES+=("SHOW neon.tenant_id")
-          QUERIES+=("SHOW neon.timeline_id")
-        fi
-
-        for q in "${QUERIES[@]}"; do
-          psql ${CONNSTR} -c "${q}"
-        done
-
     - name: ClickBench benchmark
       uses: ./.github/actions/run-python-test-set
       with:
@@ -584,11 +537,6 @@ jobs:
         path: /tmp/neon/
         prefix: latest
 
-    - name: Add Postgres binaries to PATH
-      run: |
-        ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
-        echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
-
     - name: Get Connstring Secret Name
       run: |
         case "${PLATFORM}" in
@@ -617,16 +565,6 @@ jobs:
 
         echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
 
-        QUERIES=("SELECT version()")
-        if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERIES+=("SHOW neon.tenant_id")
-          QUERIES+=("SHOW neon.timeline_id")
-        fi
-
-        for q in "${QUERIES[@]}"; do
-          psql ${CONNSTR} -c "${q}"
-        done
-
     - name: Run TPC-H benchmark
       uses: ./.github/actions/run-python-test-set
       with:
@@ -685,11 +623,6 @@ jobs:
         path: /tmp/neon/
         prefix: latest
 
-    - name: Add Postgres binaries to PATH
-      run: |
-        ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
-        echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
-
     - name: Set up Connection String
       id: set-up-connstr
       run: |
@@ -711,16 +644,6 @@ jobs:
 
         echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
 
-        QUERIES=("SELECT version()")
-        if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERIES+=("SHOW neon.tenant_id")
-          QUERIES+=("SHOW neon.timeline_id")
-        fi
-
-        for q in "${QUERIES[@]}"; do
-          psql ${CONNSTR} -c "${q}"
-        done
-
     - name: Run user examples
       uses: ./.github/actions/run-python-test-set
       with:

From 13a8a5b09b6279c4354ec8cc8704b65199fd9256 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 2 Jul 2024 17:17:22 +0100
Subject: [PATCH 154/412] tense of errors (#8234)

I forgot a commit when merging
https://github.com/neondatabase/neon/pull/8177
---
 pageserver/src/tenant/mgr.rs | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index c1da1d2c55..b0159e22bf 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -358,7 +358,7 @@ fn load_tenant_config(
         info!("Found temporary tenant directory, removing: {tenant_dir_path}");
         // No need to use safe_remove_tenant_dir_all because this is already
         // a temporary path
-        std::fs::remove_dir_all(&tenant_dir_path).fatal_err("Deleting temporary tenant dir");
+        std::fs::remove_dir_all(&tenant_dir_path).fatal_err("delete temporary tenant dir");
         return None;
     }
 
@@ -368,7 +368,7 @@ fn load_tenant_config(
         .fatal_err("Checking for empty tenant dir");
     if is_empty {
         info!("removing empty tenant directory {tenant_dir_path:?}");
-        std::fs::remove_dir(&tenant_dir_path).fatal_err("Deleting empty tenant dir");
+        std::fs::remove_dir(&tenant_dir_path).fatal_err("delete empty tenant dir");
         return None;
     }
 
@@ -386,7 +386,7 @@ async fn init_load_tenant_configs(
     let tenants_dir = conf.tenants_path();
 
     let dentries = tokio::task::spawn_blocking(move || -> Vec<Utf8DirEntry> {
-        let context = format!("Reading tenants dir {tenants_dir}");
+        let context = format!("read tenants dir {tenants_dir}");
         let dir_entries = tenants_dir.read_dir_utf8().fatal_err(&context);
 
         dir_entries
@@ -587,7 +587,7 @@ pub async fn init_tenant_mgr(
     // For those shards that have live configurations, construct `Tenant` or `SecondaryTenant` objects and start them running
     for (tenant_shard_id, location_conf, config_write_result) in config_write_results {
         // Writing a config to local disk is foundational to startup up tenants: panic if we can't.
-        config_write_result.fatal_err("writing tenant shard config file");
+        config_write_result.fatal_err("write tenant shard config file");
 
         let tenant_dir_path = conf.tenant_path(&tenant_shard_id);
         let shard_identity = location_conf.shard;
@@ -953,7 +953,7 @@ impl TenantManager {
             Some(FastPathModified::Attached(tenant)) => {
                 Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
                     .await
-                    .fatal_err("writing tenant shard config");
+                    .fatal_err("write tenant shard config");
 
                 // Transition to AttachedStale means we may well hold a valid generation
                 // still, and have been requested to go stale as part of a migration.  If
@@ -984,7 +984,7 @@ impl TenantManager {
             Some(FastPathModified::Secondary(_secondary_tenant)) => {
                 Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
                     .await
-                    .fatal_err("writing tenant shard config");
+                    .fatal_err("write tenant shard config");
 
                 return Ok(None);
             }
@@ -1069,14 +1069,14 @@ impl TenantManager {
         // Does not need to be fsync'd because local storage is just a cache.
         tokio::fs::create_dir_all(&timelines_path)
             .await
-            .fatal_err("creating timelines/ dir");
+            .fatal_err("create timelines/ dir");
 
         // Before activating either secondary or attached mode, persist the
         // configuration, so that on restart we will re-attach (or re-start
         // secondary) on the tenant.
         Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
             .await
-            .fatal_err("writing tenant shard config");
+            .fatal_err("write tenant shard config");
 
         let new_slot = match &new_location_config.mode {
             LocationMode::Secondary(secondary_config) => {

From 320b24eab321c49e757bab36d57dd51b7dc7203d Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Tue, 2 Jul 2024 12:54:32 -0400
Subject: [PATCH 155/412] fix(pageserver): comments about metadata key range
 (#8236)

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/pageserver_api/src/key.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index cd430bfab7..0acd83753e 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -29,7 +29,7 @@ pub const KEY_SIZE: usize = 18;
 /// See [`Key::to_i128`] for more information on the encoding.
 pub const METADATA_KEY_SIZE: usize = 16;
 
-/// The key prefix start range for the metadata keys. All keys with the first byte >= 0x40 is a metadata key.
+/// The key prefix start range for the metadata keys. All keys with the first byte >= 0x60 is a metadata key.
 pub const METADATA_KEY_BEGIN_PREFIX: u8 = 0x60;
 pub const METADATA_KEY_END_PREFIX: u8 = 0x7F;
 

From 2863d1df63b1e3f9895b2114475a080f84729db1 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Tue, 2 Jul 2024 21:45:42 +0300
Subject: [PATCH 156/412] Add test for proper handling of connection failure to
 avoid 'cannot wait on socket event without a socket' error (#8231)

## Problem

See https://github.com/neondatabase/cloud/issues/14289
and PR #8210

## Summary of changes

Add test for problems fixed in #8210

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/libpagestore.c                      |  5 ----
 .../regress/test_pageserver_reconnect.py      | 24 +++++++++++++++++++
 2 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index a3fdcc537e..73a001b6ba 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -427,11 +427,6 @@ pageserver_connect(shardno_t shard_no, int elevel)
 		values[n_pgsql_params] = NULL;
 
 		shard->conn = PQconnectStartParams(keywords, values, 1);
-		if (!shard->conn)
-		{
-			neon_shard_log(shard_no, elevel, "Failed to connect to pageserver: out of memory");
-			return false;
-		}
 		if (PQstatus(shard->conn) == CONNECTION_BAD)
 		{
 			char	   *msg = pchomp(PQerrorMessage(shard->conn));
diff --git a/test_runner/regress/test_pageserver_reconnect.py b/test_runner/regress/test_pageserver_reconnect.py
index aecfcdd262..37ff923632 100644
--- a/test_runner/regress/test_pageserver_reconnect.py
+++ b/test_runner/regress/test_pageserver_reconnect.py
@@ -2,6 +2,7 @@ import threading
 import time
 from contextlib import closing
 
+import psycopg2.errors
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv, PgBin
 
@@ -40,3 +41,26 @@ def test_pageserver_reconnect(neon_simple_env: NeonEnv, pg_bin: PgBin):
                 c.execute("select pg_reload_conf()")
 
     thread.join()
+
+
+# Test handling errors during page server reconnect
+def test_pageserver_reconnect_failure(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+    env.neon_cli.create_branch("test_pageserver_reconnect")
+    endpoint = env.endpoints.create_start("test_pageserver_reconnect")
+
+    con = endpoint.connect()
+    cur = con.cursor()
+
+    cur.execute("set statement_timeout='2s'")
+    cur.execute("SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'")
+    connstring = cur.fetchall()[0][0]
+    cur.execute(
+        f"alter system set neon.pageserver_connstring='{connstring}?some_invalid_param=xyz'"
+    )
+    cur.execute("select pg_reload_conf()")
+    try:
+        cur.execute("select count(*) from pg_class")
+    except psycopg2.errors.QueryCanceled:
+        log.info("Connection to PS failed")
+    assert not endpoint.log_contains("ERROR:  cannot wait on socket event without a socket.*")

From ed3b4a58b4041a4be100cebb8c33e14f3f35e816 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Wed, 3 Jul 2024 04:48:56 -0400
Subject: [PATCH 157/412] docker: add storage_scrubber into the docker image
 (#8239)

## Problem

We will run this tool in the k8s cluster. To make it accessible from
k8s, we need to package it into the docker image.

part of https://github.com/neondatabase/cloud/issues/14024
---
 Dockerfile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Dockerfile b/Dockerfile
index f0197758e4..a41598ef72 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -57,6 +57,7 @@ RUN set -e \
       --bin storage_controller  \
       --bin proxy  \
       --bin neon_local \
+      --bin storage_scrubber \
       --locked --release \
     && cachepot -s
 
@@ -83,6 +84,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_controller  /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy               /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local          /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_scrubber    /usr/local/bin
 
 COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
 COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/

From db70c175e6151c36a17a3e6c10a0011673856dc6 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 3 Jul 2024 13:22:53 +0300
Subject: [PATCH 158/412] Simplify test_wal_page_boundary_start test (#8214)

All the code to ensure the WAL record lands at a page boundary was
unnecessary for reproducing the original problem. In fact, it's a pretty
basic test that checks that outbound replication (= neon as publisher)
still works after restarting the endpoint. It just used to be very
broken before commit 5ceccdc7de, which also added this test.

To verify that:

1. Check out commit f3af5f4660 (because the next commit, 7dd58e1449,
fixed the same bug in a different way, making it infeasible to revert
the bug fix in an easy way)
2. Revert the bug fix from commit 5ceccdc7de with this:

```
diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index 7debb6325..9f03bbd99 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -1437,8 +1437,10 @@ XLogWalPropWrite(WalProposer *wp, char *buf, Size nbytes, XLogRecPtr recptr)
 	 *
 	 * https://github.com/neondatabase/neon/issues/5749
 	 */
+#if 0
 	if (!wp->config->syncSafekeepers)
 		XLogUpdateWalBuffers(buf, recptr, nbytes);
+#endif

 	while (nbytes > 0)
 	{
```

3. Run the test_wal_page_boundary_start regression test. It fails, as
expected

4. Apply this commit to the test, and run it again. It still fails, with
the same error mentioned in issue #5749:

```
PG:2024-06-30 20:49:08.805 GMT [1248196] STATEMENT:  START_REPLICATION SLOT "sub1" LOGICAL 0/0 (proto_version '4', origin 'any', publication_names '"pub1"')
PG:2024-06-30 21:37:52.567 GMT [1467972] LOG:  starting logical decoding for slot "sub1"
PG:2024-06-30 21:37:52.567 GMT [1467972] DETAIL:  Streaming transactions committing after 0/1532330, reading WAL from 0/1531C78.
PG:2024-06-30 21:37:52.567 GMT [1467972] STATEMENT:  START_REPLICATION SLOT "sub1" LOGICAL 0/0 (proto_version '4', origin 'any', publication_names '"pub1"')
PG:2024-06-30 21:37:52.567 GMT [1467972] LOG:  logical decoding found consistent point at 0/1531C78
PG:2024-06-30 21:37:52.567 GMT [1467972] DETAIL:  There are no running transactions.
PG:2024-06-30 21:37:52.567 GMT [1467972] STATEMENT:  START_REPLICATION SLOT "sub1" LOGICAL 0/0 (proto_version '4', origin 'any', publication_names '"pub1"')
PG:2024-06-30 21:37:52.568 GMT [1467972] ERROR:  could not find record while sending logically-decoded data: invalid contrecord length 312 (expected 6) at 0/1533FD8
```
---
 .../regress/test_logical_replication.py       | 60 +++----------------
 1 file changed, 9 insertions(+), 51 deletions(-)

diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py
index ca3c81d6e5..41283e4d2c 100644
--- a/test_runner/regress/test_logical_replication.py
+++ b/test_runner/regress/test_logical_replication.py
@@ -4,7 +4,6 @@ from random import choice
 from string import ascii_lowercase
 
 import pytest
-from fixtures.common_types import Lsn
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     AuxFileStore,
@@ -13,7 +12,7 @@ from fixtures.neon_fixtures import (
     logical_replication_sync,
     wait_for_last_flush_lsn,
 )
-from fixtures.utils import query_scalar, wait_until
+from fixtures.utils import wait_until
 
 
 def random_string(n: int):
@@ -326,12 +325,17 @@ FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of
         assert "could not receive data from WAL stream" not in logs
 
 
-# Test compute start at LSN page of which starts with contrecord
-# https://github.com/neondatabase/neon/issues/5749
+# Test replication of WAL record spanning page boundary (with contrecord) after
+# compute restart and WAL write of the page.
+#
+# See https://github.com/neondatabase/neon/issues/5749
+#
+# Most pages start with a contrecord, so we don't do anything special
+# to ensure that.
 @pytest.mark.parametrize(
     "pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation]
 )
-def test_wal_page_boundary_start(neon_simple_env: NeonEnv, vanilla_pg):
+def test_restart_endpoint(neon_simple_env: NeonEnv, vanilla_pg):
     env = neon_simple_env
 
     env.neon_cli.create_branch("init")
@@ -356,52 +360,6 @@ def test_wal_page_boundary_start(neon_simple_env: NeonEnv, vanilla_pg):
     logical_replication_sync(vanilla_pg, endpoint)
     vanilla_pg.stop()
 
-    with endpoint.cursor() as cur:
-        # measure how much space logical message takes. Sometimes first attempt
-        # creates huge message and then it stabilizes, have no idea why.
-        for _ in range(3):
-            lsn_before = Lsn(query_scalar(cur, "select pg_current_wal_lsn()"))
-            log.info(f"current_lsn={lsn_before}")
-            # Non-transactional logical message doesn't write WAL, only XLogInsert's
-            # it, so use transactional. Which is a bit problematic as transactional
-            # necessitates commit record. Alternatively we can do smth like
-            #   select neon_xlogflush(pg_current_wal_insert_lsn());
-            # but isn't much better + that particular call complains on 'xlog flush
-            # request 0/282C018 is not satisfied' as pg_current_wal_insert_lsn skips
-            # page headers.
-            payload = "blahblah"
-            cur.execute(f"select pg_logical_emit_message(true, 'pref', '{payload}')")
-            lsn_after_by_curr_wal_lsn = Lsn(query_scalar(cur, "select pg_current_wal_lsn()"))
-            lsn_diff = lsn_after_by_curr_wal_lsn - lsn_before
-            logical_message_base = lsn_after_by_curr_wal_lsn - lsn_before - len(payload)
-            log.info(
-                f"before {lsn_before}, after {lsn_after_by_curr_wal_lsn}, lsn diff is {lsn_diff}, base {logical_message_base}"
-            )
-
-        # and write logical message spanning exactly as we want
-        lsn_before = Lsn(query_scalar(cur, "select pg_current_wal_lsn()"))
-        log.info(f"current_lsn={lsn_before}")
-        curr_lsn = Lsn(query_scalar(cur, "select pg_current_wal_lsn()"))
-        offs = int(curr_lsn) % 8192
-        till_page = 8192 - offs
-        payload_len = (
-            till_page - logical_message_base - 8
-        )  # not sure why 8 is here, it is deduced from experiments
-        log.info(f"current_lsn={curr_lsn}, offs {offs}, till_page {till_page}")
-
-        # payload_len above would go exactly till the page boundary; but we want contrecord, so make it slightly longer
-        payload_len += 8
-
-        cur.execute(f"select pg_logical_emit_message(true, 'pref', 'f{'a' * payload_len}')")
-        supposedly_contrecord_end = Lsn(query_scalar(cur, "select pg_current_wal_lsn()"))
-        log.info(f"supposedly_page_boundary={supposedly_contrecord_end}")
-        # The calculations to hit the page boundary are very fuzzy, so just
-        # ignore test if we fail to reach it.
-        if not (int(supposedly_contrecord_end) % 8192 == 32):
-            pytest.skip("missed page boundary, bad luck")
-
-        cur.execute("insert into replication_example values (2, 3)")
-
     wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
     endpoint.stop().start()
 

From 252c4acec9e649b304ab0b1ff7dd89fbfe4852a5 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 3 Jul 2024 12:19:13 +0100
Subject: [PATCH 159/412] CI: update docker/* actions to latest versions
 (#7694)

## Problem

GitHub Actions complain that we use actions that depend on deprecated
Node 16:

```
Node.js 16 actions are deprecated. Please update the following actions to use Node.js 20: docker/setup-buildx-action@v2
```

But also, the latest `docker/setup-buildx-action` fails with the following
error:
```
/nvme/actions-runner/_work/_actions/docker/setup-buildx-action/v3/webpack:/docker-setup-buildx/node_modules/@actions/cache/lib/cache.js:175
            throw new Error(`Path Validation Error: Path(s) specified in the action for caching do(es) not exist, hence no cache is being saved.`);
^
Error: Path Validation Error: Path(s) specified in the action for caching do(es) not exist, hence no cache is being saved.
    at Object.rejected (/nvme/actions-runner/_work/_actions/docker/setup-buildx-action/v3/webpack:/docker-setup-buildx/node_modules/@actions/cache/lib/cache.js:175:1)
    at Generator.next (<anonymous>)
    at fulfilled (/nvme/actions-runner/_work/_actions/docker/setup-buildx-action/v3/webpack:/docker-setup-buildx/node_modules/@actions/cache/lib/cache.js:29:1)
```

We can work this around by setting `cache-binary: false` for `uses:
docker/setup-buildx-action@v3`

## Summary of changes
- Update `docker/setup-buildx-action` from `v2` to `v3`, set
`cache-binary: false`
- Update `docker/login-action` from `v2` to `v3`
- Update `docker/build-push-action` from `v4`/`v5` to `v6`
---
 .github/workflows/build-build-tools-image.yml |  8 +++++---
 .github/workflows/build_and_test.yml          | 17 ++++++++++-------
 2 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml
index f1c39e7e4f..a69686bf2a 100644
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -63,14 +63,16 @@ jobs:
           mkdir -p /tmp/.docker-custom
           echo DOCKER_CONFIG=/tmp/.docker-custom >> $GITHUB_ENV
 
-      - uses: docker/setup-buildx-action@v2
+      - uses: docker/setup-buildx-action@v3
+        with:
+          cache-binary: false
 
-      - uses: docker/login-action@v2
+      - uses: docker/login-action@v3
         with:
           username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
           password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
 
-      - uses: docker/build-push-action@v4
+      - uses: docker/build-push-action@v6
         with:
           context: .
           provenance: false
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 24ad26205b..5ac8c6ec27 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -751,14 +751,16 @@ jobs:
         run: |
           mkdir -p .docker-custom
           echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
-      - uses: docker/setup-buildx-action@v2
+      - uses: docker/setup-buildx-action@v3
+        with:
+          cache-binary: false
 
       - uses: docker/login-action@v3
         with:
           username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
           password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
 
-      - uses: docker/build-push-action@v5
+      - uses: docker/build-push-action@v6
         with:
           context: .
           build-args: |
@@ -829,11 +831,12 @@ jobs:
         run: |
           mkdir -p .docker-custom
           echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
-      - uses: docker/setup-buildx-action@v2
+      - uses: docker/setup-buildx-action@v3
         with:
+          cache-binary: false
           # Disable parallelism for docker buildkit.
           # As we already build everything with `make -j$(nproc)`, running it in additional level of parallelisam blows up the Runner.
-          config-inline: |
+          buildkitd-config-inline: |
             [worker.oci]
               max-parallelism = 1
 
@@ -849,7 +852,7 @@ jobs:
           password: ${{ secrets.AWS_SECRET_KEY_DEV }}
 
       - name: Build compute-node image
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@v6
         with:
           context: .
           build-args: |
@@ -868,7 +871,7 @@ jobs:
 
       - name: Build neon extensions test image
         if: matrix.version == 'v16'
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@v6
         with:
           context: .
           build-args: |
@@ -889,7 +892,7 @@ jobs:
       - name: Build compute-tools image
         # compute-tools are Postgres independent, so build it only once
         if: matrix.version == 'v16'
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@v6
         with:
           target: compute-tools-image
           context: .

From c500137ca98fa78a911fd085494d586e405db1e3 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 3 Jul 2024 14:13:06 +0100
Subject: [PATCH 160/412] pageserver: don't try to flush if shutdown during
 attach (#8235)

## Problem

test_location_conf_churn fails on log errors when it tries to shutdown a
pageserver immediately after starting a tenant attach, like this:
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-8224/9761000525/index.html#/testresult/15fb6beca5c7327c

```
shutdown:shutdown{tenant_id=35f5c55eb34e7e5e12288c5d8ab8b909 shard_id=0000}:timeline_shutdown{timeline_id=30936747043353a98661735ad09cbbfe shutdown_mode=FreezeAndFlush}: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited\n')
```

This is happening because Tenant::shutdown fires its cancellation token
early if the tenant is not fully attached by the time shutdown is
called, so the flush loop is shutdown by the time we try and flush.

## Summary of changes

- In the early-cancellation case, also set the shutdown mode to Hard to
skip trying to do a flush that will fail.
---
 pageserver/src/tenant.rs | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 89bf89471c..0c911939e8 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1816,9 +1816,15 @@ impl Tenant {
         // If we're still attaching, fire the cancellation token early to drop out: this
         // will prevent us flushing, but ensures timely shutdown if some I/O during attach
         // is very slow.
-        if matches!(self.current_state(), TenantState::Attaching) {
+        let shutdown_mode = if matches!(self.current_state(), TenantState::Attaching) {
             self.cancel.cancel();
-        }
+
+            // Having fired our cancellation token, do not try and flush timelines: their cancellation tokens
+            // are children of ours, so their flush loops will have shut down already
+            timeline::ShutdownMode::Hard
+        } else {
+            shutdown_mode
+        };
 
         match self.set_stopping(shutdown_progress, false, false).await {
             Ok(()) => {}

From 4fb50144dd8ffbe43887c103e4c8213a12678ccb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 3 Jul 2024 18:02:10 +0200
Subject: [PATCH 161/412] Only support compressed reads if the compression
 setting is present (#8238)

PR #8106 was created with the assumption that no blob is larger than
`256 MiB`. Due to #7852 we have checking for *writes* of blobs larger
than that limit, but we didn't have checking for *reads* of such large
blobs: in theory, we could be reading these blobs every day but we just
don't happen to write the blobs for some reason.

Therefore, we now add a warning for *reads* of such large blobs as well.

To make deploying compression less dangerous, we therefore only assume a
blob is compressed if the compression setting is present in the config.
This also means that we can't back out of compression once we enabled
it.

Part of https://github.com/neondatabase/neon/issues/5431
---
 pageserver/src/tenant/blob_io.rs              | 45 +++++++++++--------
 pageserver/src/tenant/block_io.rs             | 31 +++++++++++--
 .../src/tenant/storage_layer/image_layer.rs   | 28 ++++++++----
 pageserver/src/tenant/storage_layer/layer.rs  |  1 +
 4 files changed, 75 insertions(+), 30 deletions(-)

diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs
index 022801b17f..de74066b81 100644
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -19,6 +19,7 @@ use bytes::{BufMut, BytesMut};
 use pageserver_api::models::ImageCompressionAlgorithm;
 use tokio::io::AsyncWriteExt;
 use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};
+use tracing::warn;
 
 use crate::context::RequestContext;
 use crate::page_cache::PAGE_SZ;
@@ -72,14 +73,22 @@ impl<'a> BlockCursor<'a> {
                 len_buf.copy_from_slice(&buf[off..off + 4]);
                 off += 4;
             }
-            len_buf[0] &= !LEN_COMPRESSION_BIT_MASK;
+            let bit_mask = if self.read_compressed {
+                !LEN_COMPRESSION_BIT_MASK
+            } else {
+                0x7f
+            };
+            len_buf[0] &= bit_mask;
             u32::from_be_bytes(len_buf) as usize
         };
         let compression_bits = first_len_byte & LEN_COMPRESSION_BIT_MASK;
 
         let mut tmp_buf = Vec::new();
         let buf_to_write;
-        let compression = if compression_bits <= BYTE_UNCOMPRESSED {
+        let compression = if compression_bits <= BYTE_UNCOMPRESSED || !self.read_compressed {
+            if compression_bits > BYTE_UNCOMPRESSED {
+                warn!("reading key above future limit ({len} bytes)");
+            }
             buf_to_write = dstbuf;
             None
         } else if compression_bits == BYTE_ZSTD {
@@ -384,10 +393,10 @@ mod tests {
     use rand::{Rng, SeedableRng};
 
     async fn round_trip_test<const BUFFERED: bool>(blobs: &[Vec<u8>]) -> Result<(), Error> {
-        round_trip_test_compressed::<BUFFERED, 0>(blobs).await
+        round_trip_test_compressed::<BUFFERED, false>(blobs).await
     }
 
-    async fn round_trip_test_compressed<const BUFFERED: bool, const COMPRESSION: u8>(
+    async fn round_trip_test_compressed<const BUFFERED: bool, const COMPRESSION: bool>(
         blobs: &[Vec<u8>],
     ) -> Result<(), Error> {
         let temp_dir = camino_tempfile::tempdir()?;
@@ -400,17 +409,15 @@ mod tests {
             let file = VirtualFile::create(pathbuf.as_path(), &ctx).await?;
             let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
             for blob in blobs.iter() {
-                let (_, res) = match COMPRESSION {
-                    0 => wtr.write_blob(blob.clone(), &ctx).await,
-                    1 => {
-                        wtr.write_blob_maybe_compressed(
-                            blob.clone(),
-                            &ctx,
-                            Some(ImageCompressionAlgorithm::Zstd { level: Some(1) }),
-                        )
-                        .await
-                    }
-                    _ => unreachable!("Invalid compression {COMPRESSION}"),
+                let (_, res) = if COMPRESSION {
+                    wtr.write_blob_maybe_compressed(
+                        blob.clone(),
+                        &ctx,
+                        Some(ImageCompressionAlgorithm::Zstd { level: Some(1) }),
+                    )
+                    .await
+                } else {
+                    wtr.write_blob(blob.clone(), &ctx).await
                 };
                 let offs = res?;
                 offsets.push(offs);
@@ -425,7 +432,7 @@ mod tests {
 
         let file = VirtualFile::open(pathbuf.as_path(), &ctx).await?;
         let rdr = BlockReaderRef::VirtualFile(&file);
-        let rdr = BlockCursor::new(rdr);
+        let rdr = BlockCursor::new_with_compression(rdr, COMPRESSION);
         for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() {
             let blob_read = rdr.read_blob(*offset, &ctx).await?;
             assert_eq!(
@@ -459,6 +466,8 @@ mod tests {
         ];
         round_trip_test::<false>(blobs).await?;
         round_trip_test::<true>(blobs).await?;
+        round_trip_test_compressed::<false, true>(blobs).await?;
+        round_trip_test_compressed::<true, true>(blobs).await?;
         Ok(())
     }
 
@@ -474,8 +483,8 @@ mod tests {
         ];
         round_trip_test::<false>(blobs).await?;
         round_trip_test::<true>(blobs).await?;
-        round_trip_test_compressed::<false, 1>(blobs).await?;
-        round_trip_test_compressed::<true, 1>(blobs).await?;
+        round_trip_test_compressed::<false, true>(blobs).await?;
+        round_trip_test_compressed::<true, true>(blobs).await?;
         Ok(())
     }
 
diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs
index 85f3b1c799..3324e840ec 100644
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -149,16 +149,24 @@ impl<'a> BlockReaderRef<'a> {
 /// ```
 ///
 pub struct BlockCursor<'a> {
+    pub(super) read_compressed: bool,
     reader: BlockReaderRef<'a>,
 }
 
 impl<'a> BlockCursor<'a> {
     pub(crate) fn new(reader: BlockReaderRef<'a>) -> Self {
-        BlockCursor { reader }
+        Self::new_with_compression(reader, false)
+    }
+    pub(crate) fn new_with_compression(reader: BlockReaderRef<'a>, read_compressed: bool) -> Self {
+        BlockCursor {
+            read_compressed,
+            reader,
+        }
     }
     // Needed by cli
     pub fn new_fileblockreader(reader: &'a FileBlockReader) -> Self {
         BlockCursor {
+            read_compressed: false,
             reader: BlockReaderRef::FileBlockReader(reader),
         }
     }
@@ -188,11 +196,25 @@ pub struct FileBlockReader<'a> {
 
     /// Unique ID of this file, used as key in the page cache.
     file_id: page_cache::FileId,
+
+    compressed_reads: bool,
 }
 
 impl<'a> FileBlockReader<'a> {
     pub fn new(file: &'a VirtualFile, file_id: FileId) -> Self {
-        FileBlockReader { file_id, file }
+        Self::new_with_compression(file, file_id, false)
+    }
+
+    pub fn new_with_compression(
+        file: &'a VirtualFile,
+        file_id: FileId,
+        compressed_reads: bool,
+    ) -> Self {
+        FileBlockReader {
+            file_id,
+            file,
+            compressed_reads,
+        }
     }
 
     /// Read a page from the underlying file into given buffer.
@@ -239,7 +261,10 @@ impl<'a> FileBlockReader<'a> {
 
 impl BlockReader for FileBlockReader<'_> {
     fn block_cursor(&self) -> BlockCursor<'_> {
-        BlockCursor::new(BlockReaderRef::FileBlockReader(self))
+        BlockCursor::new_with_compression(
+            BlockReaderRef::FileBlockReader(self),
+            self.compressed_reads,
+        )
     }
 }
 
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 50aacbd9ad..4a1b3a0237 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -165,6 +165,7 @@ pub struct ImageLayerInner {
     file_id: FileId,
 
     max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
+    compressed_reads: bool,
 }
 
 impl std::fmt::Debug for ImageLayerInner {
@@ -178,7 +179,8 @@ impl std::fmt::Debug for ImageLayerInner {
 
 impl ImageLayerInner {
     pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> {
-        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let block_reader =
+            FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads);
         let tree_reader = DiskBtreeReader::<_, KEY_SIZE>::new(
             self.index_start_blk,
             self.index_root_blk,
@@ -266,9 +268,10 @@ impl ImageLayer {
     async fn load_inner(&self, ctx: &RequestContext) -> Result<ImageLayerInner> {
         let path = self.path();
 
-        let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, None, ctx)
-            .await
-            .and_then(|res| res)?;
+        let loaded =
+            ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, None, false, ctx)
+                .await
+                .and_then(|res| res)?;
 
         // not production code
         let actual_layer_name = LayerName::from_str(path.file_name().unwrap()).unwrap();
@@ -377,6 +380,7 @@ impl ImageLayerInner {
         lsn: Lsn,
         summary: Option<Summary>,
         max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
+        support_compressed_reads: bool,
         ctx: &RequestContext,
     ) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
         let file = match VirtualFile::open(path, ctx).await {
@@ -420,6 +424,7 @@ impl ImageLayerInner {
             file,
             file_id,
             max_vectored_read_bytes,
+            compressed_reads: support_compressed_reads,
             key_range: actual_summary.key_range,
         }))
     }
@@ -430,7 +435,8 @@ impl ImageLayerInner {
         reconstruct_state: &mut ValueReconstructState,
         ctx: &RequestContext,
     ) -> anyhow::Result<ValueReconstructResult> {
-        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let block_reader =
+            FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads);
         let tree_reader =
             DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader);
 
@@ -490,12 +496,14 @@ impl ImageLayerInner {
         &self,
         ctx: &RequestContext,
     ) -> anyhow::Result<Vec<(Key, Lsn, Value)>> {
-        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let block_reader =
+            FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads);
         let tree_reader =
             DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader);
         let mut result = Vec::new();
         let mut stream = Box::pin(tree_reader.into_stream(&[0; KEY_SIZE], ctx));
-        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let block_reader =
+            FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads);
         let cursor = block_reader.block_cursor();
         while let Some(item) = stream.next().await {
             // TODO: dedup code with get_reconstruct_value
@@ -530,7 +538,8 @@ impl ImageLayerInner {
                 .into(),
         );
 
-        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let block_reader =
+            FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads);
         let tree_reader =
             DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader);
 
@@ -691,7 +700,8 @@ impl ImageLayerInner {
 
     #[cfg(test)]
     pub(crate) fn iter<'a>(&'a self, ctx: &'a RequestContext) -> ImageLayerIterator<'a> {
-        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let block_reader =
+            FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads);
         let tree_reader =
             DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader);
         ImageLayerIterator {
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 02069c29d2..d1f5cc8f43 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1685,6 +1685,7 @@ impl DownloadedLayer {
                     lsn,
                     summary,
                     Some(owner.conf.max_vectored_read_bytes),
+                    owner.conf.image_compression.is_some(),
                     ctx,
                 )
                 .await

From 08b33adfee2f4ab5d4dcbcee03120b427f53996c Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Wed, 3 Jul 2024 18:22:33 +0200
Subject: [PATCH 162/412] add pagebench test cases for periodic pagebench on
 dedicated hardware (#8233)

we want to run some specific pagebench test cases on dedicated hardware
to get reproducible results

run1: 1 client per tenant => characterize throughput with n tenants.
-  500 tenants
- scale 13 (200 MB database)
- 1 hour duration
- ca 380 GB layer snapshot files

run2.singleclient: 1 client per tenant => characterize latencies
run2.manyclient: N clients per tenant => characterize throughput
scalability within one tenant.
- 1 tenant with 1 client for latencies
- 1 tenant with 64 clients because typically for a high number of
connections we recommend the connection pooler
which by default uses 64 connections (for scalability)
- scale 136 (2048 MB database)
- 20 minutes each
---
 .github/workflows/periodic_pagebench.yml      | 144 ++++++++++++++++++
 ...er_max_throughput_getpage_at_latest_lsn.py |  86 ++++++++---
 test_runner/performance/pageserver/util.py    |   2 +-
 3 files changed, 212 insertions(+), 20 deletions(-)
 create mode 100644 .github/workflows/periodic_pagebench.yml

diff --git a/.github/workflows/periodic_pagebench.yml b/.github/workflows/periodic_pagebench.yml
new file mode 100644
index 0000000000..c0219599a2
--- /dev/null
+++ b/.github/workflows/periodic_pagebench.yml
@@ -0,0 +1,144 @@
+name: Periodic pagebench performance test on dedicated EC2 machine in eu-central-1 region
+
+on:
+  schedule:
+    # * is a special character in YAML so you have to quote this string
+    #          ┌───────────── minute (0 - 59)
+    #          │ ┌───────────── hour (0 - 23)
+    #          │ │ ┌───────────── day of the month (1 - 31)
+    #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
+    #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
+    - cron:  '0 18 * * *' # Runs at 6 PM UTC every day
+  workflow_dispatch: # Allows manual triggering of the workflow
+    inputs:
+      commit_hash:
+        type: string
+        description: 'The long neon repo commit hash for the system under test (pageserver) to be tested.'
+        required: false
+        default: ''
+
+defaults:
+  run:
+    shell: bash -euo pipefail {0}
+
+concurrency:
+  group: ${{ github.workflow }}
+  cancel-in-progress: false
+
+jobs:
+  trigger_bench_on_ec2_machine_in_eu_central_1:
+    runs-on: [ self-hosted, gen3, small ]
+    container:
+      image: neondatabase/build-tools:pinned
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      options: --init
+    timeout-minutes: 360  # Set the timeout to 6 hours
+    env:
+      API_KEY: ${{ secrets.PERIODIC_PAGEBENCH_EC2_RUNNER_API_KEY }}
+      RUN_ID: ${{ github.run_id }}
+      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_EC2_US_TEST_RUNNER_ACCESS_KEY_ID }}
+      AWS_SECRET_ACCESS_KEY : ${{ secrets.AWS_EC2_US_TEST_RUNNER_ACCESS_KEY_SECRET }}
+      AWS_DEFAULT_REGION : "eu-central-1"
+      AWS_INSTANCE_ID : "i-02a59a3bf86bc7e74"
+    steps:
+    - name: Show my own (github runner) external IP address - usefull for IP allowlisting
+      run: curl https://ifconfig.me
+
+    - name: Start EC2 instance and wait for the instance to boot up
+      run: |
+        aws ec2 start-instances --instance-ids $AWS_INSTANCE_ID
+        aws ec2 wait instance-running --instance-ids $AWS_INSTANCE_ID
+        sleep 60 # sleep some time to allow cloudinit and our API server to start up
+
+    - name: Determine public IP of the EC2 instance and set env variable EC2_MACHINE_URL_US
+      run: |
+        public_ip=$(aws ec2 describe-instances --instance-ids $AWS_INSTANCE_ID --query 'Reservations[*].Instances[*].PublicIpAddress' --output text)
+        echo "Public IP of the EC2 instance: $public_ip"
+        echo "EC2_MACHINE_URL_US=https://${public_ip}:8443" >> $GITHUB_ENV
+
+    - name: Determine commit hash
+      env:
+        INPUT_COMMIT_HASH: ${{ github.event.inputs.commit_hash }}
+      run: |
+        if [ -z "$INPUT_COMMIT_HASH" ]; then
+          echo "COMMIT_HASH=$(curl -s https://api.github.com/repos/neondatabase/neon/commits/main | jq -r '.sha')" >> $GITHUB_ENV
+        else
+          echo "COMMIT_HASH=$INPUT_COMMIT_HASH" >> $GITHUB_ENV
+        fi
+
+    - name: Start Bench with run_id   
+      run: |
+        curl -k -X 'POST' \
+        "${EC2_MACHINE_URL_US}/start_test/${GITHUB_RUN_ID}" \
+        -H 'accept: application/json' \
+        -H 'Content-Type: application/json' \
+        -H "Authorization: Bearer $API_KEY" \
+        -d "{\"neonRepoCommitHash\": \"${COMMIT_HASH}\"}"
+
+    - name: Poll Test Status
+      id: poll_step
+      run: |
+        status=""
+        while [[ "$status" != "failure" && "$status" != "success" ]]; do
+          response=$(curl -k -X 'GET' \
+          "${EC2_MACHINE_URL_US}/test_status/${GITHUB_RUN_ID}" \
+          -H 'accept: application/json' \
+          -H "Authorization: Bearer $API_KEY")
+          echo "Response: $response"
+          set +x
+          status=$(echo $response | jq -r '.status')
+          echo "Test status: $status"
+          if [[ "$status" == "failure" || "$status" == "success" || "$status" == "null" ]]; then
+            break
+          fi
+          if [[ "$status" == "too_many_runs" ]]; then
+            echo "Too many runs already running"
+            echo "too_many_runs=true" >> "$GITHUB_OUTPUT"
+            exit 1
+          fi
+
+          sleep 60 # Poll every 60 seconds
+        done
+
+    - name: Retrieve Test Logs
+      run: |
+        curl -k -X 'GET' \
+        "${EC2_MACHINE_URL_US}/test_log/${GITHUB_RUN_ID}" \
+        -H 'accept: application/gzip' \
+        -H "Authorization: Bearer $API_KEY" \
+        --output "test_log_${GITHUB_RUN_ID}.gz"
+    
+    - name: Unzip Test Log and Print it into this job's log
+      run: |
+        gzip -d "test_log_${GITHUB_RUN_ID}.gz"
+        cat "test_log_${GITHUB_RUN_ID}"
+
+    - name: Create Allure report
+      if: ${{ !cancelled() }}
+      uses: ./.github/actions/allure-report-generate
+
+    - name: Post to a Slack channel
+      if: ${{ github.event.schedule && failure() }}
+      uses: slackapi/slack-github-action@v1
+      with:
+        channel-id: "C033QLM5P7D" # dev-staging-stream
+        slack-message: "Periodic pagebench testing on dedicated hardware: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+      env:
+        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
+
+    - name: Cleanup Test Resources
+      if: always() 
+      run: |
+        curl -k -X 'POST' \
+        "${EC2_MACHINE_URL_US}/cleanup_test/${GITHUB_RUN_ID}" \
+        -H 'accept: application/json' \
+        -H "Authorization: Bearer $API_KEY" \
+        -d ''
+
+    - name: Stop EC2 instance and wait for the instance to be stopped
+      if: always() && steps.poll_step.outputs.too_many_runs != 'true'
+      run: |
+        aws ec2 stop-instances --instance-ids $AWS_INSTANCE_ID
+        aws ec2 wait instance-stopped --instance-ids $AWS_INSTANCE_ID
diff --git a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
index 1d579214b0..a8f48fe675 100644
--- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
+++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
@@ -1,4 +1,5 @@
 import json
+import os
 from pathlib import Path
 from typing import Any, Dict, Tuple
 
@@ -17,30 +18,74 @@ from performance.pageserver.util import (
     setup_pageserver_with_tenants,
 )
 
+# The following tests use pagebench "getpage at latest LSN" to characterize the throughput of the pageserver.
+# originally there was a single test named `test_pageserver_max_throughput_getpage_at_latest_lsn``
+# so you still see some references to this name in the code.
+# To avoid recreating the snapshots for each test, we continue to use the name `max_throughput_latest_lsn`
+# for some files and metrics.
+
 
 # For reference, the space usage of the snapshots:
-# admin@ip-172-31-13-23:[~/neon-main]: sudo du -hs /instance_store/test_output/shared-snapshots
-# 137G    /instance_store/test_output/shared-snapshots
-# admin@ip-172-31-13-23:[~/neon-main]: sudo du -hs /instance_store/test_output/shared-snapshots/*
-# 1.8G    /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-1-13
-# 1.1G    /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-1-6
-# 8.5G    /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-10-13
-# 5.1G    /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-10-6
-# 76G     /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-100-13
-# 46G     /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-100-6
-@pytest.mark.parametrize("duration", [30])
-@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(s) for s in [100, 200]])
-@pytest.mark.parametrize("n_tenants", [1, 10])
-@pytest.mark.timeout(
-    10000
-)  # TODO: this value is just "a really high number"; have this per instance type
-def test_pageserver_max_throughput_getpage_at_latest_lsn(
+# sudo du -hs /instance_store/neon/test_output/shared-snapshots/*
+# 416G	/instance_store/neon/test_output/shared-snapshots/max_throughput_latest_lsn-500-13
+@pytest.mark.parametrize("duration", [60 * 60])
+@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(200)])
+@pytest.mark.parametrize("n_tenants", [500])
+@pytest.mark.timeout(10000)
+@pytest.mark.skipif(
+    os.getenv("CI", "false") == "true",
+    reason="This test needs lot of resources and should run on dedicated HW, not in github action runners as part of CI",
+)
+def test_pageserver_characterize_throughput_with_n_tenants(
     neon_env_builder: NeonEnvBuilder,
     zenbenchmark: NeonBenchmarker,
     pg_bin: PgBin,
     n_tenants: int,
     pgbench_scale: int,
     duration: int,
+):
+    setup_and_run_pagebench_benchmark(
+        neon_env_builder, zenbenchmark, pg_bin, n_tenants, pgbench_scale, duration, 1
+    )
+
+
+# For reference, the space usage of the snapshots:
+# sudo du -hs /instance_store/neon/test_output/shared-snapshots/*
+# 19G	/instance_store/neon/test_output/shared-snapshots/max_throughput_latest_lsn-1-136
+@pytest.mark.parametrize("duration", [20 * 60])
+@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(2048)])
+# we use 1 client to characterize latencies, and 64 clients to characterize throughput/scalability
+# we use 64 clients because typically for a high number of connections we recommend the connection pooler
+# which by default uses 64 connections
+@pytest.mark.parametrize("n_clients", [1, 64])
+@pytest.mark.parametrize("n_tenants", [1])
+@pytest.mark.timeout(2400)
+@pytest.mark.skipif(
+    os.getenv("CI", "false") == "true",
+    reason="This test needs lot of resources and should run on dedicated HW, not in github action runners as part of CI",
+)
+def test_pageserver_characterize_latencies_with_1_client_and_throughput_with_many_clients_one_tenant(
+    neon_env_builder: NeonEnvBuilder,
+    zenbenchmark: NeonBenchmarker,
+    pg_bin: PgBin,
+    n_tenants: int,
+    pgbench_scale: int,
+    duration: int,
+    n_clients: int,
+):
+    setup_and_run_pagebench_benchmark(
+        neon_env_builder, zenbenchmark, pg_bin, n_tenants, pgbench_scale, duration, n_clients
+    )
+
+
+def setup_and_run_pagebench_benchmark(
+    neon_env_builder: NeonEnvBuilder,
+    zenbenchmark: NeonBenchmarker,
+    pg_bin: PgBin,
+    n_tenants: int,
+    pgbench_scale: int,
+    duration: int,
+    n_clients: int,
 ):
     def record(metric, **kwargs):
         zenbenchmark.record(
@@ -55,6 +100,7 @@ def test_pageserver_max_throughput_getpage_at_latest_lsn(
             "n_tenants": (n_tenants, {"unit": ""}),
             "pgbench_scale": (pgbench_scale, {"unit": ""}),
             "duration": (duration, {"unit": "s"}),
+            "n_clients": (n_clients, {"unit": ""}),
         }
     )
 
@@ -96,7 +142,7 @@ def test_pageserver_max_throughput_getpage_at_latest_lsn(
         r".*query handler for.*pagestream.*failed: unexpected message: CopyFail during COPY.*"
     )
 
-    run_benchmark_max_throughput_latest_lsn(env, pg_bin, record, duration)
+    run_pagebench_benchmark(env, pg_bin, record, duration, n_clients)
 
 
 def setup_tenant_template(env: NeonEnv, pg_bin: PgBin, scale: int):
@@ -157,8 +203,8 @@ def setup_tenant_template(env: NeonEnv, pg_bin: PgBin, scale: int):
     return (template_tenant, template_timeline, config)
 
 
-def run_benchmark_max_throughput_latest_lsn(
-    env: NeonEnv, pg_bin: PgBin, record, duration_secs: int
+def run_pagebench_benchmark(
+    env: NeonEnv, pg_bin: PgBin, record, duration_secs: int, n_clients: int
 ):
     """
     Benchmark `env.pageserver` for max throughput @ latest LSN and record results in `zenbenchmark`.
@@ -172,6 +218,8 @@ def run_benchmark_max_throughput_latest_lsn(
         ps_http.base_url,
         "--page-service-connstring",
         env.pageserver.connstr(password=None),
+        "--num-clients",
+        str(n_clients),
         "--runtime",
         f"{duration_secs}s",
         # don't specify the targets explicitly, let pagebench auto-discover them
diff --git a/test_runner/performance/pageserver/util.py b/test_runner/performance/pageserver/util.py
index 92e05663ce..88296a7fbd 100644
--- a/test_runner/performance/pageserver/util.py
+++ b/test_runner/performance/pageserver/util.py
@@ -22,7 +22,7 @@ def ensure_pageserver_ready_for_benchmarking(env: NeonEnv, n_tenants: int):
 
     log.info("wait for all tenants to become active")
     wait_until_all_tenants_state(
-        ps_http, "Active", iterations=n_tenants, period=1, http_error_ok=False
+        ps_http, "Active", iterations=10 + n_tenants, period=1, http_error_ok=False
     )
 
     # ensure all layers are resident for predictiable performance

From a11cf0312385d0d9d96da9db1394f29d4827f4ea Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 3 Jul 2024 17:27:34 +0100
Subject: [PATCH 163/412] pageserver: reduce ops tracked at per-timeline detail
 (#8245)

## Problem

We record detailed histograms for all page_service op types, which
mostly aren't very interesting, but make our prometheus scrapes huge.

Closes: #8223

## Summary of changes

- Only track GetPageAtLsn histograms on a per-timeline granularity. For
all other operation types, rely on existing node-wide histograms.
---
 pageserver/src/metrics.rs | 107 ++++++++++++++++++++------------------
 1 file changed, 55 insertions(+), 52 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 9cd7ffa042..a21d8780cf 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -8,7 +8,7 @@ use metrics::{
 };
 use once_cell::sync::Lazy;
 use pageserver_api::shard::TenantShardId;
-use strum::{EnumCount, IntoEnumIterator, VariantNames};
+use strum::{EnumCount, VariantNames};
 use strum_macros::{EnumVariantNames, IntoStaticStr};
 use tracing::warn;
 use utils::id::TimelineId;
@@ -1076,21 +1076,12 @@ pub(crate) mod virtual_file_io_engine {
     });
 }
 
-#[derive(Debug)]
-struct GlobalAndPerTimelineHistogram {
-    global: Histogram,
-    per_tenant_timeline: Histogram,
-}
-
-impl GlobalAndPerTimelineHistogram {
-    fn observe(&self, value: f64) {
-        self.global.observe(value);
-        self.per_tenant_timeline.observe(value);
-    }
-}
-
 struct GlobalAndPerTimelineHistogramTimer<'a, 'c> {
-    h: &'a GlobalAndPerTimelineHistogram,
+    global_metric: &'a Histogram,
+
+    // Optional because not all op types are tracked per-timeline
+    timeline_metric: Option<&'a Histogram>,
+
     ctx: &'c RequestContext,
     start: std::time::Instant,
     op: SmgrQueryType,
@@ -1121,7 +1112,10 @@ impl<'a, 'c> Drop for GlobalAndPerTimelineHistogramTimer<'a, 'c> {
                 elapsed
             }
         };
-        self.h.observe(ex_throttled.as_secs_f64());
+        self.global_metric.observe(ex_throttled.as_secs_f64());
+        if let Some(timeline_metric) = self.timeline_metric {
+            timeline_metric.observe(ex_throttled.as_secs_f64());
+        }
     }
 }
 
@@ -1146,7 +1140,8 @@ pub enum SmgrQueryType {
 
 #[derive(Debug)]
 pub(crate) struct SmgrQueryTimePerTimeline {
-    metrics: [GlobalAndPerTimelineHistogram; SmgrQueryType::COUNT],
+    global_metrics: [Histogram; SmgrQueryType::COUNT],
+    per_timeline_getpage: Histogram,
 }
 
 static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
@@ -1224,27 +1219,32 @@ impl SmgrQueryTimePerTimeline {
         let tenant_id = tenant_shard_id.tenant_id.to_string();
         let shard_slug = format!("{}", tenant_shard_id.shard_slug());
         let timeline_id = timeline_id.to_string();
-        let metrics = std::array::from_fn(|i| {
+        let global_metrics = std::array::from_fn(|i| {
             let op = SmgrQueryType::from_repr(i).unwrap();
-            let global = SMGR_QUERY_TIME_GLOBAL
+            SMGR_QUERY_TIME_GLOBAL
                 .get_metric_with_label_values(&[op.into()])
-                .unwrap();
-            let per_tenant_timeline = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
-                .get_metric_with_label_values(&[op.into(), &tenant_id, &shard_slug, &timeline_id])
-                .unwrap();
-            GlobalAndPerTimelineHistogram {
-                global,
-                per_tenant_timeline,
-            }
+                .unwrap()
         });
-        Self { metrics }
+
+        let per_timeline_getpage = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
+            .get_metric_with_label_values(&[
+                SmgrQueryType::GetPageAtLsn.into(),
+                &tenant_id,
+                &shard_slug,
+                &timeline_id,
+            ])
+            .unwrap();
+        Self {
+            global_metrics,
+            per_timeline_getpage,
+        }
     }
     pub(crate) fn start_timer<'c: 'a, 'a>(
         &'a self,
         op: SmgrQueryType,
         ctx: &'c RequestContext,
-    ) -> impl Drop + '_ {
-        let metric = &self.metrics[op as usize];
+    ) -> Option<impl Drop + '_> {
+        let global_metric = &self.global_metrics[op as usize];
         let start = Instant::now();
         match ctx.micros_spent_throttled.open() {
             Ok(()) => (),
@@ -1263,12 +1263,20 @@ impl SmgrQueryTimePerTimeline {
                 });
             }
         }
-        GlobalAndPerTimelineHistogramTimer {
-            h: metric,
+
+        let timeline_metric = if matches!(op, SmgrQueryType::GetPageAtLsn) {
+            Some(&self.per_timeline_getpage)
+        } else {
+            None
+        };
+
+        Some(GlobalAndPerTimelineHistogramTimer {
+            global_metric,
+            timeline_metric,
             ctx,
             start,
             op,
-        }
+        })
     }
 }
 
@@ -1315,17 +1323,9 @@ mod smgr_query_time_tests {
             let get_counts = || {
                 let global: u64 = ops
                     .iter()
-                    .map(|op| metrics.metrics[*op as usize].global.get_sample_count())
+                    .map(|op| metrics.global_metrics[*op as usize].get_sample_count())
                     .sum();
-                let per_tenant_timeline: u64 = ops
-                    .iter()
-                    .map(|op| {
-                        metrics.metrics[*op as usize]
-                            .per_tenant_timeline
-                            .get_sample_count()
-                    })
-                    .sum();
-                (global, per_tenant_timeline)
+                (global, metrics.per_timeline_getpage.get_sample_count())
             };
 
             let (pre_global, pre_per_tenant_timeline) = get_counts();
@@ -1336,7 +1336,12 @@ mod smgr_query_time_tests {
             drop(timer);
 
             let (post_global, post_per_tenant_timeline) = get_counts();
-            assert_eq!(post_per_tenant_timeline, 1);
+            if matches!(op, super::SmgrQueryType::GetPageAtLsn) {
+                // getpage ops are tracked per-timeline, others aren't
+                assert_eq!(post_per_tenant_timeline, 1);
+            } else {
+                assert_eq!(post_per_tenant_timeline, 0);
+            }
             assert!(post_global > pre_global);
         }
     }
@@ -2317,14 +2322,12 @@ impl TimelineMetrics {
             let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, shard_id, timeline_id]);
         }
 
-        for op in SmgrQueryType::iter() {
-            let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[
-                op.into(),
-                tenant_id,
-                shard_id,
-                timeline_id,
-            ]);
-        }
+        let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[
+            SmgrQueryType::GetPageAtLsn.into(),
+            tenant_id,
+            shard_id,
+            timeline_id,
+        ]);
     }
 }
 

From c4423c0623d21b3fdbc117045785a4b6878c9f64 Mon Sep 17 00:00:00 2001
From: Japin Li <japinli@hotmail.com>
Date: Thu, 4 Jul 2024 01:55:36 +0800
Subject: [PATCH 164/412] Fix outdated comment (#8149)

Commit 97b48c23f changes the log wait timeout from 1 second to 100
milliseconds but forgets to update the comment.
---
 compute_tools/src/compute.rs    | 5 ++---
 compute_tools/src/pg_helpers.rs | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index a79b666409..41a52ef5b6 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -873,9 +873,8 @@ impl ComputeNode {
         Ok(())
     }
 
-    // We could've wrapped this around `pg_ctl reload`, but right now we don't use
-    // `pg_ctl` for start / stop, so this just seems much easier to do as we already
-    // have opened connection to Postgres and superuser access.
+    // Wrapped this around `pg_ctl reload`, but right now we don't use
+    // `pg_ctl` for start / stop.
     #[instrument(skip_all)]
     fn pg_reload_conf(&self) -> Result<()> {
         let pgctl_bin = Path::new(&self.pgbin).parent().unwrap().join("pg_ctl");
diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs
index fa0822748b..863fa9468f 100644
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -489,7 +489,7 @@ pub fn handle_postgres_logs(stderr: std::process::ChildStderr) -> JoinHandle<()>
 /// Read Postgres logs from `stderr` until EOF. Buffer is flushed on one of the following conditions:
 /// - next line starts with timestamp
 /// - EOF
-/// - no new lines were written for the last second
+/// - no new lines were written for the last 100 milliseconds
 async fn handle_postgres_logs_async(stderr: tokio::process::ChildStderr) -> Result<()> {
     let mut lines = tokio::io::BufReader::new(stderr).lines();
     let timeout_duration = Duration::from_millis(100);

From 47e06a2cc6875b754e80442a034f77211dd7f799 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 3 Jul 2024 20:05:01 +0200
Subject: [PATCH 165/412] page_service: stop exposing `get_last_record_rlsn`
 (#8244)

Compute doesn't use it, let's eliminate it.

Ref to Slack thread:
https://neondb.slack.com/archives/C033RQ5SPDH/p1719920261995529
---
 pageserver/src/metrics.rs        |  1 -
 pageserver/src/page_service.rs   | 47 --------------------------------
 test_runner/regress/test_auth.py |  2 +-
 3 files changed, 1 insertion(+), 49 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index a21d8780cf..87ff8f4d64 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1452,7 +1452,6 @@ pub(crate) enum ComputeCommandKind {
     PageStreamV2,
     PageStream,
     Basebackup,
-    GetLastRecordRlsn,
     Fullbackup,
     ImportBasebackup,
     ImportWal,
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 6ea5f396d0..a440ad6378 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1656,53 +1656,6 @@ where
             metric_recording.observe(&res);
             res?;
         }
-        // return pair of prev_lsn and last_lsn
-        else if let Some(params) = parts.strip_prefix(&["get_last_record_rlsn"]) {
-            if params.len() != 2 {
-                return Err(QueryError::Other(anyhow::anyhow!(
-                    "invalid param number for get_last_record_rlsn command"
-                )));
-            }
-
-            let tenant_id = TenantId::from_str(params[0])
-                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
-            let timeline_id = TimelineId::from_str(params[1])
-                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
-
-            tracing::Span::current()
-                .record("tenant_id", field::display(tenant_id))
-                .record("timeline_id", field::display(timeline_id));
-
-            self.check_permission(Some(tenant_id))?;
-
-            COMPUTE_COMMANDS_COUNTERS
-                .for_command(ComputeCommandKind::GetLastRecordRlsn)
-                .inc();
-
-            async {
-                let timeline = self
-                    .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
-                    .await?;
-
-                let end_of_timeline = timeline.get_last_record_rlsn();
-
-                pgb.write_message_noflush(&BeMessage::RowDescription(&[
-                    RowDescriptor::text_col(b"prev_lsn"),
-                    RowDescriptor::text_col(b"last_lsn"),
-                ]))?
-                .write_message_noflush(&BeMessage::DataRow(&[
-                    Some(end_of_timeline.prev.to_string().as_bytes()),
-                    Some(end_of_timeline.last.to_string().as_bytes()),
-                ]))?
-                .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
-                anyhow::Ok(())
-            }
-            .instrument(info_span!(
-                "handle_get_last_record_lsn",
-                shard_id = tracing::field::Empty
-            ))
-            .await?;
-        }
         // same as basebackup, but result includes relational data as well
         else if let Some(params) = parts.strip_prefix(&["fullbackup"]) {
             if params.len() < 2 {
diff --git a/test_runner/regress/test_auth.py b/test_runner/regress/test_auth.py
index 035ab2796f..922a21a999 100644
--- a/test_runner/regress/test_auth.py
+++ b/test_runner/regress/test_auth.py
@@ -211,7 +211,7 @@ def test_auth_failures(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
     def check_pageserver(expect_success: bool, **conn_kwargs):
         check_connection(
             env.pageserver,
-            f"get_last_record_rlsn {env.initial_tenant} {timeline_id}",
+            f"show {env.initial_tenant}",
             expect_success,
             **conn_kwargs,
         )

From fee4169b6bb0b34791f676d57b49bbd24da02493 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Wed, 3 Jul 2024 14:46:58 -0400
Subject: [PATCH 166/412] fix(pageserver): ensure test creates valid layer map
 (#8191)

I'd like to add some constraints to the layer map we generate in tests.

(1) is the layer map that the current compaction algorithm will produce.
There is a property that for all delta layer, all delta layer overlaps
with it on the LSN axis will have the same LSN range.
(2) is the layer map that cannot be produced with the legacy compaction
algorithm.
(3) is the layer map that will be produced by the future
tiered-compaction algorithm. The current validator does not allow that
but we can modify the algorithm to allow it in the future.

## Summary of changes

Add a validator to check if the layer map is valid and refactor the test
cases to include delta layer start/end LSN.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 pageserver/src/tenant.rs          | 177 ++++++++++++++++--------------
 pageserver/src/tenant/timeline.rs |  92 +++++++++++++---
 2 files changed, 172 insertions(+), 97 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 0c911939e8..adf492ace7 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1365,7 +1365,7 @@ impl Tenant {
         initdb_lsn: Lsn,
         pg_version: u32,
         ctx: &RequestContext,
-        delta_layer_desc: Vec<Vec<(pageserver_api::key::Key, Lsn, crate::repository::Value)>>,
+        delta_layer_desc: Vec<timeline::DeltaLayerTestDesc>,
         image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
         end_lsn: Lsn,
     ) -> anyhow::Result<Arc<Timeline>> {
@@ -2933,7 +2933,7 @@ impl Tenant {
         dst_id: TimelineId,
         ancestor_lsn: Option<Lsn>,
         ctx: &RequestContext,
-        delta_layer_desc: Vec<Vec<(pageserver_api::key::Key, Lsn, crate::repository::Value)>>,
+        delta_layer_desc: Vec<timeline::DeltaLayerTestDesc>,
         image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
         end_lsn: Lsn,
     ) -> anyhow::Result<Arc<Timeline>> {
@@ -3933,7 +3933,7 @@ mod tests {
     use storage_layer::PersistentLayerKey;
     use tests::storage_layer::ValuesReconstructState;
     use tests::timeline::{GetVectoredError, ShutdownMode};
-    use timeline::GcInfo;
+    use timeline::{DeltaLayerTestDesc, GcInfo};
     use utils::bin_ser::BeSer;
     use utils::id::TenantId;
 
@@ -6229,27 +6229,6 @@ mod tests {
             .await
             .unwrap();
 
-        async fn get_vectored_impl_wrapper(
-            tline: &Arc<Timeline>,
-            key: Key,
-            lsn: Lsn,
-            ctx: &RequestContext,
-        ) -> Result<Option<Bytes>, GetVectoredError> {
-            let mut reconstruct_state = ValuesReconstructState::new();
-            let mut res = tline
-                .get_vectored_impl(
-                    KeySpace::single(key..key.next()),
-                    lsn,
-                    &mut reconstruct_state,
-                    ctx,
-                )
-                .await?;
-            Ok(res.pop_last().map(|(k, v)| {
-                assert_eq!(k, key);
-                v.unwrap()
-            }))
-        }
-
         let lsn = Lsn(0x30);
 
         // test vectored get on parent timeline
@@ -6325,27 +6304,6 @@ mod tests {
             .await
             .unwrap();
 
-        async fn get_vectored_impl_wrapper(
-            tline: &Arc<Timeline>,
-            key: Key,
-            lsn: Lsn,
-            ctx: &RequestContext,
-        ) -> Result<Option<Bytes>, GetVectoredError> {
-            let mut reconstruct_state = ValuesReconstructState::new();
-            let mut res = tline
-                .get_vectored_impl(
-                    KeySpace::single(key..key.next()),
-                    lsn,
-                    &mut reconstruct_state,
-                    ctx,
-                )
-                .await?;
-            Ok(res.pop_last().map(|(k, v)| {
-                assert_eq!(k, key);
-                v.unwrap()
-            }))
-        }
-
         let lsn = Lsn(0x30);
 
         // test vectored get on parent timeline
@@ -6421,9 +6379,18 @@ mod tests {
                 &ctx,
                 // delta layers
                 vec![
-                    vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
-                    vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
-                    vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
+                    DeltaLayerTestDesc::new_with_inferred_key_range(
+                        Lsn(0x10)..Lsn(0x20),
+                        vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
+                    ),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(
+                        Lsn(0x20)..Lsn(0x30),
+                        vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
+                    ),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(
+                        Lsn(0x20)..Lsn(0x30),
+                        vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
+                    ),
                 ],
                 // image layers
                 vec![
@@ -6489,17 +6456,29 @@ mod tests {
                 &ctx,
                 // delta layers
                 vec![
-                    vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
-                    vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
-                    vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
-                    vec![
-                        (key0, Lsn(0x30), Value::Image(test_img("metadata key 0"))),
-                        (key3, Lsn(0x30), Value::Image(test_img("metadata key 3"))),
-                    ],
+                    DeltaLayerTestDesc::new_with_inferred_key_range(
+                        Lsn(0x10)..Lsn(0x20),
+                        vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
+                    ),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(
+                        Lsn(0x20)..Lsn(0x30),
+                        vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
+                    ),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(
+                        Lsn(0x20)..Lsn(0x30),
+                        vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
+                    ),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(
+                        Lsn(0x30)..Lsn(0x40),
+                        vec![
+                            (key0, Lsn(0x30), Value::Image(test_img("metadata key 0"))),
+                            (key3, Lsn(0x30), Value::Image(test_img("metadata key 3"))),
+                        ],
+                    ),
                 ],
                 // image layers
                 vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])],
-                Lsn(0x30),
+                Lsn(0x40),
             )
             .await
             .unwrap();
@@ -6522,7 +6501,7 @@ mod tests {
 
         // Image layers are created at last_record_lsn
         let images = tline
-            .inspect_image_layers(Lsn(0x30), &ctx)
+            .inspect_image_layers(Lsn(0x40), &ctx)
             .await
             .unwrap()
             .into_iter()
@@ -6548,9 +6527,18 @@ mod tests {
                 &ctx,
                 // delta layers
                 vec![
-                    vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
-                    vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
-                    vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
+                    DeltaLayerTestDesc::new_with_inferred_key_range(
+                        Lsn(0x10)..Lsn(0x20),
+                        vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
+                    ),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(
+                        Lsn(0x20)..Lsn(0x30),
+                        vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
+                    ),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(
+                        Lsn(0x20)..Lsn(0x30),
+                        vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
+                    ),
                 ],
                 // image layers
                 vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])],
@@ -6598,15 +6586,21 @@ mod tests {
             key
         }
 
-        // We create one bottom-most image layer, a delta layer D1 crossing the GC horizon, D2 below the horizon, and D3 above the horizon.
+        // We create
+        // - one bottom-most image layer,
+        // - a delta layer D1 crossing the GC horizon with data below and above the horizon,
+        // - a delta layer D2 crossing the GC horizon with data only below the horizon,
+        // - a delta layer D3 above the horizon.
         //
-        //  | D1 |                       | D3 |
+        //                             | D3 |
+        //  | D1 |
         // -|    |-- gc horizon -----------------
         //  |    |                | D2 |
         // --------- img layer ------------------
         //
         // What we should expact from this compaction is:
-        //  | Part of D1 |               | D3 |
+        //                             | D3 |
+        //  | Part of D1 |
         // --------- img layer with D1+D2 at GC horizon------------------
 
         // img layer at 0x10
@@ -6646,13 +6640,13 @@ mod tests {
         let delta3 = vec![
             (
                 get_key(8),
-                Lsn(0x40),
-                Value::Image(Bytes::from("value 8@0x40")),
+                Lsn(0x48),
+                Value::Image(Bytes::from("value 8@0x48")),
             ),
             (
                 get_key(9),
-                Lsn(0x40),
-                Value::Image(Bytes::from("value 9@0x40")),
+                Lsn(0x48),
+                Value::Image(Bytes::from("value 9@0x48")),
             ),
         ];
 
@@ -6662,7 +6656,11 @@ mod tests {
                 Lsn(0x10),
                 DEFAULT_PG_VERSION,
                 &ctx,
-                vec![delta1, delta2, delta3], // delta layers
+                vec![
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3),
+                ], // delta layers
                 vec![(Lsn(0x10), img_layer)], // image layers
                 Lsn(0x50),
             )
@@ -6683,8 +6681,8 @@ mod tests {
             Bytes::from_static(b"value 5@0x20"),
             Bytes::from_static(b"value 6@0x20"),
             Bytes::from_static(b"value 7@0x10"),
-            Bytes::from_static(b"value 8@0x40"),
-            Bytes::from_static(b"value 9@0x40"),
+            Bytes::from_static(b"value 8@0x48"),
+            Bytes::from_static(b"value 9@0x48"),
         ];
 
         for (idx, expected) in expected_result.iter().enumerate() {
@@ -6772,10 +6770,10 @@ mod tests {
                     lsn_range: Lsn(0x30)..Lsn(0x41),
                     is_delta: true
                 },
-                // The delta layer we created and should not be picked for the compaction
+                // The delta3 layer that should not be picked for the compaction
                 PersistentLayerKey {
                     key_range: get_key(8)..get_key(10),
-                    lsn_range: Lsn(0x40)..Lsn(0x41),
+                    lsn_range: Lsn(0x48)..Lsn(0x50),
                     is_delta: true
                 }
             ]
@@ -6839,7 +6837,10 @@ mod tests {
                 Lsn(0x10),
                 DEFAULT_PG_VERSION,
                 &ctx,
-                vec![delta1],              // delta layers
+                vec![DeltaLayerTestDesc::new_with_inferred_key_range(
+                    Lsn(0x10)..Lsn(0x40),
+                    delta1,
+                )], // delta layers
                 vec![(Lsn(0x10), image1)], // image layers
                 Lsn(0x50),
             )
@@ -6963,15 +6964,21 @@ mod tests {
             key
         }
 
-        // We create one bottom-most image layer, a delta layer D1 crossing the GC horizon, D2 below the horizon, and D3 above the horizon.
+        // We create
+        // - one bottom-most image layer,
+        // - a delta layer D1 crossing the GC horizon with data below and above the horizon,
+        // - a delta layer D2 crossing the GC horizon with data only below the horizon,
+        // - a delta layer D3 above the horizon.
         //
-        //  | D1 |                       | D3 |
+        //                             | D3 |
+        //  | D1 |
         // -|    |-- gc horizon -----------------
         //  |    |                | D2 |
         // --------- img layer ------------------
         //
         // What we should expact from this compaction is:
-        //  | Part of D1 |               | D3 |
+        //                             | D3 |
+        //  | Part of D1 |
         // --------- img layer with D1+D2 at GC horizon------------------
 
         // img layer at 0x10
@@ -7021,13 +7028,13 @@ mod tests {
         let delta3 = vec![
             (
                 get_key(8),
-                Lsn(0x40),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x40")),
+                Lsn(0x48),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
             ),
             (
                 get_key(9),
-                Lsn(0x40),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x40")),
+                Lsn(0x48),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
             ),
         ];
 
@@ -7037,7 +7044,11 @@ mod tests {
                 Lsn(0x10),
                 DEFAULT_PG_VERSION,
                 &ctx,
-                vec![delta1, delta2, delta3], // delta layers
+                vec![
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta1),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta2),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3),
+                ], // delta layers
                 vec![(Lsn(0x10), img_layer)], // image layers
                 Lsn(0x50),
             )
@@ -7064,8 +7075,8 @@ mod tests {
             Bytes::from_static(b"value 5@0x10@0x20"),
             Bytes::from_static(b"value 6@0x10@0x20"),
             Bytes::from_static(b"value 7@0x10"),
-            Bytes::from_static(b"value 8@0x10@0x40"),
-            Bytes::from_static(b"value 9@0x10@0x40"),
+            Bytes::from_static(b"value 8@0x10@0x48"),
+            Bytes::from_static(b"value 9@0x10@0x48"),
         ];
 
         let expected_result_at_gc_horizon = [
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index de9361d721..df4d252ad2 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4735,6 +4735,42 @@ impl DurationRecorder {
     }
 }
 
+/// Descriptor for a delta layer used in testing infra. The start/end key/lsn range of the
+/// delta layer might be different from the min/max key/lsn in the delta layer. Therefore,
+/// the layer descriptor requires the user to provide the ranges, which should cover all
+/// keys specified in the `data` field.
+#[cfg(test)]
+pub struct DeltaLayerTestDesc {
+    pub lsn_range: Range<Lsn>,
+    pub key_range: Range<Key>,
+    pub data: Vec<(Key, Lsn, Value)>,
+}
+
+#[cfg(test)]
+impl DeltaLayerTestDesc {
+    #[allow(dead_code)]
+    pub fn new(lsn_range: Range<Lsn>, key_range: Range<Key>, data: Vec<(Key, Lsn, Value)>) -> Self {
+        Self {
+            lsn_range,
+            key_range,
+            data,
+        }
+    }
+
+    pub fn new_with_inferred_key_range(
+        lsn_range: Range<Lsn>,
+        data: Vec<(Key, Lsn, Value)>,
+    ) -> Self {
+        let key_min = data.iter().map(|(key, _, _)| key).min().unwrap();
+        let key_max = data.iter().map(|(key, _, _)| key).max().unwrap();
+        Self {
+            key_range: (*key_min)..(key_max.next()),
+            lsn_range,
+            data,
+        }
+    }
+}
+
 impl Timeline {
     async fn finish_compact_batch(
         self: &Arc<Self>,
@@ -5535,37 +5571,65 @@ impl Timeline {
     #[cfg(test)]
     pub(super) async fn force_create_delta_layer(
         self: &Arc<Timeline>,
-        mut deltas: Vec<(Key, Lsn, Value)>,
+        mut deltas: DeltaLayerTestDesc,
         check_start_lsn: Option<Lsn>,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         let last_record_lsn = self.get_last_record_lsn();
-        deltas.sort_unstable_by(|(ka, la, _), (kb, lb, _)| (ka, la).cmp(&(kb, lb)));
-        let min_key = *deltas.first().map(|(k, _, _)| k).unwrap();
-        let end_key = deltas.last().map(|(k, _, _)| k).unwrap().next();
-        let min_lsn = *deltas.iter().map(|(_, lsn, _)| lsn).min().unwrap();
-        let max_lsn = *deltas.iter().map(|(_, lsn, _)| lsn).max().unwrap();
+        deltas
+            .data
+            .sort_unstable_by(|(ka, la, _), (kb, lb, _)| (ka, la).cmp(&(kb, lb)));
+        assert!(deltas.data.first().unwrap().0 >= deltas.key_range.start);
+        assert!(deltas.data.last().unwrap().0 < deltas.key_range.end);
+        for (_, lsn, _) in &deltas.data {
+            assert!(deltas.lsn_range.start <= *lsn && *lsn < deltas.lsn_range.end);
+        }
         assert!(
-            max_lsn <= last_record_lsn,
-            "advance last record lsn before inserting a layer, max_lsn={max_lsn}, last_record_lsn={last_record_lsn}"
+            deltas.lsn_range.end <= last_record_lsn,
+            "advance last record lsn before inserting a layer, end_lsn={}, last_record_lsn={}",
+            deltas.lsn_range.end,
+            last_record_lsn
         );
-        let end_lsn = Lsn(max_lsn.0 + 1);
         if let Some(check_start_lsn) = check_start_lsn {
-            assert!(min_lsn >= check_start_lsn);
+            assert!(deltas.lsn_range.start >= check_start_lsn);
+        }
+        // check if the delta layer does not violate the LSN invariant, the legacy compaction should always produce a batch of
+        // layers of the same start/end LSN, and so should the force inserted layer
+        {
+            /// Checks if a overlaps with b, assume a/b = [start, end).
+            pub fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
+                !(a.end <= b.start || b.end <= a.start)
+            }
+
+            let guard = self.layers.read().await;
+            for layer in guard.layer_map().iter_historic_layers() {
+                if layer.is_delta()
+                    && overlaps_with(&layer.lsn_range, &deltas.lsn_range)
+                    && layer.lsn_range != deltas.lsn_range
+                {
+                    // If a delta layer overlaps with another delta layer AND their LSN range is not the same, panic
+                    panic!(
+                        "inserted layer violates delta layer LSN invariant: current_lsn_range={}..{}, conflict_lsn_range={}..{}",
+                        deltas.lsn_range.start, deltas.lsn_range.end, layer.lsn_range.start, layer.lsn_range.end
+                    );
+                }
+            }
         }
         let mut delta_layer_writer = DeltaLayerWriter::new(
             self.conf,
             self.timeline_id,
             self.tenant_shard_id,
-            min_key,
-            min_lsn..end_lsn,
+            deltas.key_range.start,
+            deltas.lsn_range,
             ctx,
         )
         .await?;
-        for (key, lsn, val) in deltas {
+        for (key, lsn, val) in deltas.data {
             delta_layer_writer.put_value(key, lsn, val, ctx).await?;
         }
-        let delta_layer = delta_layer_writer.finish(end_key, self, ctx).await?;
+        let delta_layer = delta_layer_writer
+            .finish(deltas.key_range.end, self, ctx)
+            .await?;
 
         {
             let mut guard = self.layers.write().await;

From 7b7d16f52e0c901303c2d749c0821148dd40f8bb Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 3 Jul 2024 22:29:43 +0100
Subject: [PATCH 167/412] pageserver: add supplementary branch usage stats
 (#8131)

## Problem

The metrics we have today aren't convenient for planning around the
impact of timeline archival on costs.

Closes: https://github.com/neondatabase/neon/issues/8108

## Summary of changes

- Add metric `pageserver_archive_size`, which indicates the logical
bytes of data which we would expect to write into an archived branch.
- Add metric `pageserver_pitr_history_size`, which indicates the
distance between last_record_lsn and the PITR cutoff.

These metrics are somewhat temporary: when we implement #8088 and
associated consumption metric changes, these will reach a final form.
For now, an "archived" branch is just any branch outside of its parent's
PITR window: later, archival will become an explicit state (which will
_usually_ correspond to falling outside the parent's PITR window).

The overall volume of timeline metrics is something to watch, but we are
removing many more in https://github.com/neondatabase/neon/pull/8245
than this PR is adding.
---
 libs/pageserver_api/src/models.rs | 10 +++++++++
 pageserver/src/http/routes.rs     |  4 ++++
 pageserver/src/metrics.rs         | 35 +++++++++++++++++++++++++++++++
 pageserver/src/tenant.rs          | 27 ++++++++++++++++++++++++
 pageserver/src/tenant/timeline.rs | 15 +++++++++++++
 test_runner/fixtures/metrics.py   |  2 ++
 6 files changed, 93 insertions(+)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 959e161c16..9228953761 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -661,6 +661,16 @@ pub struct TimelineInfo {
     pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
     pub current_logical_size_non_incremental: Option<u64>,
 
+    /// How many bytes of WAL are within this branch's pitr_interval.  If the pitr_interval goes
+    /// beyond the branch's branch point, we only count up to the branch point.
+    pub pitr_history_size: u64,
+
+    /// Whether this branch's branch point is within its ancestor's PITR interval (i.e. any
+    /// ancestor data used by this branch would have been retained anyway).  If this is false, then
+    /// this branch may be imposing a cost on the ancestor by causing it to retain layers that it would
+    /// otherwise be able to GC.
+    pub within_ancestor_pitr: bool,
+
     pub timeline_dir_layer_file_size_sum: Option<u64>,
 
     pub wal_source_connstr: Option<String>,
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index f726ba115d..6a6f17604d 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -406,6 +406,8 @@ async fn build_timeline_info_common(
 
     let walreceiver_status = timeline.walreceiver_status();
 
+    let (pitr_history_size, within_ancestor_pitr) = timeline.get_pitr_history_stats();
+
     let info = TimelineInfo {
         tenant_id: timeline.tenant_shard_id,
         timeline_id: timeline.timeline_id,
@@ -426,6 +428,8 @@ async fn build_timeline_info_common(
         directory_entries_counts: timeline.get_directory_metrics().to_vec(),
         current_physical_size,
         current_logical_size_non_incremental: None,
+        pitr_history_size,
+        within_ancestor_pitr,
         timeline_dir_layer_file_size_sum: None,
         wal_source_connstr,
         last_received_msg_lsn,
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 87ff8f4d64..9e9fe7fbb8 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -464,6 +464,24 @@ static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+static PITR_HISTORY_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_pitr_history_size",
+        "Data written since PITR cutoff on this timeline",
+        &["tenant_id", "shard_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
+static TIMELINE_ARCHIVE_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_archive_size",
+        "Timeline's logical size if it is considered eligible for archival (outside PITR window), else zero",
+        &["tenant_id", "shard_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
 static STANDBY_HORIZON: Lazy<IntGaugeVec> = Lazy::new(|| {
     register_int_gauge_vec!(
         "pageserver_standby_horizon",
@@ -2106,6 +2124,8 @@ pub(crate) struct TimelineMetrics {
     pub garbage_collect_histo: StorageTimeMetrics,
     pub find_gc_cutoffs_histo: StorageTimeMetrics,
     pub last_record_gauge: IntGauge,
+    pub pitr_history_size: UIntGauge,
+    pub archival_size: UIntGauge,
     pub standby_horizon_gauge: IntGauge,
     pub resident_physical_size_gauge: UIntGauge,
     /// copy of LayeredTimeline.current_logical_size
@@ -2179,6 +2199,15 @@ impl TimelineMetrics {
         let last_record_gauge = LAST_RECORD_LSN
             .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
             .unwrap();
+
+        let pitr_history_size = PITR_HISTORY_SIZE
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
+            .unwrap();
+
+        let archival_size = TIMELINE_ARCHIVE_SIZE
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
+            .unwrap();
+
         let standby_horizon_gauge = STANDBY_HORIZON
             .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
             .unwrap();
@@ -2231,6 +2260,8 @@ impl TimelineMetrics {
             find_gc_cutoffs_histo,
             load_layer_map_histo,
             last_record_gauge,
+            pitr_history_size,
+            archival_size,
             standby_horizon_gauge,
             resident_physical_size_gauge,
             current_logical_size_gauge,
@@ -2288,6 +2319,10 @@ impl TimelineMetrics {
         if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) {
             let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         }
+
+        let _ = TIMELINE_ARCHIVE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
+        let _ = PITR_HISTORY_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
+
         let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         let _ = VALID_LSN_LEASE_COUNT.remove_label_values(&[tenant_id, shard_id, timeline_id]);
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index adf492ace7..eef8dc104c 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2874,6 +2874,7 @@ impl Tenant {
             {
                 let mut target = timeline.gc_info.write().unwrap();
 
+                // Cull any expired leases
                 let now = SystemTime::now();
                 target.leases.retain(|_, lease| !lease.is_expired(&now));
 
@@ -2882,6 +2883,31 @@ impl Tenant {
                     .valid_lsn_lease_count_gauge
                     .set(target.leases.len() as u64);
 
+                // Look up parent's PITR cutoff to update the child's knowledge of whether it is within parent's PITR
+                if let Some(ancestor_id) = timeline.get_ancestor_timeline_id() {
+                    if let Some(ancestor_gc_cutoffs) = gc_cutoffs.get(&ancestor_id) {
+                        target.within_ancestor_pitr =
+                            timeline.get_ancestor_lsn() >= ancestor_gc_cutoffs.pitr;
+                    }
+                }
+
+                // Update metrics that depend on GC state
+                timeline
+                    .metrics
+                    .archival_size
+                    .set(if target.within_ancestor_pitr {
+                        timeline.metrics.current_logical_size_gauge.get()
+                    } else {
+                        0
+                    });
+                timeline.metrics.pitr_history_size.set(
+                    timeline
+                        .get_last_record_lsn()
+                        .checked_sub(target.cutoffs.pitr)
+                        .unwrap_or(Lsn(0))
+                        .0,
+                );
+
                 match gc_cutoffs.remove(&timeline.timeline_id) {
                     Some(cutoffs) => {
                         target.retain_lsns = branchpoints;
@@ -7063,6 +7089,7 @@ mod tests {
                     horizon: Lsn(0x30),
                 },
                 leases: Default::default(),
+                within_ancestor_pitr: false,
             };
         }
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index df4d252ad2..54bbdef56e 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -463,6 +463,9 @@ pub(crate) struct GcInfo {
 
     /// Leases granted to particular LSNs.
     pub(crate) leases: BTreeMap<Lsn, LsnLease>,
+
+    /// Whether our branch point is within our ancestor's PITR interval (for cost estimation)
+    pub(crate) within_ancestor_pitr: bool,
 }
 
 impl GcInfo {
@@ -851,6 +854,18 @@ impl Timeline {
             .map(|ancestor| ancestor.timeline_id)
     }
 
+    /// Get the bytes written since the PITR cutoff on this branch, and
+    /// whether this branch's ancestor_lsn is within its parent's PITR.
+    pub(crate) fn get_pitr_history_stats(&self) -> (u64, bool) {
+        let gc_info = self.gc_info.read().unwrap();
+        let history = self
+            .get_last_record_lsn()
+            .checked_sub(gc_info.cutoffs.pitr)
+            .unwrap_or(Lsn(0))
+            .0;
+        (history, gc_info.within_ancestor_pitr)
+    }
+
     /// Lock and get timeline's GC cutoff
     pub(crate) fn get_latest_gc_cutoff_lsn(&self) -> RcuReadGuard<Lsn> {
         self.latest_gc_cutoff_lsn.read()
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index 41fa8e679f..c019cbbc77 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -144,6 +144,8 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
     "pageserver_smgr_query_seconds_bucket",
     "pageserver_smgr_query_seconds_count",
     "pageserver_smgr_query_seconds_sum",
+    "pageserver_archive_size",
+    "pageserver_pitr_history_size",
     "pageserver_storage_operations_seconds_count_total",
     "pageserver_storage_operations_seconds_sum_total",
     "pageserver_evictions_total",

From b917868ada53f1e3a486a0cf3f2552c88a07547d Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 4 Jul 2024 06:04:19 +0100
Subject: [PATCH 168/412] tests: perform graceful rolling restarts in storcon
 scale test (#8173)

## Problem
Scale test doesn't exercise drain & fill.

## Summary of changes
Make scale test exercise drain & fill
---
 test_runner/fixtures/neon_fixtures.py         |  47 +++++++
 .../test_storage_controller_scale.py          | 124 ++++++++++++++++--
 .../regress/test_storage_controller.py        |  59 ++-------
 3 files changed, 171 insertions(+), 59 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 565aaba6e0..c002e11c1c 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2113,6 +2113,21 @@ class NeonStorageController(MetricsGetter, LogUtils):
             self.running = False
         return self
 
+    @staticmethod
+    def retryable_node_operation(op, ps_id, max_attempts, backoff):
+        while max_attempts > 0:
+            try:
+                op(ps_id)
+                return
+            except StorageControllerApiException as e:
+                max_attempts -= 1
+                log.info(f"Operation failed ({max_attempts} attempts left): {e}")
+
+                if max_attempts == 0:
+                    raise e
+
+                time.sleep(backoff)
+
     @staticmethod
     def raise_api_exception(res: requests.Response):
         try:
@@ -2453,6 +2468,38 @@ class NeonStorageController(MetricsGetter, LogUtils):
         )
         log.info("storage controller passed consistency check")
 
+    def poll_node_status(
+        self, node_id: int, desired_scheduling_policy: str, max_attempts: int, backoff: int
+    ):
+        """
+        Poll the node status until it reaches 'desired_scheduling_policy' or 'max_attempts' have been exhausted
+        """
+        log.info(f"Polling {node_id} for {desired_scheduling_policy} scheduling policy")
+        while max_attempts > 0:
+            try:
+                status = self.node_status(node_id)
+                policy = status["scheduling"]
+                if policy == desired_scheduling_policy:
+                    return
+                else:
+                    max_attempts -= 1
+                    log.info(f"Status call returned {policy=} ({max_attempts} attempts left)")
+
+                    if max_attempts == 0:
+                        raise AssertionError(
+                            f"Status for {node_id=} did not reach {desired_scheduling_policy=}"
+                        )
+
+                    time.sleep(backoff)
+            except StorageControllerApiException as e:
+                max_attempts -= 1
+                log.info(f"Status call failed ({max_attempts} retries left): {e}")
+
+                if max_attempts == 0:
+                    raise e
+
+                time.sleep(backoff)
+
     def configure_failpoints(self, config_strings: Tuple[str, str] | List[Tuple[str, str]]):
         if isinstance(config_strings, tuple):
             pairs = [config_strings]
diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py
index a4c8c8ac42..d65a66b010 100644
--- a/test_runner/performance/test_storage_controller_scale.py
+++ b/test_runner/performance/test_storage_controller_scale.py
@@ -1,18 +1,89 @@
 import concurrent.futures
 import random
 import time
+from collections import defaultdict
+from typing import Any, Dict
 
 import pytest
 from fixtures.common_types import TenantId, TenantShardId, TimelineId
 from fixtures.compute_reconfigure import ComputeReconfigure
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import (
-    NeonEnvBuilder,
-)
+from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pg_version import PgVersion
 
 
+def get_consistent_node_shard_counts(env: NeonEnv, total_shards) -> defaultdict[str, int]:
+    """
+    Get the number of shards attached to each node.
+    This function takes into account the intersection of the intent and the observed state.
+    If they do not match, it asserts out.
+    """
+    tenants = env.storage_controller.tenant_list()
+
+    intent = dict()
+    observed = dict()
+
+    tenant_placement: defaultdict[str, Dict[str, Any]] = defaultdict(
+        lambda: {
+            "observed": {"attached": None, "secondary": []},
+            "intent": {"attached": None, "secondary": []},
+        }
+    )
+
+    for t in tenants:
+        for node_id, loc_state in t["observed"]["locations"].items():
+            if (
+                loc_state is not None
+                and "conf" in loc_state
+                and loc_state["conf"] is not None
+                and loc_state["conf"]["mode"]
+                in set(["AttachedSingle", "AttachedMulti", "AttachedStale"])
+            ):
+                observed[t["tenant_shard_id"]] = int(node_id)
+                tenant_placement[t["tenant_shard_id"]]["observed"]["attached"] = int(node_id)
+
+            if (
+                loc_state is not None
+                and "conf" in loc_state
+                and loc_state["conf"] is not None
+                and loc_state["conf"]["mode"] == "Secondary"
+            ):
+                tenant_placement[t["tenant_shard_id"]]["observed"]["secondary"].append(int(node_id))
+
+        if "attached" in t["intent"]:
+            intent[t["tenant_shard_id"]] = t["intent"]["attached"]
+            tenant_placement[t["tenant_shard_id"]]["intent"]["attached"] = t["intent"]["attached"]
+
+        if "secondary" in t["intent"]:
+            tenant_placement[t["tenant_shard_id"]]["intent"]["secondary"] += t["intent"][
+                "secondary"
+            ]
+
+    log.info(f"{tenant_placement=}")
+
+    matching = {
+        tid: intent[tid] for tid in observed if tid in intent and intent[tid] == observed[tid]
+    }
+    assert len(matching) == total_shards
+
+    attached_per_node: defaultdict[str, int] = defaultdict(int)
+    for node_id in matching.values():
+        attached_per_node[node_id] += 1
+
+    return attached_per_node
+
+
+def assert_consistent_balanced_attachments(env: NeonEnv, total_shards):
+    attached_per_node = get_consistent_node_shard_counts(env, total_shards)
+
+    min_shard_count = min(attached_per_node.values())
+    max_shard_count = max(attached_per_node.values())
+
+    flake_factor = 5 / 100
+    assert max_shard_count - min_shard_count <= int(total_shards * flake_factor)
+
+
 @pytest.mark.timeout(3600)  # super long running test: should go down as we optimize
 def test_storage_controller_many_tenants(
     neon_env_builder: NeonEnvBuilder, compute_reconfigure_listener: ComputeReconfigure
@@ -44,7 +115,8 @@ def test_storage_controller_many_tenants(
     # A small sleep on each call into the notify hook, to simulate the latency of doing a database write
     compute_reconfigure_listener.register_on_notify(lambda body: time.sleep(0.01))
 
-    env = neon_env_builder.init_start()
+    env = neon_env_builder.init_configs()
+    neon_env_builder.start()
 
     # We will intentionally stress reconciler concurrrency, which triggers a warning when lots
     # of shards are hitting the delayed path.
@@ -79,6 +151,8 @@ def test_storage_controller_many_tenants(
     shard_count = 2
     stripe_size = 1024
 
+    total_shards = tenant_count * shard_count
+
     tenants = set(TenantId.generate() for _i in range(0, tenant_count))
 
     virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
@@ -195,10 +269,44 @@ def test_storage_controller_many_tenants(
     env.storage_controller.consistency_check()
     check_memory()
 
-    # Restart pageservers: this exercises the /re-attach API
-    for pageserver in env.pageservers:
-        pageserver.stop()
-        pageserver.start()
+    shard_counts = get_consistent_node_shard_counts(env, total_shards)
+    log.info(f"Shard counts before rolling restart: {shard_counts}")
+
+    assert_consistent_balanced_attachments(env, total_shards)
+
+    # Restart pageservers gracefully: this exercises the /re-attach pageserver API
+    # and the storage controller drain and fill API
+    for ps in env.pageservers:
+        env.storage_controller.retryable_node_operation(
+            lambda ps_id: env.storage_controller.node_drain(ps_id), ps.id, max_attempts=3, backoff=2
+        )
+
+        env.storage_controller.poll_node_status(
+            ps.id, "PauseForRestart", max_attempts=24, backoff=5
+        )
+
+        shard_counts = get_consistent_node_shard_counts(env, total_shards)
+        log.info(f"Shard counts after draining node {ps.id}: {shard_counts}")
+        # Assert that we've drained the node
+        assert shard_counts[str(ps.id)] == 0
+        # Assert that those shards actually went somewhere
+        assert sum(shard_counts.values()) == total_shards
+
+        ps.restart()
+        env.storage_controller.poll_node_status(ps.id, "Active", max_attempts=24, backoff=1)
+
+        env.storage_controller.retryable_node_operation(
+            lambda ps_id: env.storage_controller.node_fill(ps_id), ps.id, max_attempts=3, backoff=2
+        )
+        env.storage_controller.poll_node_status(ps.id, "Active", max_attempts=24, backoff=5)
+
+        shard_counts = get_consistent_node_shard_counts(env, total_shards)
+        log.info(f"Shard counts after filling node {ps.id}: {shard_counts}")
+
+        assert_consistent_balanced_attachments(env, total_shards)
+
+        env.storage_controller.reconcile_until_idle()
+        env.storage_controller.consistency_check()
 
     # Consistency check is safe here: restarting pageservers should not have caused any Reconcilers to spawn,
     # as they were not offline long enough to trigger any scheduling changes.
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 1b294fb2d0..a78f566f0e 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -1518,49 +1518,6 @@ def test_tenant_import(neon_env_builder: NeonEnvBuilder, shard_count, remote_sto
         workload.validate()
 
 
-def retryable_node_operation(op, ps_id, max_attempts, backoff):
-    while max_attempts > 0:
-        try:
-            op(ps_id)
-            return
-        except StorageControllerApiException as e:
-            max_attempts -= 1
-            log.info(f"Operation failed ({max_attempts} attempts left): {e}")
-
-            if max_attempts == 0:
-                raise e
-
-            time.sleep(backoff)
-
-
-def poll_node_status(env, node_id, desired_scheduling_policy, max_attempts, backoff):
-    log.info(f"Polling {node_id} for {desired_scheduling_policy} scheduling policy")
-    while max_attempts > 0:
-        try:
-            status = env.storage_controller.node_status(node_id)
-            policy = status["scheduling"]
-            if policy == desired_scheduling_policy:
-                return
-            else:
-                max_attempts -= 1
-                log.info(f"Status call returned {policy=} ({max_attempts} attempts left)")
-
-                if max_attempts == 0:
-                    raise AssertionError(
-                        f"Status for {node_id=} did not reach {desired_scheduling_policy=}"
-                    )
-
-                time.sleep(backoff)
-        except StorageControllerApiException as e:
-            max_attempts -= 1
-            log.info(f"Status call failed ({max_attempts} retries left): {e}")
-
-            if max_attempts == 0:
-                raise e
-
-            time.sleep(backoff)
-
-
 def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder):
     """
     Graceful reststart of storage controller clusters use the drain and
@@ -1601,10 +1558,10 @@ def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder):
 
     # Perform a graceful rolling restart
     for ps in env.pageservers:
-        retryable_node_operation(
+        env.storage_controller.retryable_node_operation(
             lambda ps_id: env.storage_controller.node_drain(ps_id), ps.id, max_attempts=3, backoff=2
         )
-        poll_node_status(env, ps.id, "PauseForRestart", max_attempts=6, backoff=5)
+        env.storage_controller.poll_node_status(ps.id, "PauseForRestart", max_attempts=6, backoff=5)
 
         shard_counts = get_node_shard_counts(env, tenant_ids)
         log.info(f"Shard counts after draining node {ps.id}: {shard_counts}")
@@ -1614,12 +1571,12 @@ def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder):
         assert sum(shard_counts.values()) == total_shards
 
         ps.restart()
-        poll_node_status(env, ps.id, "Active", max_attempts=10, backoff=1)
+        env.storage_controller.poll_node_status(ps.id, "Active", max_attempts=10, backoff=1)
 
-        retryable_node_operation(
+        env.storage_controller.retryable_node_operation(
             lambda ps_id: env.storage_controller.node_fill(ps_id), ps.id, max_attempts=3, backoff=2
         )
-        poll_node_status(env, ps.id, "Active", max_attempts=6, backoff=5)
+        env.storage_controller.poll_node_status(ps.id, "Active", max_attempts=6, backoff=5)
 
         shard_counts = get_node_shard_counts(env, tenant_ids)
         log.info(f"Shard counts after filling node {ps.id}: {shard_counts}")
@@ -1657,15 +1614,15 @@ def test_background_operation_cancellation(neon_env_builder: NeonEnvBuilder):
 
     ps_id_to_drain = env.pageservers[0].id
 
-    retryable_node_operation(
+    env.storage_controller.retryable_node_operation(
         lambda ps_id: env.storage_controller.node_drain(ps_id),
         ps_id_to_drain,
         max_attempts=3,
         backoff=2,
     )
 
-    poll_node_status(env, ps_id_to_drain, "Draining", max_attempts=6, backoff=2)
+    env.storage_controller.poll_node_status(ps_id_to_drain, "Draining", max_attempts=6, backoff=2)
 
     env.storage_controller.cancel_node_drain(ps_id_to_drain)
 
-    poll_node_status(env, ps_id_to_drain, "Active", max_attempts=6, backoff=2)
+    env.storage_controller.poll_node_status(ps_id_to_drain, "Active", max_attempts=6, backoff=2)

From a86b43fcd7ba679212de1c4adb7e22a27c48bbab Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 4 Jul 2024 09:03:03 +0100
Subject: [PATCH 169/412] proxy: cache certain non-retriable console errors for
 a short time (#8201)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

If there's a quota error, it makes sense to cache it for a short window
of time. Many clients do not handle database connection errors
gracefully, so just spam retry 🤡

## Summary of changes

Updates the node_info cache to support storing console errors. Store
console errors if they cannot be retried (using our own heuristic.
should only trigger for quota exceeded errors).
---
 proxy/src/cache/common.rs          |  7 +++
 proxy/src/cache/timed_lru.rs       | 38 ++++++++++++-
 proxy/src/console/messages.rs      | 62 +++++++++++----------
 proxy/src/console/provider.rs      |  6 +-
 proxy/src/console/provider/neon.rs | 89 ++++++++++++++++++++++--------
 proxy/src/proxy/tests.rs           |  4 +-
 6 files changed, 146 insertions(+), 60 deletions(-)

diff --git a/proxy/src/cache/common.rs b/proxy/src/cache/common.rs
index bc1c37512b..4e393fddb2 100644
--- a/proxy/src/cache/common.rs
+++ b/proxy/src/cache/common.rs
@@ -53,6 +53,13 @@ impl<C: Cache, V> Cached<C, V> {
         )
     }
 
+    pub fn map<U>(self, f: impl FnOnce(V) -> U) -> Cached<C, U> {
+        Cached {
+            token: self.token,
+            value: f(self.value),
+        }
+    }
+
     /// Drop this entry from a cache if it's still there.
     pub fn invalidate(self) -> V {
         if let Some((cache, info)) = &self.token {
diff --git a/proxy/src/cache/timed_lru.rs b/proxy/src/cache/timed_lru.rs
index 3b21381bb9..c5c4f6a1ed 100644
--- a/proxy/src/cache/timed_lru.rs
+++ b/proxy/src/cache/timed_lru.rs
@@ -65,6 +65,8 @@ impl<K: Hash + Eq, V> Cache for TimedLru<K, V> {
 struct Entry<T> {
     created_at: Instant,
     expires_at: Instant,
+    ttl: Duration,
+    update_ttl_on_retrieval: bool,
     value: T,
 }
 
@@ -122,7 +124,6 @@ impl<K: Hash + Eq, V> TimedLru<K, V> {
         Q: Hash + Eq + ?Sized,
     {
         let now = Instant::now();
-        let deadline = now.checked_add(self.ttl).expect("time overflow");
 
         // Do costly things before taking the lock.
         let mut cache = self.cache.lock();
@@ -142,7 +143,8 @@ impl<K: Hash + Eq, V> TimedLru<K, V> {
         let (created_at, expires_at) = (entry.created_at, entry.expires_at);
 
         // Update the deadline and the entry's position in the LRU list.
-        if self.update_ttl_on_retrieval {
+        let deadline = now.checked_add(raw_entry.get().ttl).expect("time overflow");
+        if raw_entry.get().update_ttl_on_retrieval {
             raw_entry.get_mut().expires_at = deadline;
         }
         raw_entry.to_back();
@@ -162,12 +164,27 @@ impl<K: Hash + Eq, V> TimedLru<K, V> {
     /// existed, return the previous value and its creation timestamp.
     #[tracing::instrument(level = "debug", fields(cache = self.name), skip_all)]
     fn insert_raw(&self, key: K, value: V) -> (Instant, Option<V>) {
+        self.insert_raw_ttl(key, value, self.ttl, self.update_ttl_on_retrieval)
+    }
+
+    /// Insert an entry to the cache. If an entry with the same key already
+    /// existed, return the previous value and its creation timestamp.
+    #[tracing::instrument(level = "debug", fields(cache = self.name), skip_all)]
+    fn insert_raw_ttl(
+        &self,
+        key: K,
+        value: V,
+        ttl: Duration,
+        update: bool,
+    ) -> (Instant, Option<V>) {
         let created_at = Instant::now();
-        let expires_at = created_at.checked_add(self.ttl).expect("time overflow");
+        let expires_at = created_at.checked_add(ttl).expect("time overflow");
 
         let entry = Entry {
             created_at,
             expires_at,
+            ttl,
+            update_ttl_on_retrieval: update,
             value,
         };
 
@@ -190,6 +207,21 @@ impl<K: Hash + Eq, V> TimedLru<K, V> {
 }
 
 impl<K: Hash + Eq + Clone, V: Clone> TimedLru<K, V> {
+    pub fn insert_ttl(&self, key: K, value: V, ttl: Duration) {
+        self.insert_raw_ttl(key, value, ttl, false);
+    }
+
+    pub fn insert_unit(&self, key: K, value: V) -> (Option<V>, Cached<&Self, ()>) {
+        let (created_at, old) = self.insert_raw(key.clone(), value);
+
+        let cached = Cached {
+            token: Some((self, LookupInfo { created_at, key })),
+            value: (),
+        };
+
+        (old, cached)
+    }
+
     pub fn insert(&self, key: K, value: V) -> (Option<V>, Cached<&Self>) {
         let (created_at, old) = self.insert_raw(key.clone(), value.clone());
 
diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs
index d28d13ba69..9abf24ab7f 100644
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -9,7 +9,7 @@ use crate::proxy::retry::CouldRetry;
 
 /// Generic error response with human-readable description.
 /// Note that we can't always present it to user as is.
-#[derive(Debug, Deserialize)]
+#[derive(Debug, Deserialize, Clone)]
 pub struct ConsoleError {
     pub error: Box<str>,
     #[serde(skip)]
@@ -82,41 +82,19 @@ impl CouldRetry for ConsoleError {
             .details
             .error_info
             .map_or(Reason::Unknown, |e| e.reason);
-        match reason {
-            // not a transitive error
-            Reason::RoleProtected => false,
-            // on retry, it will still not be found
-            Reason::ResourceNotFound
-            | Reason::ProjectNotFound
-            | Reason::EndpointNotFound
-            | Reason::BranchNotFound => false,
-            // we were asked to go away
-            Reason::RateLimitExceeded
-            | Reason::NonDefaultBranchComputeTimeExceeded
-            | Reason::ActiveTimeQuotaExceeded
-            | Reason::ComputeTimeQuotaExceeded
-            | Reason::WrittenDataQuotaExceeded
-            | Reason::DataTransferQuotaExceeded
-            | Reason::LogicalSizeQuotaExceeded => false,
-            // transitive error. control plane is currently busy
-            // but might be ready soon
-            Reason::RunningOperations => true,
-            Reason::ConcurrencyLimitReached => true,
-            Reason::LockAlreadyTaken => true,
-            // unknown error. better not retry it.
-            Reason::Unknown => false,
-        }
+
+        reason.can_retry()
     }
 }
 
-#[derive(Debug, Deserialize)]
+#[derive(Debug, Deserialize, Clone)]
 pub struct Status {
     pub code: Box<str>,
     pub message: Box<str>,
     pub details: Details,
 }
 
-#[derive(Debug, Deserialize)]
+#[derive(Debug, Deserialize, Clone)]
 pub struct Details {
     pub error_info: Option<ErrorInfo>,
     pub retry_info: Option<RetryInfo>,
@@ -199,6 +177,34 @@ impl Reason {
                 | Reason::BranchNotFound
         )
     }
+
+    pub fn can_retry(&self) -> bool {
+        match self {
+            // do not retry role protected errors
+            // not a transitive error
+            Reason::RoleProtected => false,
+            // on retry, it will still not be found
+            Reason::ResourceNotFound
+            | Reason::ProjectNotFound
+            | Reason::EndpointNotFound
+            | Reason::BranchNotFound => false,
+            // we were asked to go away
+            Reason::RateLimitExceeded
+            | Reason::NonDefaultBranchComputeTimeExceeded
+            | Reason::ActiveTimeQuotaExceeded
+            | Reason::ComputeTimeQuotaExceeded
+            | Reason::WrittenDataQuotaExceeded
+            | Reason::DataTransferQuotaExceeded
+            | Reason::LogicalSizeQuotaExceeded => false,
+            // transitive error. control plane is currently busy
+            // but might be ready soon
+            Reason::RunningOperations
+            | Reason::ConcurrencyLimitReached
+            | Reason::LockAlreadyTaken => true,
+            // unknown error. better not retry it.
+            Reason::Unknown => false,
+        }
+    }
 }
 
 #[derive(Copy, Clone, Debug, Deserialize)]
@@ -206,7 +212,7 @@ pub struct RetryInfo {
     pub retry_delay_ms: u64,
 }
 
-#[derive(Debug, Deserialize)]
+#[derive(Debug, Deserialize, Clone)]
 pub struct UserFacingMessage {
     pub message: Box<str>,
 }
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index bec55a8343..7a9637066f 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -2,7 +2,7 @@
 pub mod mock;
 pub mod neon;
 
-use super::messages::MetricsAuxInfo;
+use super::messages::{ConsoleError, MetricsAuxInfo};
 use crate::{
     auth::{
         backend::{ComputeCredentialKeys, ComputeUserInfo},
@@ -317,8 +317,8 @@ impl NodeInfo {
     }
 }
 
-pub type NodeInfoCache = TimedLru<EndpointCacheKey, NodeInfo>;
-pub type CachedNodeInfo = Cached<&'static NodeInfoCache>;
+pub type NodeInfoCache = TimedLru<EndpointCacheKey, Result<NodeInfo, Box<ConsoleError>>>;
+pub type CachedNodeInfo = Cached<&'static NodeInfoCache, NodeInfo>;
 pub type CachedRoleSecret = Cached<&'static ProjectInfoCacheImpl, Option<AuthSecret>>;
 pub type CachedAllowedIps = Cached<&'static ProjectInfoCacheImpl, Arc<Vec<IpPattern>>>;
 
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 41bd2f4956..a6e67be22f 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -9,7 +9,7 @@ use super::{
 use crate::{
     auth::backend::ComputeUserInfo,
     compute,
-    console::messages::ColdStartInfo,
+    console::messages::{ColdStartInfo, Reason},
     http,
     metrics::{CacheOutcome, Metrics},
     rate_limiter::EndpointRateLimiter,
@@ -17,10 +17,10 @@ use crate::{
 };
 use crate::{cache::Cached, context::RequestMonitoring};
 use futures::TryFutureExt;
-use std::sync::Arc;
+use std::{sync::Arc, time::Duration};
 use tokio::time::Instant;
 use tokio_postgres::config::SslMode;
-use tracing::{error, info, info_span, warn, Instrument};
+use tracing::{debug, error, info, info_span, warn, Instrument};
 
 pub struct Api {
     endpoint: http::Endpoint,
@@ -273,26 +273,34 @@ impl super::Api for Api {
     ) -> Result<CachedNodeInfo, WakeComputeError> {
         let key = user_info.endpoint_cache_key();
 
+        macro_rules! check_cache {
+            () => {
+                if let Some(cached) = self.caches.node_info.get(&key) {
+                    let (cached, info) = cached.take_value();
+                    let info = info.map_err(|c| {
+                        info!(key = &*key, "found cached wake_compute error");
+                        WakeComputeError::ApiError(ApiError::Console(*c))
+                    })?;
+
+                    debug!(key = &*key, "found cached compute node info");
+                    ctx.set_project(info.aux.clone());
+                    return Ok(cached.map(|()| info));
+                }
+            };
+        }
+
         // Every time we do a wakeup http request, the compute node will stay up
         // for some time (highly depends on the console's scale-to-zero policy);
         // The connection info remains the same during that period of time,
         // which means that we might cache it to reduce the load and latency.
-        if let Some(cached) = self.caches.node_info.get(&key) {
-            info!(key = &*key, "found cached compute node info");
-            ctx.set_project(cached.aux.clone());
-            return Ok(cached);
-        }
+        check_cache!();
 
         let permit = self.locks.get_permit(&key).await?;
 
         // after getting back a permit - it's possible the cache was filled
         // double check
         if permit.should_check_cache() {
-            if let Some(cached) = self.caches.node_info.get(&key) {
-                info!(key = &*key, "found cached compute node info");
-                ctx.set_project(cached.aux.clone());
-                return Ok(cached);
-            }
+            check_cache!();
         }
 
         // check rate limit
@@ -300,23 +308,56 @@ impl super::Api for Api {
             .wake_compute_endpoint_rate_limiter
             .check(user_info.endpoint.normalize_intern(), 1)
         {
-            info!(key = &*key, "found cached compute node info");
             return Err(WakeComputeError::TooManyConnections);
         }
 
-        let mut node = permit.release_result(self.do_wake_compute(ctx, user_info).await)?;
-        ctx.set_project(node.aux.clone());
-        let cold_start_info = node.aux.cold_start_info;
-        info!("woken up a compute node");
+        let node = permit.release_result(self.do_wake_compute(ctx, user_info).await);
+        match node {
+            Ok(node) => {
+                ctx.set_project(node.aux.clone());
+                debug!(key = &*key, "created a cache entry for woken compute node");
 
-        // store the cached node as 'warm'
-        node.aux.cold_start_info = ColdStartInfo::WarmCached;
-        let (_, mut cached) = self.caches.node_info.insert(key.clone(), node);
-        cached.aux.cold_start_info = cold_start_info;
+                let mut stored_node = node.clone();
+                // store the cached node as 'warm_cached'
+                stored_node.aux.cold_start_info = ColdStartInfo::WarmCached;
 
-        info!(key = &*key, "created a cache entry for compute node info");
+                let (_, cached) = self.caches.node_info.insert_unit(key, Ok(stored_node));
 
-        Ok(cached)
+                Ok(cached.map(|()| node))
+            }
+            Err(err) => match err {
+                WakeComputeError::ApiError(ApiError::Console(err)) => {
+                    let Some(status) = &err.status else {
+                        return Err(WakeComputeError::ApiError(ApiError::Console(err)));
+                    };
+
+                    let reason = status
+                        .details
+                        .error_info
+                        .map_or(Reason::Unknown, |x| x.reason);
+
+                    // if we can retry this error, do not cache it.
+                    if reason.can_retry() {
+                        return Err(WakeComputeError::ApiError(ApiError::Console(err)));
+                    }
+
+                    // at this point, we should only have quota errors.
+                    debug!(
+                        key = &*key,
+                        "created a cache entry for the wake compute error"
+                    );
+
+                    self.caches.node_info.insert_ttl(
+                        key,
+                        Err(Box::new(err.clone())),
+                        Duration::from_secs(30),
+                    );
+
+                    Err(WakeComputeError::ApiError(ApiError::Console(err)))
+                }
+                err => return Err(err),
+            },
+        }
     }
 }
 
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 8119f39fae..5186a9e1b0 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -540,8 +540,8 @@ fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeIn
         },
         allow_self_signed_compute: false,
     };
-    let (_, node) = cache.insert("key".into(), node);
-    node
+    let (_, node2) = cache.insert_unit("key".into(), Ok(node.clone()));
+    node2.map(|()| node)
 }
 
 fn helper_create_connect_info(

From 1d0ec50ddbbfa096898bc58051a1d54ee1dc57f0 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 4 Jul 2024 09:20:01 +0100
Subject: [PATCH 170/412] CI(build-and-test): add conclusion job (#8246)

## Problem

Currently, if you need to rename a job and the job is listed in [branch
protection
rules](https://github.com/neondatabase/neon/settings/branch_protection_rules),
the PR won't be allowed to merge.

## Summary of changes
- Add `conclusion` job that fails if any of its dependencies don't
finish successfully
---
 .github/workflows/build_and_test.yml | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 5ac8c6ec27..9b75d0bf3c 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1368,3 +1368,31 @@ jobs:
     with:
       from-tag: ${{ needs.build-build-tools-image.outputs.image-tag }}
     secrets: inherit
+
+  # This job simplifies setting branch protection rules (in GitHub UI)
+  # by allowing to set only this job instead of listing many others.
+  # It also makes it easier to rename or parametrise jobs (using matrix)
+  # which requires changes in branch protection rules
+  #
+  # Note, that we can't add external check (like `neon-cloud-e2e`) we still need to use GitHub UI for that.
+  #
+  # https://github.com/neondatabase/neon/settings/branch_protection_rules
+  conclusion:
+    if: always()
+    # Format `needs` differently to make the list more readable.
+    # Usually we do `needs: [...]`
+    needs:
+      - check-codestyle-python
+      - check-codestyle-rust
+      - regress-tests
+      - test-images
+    runs-on: ubuntu-22.04
+    steps:
+      # The list of possible results:
+      # https://docs.github.com/en/actions/learn-github-actions/contexts#needs-context
+      - name: Fail the job if any of the dependencies do not succeed
+        run: exit 1
+        if: |
+          contains(needs.*.result, 'failure')
+          || contains(needs.*.result, 'cancelled')
+          || contains(needs.*.result, 'skipped')

From 2897dcc9aa20dd44389521908a2be62b22d71377 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 4 Jul 2024 13:22:33 +0100
Subject: [PATCH 171/412] pageserver: increase rate limit duration for layer
 visit log (#8263)

## Problem
I'd like to keep this in the tree since it might be useful in prod as
well. It's a bit too noisy as is and missing the lsn.

## Summary of changes
Add an lsn field and and increase the rate limit duration.
---
 pageserver/src/tenant/timeline.rs | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 54bbdef56e..bbf0d0a4bf 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1284,15 +1284,14 @@ impl Timeline {
             if avg >= Self::VEC_GET_LAYERS_VISITED_WARN_THRESH {
                 use utils::rate_limit::RateLimit;
                 static LOGGED: Lazy<Mutex<RateLimit>> =
-                    Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
+                    Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(60))));
                 let mut rate_limit = LOGGED.lock().unwrap();
                 rate_limit.call(|| {
                     tracing::info!(
-                    tenant_id = %self.tenant_shard_id.tenant_id,
-                    shard_id = %self.tenant_shard_id.shard_slug(),
-                    timeline_id = %self.timeline_id,
-                    "Vectored read for {} visited {} layers on average per key and {} in total. {}/{} pages were returned",
-                    keyspace, avg, layers_visited, results.len(), keyspace.total_raw_size());
+                      shard_id = %self.tenant_shard_id.shard_slug(),
+                      lsn = %lsn,
+                      "Vectored read for {} visited {} layers on average per key and {} in total. {}/{} pages were returned",
+                      keyspace, avg, layers_visited, results.len(), keyspace.total_raw_size());
                 });
             }
 

From 0a63bc4818d620bccbc9e2403b4235e31f7d3bef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 4 Jul 2024 15:04:08 +0200
Subject: [PATCH 172/412] Use bool param for round_trip_test_compressed (#8252)

As per @koivunej 's request in
https://github.com/neondatabase/neon/pull/8238#discussion_r1663892091 ,
use a runtime param instead of monomorphizing the function based on the value.

Part of https://github.com/neondatabase/neon/issues/5431
---
 pageserver/src/tenant/blob_io.rs | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs
index de74066b81..1a6a5702f1 100644
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -393,11 +393,12 @@ mod tests {
     use rand::{Rng, SeedableRng};
 
     async fn round_trip_test<const BUFFERED: bool>(blobs: &[Vec<u8>]) -> Result<(), Error> {
-        round_trip_test_compressed::<BUFFERED, false>(blobs).await
+        round_trip_test_compressed::<BUFFERED>(blobs, false).await
     }
 
-    async fn round_trip_test_compressed<const BUFFERED: bool, const COMPRESSION: bool>(
+    async fn round_trip_test_compressed<const BUFFERED: bool>(
         blobs: &[Vec<u8>],
+        compression: bool,
     ) -> Result<(), Error> {
         let temp_dir = camino_tempfile::tempdir()?;
         let pathbuf = temp_dir.path().join("file");
@@ -409,7 +410,7 @@ mod tests {
             let file = VirtualFile::create(pathbuf.as_path(), &ctx).await?;
             let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
             for blob in blobs.iter() {
-                let (_, res) = if COMPRESSION {
+                let (_, res) = if compression {
                     wtr.write_blob_maybe_compressed(
                         blob.clone(),
                         &ctx,
@@ -432,7 +433,7 @@ mod tests {
 
         let file = VirtualFile::open(pathbuf.as_path(), &ctx).await?;
         let rdr = BlockReaderRef::VirtualFile(&file);
-        let rdr = BlockCursor::new_with_compression(rdr, COMPRESSION);
+        let rdr = BlockCursor::new_with_compression(rdr, compression);
         for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() {
             let blob_read = rdr.read_blob(*offset, &ctx).await?;
             assert_eq!(
@@ -466,8 +467,8 @@ mod tests {
         ];
         round_trip_test::<false>(blobs).await?;
         round_trip_test::<true>(blobs).await?;
-        round_trip_test_compressed::<false, true>(blobs).await?;
-        round_trip_test_compressed::<true, true>(blobs).await?;
+        round_trip_test_compressed::<false>(blobs, true).await?;
+        round_trip_test_compressed::<true>(blobs, true).await?;
         Ok(())
     }
 
@@ -483,8 +484,8 @@ mod tests {
         ];
         round_trip_test::<false>(blobs).await?;
         round_trip_test::<true>(blobs).await?;
-        round_trip_test_compressed::<false, true>(blobs).await?;
-        round_trip_test_compressed::<true, true>(blobs).await?;
+        round_trip_test_compressed::<false>(blobs, true).await?;
+        round_trip_test_compressed::<true>(blobs, true).await?;
         Ok(())
     }
 

From 0e4832308d53da8b546d9f146a7f1f2f597e21f1 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 4 Jul 2024 14:58:01 +0100
Subject: [PATCH 173/412] CI(pg-clients): unify workflow with build-and-test
 (#8160)

## Problem

`pg-clients` workflow looks different from the main `build-and-test`
workflow for historical reasons (it was my very first task at Neon, and
back then I wasn't really familiar with the rest of the CI pipelines).
This PR unifies `pg-clients` workflow with `build-and-test`

## Summary of changes
- Rename `pg_clients.yml` to `pg-clients.yml`
- Run the workflow on changes in relevant files
- Create Allure report for tests
- Send slack notifications to `#on-call-qa-staging-stream` channel
(instead of `#on-call-staging-stream`)
- Update Client libraries once we're here
---
 .github/workflows/build_and_test.yml          |   2 +-
 .github/workflows/pg-clients.yml              | 115 ++++++++
 .github/workflows/pg_clients.yml              |  98 -------
 Dockerfile.build-tools                        |  22 +-
 test_runner/pg_clients/java/jdbc/Dockerfile   |   2 +-
 .../pg_clients/python/pg8000/requirements.txt |   2 +-
 .../pg_clients/rust/tokio-postgres/Cargo.lock | 273 +++++++++---------
 .../pg_clients/rust/tokio-postgres/Cargo.toml |   4 +-
 .../pg_clients/rust/tokio-postgres/Dockerfile |   2 +-
 .../swift/PostgresClientKitExample/Dockerfile |   4 +-
 .../PostgresClientKitExample/Package.resolved |  12 +-
 .../PostgresClientKitExample/Package.swift    |   2 +-
 .../swift/PostgresNIOExample/Dockerfile       |   4 +-
 .../swift/PostgresNIOExample/Package.resolved |  25 +-
 .../swift/PostgresNIOExample/Package.swift    |   4 +-
 .../typescript/postgresql-client/Dockerfile   |   2 +-
 .../postgresql-client/package-lock.json       |  12 +-
 .../typescript/postgresql-client/package.json |   2 +-
 .../typescript/serverless-driver/Dockerfile   |   2 +-
 .../serverless-driver/package-lock.json       | 144 +++++----
 .../typescript/serverless-driver/package.json |   2 +-
 21 files changed, 403 insertions(+), 332 deletions(-)
 create mode 100644 .github/workflows/pg-clients.yml
 delete mode 100644 .github/workflows/pg_clients.yml

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 9b75d0bf3c..a3246987e2 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -30,7 +30,7 @@ jobs:
     if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
     uses: ./.github/workflows/check-permissions.yml
     with:
-      github-event-name: ${{ github.event_name}}
+      github-event-name: ${{ github.event_name }}
 
   cancel-previous-e2e-tests:
     needs: [ check-permissions ]
diff --git a/.github/workflows/pg-clients.yml b/.github/workflows/pg-clients.yml
new file mode 100644
index 0000000000..e21e45c929
--- /dev/null
+++ b/.github/workflows/pg-clients.yml
@@ -0,0 +1,115 @@
+name: Test Postgres client libraries
+
+on:
+  schedule:
+    # * is a special character in YAML so you have to quote this string
+    #          ┌───────────── minute (0 - 59)
+    #          │ ┌───────────── hour (0 - 23)
+    #          │ │ ┌───────────── day of the month (1 - 31)
+    #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
+    #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
+    - cron:  '23 02 * * *' # run once a day, timezone is utc
+  pull_request:
+    paths:
+      - '.github/workflows/pg-clients.yml'
+      - 'test_runner/pg_clients/**'
+      - 'poetry.lock'
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref_name }}
+  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
+
+defaults:
+  run:
+    shell: bash -euxo pipefail {0}
+
+env:
+  DEFAULT_PG_VERSION: 16
+  PLATFORM: neon-captest-new
+  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
+  AWS_DEFAULT_REGION: eu-central-1
+
+jobs:
+  check-permissions:
+    if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
+    uses: ./.github/workflows/check-permissions.yml
+    with:
+      github-event-name: ${{ github.event_name }}
+
+  check-build-tools-image:
+    needs: [ check-permissions ]
+    uses: ./.github/workflows/check-build-tools-image.yml
+
+  build-build-tools-image:
+    needs: [ check-build-tools-image ]
+    uses: ./.github/workflows/build-build-tools-image.yml
+    with:
+      image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }}
+    secrets: inherit
+
+  test-postgres-client-libs:
+    needs: [ build-build-tools-image ]
+    runs-on: ubuntu-22.04
+
+    container:
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      options: --init --user root
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Download Neon artifact
+      uses: ./.github/actions/download
+      with:
+        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
+        path: /tmp/neon/
+        prefix: latest
+
+    - name: Create Neon Project
+      id: create-neon-project
+      uses: ./.github/actions/neon-project-create
+      with:
+        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+        postgres_version: ${{ env.DEFAULT_PG_VERSION }}
+
+    - name: Run tests
+      uses: ./.github/actions/run-python-test-set
+      with:
+        build_type: remote
+        test_selection: pg_clients
+        run_in_parallel: false
+        extra_params: -m remote_cluster
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
+      env:
+        BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
+
+    - name: Delete Neon Project
+      if: always()
+      uses: ./.github/actions/neon-project-delete
+      with:
+        project_id: ${{ steps.create-neon-project.outputs.project_id }}
+        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+
+    - name: Create Allure report
+      if: ${{ !cancelled() }}
+      id: create-allure-report
+      uses: ./.github/actions/allure-report-generate
+      with:
+        store-test-results-into-db: true
+      env:
+        REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
+
+    - name: Post to a Slack channel
+      if: github.event.schedule && failure()
+      uses: slackapi/slack-github-action@v1
+      with:
+        channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
+        slack-message: |
+          Testing Postgres clients: <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|${{ job.status }}> (<${{ steps.create-allure-report.outputs.report-url }}|test report>)
+      env:
+        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
diff --git a/.github/workflows/pg_clients.yml b/.github/workflows/pg_clients.yml
deleted file mode 100644
index dd09abddb8..0000000000
--- a/.github/workflows/pg_clients.yml
+++ /dev/null
@@ -1,98 +0,0 @@
-name: Test Postgres client libraries
-
-on:
-  schedule:
-    # * is a special character in YAML so you have to quote this string
-    #          ┌───────────── minute (0 - 59)
-    #          │ ┌───────────── hour (0 - 23)
-    #          │ │ ┌───────────── day of the month (1 - 31)
-    #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
-    #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
-    - cron:  '23 02 * * *' # run once a day, timezone is utc
-
-  workflow_dispatch:
-
-concurrency:
-  # Allow only one workflow per any non-`main` branch.
-  group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
-  cancel-in-progress: true
-
-jobs:
-  test-postgres-client-libs:
-    # TODO: switch to gen2 runner, requires docker
-    runs-on: ubuntu-22.04
-
-    env:
-      DEFAULT_PG_VERSION: 14
-      TEST_OUTPUT: /tmp/test_output
-
-    steps:
-    - name: Checkout
-      uses: actions/checkout@v4
-
-    - uses: actions/setup-python@v4
-      with:
-        python-version: 3.9
-
-    - name: Install Poetry
-      uses: snok/install-poetry@v1
-
-    - name: Cache poetry deps
-      uses: actions/cache@v4
-      with:
-        path: ~/.cache/pypoetry/virtualenvs
-        key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-ubunutu-latest-${{ hashFiles('poetry.lock') }}
-
-    - name: Install Python deps
-      shell: bash -euxo pipefail {0}
-      run: ./scripts/pysync
-
-    - name: Create Neon Project
-      id: create-neon-project
-      uses: ./.github/actions/neon-project-create
-      with:
-        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
-        postgres_version: ${{ env.DEFAULT_PG_VERSION }}
-
-    - name: Run pytest
-      env:
-        REMOTE_ENV: 1
-        BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
-        POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      shell: bash -euxo pipefail {0}
-      run: |
-        # Test framework expects we have psql binary;
-        # but since we don't really need it in this test, let's mock it
-        mkdir -p "$POSTGRES_DISTRIB_DIR/v${DEFAULT_PG_VERSION}/bin" && touch "$POSTGRES_DISTRIB_DIR/v${DEFAULT_PG_VERSION}/bin/psql";
-        ./scripts/pytest \
-          --junitxml=$TEST_OUTPUT/junit.xml \
-          --tb=short \
-          --verbose \
-          -m "remote_cluster" \
-          -rA "test_runner/pg_clients"
-
-    - name: Delete Neon Project
-      if: ${{ always() }}
-      uses: ./.github/actions/neon-project-delete
-      with:
-        project_id: ${{ steps.create-neon-project.outputs.project_id }}
-        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
-
-    # We use GitHub's action upload-artifact because `ubuntu-latest` doesn't have configured AWS CLI.
-    # It will be fixed after switching to gen2 runner
-    - name: Upload python test logs
-      if: always()
-      uses: actions/upload-artifact@v4
-      with:
-        retention-days: 7
-        name: python-test-pg_clients-${{ runner.os }}-${{ runner.arch }}-stage-logs
-        path: ${{ env.TEST_OUTPUT }}
-
-    - name: Post to a Slack channel
-      if: ${{ github.event.schedule && failure() }}
-      uses: slackapi/slack-github-action@v1
-      with:
-        channel-id: "C033QLM5P7D" # dev-staging-stream
-        slack-message: "Testing Postgres clients: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
-      env:
-        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools
index 30314376ef..4826b7914e 100644
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -1,5 +1,13 @@
 FROM debian:bullseye-slim
 
+# Use ARG as a build-time environment variable here to allow.
+# It's not supposed to be set outside.
+# Alternatively it can be obtained using the following command
+# ```
+# . /etc/os-release && echo "${VERSION_CODENAME}"
+# ```
+ARG DEBIAN_VERSION_CODENAME=bullseye
+
 # Add nonroot user
 RUN useradd -ms /bin/bash nonroot -b /home
 SHELL ["/bin/bash", "-c"]
@@ -66,12 +74,24 @@ RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/
 # LLVM
 ENV LLVM_VERSION=18
 RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
-    && echo "deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
+    && echo "deb http://apt.llvm.org/${DEBIAN_VERSION_CODENAME}/ llvm-toolchain-${DEBIAN_VERSION_CODENAME}-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
     && apt update \
     && apt install -y clang-${LLVM_VERSION} llvm-${LLVM_VERSION} \
     && bash -c 'for f in /usr/bin/clang*-${LLVM_VERSION} /usr/bin/llvm*-${LLVM_VERSION}; do ln -s "${f}" "${f%-${LLVM_VERSION}}"; done' \
     && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
 
+# Install docker
+RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg \
+    && echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/debian ${DEBIAN_VERSION_CODENAME} stable" > /etc/apt/sources.list.d/docker.list \
+    && apt update \
+    && apt install -y docker-ce docker-ce-cli \
+    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+
+# Configure sudo & docker
+RUN usermod -aG sudo nonroot && \
+    echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers && \
+    usermod -aG docker nonroot
+
 # AWS CLI
 RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "awscliv2.zip" \
     && unzip -q awscliv2.zip \
diff --git a/test_runner/pg_clients/java/jdbc/Dockerfile b/test_runner/pg_clients/java/jdbc/Dockerfile
index 7e074e07b8..7c2b1b40e0 100644
--- a/test_runner/pg_clients/java/jdbc/Dockerfile
+++ b/test_runner/pg_clients/java/jdbc/Dockerfile
@@ -1,4 +1,4 @@
-FROM openjdk:21
+FROM openjdk:22
 WORKDIR /source
 
 COPY . .
diff --git a/test_runner/pg_clients/python/pg8000/requirements.txt b/test_runner/pg_clients/python/pg8000/requirements.txt
index e086a937e6..099a4ade2c 100644
--- a/test_runner/pg_clients/python/pg8000/requirements.txt
+++ b/test_runner/pg_clients/python/pg8000/requirements.txt
@@ -1,2 +1,2 @@
-pg8000==1.30.5
+pg8000==1.31.2
 scramp>=1.4.3
diff --git a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock
index a4a2426b97..32c1c52eea 100644
--- a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock
+++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock
@@ -4,9 +4,9 @@ version = 3
 
 [[package]]
 name = "addr2line"
-version = "0.21.0"
+version = "0.22.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8a30b2e23b9e17a9f90641c7ab1549cd9b44f296d3ccbf309d2863cfe398a0cb"
+checksum = "6e4503c46a5c0c7844e948c9a4d6acd9f50cccb4de1c48eb9e291ea17470c678"
 dependencies = [
  "gimli",
 ]
@@ -19,9 +19,9 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
 
 [[package]]
 name = "async-trait"
-version = "0.1.77"
+version = "0.1.80"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c980ee35e870bd1a4d2c8294d4c04d0499e67bca1e4b5cefcc693c2fa00caea9"
+checksum = "c6fa2087f2753a7da8cc1c0dbfcf89579dd57458e36769de5ac750b4671737ca"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -30,15 +30,15 @@ dependencies = [
 
 [[package]]
 name = "autocfg"
-version = "1.1.0"
+version = "1.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
+checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0"
 
 [[package]]
 name = "backtrace"
-version = "0.3.69"
+version = "0.3.73"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2089b7e3f35b9dd2d0ed921ead4f6d318c27680d4a5bd167b3ee120edb105837"
+checksum = "5cc23269a4f8976d0a4d2e7109211a419fe30e8d88d677cd60b6bc79c5732e0a"
 dependencies = [
  "addr2line",
  "cc",
@@ -63,9 +63,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
 
 [[package]]
 name = "bitflags"
-version = "2.4.2"
+version = "2.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ed570934406eb16438a4e976b1b4500774099c13b8cb96eec99f620f05090ddf"
+checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de"
 
 [[package]]
 name = "block-buffer"
@@ -78,9 +78,9 @@ dependencies = [
 
 [[package]]
 name = "bumpalo"
-version = "3.15.3"
+version = "3.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8ea184aa71bb362a1157c896979544cc23974e08fd265f29ea96b59f0b4a555b"
+checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c"
 
 [[package]]
 name = "byteorder"
@@ -90,15 +90,15 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
 
 [[package]]
 name = "bytes"
-version = "1.5.0"
+version = "1.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223"
+checksum = "514de17de45fdb8dc022b1a7975556c53c86f9f0aa5f534b98977b171857c2c9"
 
 [[package]]
 name = "cc"
-version = "1.0.89"
+version = "1.0.101"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a0ba8f7aaa012f30d5b2861462f6708eccd49c3c39863fe083a308035f63d723"
+checksum = "ac367972e516d45567c7eafc73d24e1c193dcf200a8d94e9db7b3d38b349572d"
 
 [[package]]
 name = "cfg-if"
@@ -154,9 +154,9 @@ dependencies = [
 
 [[package]]
 name = "errno"
-version = "0.3.8"
+version = "0.3.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245"
+checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba"
 dependencies = [
  "libc",
  "windows-sys 0.52.0",
@@ -170,15 +170,9 @@ checksum = "4443176a9f2c162692bd3d352d745ef9413eec5782a80d8fd6f8a1ac692a07f7"
 
 [[package]]
 name = "fastrand"
-version = "2.0.1"
+version = "2.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5"
-
-[[package]]
-name = "finl_unicode"
-version = "1.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8fcfdc7a0362c9f4444381a9e697c79d435fe65b52a37466fc2c1184cee9edc6"
+checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a"
 
 [[package]]
 name = "foreign-types"
@@ -296,9 +290,9 @@ dependencies = [
 
 [[package]]
 name = "getrandom"
-version = "0.2.12"
+version = "0.2.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "190092ea657667030ac6a35e305e62fc4dd69fd98ac98631e5d3a2b1575a12b5"
+checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7"
 dependencies = [
  "cfg-if",
  "libc",
@@ -307,9 +301,9 @@ dependencies = [
 
 [[package]]
 name = "gimli"
-version = "0.28.1"
+version = "0.29.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253"
+checksum = "40ecd4077b5ae9fd2e9e169b102c6c330d0605168eb0e8bf79952b256dbefffd"
 
 [[package]]
 name = "hmac"
@@ -329,29 +323,23 @@ dependencies = [
  "wasm-bindgen",
 ]
 
-[[package]]
-name = "lazy_static"
-version = "1.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
-
 [[package]]
 name = "libc"
-version = "0.2.153"
+version = "0.2.155"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd"
+checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c"
 
 [[package]]
 name = "linux-raw-sys"
-version = "0.4.13"
+version = "0.4.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c"
+checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89"
 
 [[package]]
 name = "lock_api"
-version = "0.4.11"
+version = "0.4.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3c168f8615b12bc01f9c17e2eb0cc07dcae1940121185446edc3744920e8ef45"
+checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17"
 dependencies = [
  "autocfg",
  "scopeguard",
@@ -375,15 +363,15 @@ dependencies = [
 
 [[package]]
 name = "memchr"
-version = "2.7.1"
+version = "2.7.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149"
+checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
 
 [[package]]
 name = "miniz_oxide"
-version = "0.7.2"
+version = "0.7.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9d811f3e15f28568be3407c8e7fdb6514c1cda3cb30683f15b6a1a1dc4ea14a7"
+checksum = "b8a240ddb74feaf34a79a7add65a741f3167852fba007066dcac1ca548d89c08"
 dependencies = [
  "adler",
 ]
@@ -401,11 +389,10 @@ dependencies = [
 
 [[package]]
 name = "native-tls"
-version = "0.2.11"
+version = "0.2.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "07226173c32f2926027b63cce4bcd8076c3552846cbe7925f3aaffeac0a3b92e"
+checksum = "a8614eb2c83d59d1c8cc974dd3f920198647674a0a035e1af1fa58707e317466"
 dependencies = [
- "lazy_static",
  "libc",
  "log",
  "openssl",
@@ -419,9 +406,9 @@ dependencies = [
 
 [[package]]
 name = "object"
-version = "0.32.2"
+version = "0.36.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441"
+checksum = "576dfe1fc8f9df304abb159d767a29d0476f7750fbf8aa7ad07816004a207434"
 dependencies = [
  "memchr",
 ]
@@ -438,7 +425,7 @@ version = "0.10.64"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "95a0481286a310808298130d22dd1fef0fa571e05a8f44ec801801e84b216b1f"
 dependencies = [
- "bitflags 2.4.2",
+ "bitflags 2.6.0",
  "cfg-if",
  "foreign-types",
  "libc",
@@ -466,9 +453,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
 
 [[package]]
 name = "openssl-sys"
-version = "0.9.101"
+version = "0.9.102"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dda2b0f344e78efc2facf7d195d098df0dd72151b26ab98da807afc26c198dff"
+checksum = "c597637d56fbc83893a35eb0dd04b2b8e7a50c91e64e9493e398b5df4fb45fa2"
 dependencies = [
  "cc",
  "libc",
@@ -478,9 +465,9 @@ dependencies = [
 
 [[package]]
 name = "parking_lot"
-version = "0.12.1"
+version = "0.12.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f"
+checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27"
 dependencies = [
  "lock_api",
  "parking_lot_core",
@@ -488,15 +475,15 @@ dependencies = [
 
 [[package]]
 name = "parking_lot_core"
-version = "0.9.9"
+version = "0.9.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e"
+checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8"
 dependencies = [
  "cfg-if",
  "libc",
- "redox_syscall",
+ "redox_syscall 0.5.2",
  "smallvec",
- "windows-targets 0.48.5",
+ "windows-targets 0.52.5",
 ]
 
 [[package]]
@@ -525,9 +512,9 @@ dependencies = [
 
 [[package]]
 name = "pin-project-lite"
-version = "0.2.13"
+version = "0.2.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8afb450f006bf6385ca15ef45d71d2288452bc3683ce2e2cacc0d18e4be60b58"
+checksum = "bda66fc9667c18cb2758a2ac84d1167245054bcf85d5d1aaa6923f45801bdd02"
 
 [[package]]
 name = "pin-utils"
@@ -591,18 +578,18 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
 
 [[package]]
 name = "proc-macro2"
-version = "1.0.78"
+version = "1.0.86"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae"
+checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77"
 dependencies = [
  "unicode-ident",
 ]
 
 [[package]]
 name = "quote"
-version = "1.0.35"
+version = "1.0.36"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef"
+checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7"
 dependencies = [
  "proc-macro2",
 ]
@@ -646,6 +633,15 @@ dependencies = [
  "bitflags 1.3.2",
 ]
 
+[[package]]
+name = "redox_syscall"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c82cf8cff14456045f55ec4241383baeff27af886adb72ffb2162f99911de0fd"
+dependencies = [
+ "bitflags 2.6.0",
+]
+
 [[package]]
 name = "rust-neon-example"
 version = "0.1.0"
@@ -658,17 +654,17 @@ dependencies = [
 
 [[package]]
 name = "rustc-demangle"
-version = "0.1.23"
+version = "0.1.24"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76"
+checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f"
 
 [[package]]
 name = "rustix"
-version = "0.38.31"
+version = "0.38.34"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6ea3e1a662af26cd7a3ba09c0297a31af215563ecf42817c98df621387f4e949"
+checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f"
 dependencies = [
- "bitflags 2.4.2",
+ "bitflags 2.6.0",
  "errno",
  "libc",
  "linux-raw-sys",
@@ -692,11 +688,11 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
 
 [[package]]
 name = "security-framework"
-version = "2.9.2"
+version = "2.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "05b64fb303737d99b81884b2c63433e9ae28abebe5eb5045dcdd175dc2ecf4de"
+checksum = "c627723fd09706bacdb5cf41499e95098555af3c3c29d014dc3c458ef6be11c0"
 dependencies = [
- "bitflags 1.3.2",
+ "bitflags 2.6.0",
  "core-foundation",
  "core-foundation-sys",
  "libc",
@@ -705,9 +701,9 @@ dependencies = [
 
 [[package]]
 name = "security-framework-sys"
-version = "2.9.1"
+version = "2.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e932934257d3b408ed8f30db49d85ea163bfe74961f017f405b025af298f0c7a"
+checksum = "317936bbbd05227752583946b9e66d7ce3b489f84e11a94a510b4437fef407d7"
 dependencies = [
  "core-foundation-sys",
  "libc",
@@ -741,15 +737,15 @@ dependencies = [
 
 [[package]]
 name = "smallvec"
-version = "1.13.1"
+version = "1.13.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7"
+checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67"
 
 [[package]]
 name = "socket2"
-version = "0.5.6"
+version = "0.5.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "05ffd9c0a93b7543e062e759284fcf5f5e3b098501104bfbdde4d404db792871"
+checksum = "ce305eb0b4296696835b71df73eb912e0f1ffd2556a501fcede6e0c50349191c"
 dependencies = [
  "libc",
  "windows-sys 0.52.0",
@@ -757,26 +753,26 @@ dependencies = [
 
 [[package]]
 name = "stringprep"
-version = "0.1.4"
+version = "0.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bb41d74e231a107a1b4ee36bd1214b11285b77768d2e3824aedafa988fd36ee6"
+checksum = "7b4df3d392d81bd458a8a621b8bffbd2302a12ffe288a9d931670948749463b1"
 dependencies = [
- "finl_unicode",
  "unicode-bidi",
  "unicode-normalization",
+ "unicode-properties",
 ]
 
 [[package]]
 name = "subtle"
-version = "2.5.0"
+version = "2.6.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"
+checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292"
 
 [[package]]
 name = "syn"
-version = "2.0.52"
+version = "2.0.68"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b699d15b36d1f02c3e7c69f8ffef53de37aefae075d8488d4ba1a7788d574a07"
+checksum = "901fa70d88b9d6c98022e23b4136f9f3e54e4662c3bc1bd1d84a42a9a0f0c1e9"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -797,9 +793,9 @@ dependencies = [
 
 [[package]]
 name = "tinyvec"
-version = "1.6.0"
+version = "1.6.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50"
+checksum = "c55115c6fbe2d2bef26eb09ad74bde02d8255476fc0c7b515ef09fbb35742d82"
 dependencies = [
  "tinyvec_macros",
 ]
@@ -812,9 +808,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
 
 [[package]]
 name = "tokio"
-version = "1.36.0"
+version = "1.38.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "61285f6515fa018fb2d1e46eb21223fff441ee8db5d0f1435e8ab4f5cdb80931"
+checksum = "ba4f4a02a7a80d6f274636f0aa95c7e383b912d41fe721a31f29e29698585a4a"
 dependencies = [
  "backtrace",
  "bytes",
@@ -828,9 +824,9 @@ dependencies = [
 
 [[package]]
 name = "tokio-macros"
-version = "2.2.0"
+version = "2.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b"
+checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -875,35 +871,15 @@ dependencies = [
 
 [[package]]
 name = "tokio-util"
-version = "0.7.10"
+version = "0.7.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5419f34732d9eb6ee4c3578b7989078579b7f039cbbb9ca2c4da015749371e15"
+checksum = "9cf6b47b3771c49ac75ad09a6162f53ad4b8088b76ac60e8ec1455b31a189fe1"
 dependencies = [
  "bytes",
  "futures-core",
  "futures-sink",
  "pin-project-lite",
  "tokio",
- "tracing",
-]
-
-[[package]]
-name = "tracing"
-version = "0.1.40"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef"
-dependencies = [
- "pin-project-lite",
- "tracing-core",
-]
-
-[[package]]
-name = "tracing-core"
-version = "0.1.32"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54"
-dependencies = [
- "once_cell",
 ]
 
 [[package]]
@@ -933,6 +909,12 @@ dependencies = [
  "tinyvec",
 ]
 
+[[package]]
+name = "unicode-properties"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e4259d9d4425d9f0661581b804cb85fe66a4c631cadd8f490d1c13a35d5d9291"
+
 [[package]]
 name = "vcpkg"
 version = "0.2.15"
@@ -1023,11 +1005,11 @@ dependencies = [
 
 [[package]]
 name = "whoami"
-version = "1.5.0"
+version = "1.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0fec781d48b41f8163426ed18e8fc2864c12937df9ce54c88ede7bd47270893e"
+checksum = "a44ab49fad634e88f55bf8f9bb3abd2f27d7204172a112c7c9987e01c1c94ea9"
 dependencies = [
- "redox_syscall",
+ "redox_syscall 0.4.1",
  "wasite",
  "web-sys",
 ]
@@ -1047,7 +1029,7 @@ version = "0.52.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
 dependencies = [
- "windows-targets 0.52.4",
+ "windows-targets 0.52.5",
 ]
 
 [[package]]
@@ -1067,17 +1049,18 @@ dependencies = [
 
 [[package]]
 name = "windows-targets"
-version = "0.52.4"
+version = "0.52.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b"
+checksum = "6f0713a46559409d202e70e28227288446bf7841d3211583a4b53e3f6d96e7eb"
 dependencies = [
- "windows_aarch64_gnullvm 0.52.4",
- "windows_aarch64_msvc 0.52.4",
- "windows_i686_gnu 0.52.4",
- "windows_i686_msvc 0.52.4",
- "windows_x86_64_gnu 0.52.4",
- "windows_x86_64_gnullvm 0.52.4",
- "windows_x86_64_msvc 0.52.4",
+ "windows_aarch64_gnullvm 0.52.5",
+ "windows_aarch64_msvc 0.52.5",
+ "windows_i686_gnu 0.52.5",
+ "windows_i686_gnullvm",
+ "windows_i686_msvc 0.52.5",
+ "windows_x86_64_gnu 0.52.5",
+ "windows_x86_64_gnullvm 0.52.5",
+ "windows_x86_64_msvc 0.52.5",
 ]
 
 [[package]]
@@ -1088,9 +1071,9 @@ checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8"
 
 [[package]]
 name = "windows_aarch64_gnullvm"
-version = "0.52.4"
+version = "0.52.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9"
+checksum = "7088eed71e8b8dda258ecc8bac5fb1153c5cffaf2578fc8ff5d61e23578d3263"
 
 [[package]]
 name = "windows_aarch64_msvc"
@@ -1100,9 +1083,9 @@ checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc"
 
 [[package]]
 name = "windows_aarch64_msvc"
-version = "0.52.4"
+version = "0.52.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675"
+checksum = "9985fd1504e250c615ca5f281c3f7a6da76213ebd5ccc9561496568a2752afb6"
 
 [[package]]
 name = "windows_i686_gnu"
@@ -1112,9 +1095,15 @@ checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e"
 
 [[package]]
 name = "windows_i686_gnu"
-version = "0.52.4"
+version = "0.52.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3"
+checksum = "88ba073cf16d5372720ec942a8ccbf61626074c6d4dd2e745299726ce8b89670"
+
+[[package]]
+name = "windows_i686_gnullvm"
+version = "0.52.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "87f4261229030a858f36b459e748ae97545d6f1ec60e5e0d6a3d32e0dc232ee9"
 
 [[package]]
 name = "windows_i686_msvc"
@@ -1124,9 +1113,9 @@ checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406"
 
 [[package]]
 name = "windows_i686_msvc"
-version = "0.52.4"
+version = "0.52.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02"
+checksum = "db3c2bf3d13d5b658be73463284eaf12830ac9a26a90c717b7f771dfe97487bf"
 
 [[package]]
 name = "windows_x86_64_gnu"
@@ -1136,9 +1125,9 @@ checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e"
 
 [[package]]
 name = "windows_x86_64_gnu"
-version = "0.52.4"
+version = "0.52.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03"
+checksum = "4e4246f76bdeff09eb48875a0fd3e2af6aada79d409d33011886d3e1581517d9"
 
 [[package]]
 name = "windows_x86_64_gnullvm"
@@ -1148,9 +1137,9 @@ checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc"
 
 [[package]]
 name = "windows_x86_64_gnullvm"
-version = "0.52.4"
+version = "0.52.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177"
+checksum = "852298e482cd67c356ddd9570386e2862b5673c85bd5f88df9ab6802b334c596"
 
 [[package]]
 name = "windows_x86_64_msvc"
@@ -1160,6 +1149,6 @@ checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538"
 
 [[package]]
 name = "windows_x86_64_msvc"
-version = "0.52.4"
+version = "0.52.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8"
+checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0"
diff --git a/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml b/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml
index 0f420e5b06..27d01810bd 100644
--- a/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml
+++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml
@@ -7,9 +7,9 @@ publish = false
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
-native-tls = "0.2.11"
+native-tls = "0.2.12"
 postgres-native-tls = "0.5.0"
-tokio = { version = "1.36", features=["rt", "macros"] }
+tokio = { version = "1.38", features=["rt", "macros"] }
 tokio-postgres = "0.7.10"
 
 
diff --git a/test_runner/pg_clients/rust/tokio-postgres/Dockerfile b/test_runner/pg_clients/rust/tokio-postgres/Dockerfile
index 8611e66cbb..3e214de785 100644
--- a/test_runner/pg_clients/rust/tokio-postgres/Dockerfile
+++ b/test_runner/pg_clients/rust/tokio-postgres/Dockerfile
@@ -1,4 +1,4 @@
-FROM rust:1.76
+FROM rust:1.79
 WORKDIR /source
 
 COPY . .
diff --git a/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile b/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile
index 0402838820..6006e61ee2 100644
--- a/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile
+++ b/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile
@@ -1,11 +1,11 @@
-FROM swift:5.9 AS build
+FROM swift:5.10 AS build
 RUN apt-get -q update && apt-get -q install -y libssl-dev
 WORKDIR /source
 
 COPY . .
 RUN swift build --configuration release
 
-FROM swift:5.9
+FROM swift:5.10
 WORKDIR /app
 COPY --from=build /source/.build/release .
 CMD ["/app/PostgresClientKitExample"]
diff --git a/test_runner/pg_clients/swift/PostgresClientKitExample/Package.resolved b/test_runner/pg_clients/swift/PostgresClientKitExample/Package.resolved
index 767443a9dd..6e8613095f 100644
--- a/test_runner/pg_clients/swift/PostgresClientKitExample/Package.resolved
+++ b/test_runner/pg_clients/swift/PostgresClientKitExample/Package.resolved
@@ -1,4 +1,5 @@
 {
+  "originHash" : "8eff8c577ba246ce7824d3434839acefced2b1a1d2b1ad700554502538a50558",
   "pins" : [
     {
       "identity" : "bluesocket",
@@ -18,15 +19,6 @@
         "version" : "2.0.2"
       }
     },
-    {
-      "identity" : "openssl",
-      "kind" : "remoteSourceControl",
-      "location" : "https://github.com/Kitura/OpenSSL.git",
-      "state" : {
-        "revision" : "5dc8cb4f971135c17343e3c6df4f28904a0600e2",
-        "version" : "2.3.1"
-      }
-    },
     {
       "identity" : "postgresclientkit",
       "kind" : "remoteSourceControl",
@@ -37,5 +29,5 @@
       }
     }
   ],
-  "version" : 2
+  "version" : 3
 }
diff --git a/test_runner/pg_clients/swift/PostgresClientKitExample/Package.swift b/test_runner/pg_clients/swift/PostgresClientKitExample/Package.swift
index 48320dd023..a66d09c542 100644
--- a/test_runner/pg_clients/swift/PostgresClientKitExample/Package.swift
+++ b/test_runner/pg_clients/swift/PostgresClientKitExample/Package.swift
@@ -1,4 +1,4 @@
-// swift-tools-version:5.8
+// swift-tools-version:5.10
 import PackageDescription
 
 let package = Package(
diff --git a/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile b/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile
index 9130e0973f..d6815fbb5f 100644
--- a/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile
+++ b/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile
@@ -1,10 +1,10 @@
-FROM swift:5.9 AS build
+FROM swift:5.10 AS build
 WORKDIR /source
 
 COPY . .
 RUN swift build --configuration release
 
-FROM swift:5.9
+FROM swift:5.10
 WORKDIR /app
 COPY --from=build /source/.build/release .
 CMD ["/app/PostgresNIOExample"]
diff --git a/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved b/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved
index 023e03a7b1..0e5dfdafcb 100644
--- a/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved
+++ b/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved
@@ -1,12 +1,22 @@
 {
+  "originHash" : "11b5dcece349a3e56a7a9a7d0af6d0f5b83dff321b43124a01b158ed7aac5302",
   "pins" : [
     {
       "identity" : "postgres-nio",
       "kind" : "remoteSourceControl",
       "location" : "https://github.com/vapor/postgres-nio.git",
       "state" : {
-        "revision" : "69ccfdf4c80144d845e3b439961b7ec6cd7ae33f",
-        "version" : "1.20.2"
+        "revision" : "5c268768890b062803a49f1358becc478f954265",
+        "version" : "1.21.5"
+      }
+    },
+    {
+      "identity" : "swift-async-algorithms",
+      "kind" : "remoteSourceControl",
+      "location" : "https://github.com/apple/swift-async-algorithms.git",
+      "state" : {
+        "revision" : "da4e36f86544cdf733a40d59b3a2267e3a7bbf36",
+        "version" : "1.0.0"
       }
     },
     {
@@ -81,6 +91,15 @@
         "version" : "1.20.1"
       }
     },
+    {
+      "identity" : "swift-service-lifecycle",
+      "kind" : "remoteSourceControl",
+      "location" : "https://github.com/swift-server/swift-service-lifecycle.git",
+      "state" : {
+        "revision" : "d58e6bf2b1ae2884cf204a8b5bcaaa7aae3c1ff0",
+        "version" : "2.6.0"
+      }
+    },
     {
       "identity" : "swift-system",
       "kind" : "remoteSourceControl",
@@ -91,5 +110,5 @@
       }
     }
   ],
-  "version" : 2
+  "version" : 3
 }
diff --git a/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift b/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift
index 637eb4bc9d..20bb10f76c 100644
--- a/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift
+++ b/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift
@@ -1,10 +1,10 @@
-// swift-tools-version:5.9
+// swift-tools-version:5.10
 import PackageDescription
 
 let package = Package(
     name: "PostgresNIOExample",
     dependencies: [
-        .package(url: "https://github.com/vapor/postgres-nio.git", from: "1.20.2")
+        .package(url: "https://github.com/vapor/postgres-nio.git", from: "1.21.5")
     ],
     targets: [
         .executableTarget(
diff --git a/test_runner/pg_clients/typescript/postgresql-client/Dockerfile b/test_runner/pg_clients/typescript/postgresql-client/Dockerfile
index 004b383749..45e8753f7e 100644
--- a/test_runner/pg_clients/typescript/postgresql-client/Dockerfile
+++ b/test_runner/pg_clients/typescript/postgresql-client/Dockerfile
@@ -1,4 +1,4 @@
-FROM node:21
+FROM node:22
 WORKDIR /source
 
 COPY . .
diff --git a/test_runner/pg_clients/typescript/postgresql-client/package-lock.json b/test_runner/pg_clients/typescript/postgresql-client/package-lock.json
index b4f8587eac..19311808b6 100644
--- a/test_runner/pg_clients/typescript/postgresql-client/package-lock.json
+++ b/test_runner/pg_clients/typescript/postgresql-client/package-lock.json
@@ -5,7 +5,7 @@
   "packages": {
     "": {
       "dependencies": {
-        "postgresql-client": "2.10.5"
+        "postgresql-client": "2.11.0"
       }
     },
     "node_modules/doublylinked": {
@@ -42,9 +42,10 @@
       }
     },
     "node_modules/postgresql-client": {
-      "version": "2.10.5",
-      "resolved": "https://registry.npmjs.org/postgresql-client/-/postgresql-client-2.10.5.tgz",
-      "integrity": "sha512-R3EC16pUdbgrzk1J2MQLj7jY2TepWurJHoK90nOeLZj1XTpL/+wL1VCneTmclRVKDuKVjFHr+FASV47KrLpAbw==",
+      "version": "2.11.0",
+      "resolved": "https://registry.npmjs.org/postgresql-client/-/postgresql-client-2.11.0.tgz",
+      "integrity": "sha512-QSPHcWVaiBG+JyASaDojOXvhRmsc2n8j2COdIjUDENFAtFls16Zy240asY2ENzZRQJUMAA8vpR8w4SAdI8jdbw==",
+      "license": "MIT",
       "dependencies": {
         "doublylinked": "^2.5.4",
         "lightning-pool": "^4.2.2",
@@ -55,8 +56,7 @@
         "putil-varhelpers": "^1.6.5"
       },
       "engines": {
-        "node": ">=16.0",
-        "npm": ">=7.0.0"
+        "node": ">=16.0"
       }
     },
     "node_modules/power-tasks": {
diff --git a/test_runner/pg_clients/typescript/postgresql-client/package.json b/test_runner/pg_clients/typescript/postgresql-client/package.json
index 07ec100d0d..d2bba23d29 100644
--- a/test_runner/pg_clients/typescript/postgresql-client/package.json
+++ b/test_runner/pg_clients/typescript/postgresql-client/package.json
@@ -1,6 +1,6 @@
 {
   "type": "module",
   "dependencies": {
-    "postgresql-client": "2.10.5"
+    "postgresql-client": "2.11.0"
   }
 }
diff --git a/test_runner/pg_clients/typescript/serverless-driver/Dockerfile b/test_runner/pg_clients/typescript/serverless-driver/Dockerfile
index 004b383749..45e8753f7e 100644
--- a/test_runner/pg_clients/typescript/serverless-driver/Dockerfile
+++ b/test_runner/pg_clients/typescript/serverless-driver/Dockerfile
@@ -1,4 +1,4 @@
-FROM node:21
+FROM node:22
 WORKDIR /source
 
 COPY . .
diff --git a/test_runner/pg_clients/typescript/serverless-driver/package-lock.json b/test_runner/pg_clients/typescript/serverless-driver/package-lock.json
index f3b456f1ed..7f3f7f2e84 100644
--- a/test_runner/pg_clients/typescript/serverless-driver/package-lock.json
+++ b/test_runner/pg_clients/typescript/serverless-driver/package-lock.json
@@ -5,96 +5,138 @@
   "packages": {
     "": {
       "dependencies": {
-        "@neondatabase/serverless": "0.9.0",
+        "@neondatabase/serverless": "0.9.4",
         "ws": "8.17.1"
       }
     },
     "node_modules/@neondatabase/serverless": {
-      "version": "0.9.0",
-      "resolved": "https://registry.npmjs.org/@neondatabase/serverless/-/serverless-0.9.0.tgz",
-      "integrity": "sha512-mmJnUAzlzvxNSZuuhI6kgJjH+JgFdBMYUWxihtq/nj0Tjt+Y5UU3W+SvRFoucnd5NObYkuLYQzk+zV5DGFKGJg==",
+      "version": "0.9.4",
+      "resolved": "https://registry.npmjs.org/@neondatabase/serverless/-/serverless-0.9.4.tgz",
+      "integrity": "sha512-D0AXgJh6xkf+XTlsO7iwE2Q1w8981E1cLCPAALMU2YKtkF/1SF6BiAzYARZFYo175ON+b1RNIy9TdSFHm5nteg==",
+      "license": "MIT",
       "dependencies": {
-        "@types/pg": "8.6.6"
+        "@types/pg": "8.11.6"
       }
     },
     "node_modules/@types/node": {
-      "version": "18.16.3",
-      "resolved": "https://registry.npmjs.org/@types/node/-/node-18.16.3.tgz",
-      "integrity": "sha512-OPs5WnnT1xkCBiuQrZA4+YAV4HEJejmHneyraIaxsbev5yCEr6KMwINNFP9wQeFIw8FWcoTqF3vQsa5CDaI+8Q=="
+      "version": "20.14.9",
+      "resolved": "https://registry.npmjs.org/@types/node/-/node-20.14.9.tgz",
+      "integrity": "sha512-06OCtnTXtWOZBJlRApleWndH4JsRVs1pDCc8dLSQp+7PpUpX3ePdHyeNSFTeSe7FtKyQkrlPvHwJOW3SLd8Oyg==",
+      "license": "MIT",
+      "dependencies": {
+        "undici-types": "~5.26.4"
+      }
     },
     "node_modules/@types/pg": {
-      "version": "8.6.6",
-      "resolved": "https://registry.npmjs.org/@types/pg/-/pg-8.6.6.tgz",
-      "integrity": "sha512-O2xNmXebtwVekJDD+02udOncjVcMZQuTEQEMpKJ0ZRf5E7/9JJX3izhKUcUifBkyKpljyUM6BTgy2trmviKlpw==",
+      "version": "8.11.6",
+      "resolved": "https://registry.npmjs.org/@types/pg/-/pg-8.11.6.tgz",
+      "integrity": "sha512-/2WmmBXHLsfRqzfHW7BNZ8SbYzE8OSk7i3WjFYvfgRHj7S1xj+16Je5fUKv3lVdVzk/zn9TXOqf+avFCFIE0yQ==",
+      "license": "MIT",
       "dependencies": {
         "@types/node": "*",
         "pg-protocol": "*",
-        "pg-types": "^2.2.0"
+        "pg-types": "^4.0.1"
       }
     },
+    "node_modules/obuf": {
+      "version": "1.1.2",
+      "resolved": "https://registry.npmjs.org/obuf/-/obuf-1.1.2.tgz",
+      "integrity": "sha512-PX1wu0AmAdPqOL1mWhqmlOd8kOIZQwGZw6rh7uby9fTc5lhaOWFLX3I6R1hrF9k3zUY40e6igsLGkDXK92LJNg==",
+      "license": "MIT"
+    },
     "node_modules/pg-int8": {
       "version": "1.0.1",
       "resolved": "https://registry.npmjs.org/pg-int8/-/pg-int8-1.0.1.tgz",
       "integrity": "sha512-WCtabS6t3c8SkpDBUlb1kjOs7l66xsGdKpIPZsg4wR+B3+u9UAum2odSsF9tnvxg80h4ZxLWMy4pRjOsFIqQpw==",
+      "license": "ISC",
       "engines": {
         "node": ">=4.0.0"
       }
     },
-    "node_modules/pg-protocol": {
-      "version": "1.6.0",
-      "resolved": "https://registry.npmjs.org/pg-protocol/-/pg-protocol-1.6.0.tgz",
-      "integrity": "sha512-M+PDm637OY5WM307051+bsDia5Xej6d9IR4GwJse1qA1DIhiKlksvrneZOYQq42OM+spubpcNYEo2FcKQrDk+Q=="
-    },
-    "node_modules/pg-types": {
-      "version": "2.2.0",
-      "resolved": "https://registry.npmjs.org/pg-types/-/pg-types-2.2.0.tgz",
-      "integrity": "sha512-qTAAlrEsl8s4OiEQY69wDvcMIdQN6wdz5ojQiOy6YRMuynxenON0O5oCpJI6lshc6scgAY8qvJ2On/p+CXY0GA==",
-      "dependencies": {
-        "pg-int8": "1.0.1",
-        "postgres-array": "~2.0.0",
-        "postgres-bytea": "~1.0.0",
-        "postgres-date": "~1.0.4",
-        "postgres-interval": "^1.1.0"
-      },
+    "node_modules/pg-numeric": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/pg-numeric/-/pg-numeric-1.0.2.tgz",
+      "integrity": "sha512-BM/Thnrw5jm2kKLE5uJkXqqExRUY/toLHda65XgFTBTFYZyopbKjBe29Ii3RbkvlsMoFwD+tHeGaCjjv0gHlyw==",
+      "license": "ISC",
       "engines": {
         "node": ">=4"
       }
     },
+    "node_modules/pg-protocol": {
+      "version": "1.6.1",
+      "resolved": "https://registry.npmjs.org/pg-protocol/-/pg-protocol-1.6.1.tgz",
+      "integrity": "sha512-jPIlvgoD63hrEuihvIg+tJhoGjUsLPn6poJY9N5CnlPd91c2T18T/9zBtLxZSb1EhYxBRoZJtzScCaWlYLtktg==",
+      "license": "MIT"
+    },
+    "node_modules/pg-types": {
+      "version": "4.0.2",
+      "resolved": "https://registry.npmjs.org/pg-types/-/pg-types-4.0.2.tgz",
+      "integrity": "sha512-cRL3JpS3lKMGsKaWndugWQoLOCoP+Cic8oseVcbr0qhPzYD5DWXK+RZ9LY9wxRf7RQia4SCwQlXk0q6FCPrVng==",
+      "license": "MIT",
+      "dependencies": {
+        "pg-int8": "1.0.1",
+        "pg-numeric": "1.0.2",
+        "postgres-array": "~3.0.1",
+        "postgres-bytea": "~3.0.0",
+        "postgres-date": "~2.1.0",
+        "postgres-interval": "^3.0.0",
+        "postgres-range": "^1.1.1"
+      },
+      "engines": {
+        "node": ">=10"
+      }
+    },
     "node_modules/postgres-array": {
-      "version": "2.0.0",
-      "resolved": "https://registry.npmjs.org/postgres-array/-/postgres-array-2.0.0.tgz",
-      "integrity": "sha512-VpZrUqU5A69eQyW2c5CA1jtLecCsN2U/bD6VilrFDWq5+5UIEVO7nazS3TEcHf1zuPYO/sqGvUvW62g86RXZuA==",
+      "version": "3.0.2",
+      "resolved": "https://registry.npmjs.org/postgres-array/-/postgres-array-3.0.2.tgz",
+      "integrity": "sha512-6faShkdFugNQCLwucjPcY5ARoW1SlbnrZjmGl0IrrqewpvxvhSLHimCVzqeuULCbG0fQv7Dtk1yDbG3xv7Veog==",
+      "license": "MIT",
       "engines": {
-        "node": ">=4"
+        "node": ">=12"
       }
     },
     "node_modules/postgres-bytea": {
-      "version": "1.0.0",
-      "resolved": "https://registry.npmjs.org/postgres-bytea/-/postgres-bytea-1.0.0.tgz",
-      "integrity": "sha512-xy3pmLuQqRBZBXDULy7KbaitYqLcmxigw14Q5sj8QBVLqEwXfeybIKVWiqAXTlcvdvb0+xkOtDbfQMOf4lST1w==",
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/postgres-bytea/-/postgres-bytea-3.0.0.tgz",
+      "integrity": "sha512-CNd4jim9RFPkObHSjVHlVrxoVQXz7quwNFpz7RY1okNNme49+sVyiTvTRobiLV548Hx/hb1BG+iE7h9493WzFw==",
+      "license": "MIT",
+      "dependencies": {
+        "obuf": "~1.1.2"
+      },
       "engines": {
-        "node": ">=0.10.0"
+        "node": ">= 6"
       }
     },
     "node_modules/postgres-date": {
-      "version": "1.0.7",
-      "resolved": "https://registry.npmjs.org/postgres-date/-/postgres-date-1.0.7.tgz",
-      "integrity": "sha512-suDmjLVQg78nMK2UZ454hAG+OAW+HQPZ6n++TNDUX+L0+uUlLywnoxJKDou51Zm+zTCjrCl0Nq6J9C5hP9vK/Q==",
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/postgres-date/-/postgres-date-2.1.0.tgz",
+      "integrity": "sha512-K7Juri8gtgXVcDfZttFKVmhglp7epKb1K4pgrkLxehjqkrgPhfG6OO8LHLkfaqkbpjNRnra018XwAr1yQFWGcA==",
+      "license": "MIT",
       "engines": {
-        "node": ">=0.10.0"
+        "node": ">=12"
       }
     },
     "node_modules/postgres-interval": {
-      "version": "1.2.0",
-      "resolved": "https://registry.npmjs.org/postgres-interval/-/postgres-interval-1.2.0.tgz",
-      "integrity": "sha512-9ZhXKM/rw350N1ovuWHbGxnGh/SNJ4cnxHiM0rxE4VN41wsg8P8zWn9hv/buK00RP4WvlOyr/RBDiptyxVbkZQ==",
-      "dependencies": {
-        "xtend": "^4.0.0"
-      },
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/postgres-interval/-/postgres-interval-3.0.0.tgz",
+      "integrity": "sha512-BSNDnbyZCXSxgA+1f5UU2GmwhoI0aU5yMxRGO8CdFEcY2BQF9xm/7MqKnYoM1nJDk8nONNWDk9WeSmePFhQdlw==",
+      "license": "MIT",
       "engines": {
-        "node": ">=0.10.0"
+        "node": ">=12"
       }
     },
+    "node_modules/postgres-range": {
+      "version": "1.1.4",
+      "resolved": "https://registry.npmjs.org/postgres-range/-/postgres-range-1.1.4.tgz",
+      "integrity": "sha512-i/hbxIE9803Alj/6ytL7UHQxRvZkI9O4Sy+J3HGc4F4oo/2eQAjTSNJ0bfxyse3bH0nuVesCk+3IRLaMtG3H6w==",
+      "license": "MIT"
+    },
+    "node_modules/undici-types": {
+      "version": "5.26.5",
+      "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz",
+      "integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==",
+      "license": "MIT"
+    },
     "node_modules/ws": {
       "version": "8.17.1",
       "resolved": "https://registry.npmjs.org/ws/-/ws-8.17.1.tgz",
@@ -114,14 +156,6 @@
           "optional": true
         }
       }
-    },
-    "node_modules/xtend": {
-      "version": "4.0.2",
-      "resolved": "https://registry.npmjs.org/xtend/-/xtend-4.0.2.tgz",
-      "integrity": "sha512-LKYU1iAXJXUgAXn9URjiu+MWhyUXHsvfp7mcuYm9dSUKK0/CjtrUwFAxD82/mCWbtLsGjFIad0wIsod4zrTAEQ==",
-      "engines": {
-        "node": ">=0.4"
-      }
     }
   }
 }
diff --git a/test_runner/pg_clients/typescript/serverless-driver/package.json b/test_runner/pg_clients/typescript/serverless-driver/package.json
index 3ae7a8a6cf..f791d184c5 100644
--- a/test_runner/pg_clients/typescript/serverless-driver/package.json
+++ b/test_runner/pg_clients/typescript/serverless-driver/package.json
@@ -1,7 +1,7 @@
 {
   "type": "module",
   "dependencies": {
-    "@neondatabase/serverless": "0.9.0",
+    "@neondatabase/serverless": "0.9.4",
     "ws": "8.17.1"
   }
 }

From 7e2a3d2728245d7d34e443b4a08ea041263f4f36 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 4 Jul 2024 15:05:41 +0100
Subject: [PATCH 174/412] pageserver: downgrade stale generation messages to
 INFO (#8256)

## Problem

When generations were new, these messages were an important way of
noticing if something unexpected was going on. We found some real issues
when investigating tests that unexpectedly tripped them.

At time has gone on, this code is now pretty battle-tested, and as we do
more live migrations etc, it's fairly normal to see the occasional
message from a node with a stale generation.

At this point the cognitive load on developers to selectively allow-list
these logs outweighs the benefit of having them at warn severity.

Closes: https://github.com/neondatabase/neon/issues/8080

## Summary of changes

- Downgrade "Dropped remote consistent LSN updates" and "Dropping stale
deletions" messages to INFO
- Remove all the allow-list entries for these logs.
---
 pageserver/src/deletion_queue/validator.rs        |  4 ++--
 test_runner/fixtures/pageserver/many_tenants.py   |  4 ----
 .../interactive/test_many_small_tenants.py        |  4 ----
 .../pagebench/test_large_slru_basebackup.py       |  4 ----
 ...server_max_throughput_getpage_at_latest_lsn.py |  4 ----
 .../performance/test_storage_controller_scale.py  |  8 --------
 test_runner/regress/test_attach_tenant_config.py  |  6 ------
 test_runner/regress/test_change_pageserver.py     |  5 -----
 test_runner/regress/test_layers_from_future.py    |  3 ---
 .../regress/test_pageserver_generations.py        | 13 -------------
 test_runner/regress/test_pageserver_secondary.py  |  3 ---
 test_runner/regress/test_remote_storage.py        |  7 -------
 test_runner/regress/test_sharding.py              |  4 ----
 test_runner/regress/test_storage_controller.py    | 15 ---------------
 test_runner/regress/test_tenant_conf.py           |  4 ----
 test_runner/regress/test_tenant_detach.py         | 12 ------------
 test_runner/regress/test_tenant_relocation.py     |  2 --
 test_runner/regress/test_tenants.py               |  4 ----
 18 files changed, 2 insertions(+), 104 deletions(-)

diff --git a/pageserver/src/deletion_queue/validator.rs b/pageserver/src/deletion_queue/validator.rs
index bf06c78e67..d215fd2b7d 100644
--- a/pageserver/src/deletion_queue/validator.rs
+++ b/pageserver/src/deletion_queue/validator.rs
@@ -190,7 +190,7 @@ where
                 }
             } else {
                 // If we failed validation, then do not apply any of the projected updates
-                warn!("Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}", tenant_lsn_state.generation);
+                info!("Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}", tenant_lsn_state.generation);
                 metrics::DELETION_QUEUE.dropped_lsn_updates.inc();
             }
         }
@@ -225,7 +225,7 @@ where
                     && (tenant.generation == *validated_generation);
 
                 if !this_list_valid {
-                    warn!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation);
+                    info!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation);
                     metrics::DELETION_QUEUE.keys_dropped.inc_by(tenant.len() as u64);
                     mutated = true;
                 } else {
diff --git a/test_runner/fixtures/pageserver/many_tenants.py b/test_runner/fixtures/pageserver/many_tenants.py
index 8730d8ef75..c437258c6f 100644
--- a/test_runner/fixtures/pageserver/many_tenants.py
+++ b/test_runner/fixtures/pageserver/many_tenants.py
@@ -42,10 +42,6 @@ def single_timeline(
 
     log.info("detach template tenant form pageserver")
     env.pageserver.tenant_detach(template_tenant)
-    env.pageserver.allowed_errors.append(
-        # tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely
-        ".*Dropped remote consistent LSN updates.*",
-    )
 
     log.info(f"duplicating template tenant {ncopies} times in S3")
     tenants = fixtures.pageserver.remote_storage.duplicate_tenant(env, template_tenant, ncopies)
diff --git a/test_runner/performance/pageserver/interactive/test_many_small_tenants.py b/test_runner/performance/pageserver/interactive/test_many_small_tenants.py
index 0ff9c8fdaa..33848b06d3 100644
--- a/test_runner/performance/pageserver/interactive/test_many_small_tenants.py
+++ b/test_runner/performance/pageserver/interactive/test_many_small_tenants.py
@@ -55,10 +55,6 @@ def setup_env(
         }
         template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True)
         env.pageserver.tenant_detach(template_tenant)
-        env.pageserver.allowed_errors.append(
-            # tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely
-            ".*Dropped remote consistent LSN updates.*",
-        )
         env.pageserver.tenant_attach(template_tenant, config)
         ep = env.endpoints.create_start("main", tenant_id=template_tenant)
         ep.safe_psql("create table foo(b text)")
diff --git a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
index b66db4d0ab..b41ae60197 100644
--- a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
+++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
@@ -86,10 +86,6 @@ def setup_tenant_template(env: NeonEnv, n_txns: int):
 
     template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True)
     env.pageserver.tenant_detach(template_tenant)
-    env.pageserver.allowed_errors.append(
-        # tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely
-        ".*Dropped remote consistent LSN updates.*",
-    )
     env.pageserver.tenant_attach(template_tenant, config)
 
     ps_http = env.pageserver.http_client()
diff --git a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
index a8f48fe675..60861cf939 100644
--- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
+++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
@@ -164,10 +164,6 @@ def setup_tenant_template(env: NeonEnv, pg_bin: PgBin, scale: int):
     }
     template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True)
     env.pageserver.tenant_detach(template_tenant)
-    env.pageserver.allowed_errors.append(
-        # tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely
-        ".*Dropped remote consistent LSN updates.*",
-    )
     env.pageserver.tenant_attach(template_tenant, config)
     ps_http = env.pageserver.http_client()
     with env.endpoints.create_start("main", tenant_id=template_tenant) as ep:
diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py
index d65a66b010..3a6113706f 100644
--- a/test_runner/performance/test_storage_controller_scale.py
+++ b/test_runner/performance/test_storage_controller_scale.py
@@ -132,14 +132,6 @@ def test_storage_controller_many_tenants(
     )
 
     for ps in env.pageservers:
-        # This can happen because when we do a loop over all pageservers and mark them offline/active,
-        # reconcilers might get cancelled, and the next reconcile can follow a not-so-elegant path of
-        # bumping generation before other attachments are detached.
-        #
-        # We could clean this up by making reconcilers respect the .observed of their predecessor, if
-        # we spawn with a wait for the predecessor.
-        ps.allowed_errors.append(".*Dropped remote consistent LSN updates.*")
-
         # Storage controller is allowed to drop pageserver requests when the cancellation token
         # for a Reconciler fires.
         ps.allowed_errors.append(".*request was dropped before completing.*")
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index e117c2140f..f2ee2b70aa 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -21,8 +21,6 @@ def positive_env(neon_env_builder: NeonEnvBuilder) -> NeonEnv:
         [
             # eviction might be the first one after an attach to access the layers
             ".*unexpectedly on-demand downloading remote layer .* for task kind Eviction",
-            # detach can happen before we get to validate the generation number
-            ".*deletion backend: Dropped remote consistent LSN updates for tenant.*",
         ]
     )
     assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
@@ -58,10 +56,6 @@ def negative_env(neon_env_builder: NeonEnvBuilder) -> Generator[NegativeTests, N
 
     env.pageserver.allowed_errors.extend(
         [
-            # This fixture detaches the tenant, and tests using it will tend to re-attach it
-            # shortly after. There may be un-processed deletion_queue validations from the
-            # initial attachment
-            ".*Dropped remote consistent LSN updates.*",
             # This fixture is for tests that will intentionally generate 400 responses
             ".*Error processing HTTP request: Bad request",
         ]
diff --git a/test_runner/regress/test_change_pageserver.py b/test_runner/regress/test_change_pageserver.py
index 97ab69049d..4d2cdb8e32 100644
--- a/test_runner/regress/test_change_pageserver.py
+++ b/test_runner/regress/test_change_pageserver.py
@@ -14,11 +14,6 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
     )
     env = neon_env_builder.init_start()
 
-    for pageserver in env.pageservers:
-        # This test dual-attaches a tenant, one of the pageservers will therefore
-        # be running with a stale generation.
-        pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*")
-
     env.neon_cli.create_branch("test_change_pageserver")
     endpoint = env.endpoints.create_start("test_change_pageserver")
 
diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py
index 54d3b2d515..3b2218dd9b 100644
--- a/test_runner/regress/test_layers_from_future.py
+++ b/test_runner/regress/test_layers_from_future.py
@@ -39,9 +39,6 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
 
     env = neon_env_builder.init_configs()
     env.start()
-    env.pageserver.allowed_errors.extend(
-        [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"]
-    )
 
     ps_http = env.pageserver.http_client()
 
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index 696af24e5c..7ce38c5c3c 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -249,10 +249,6 @@ def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):
     assert timeline["remote_consistent_lsn"] == timeline["remote_consistent_lsn_visible"]
     assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0
 
-    main_pageserver.allowed_errors.extend(
-        [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"]
-    )
-
     # Now advance the generation in the control plane: subsequent validations
     # from the running pageserver will fail.  No more deletions should happen.
     env.storage_controller.attach_hook_issue(env.initial_tenant, other_pageserver.id)
@@ -397,8 +393,6 @@ def test_deletion_queue_recovery(
         #   validated before restart.
         assert get_deletion_queue_executed(ps_http) == before_restart_depth
     else:
-        main_pageserver.allowed_errors.extend([".*Dropping stale deletions.*"])
-
         # If we lost the attachment, we should have dropped our pre-restart deletions.
         assert get_deletion_queue_dropped(ps_http) == before_restart_depth
 
@@ -553,13 +547,6 @@ def test_multi_attach(
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline
 
-    # We will intentionally create situations where stale deletions happen from non-latest-generation
-    # nodes when the tenant is multiply-attached
-    for ps in env.pageservers:
-        ps.allowed_errors.extend(
-            [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"]
-        )
-
     # Initially, the tenant will be attached to the first pageserver (first is default in our test harness)
     wait_until(10, 0.2, lambda: assert_tenant_state(http_clients[0], tenant_id, "Active"))
     _detail = http_clients[0].timeline_detail(tenant_id, timeline_id)
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 8431840dc0..4c828b86b0 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -83,9 +83,6 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int):
     for ps in env.pageservers:
         ps.allowed_errors.extend(
             [
-                # We will make no effort to avoid stale attachments
-                ".*Dropped remote consistent LSN updates.*",
-                ".*Dropping stale deletions.*",
                 # page_service_conn_main{peer_addr=[::1]:41176}: query handler for 'pagestream 3b19aec5038c796f64b430b30a555121 d07776761d44050b8aab511df1657d83' failed: Tenant 3b19aec5038c796f64b430b30a555121 not found
                 ".*query handler.*Tenant.*not found.*",
                 # page_service_conn_main{peer_addr=[::1]:45552}: query handler for 'pagestream 414ede7ad50f775a8e7d9ba0e43b9efc a43884be16f44b3626482b6981b2c745' failed: Tenant 414ede7ad50f775a8e7d9ba0e43b9efc is not active
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index b26bd3422f..fac7fe9dee 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -355,13 +355,6 @@ def test_remote_storage_upload_queue_retries(
     env.pageserver.stop(immediate=True)
     env.endpoints.stop_all()
 
-    # We are about to forcibly drop local dirs.  Storage controller will increment generation in re-attach before
-    # we later increment when actually attaching it again, leading to skipping a generation and potentially getting
-    # these warnings if there was a durable but un-executed deletion list at time of restart.
-    env.pageserver.allowed_errors.extend(
-        [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"]
-    )
-
     dir_to_clear = env.pageserver.tenant_dir()
     shutil.rmtree(dir_to_clear)
     os.mkdir(dir_to_clear)
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 8267d3f36c..d414f986e6 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -1144,10 +1144,6 @@ def test_sharding_split_failures(
     )
 
     for ps in env.pageservers:
-        # When we do node failures and abandon a shard, it will de-facto have old generation and
-        # thereby be unable to publish remote consistent LSN updates
-        ps.allowed_errors.append(".*Dropped remote consistent LSN updates.*")
-
         # If we're using a failure that will panic the storage controller, all background
         # upcalls from the pageserver can fail
         ps.allowed_errors.append(".*calling control plane generation validation API failed.*")
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index a78f566f0e..d37f7aae3d 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -60,11 +60,6 @@ def test_storage_controller_smoke(
     neon_env_builder.num_pageservers = 3
     env = neon_env_builder.init_configs()
 
-    for pageserver in env.pageservers:
-        # This test detaches tenants during migration, which can race with deletion queue operations,
-        # during detach we only do an advisory flush, we don't wait for it.
-        pageserver.allowed_errors.extend([".*Dropped remote consistent LSN updates.*"])
-
     # Start services by hand so that we can skip a pageserver (this will start + register later)
     env.broker.try_start()
     env.storage_controller.start()
@@ -484,9 +479,6 @@ def test_storage_controller_compute_hook(
     # Start running
     env = neon_env_builder.init_start()
 
-    # We will to an unclean migration, which will result in deletion queue warnings
-    env.pageservers[0].allowed_errors.append(".*Dropped remote consistent LSN updates for tenant.*")
-
     # Initial notification from tenant creation
     assert len(notifications) == 1
     expect: Dict[str, Union[List[Dict[str, int]], str, None, int]] = {
@@ -1054,13 +1046,6 @@ def test_storage_controller_heartbeats(
     online_node_ids = set(range(1, len(env.pageservers) + 1)) - offline_node_ids
 
     for node_id in offline_node_ids:
-        env.get_pageserver(node_id).allowed_errors.append(
-            # In the case of the failpoint failure, the impacted pageserver
-            # still believes it has the tenant attached since location
-            # config calls into it will fail due to being marked offline.
-            ".*Dropped remote consistent LSN updates.*",
-        )
-
         if len(offline_node_ids) > 1:
             env.get_pageserver(node_id).allowed_errors.append(
                 ".*Scheduling error when marking pageserver.*offline.*",
diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py
index 2cbb036c0d..80fb2b55b8 100644
--- a/test_runner/regress/test_tenant_conf.py
+++ b/test_runner/regress/test_tenant_conf.py
@@ -320,10 +320,6 @@ def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder):
 
     assert not config_path.exists(), "detach did not remove config file"
 
-    # The re-attach's increment of the generation number may invalidate deletion queue
-    # updates in flight from the previous attachment.
-    env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*")
-
     env.pageserver.tenant_attach(tenant_id)
     wait_until(
         number_of_iterations=5,
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index 2056840558..b165588636 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -76,10 +76,6 @@ def test_tenant_reattach(neon_env_builder: NeonEnvBuilder, mode: str):
 
     env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
 
-    # Our re-attach may race with the deletion queue processing LSN updates
-    # from the original attachment.
-    env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*")
-
     with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
         with endpoint.cursor() as cur:
             cur.execute("CREATE TABLE t(key int primary key, value text)")
@@ -349,10 +345,6 @@ def test_detach_while_attaching(
 
     env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
 
-    # Our re-attach may race with the deletion queue processing LSN updates
-    # from the original attachment.
-    env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*")
-
     # Create table, and insert some rows. Make it big enough that it doesn't fit in
     # shared_buffers, otherwise the SELECT after restart will just return answer
     # from shared_buffers without hitting the page server, which defeats the point
@@ -422,10 +414,6 @@ def test_detach_while_activating(
 
     env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
 
-    # Our re-attach may race with the deletion queue processing LSN updates
-    # from the original attachment.
-    env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*")
-
     data_id = 1
     data_secret = "very secret secret"
     insert_test_data(pageserver_http, tenant_id, timeline_id, data_id, data_secret, endpoint)
diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py
index 9fe732e288..43e9a0d36e 100644
--- a/test_runner/regress/test_tenant_relocation.py
+++ b/test_runner/regress/test_tenant_relocation.py
@@ -203,8 +203,6 @@ def test_tenant_relocation(
         [
             # Needed for detach polling on the original pageserver
             f".*NotFound: tenant {tenant_id}.*",
-            # We will dual-attach in this test, so stale generations are expected
-            ".*Dropped remote consistent LSN updates.*",
         ]
     )
 
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index 3705406c2f..04b3fdd80f 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -386,10 +386,6 @@ def test_create_churn_during_restart(neon_env_builder: NeonEnvBuilder):
     # generation nubmers out of order.
     env.pageserver.allowed_errors.append(".*Generation .+ is less than existing .+")
 
-    # Our multiple creation requests will advance generation quickly, and when we skip
-    # a generation number we can generate these warnings
-    env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates for tenant .+")
-
     # Timeline::flush_and_shutdown cannot tell if it is hitting a failure because of
     # an incomplete attach, or some other problem.  In the field this should be rare,
     # so we allow it to log at WARN, even if it is occasionally a false positive.

From bd2046e1abbfae5558c8971356dd2d625ce4ce28 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 4 Jul 2024 17:07:16 +0200
Subject: [PATCH 175/412] Add find-large-objects subcommand to scrubber (#8257)

Adds a find-large-objects subcommand to the scrubber to allow listing
layer objects larger than a specific size.

To be used like:

```
AWS_PROFILE=dev REGION=us-east-2 BUCKET=neon-dev-storage-us-east-2 cargo run -p storage_scrubber -- find-large-objects --min-size 250000000 --ignore-deltas
```

Part of #5431
---
 storage_scrubber/src/checks.rs             |  2 +-
 storage_scrubber/src/find_large_objects.rs | 97 ++++++++++++++++++++++
 storage_scrubber/src/lib.rs                |  1 +
 storage_scrubber/src/main.rs               | 18 ++++
 4 files changed, 117 insertions(+), 1 deletion(-)
 create mode 100644 storage_scrubber/src/find_large_objects.rs

diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs
index 4eb8580e32..f687b24320 100644
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -259,7 +259,7 @@ pub(crate) enum BlobDataParseResult {
     Incorrect(Vec<String>),
 }
 
-fn parse_layer_object_name(name: &str) -> Result<(LayerName, Generation), String> {
+pub(crate) fn parse_layer_object_name(name: &str) -> Result<(LayerName, Generation), String> {
     match name.rsplit_once('-') {
         // FIXME: this is gross, just use a regex?
         Some((layer_filename, gen)) if gen.len() == 8 => {
diff --git a/storage_scrubber/src/find_large_objects.rs b/storage_scrubber/src/find_large_objects.rs
new file mode 100644
index 0000000000..24668b6516
--- /dev/null
+++ b/storage_scrubber/src/find_large_objects.rs
@@ -0,0 +1,97 @@
+use futures::StreamExt;
+use pageserver::tenant::storage_layer::LayerName;
+use serde::{Deserialize, Serialize};
+
+use crate::{
+    checks::parse_layer_object_name, init_remote, list_objects_with_retries,
+    metadata_stream::stream_tenants, BucketConfig, NodeKind,
+};
+
+#[derive(Serialize, Deserialize, Clone, Copy, PartialEq, Eq)]
+enum LargeObjectKind {
+    DeltaLayer,
+    ImageLayer,
+    Other,
+}
+
+impl LargeObjectKind {
+    fn from_key(key: &str) -> Self {
+        let fname = key.split('/').last().unwrap();
+
+        let Ok((layer_name, _generation)) = parse_layer_object_name(fname) else {
+            return LargeObjectKind::Other;
+        };
+
+        match layer_name {
+            LayerName::Image(_) => LargeObjectKind::ImageLayer,
+            LayerName::Delta(_) => LargeObjectKind::DeltaLayer,
+        }
+    }
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct LargeObject {
+    pub key: String,
+    pub size: u64,
+    kind: LargeObjectKind,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct LargeObjectListing {
+    pub objects: Vec<LargeObject>,
+}
+
+pub async fn find_large_objects(
+    bucket_config: BucketConfig,
+    min_size: u64,
+    ignore_deltas: bool,
+) -> anyhow::Result<LargeObjectListing> {
+    let (s3_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver)?;
+    let mut tenants = std::pin::pin!(stream_tenants(&s3_client, &target));
+    let mut objects = Vec::new();
+    let mut tenant_ctr = 0u64;
+    let mut object_ctr = 0u64;
+    while let Some(tenant_shard_id) = tenants.next().await {
+        let tenant_shard_id = tenant_shard_id?;
+        let mut tenant_root = target.tenant_root(&tenant_shard_id);
+        // We want the objects and not just common prefixes
+        tenant_root.delimiter.clear();
+        let mut continuation_token = None;
+        loop {
+            let fetch_response =
+                list_objects_with_retries(&s3_client, &tenant_root, continuation_token.clone())
+                    .await?;
+            for obj in fetch_response.contents().iter().filter(|o| {
+                if let Some(obj_size) = o.size {
+                    min_size as i64 <= obj_size
+                } else {
+                    false
+                }
+            }) {
+                let key = obj.key().expect("couldn't get key").to_owned();
+                let kind = LargeObjectKind::from_key(&key);
+                if ignore_deltas && kind == LargeObjectKind::DeltaLayer {
+                    continue;
+                }
+                objects.push(LargeObject {
+                    key,
+                    size: obj.size.unwrap() as u64,
+                    kind,
+                })
+            }
+            object_ctr += fetch_response.contents().len() as u64;
+            match fetch_response.next_continuation_token {
+                Some(new_token) => continuation_token = Some(new_token),
+                None => break,
+            }
+        }
+
+        tenant_ctr += 1;
+        if tenant_ctr % 50 == 0 {
+            tracing::info!(
+                "Scanned {tenant_ctr} shards. objects={object_ctr}, found={}, current={tenant_shard_id}.", objects.len()
+            );
+        }
+    }
+    Ok(LargeObjectListing { objects })
+}
diff --git a/storage_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs
index 64273432fc..6adaa5d38f 100644
--- a/storage_scrubber/src/lib.rs
+++ b/storage_scrubber/src/lib.rs
@@ -2,6 +2,7 @@
 #![deny(clippy::undocumented_unsafe_blocks)]
 pub mod checks;
 pub mod cloud_admin_api;
+pub mod find_large_objects;
 pub mod garbage;
 pub mod metadata_stream;
 pub mod pageserver_physical_gc;
diff --git a/storage_scrubber/src/main.rs b/storage_scrubber/src/main.rs
index 222bd10ed2..10699edd3c 100644
--- a/storage_scrubber/src/main.rs
+++ b/storage_scrubber/src/main.rs
@@ -1,6 +1,7 @@
 use anyhow::bail;
 use camino::Utf8PathBuf;
 use pageserver_api::shard::TenantShardId;
+use storage_scrubber::find_large_objects;
 use storage_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
 use storage_scrubber::pageserver_physical_gc::GcMode;
 use storage_scrubber::scan_pageserver_metadata::scan_metadata;
@@ -72,6 +73,12 @@ enum Command {
         #[arg(short, long, default_value_t = GcMode::IndicesOnly)]
         mode: GcMode,
     },
+    FindLargeObjects {
+        #[arg(long = "min-size")]
+        min_size: u64,
+        #[arg(short, long, default_value_t = false)]
+        ignore_deltas: bool,
+    },
 }
 
 #[tokio::main]
@@ -86,6 +93,7 @@ async fn main() -> anyhow::Result<()> {
         Command::PurgeGarbage { .. } => "purge-garbage",
         Command::TenantSnapshot { .. } => "tenant-snapshot",
         Command::PageserverPhysicalGc { .. } => "pageserver-physical-gc",
+        Command::FindLargeObjects { .. } => "find-large-objects",
     };
     let _guard = init_logging(&format!(
         "{}_{}_{}_{}.log",
@@ -199,5 +207,15 @@ async fn main() -> anyhow::Result<()> {
             println!("{}", serde_json::to_string(&summary).unwrap());
             Ok(())
         }
+        Command::FindLargeObjects {
+            min_size,
+            ignore_deltas,
+        } => {
+            let summary =
+                find_large_objects::find_large_objects(bucket_config, min_size, ignore_deltas)
+                    .await?;
+            println!("{}", serde_json::to_string(&summary).unwrap());
+            Ok(())
+        }
     }
 }

From 32828cddd6a55dee48cc93d7fa1d554e110fc845 Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Thu, 4 Jul 2024 11:09:05 -0400
Subject: [PATCH 176/412] feat(pageserver): integrate lsn lease into synthetic
 size (#8220)

Part of #7497, closes #8071. (accidentally closed #8208, reopened here)

## Problem

After the changes in #8084, we need synthetic size to also account for
leased LSNs so that users do not get free retention by running a small
ephemeral endpoint for a long time.

## Summary of changes

This PR integrates LSN leases into the synthetic size calculation. We
model leases as read-only branches started at the leased LSN (except it
does not have a timeline id).

Other changes:
- Add new unit tests testing whether a lease behaves like a read-only
branch.
- Change `/size_debug` response to include lease point in the SVG
visualization.
- Fix `/lsn_lease` HTTP API to do proper parsing for POST.


Signed-off-by: Yuchen Liang <yuchen@neon.tech>
Co-authored-by: Joonas Koivunen <joonas@neon.tech>
Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 libs/pageserver_api/src/models.rs         |  5 ++
 libs/tenant_size_model/src/calculation.rs |  4 +-
 libs/tenant_size_model/src/svg.rs         | 36 ++++++++--
 pageserver/src/http/openapi_spec.yml      | 22 +++---
 pageserver/src/http/routes.rs             | 18 +++--
 pageserver/src/tenant/size.rs             | 85 ++++++++++++++++++++--
 pageserver/src/tenant/timeline.rs         |  9 +++
 test_runner/fixtures/pageserver/http.py   | 16 +++++
 test_runner/regress/test_tenant_size.py   | 88 +++++++++++++++++++++++
 9 files changed, 256 insertions(+), 27 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 9228953761..ad65602f54 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -228,6 +228,11 @@ pub struct TimelineCreateRequest {
     pub pg_version: Option<u32>,
 }
 
+#[derive(Serialize, Deserialize, Clone)]
+pub struct LsnLeaseRequest {
+    pub lsn: Lsn,
+}
+
 #[derive(Serialize, Deserialize)]
 pub struct TenantShardSplitRequest {
     pub new_shard_count: u8,
diff --git a/libs/tenant_size_model/src/calculation.rs b/libs/tenant_size_model/src/calculation.rs
index f05997ee65..be00562219 100644
--- a/libs/tenant_size_model/src/calculation.rs
+++ b/libs/tenant_size_model/src/calculation.rs
@@ -34,10 +34,10 @@ struct SegmentSize {
 }
 
 struct SizeAlternatives {
-    // cheapest alternative if parent is available.
+    /// cheapest alternative if parent is available.
     incremental: SegmentSize,
 
-    // cheapest alternative if parent node is not available
+    /// cheapest alternative if parent node is not available
     non_incremental: Option<SegmentSize>,
 }
 
diff --git a/libs/tenant_size_model/src/svg.rs b/libs/tenant_size_model/src/svg.rs
index f26d3aa79d..0de2890bb4 100644
--- a/libs/tenant_size_model/src/svg.rs
+++ b/libs/tenant_size_model/src/svg.rs
@@ -3,10 +3,17 @@ use std::fmt::Write;
 
 const SVG_WIDTH: f32 = 500.0;
 
+/// Different branch kind for SVG drawing.
+#[derive(PartialEq)]
+pub enum SvgBranchKind {
+    Timeline,
+    Lease,
+}
+
 struct SvgDraw<'a> {
     storage: &'a StorageModel,
     branches: &'a [String],
-    seg_to_branch: &'a [usize],
+    seg_to_branch: &'a [(usize, SvgBranchKind)],
     sizes: &'a [SegmentSizeResult],
 
     // layout
@@ -42,13 +49,18 @@ fn draw_legend(result: &mut String) -> anyhow::Result<()> {
         "<line x1=\"5\" y1=\"70\" x2=\"15\" y2=\"70\" stroke-width=\"1\" stroke=\"gray\" />"
     )?;
     writeln!(result, "<text x=\"20\" y=\"75\">WAL not retained</text>")?;
+    writeln!(
+        result,
+        "<line x1=\"10\" y1=\"85\" x2=\"10\" y2=\"95\" stroke-width=\"3\" stroke=\"blue\" />"
+    )?;
+    writeln!(result, "<text x=\"20\" y=\"95\">LSN lease</text>")?;
     Ok(())
 }
 
 pub fn draw_svg(
     storage: &StorageModel,
     branches: &[String],
-    seg_to_branch: &[usize],
+    seg_to_branch: &[(usize, SvgBranchKind)],
     sizes: &SizeResult,
 ) -> anyhow::Result<String> {
     let mut draw = SvgDraw {
@@ -100,7 +112,7 @@ impl<'a> SvgDraw<'a> {
 
         // Layout the timelines on Y dimension.
         // TODO
-        let mut y = 100.0;
+        let mut y = 120.0;
         let mut branch_y_coordinates = Vec::new();
         for _branch in self.branches {
             branch_y_coordinates.push(y);
@@ -109,7 +121,7 @@ impl<'a> SvgDraw<'a> {
 
         // Calculate coordinates for each point
         let seg_coordinates = std::iter::zip(segments, self.seg_to_branch)
-            .map(|(seg, branch_id)| {
+            .map(|(seg, (branch_id, _))| {
                 let x = (seg.lsn - min_lsn) as f32 / xscale;
                 let y = branch_y_coordinates[*branch_id];
                 (x, y)
@@ -175,6 +187,22 @@ impl<'a> SvgDraw<'a> {
 
         // draw a snapshot point if it's needed
         let (coord_x, coord_y) = self.seg_coordinates[seg_id];
+
+        let (_, kind) = &self.seg_to_branch[seg_id];
+        if kind == &SvgBranchKind::Lease {
+            let (x1, y1) = (coord_x, coord_y - 10.0);
+            let (x2, y2) = (coord_x, coord_y + 10.0);
+
+            let style = "stroke-width=\"3\" stroke=\"blue\"";
+
+            writeln!(
+                result,
+                "<line x1=\"{x1}\" y1=\"{y1}\" x2=\"{x2}\" y2=\"{y2}\" {style}>",
+            )?;
+            writeln!(result, "  <title>leased lsn at {}</title>", seg.lsn)?;
+            writeln!(result, "</line>")?;
+        }
+
         if self.sizes[seg_id].method == SegmentMethod::SnapshotHere {
             writeln!(
                 result,
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 58ff6e3f83..5ba329f05e 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -265,15 +265,19 @@ paths:
           type: string
           format: hex
     post:
-      description: Obtain lease for the given LSN
-      parameters:
-        - name: lsn
-          in: query
-          required: true
-          schema:
-            type: string
-            format: hex
-          description: A LSN to obtain the lease for
+      description: Obtains a lease for the given LSN.
+      requestBody:
+        content:
+          application/json:
+            schema:
+              type: object
+              required:
+               - lsn
+              properties:
+                lsn:
+                  description: A LSN to obtain the lease for.
+                  type: string
+                  format: hex
       responses:
         "200":
           description: OK
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 6a6f17604d..893302b7d6 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -22,6 +22,7 @@ use pageserver_api::models::ListAuxFilesRequest;
 use pageserver_api::models::LocationConfig;
 use pageserver_api::models::LocationConfigListResponse;
 use pageserver_api::models::LsnLease;
+use pageserver_api::models::LsnLeaseRequest;
 use pageserver_api::models::ShardParameters;
 use pageserver_api::models::TenantDetails;
 use pageserver_api::models::TenantLocationConfigResponse;
@@ -42,7 +43,7 @@ use pageserver_api::shard::TenantShardId;
 use remote_storage::DownloadError;
 use remote_storage::GenericRemoteStorage;
 use remote_storage::TimeTravelError;
-use tenant_size_model::{SizeResult, StorageModel};
+use tenant_size_model::{svg::SvgBranchKind, SizeResult, StorageModel};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::auth::JwtAuth;
@@ -1195,10 +1196,15 @@ fn synthetic_size_html_response(
         timeline_map.insert(ti.timeline_id, index);
         timeline_ids.push(ti.timeline_id.to_string());
     }
-    let seg_to_branch: Vec<usize> = inputs
+    let seg_to_branch: Vec<(usize, SvgBranchKind)> = inputs
         .segments
         .iter()
-        .map(|seg| *timeline_map.get(&seg.timeline_id).unwrap())
+        .map(|seg| {
+            (
+                *timeline_map.get(&seg.timeline_id).unwrap(),
+                seg.kind.into(),
+            )
+        })
         .collect();
 
     let svg =
@@ -1531,15 +1537,13 @@ async fn handle_tenant_break(
 
 // Obtains an lsn lease on the given timeline.
 async fn lsn_lease_handler(
-    request: Request<Body>,
+    mut request: Request<Body>,
     _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
     let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
-
-    let lsn: Lsn = parse_query_param(&request, "lsn")?
-        .ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'lsn' query parameter")))?;
+    let lsn = json_request::<LsnLeaseRequest>(&mut request).await?.lsn;
 
     let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
 
diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs
index b2338b620e..23354417e7 100644
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -3,6 +3,7 @@ use std::collections::hash_map::Entry;
 use std::collections::{HashMap, HashSet};
 use std::sync::Arc;
 
+use tenant_size_model::svg::SvgBranchKind;
 use tokio::sync::oneshot::error::RecvError;
 use tokio::sync::Semaphore;
 use tokio_util::sync::CancellationToken;
@@ -87,6 +88,9 @@ impl SegmentMeta {
             LsnKind::BranchPoint => true,
             LsnKind::GcCutOff => true,
             LsnKind::BranchEnd => false,
+            LsnKind::LeasePoint => true,
+            LsnKind::LeaseStart => false,
+            LsnKind::LeaseEnd => false,
         }
     }
 }
@@ -103,6 +107,21 @@ pub enum LsnKind {
     GcCutOff,
     /// Last record LSN
     BranchEnd,
+    /// A LSN lease is granted here.
+    LeasePoint,
+    /// A lease starts from here.
+    LeaseStart,
+    /// Last record LSN for the lease (should have the same LSN as the previous [`LsnKind::LeaseStart`]).
+    LeaseEnd,
+}
+
+impl From<LsnKind> for SvgBranchKind {
+    fn from(kind: LsnKind) -> Self {
+        match kind {
+            LsnKind::LeasePoint | LsnKind::LeaseStart | LsnKind::LeaseEnd => SvgBranchKind::Lease,
+            _ => SvgBranchKind::Timeline,
+        }
+    }
 }
 
 /// Collect all relevant LSNs to the inputs. These will only be helpful in the serialized form as
@@ -124,6 +143,9 @@ pub struct TimelineInputs {
 
     /// Cutoff point calculated from the user-supplied 'max_retention_period'
     retention_param_cutoff: Option<Lsn>,
+
+    /// Lease points on the timeline
+    lease_points: Vec<Lsn>,
 }
 
 /// Gathers the inputs for the tenant sizing model.
@@ -234,6 +256,13 @@ pub(super) async fn gather_inputs(
             None
         };
 
+        let lease_points = gc_info
+            .leases
+            .keys()
+            .filter(|&&lsn| lsn > ancestor_lsn)
+            .copied()
+            .collect::<Vec<_>>();
+
         // next_gc_cutoff in parent branch are not of interest (right now at least), nor do we
         // want to query any logical size before initdb_lsn.
         let branch_start_lsn = cmp::max(ancestor_lsn, timeline.initdb_lsn);
@@ -248,6 +277,8 @@ pub(super) async fn gather_inputs(
             .map(|lsn| (lsn, LsnKind::BranchPoint))
             .collect::<Vec<_>>();
 
+        lsns.extend(lease_points.iter().map(|&lsn| (lsn, LsnKind::LeasePoint)));
+
         drop(gc_info);
 
         // Add branch points we collected earlier, just in case there were any that were
@@ -296,6 +327,7 @@ pub(super) async fn gather_inputs(
             if kind == LsnKind::BranchPoint {
                 branchpoint_segments.insert((timeline_id, lsn), segments.len());
             }
+
             segments.push(SegmentMeta {
                 segment: Segment {
                     parent: Some(parent),
@@ -306,7 +338,45 @@ pub(super) async fn gather_inputs(
                 timeline_id: timeline.timeline_id,
                 kind,
             });
-            parent += 1;
+
+            parent = segments.len() - 1;
+
+            if kind == LsnKind::LeasePoint {
+                // Needs `LeaseStart` and `LeaseEnd` as well to model lease as a read-only branch that never writes data
+                // (i.e. it's lsn has not advanced from ancestor_lsn), and therefore the three segments have the same LSN
+                // value. Without the other two segments, the calculation code would not count the leased LSN as a point
+                // to be retained.
+                // Did not use `BranchStart` or `BranchEnd` so we can differentiate branches and leases during debug.
+                //
+                // Alt Design: rewrite the entire calculation code to be independent of timeline id. Both leases and
+                // branch points can be given a synthetic id so we can unite them.
+                let mut lease_parent = parent;
+
+                // Start of a lease.
+                segments.push(SegmentMeta {
+                    segment: Segment {
+                        parent: Some(lease_parent),
+                        lsn: lsn.0,
+                        size: None,                   // Filled in later, if necessary
+                        needed: lsn > next_gc_cutoff, // only needed if the point is within rentention.
+                    },
+                    timeline_id: timeline.timeline_id,
+                    kind: LsnKind::LeaseStart,
+                });
+                lease_parent += 1;
+
+                // End of the lease.
+                segments.push(SegmentMeta {
+                    segment: Segment {
+                        parent: Some(lease_parent),
+                        lsn: lsn.0,
+                        size: None,   // Filled in later, if necessary
+                        needed: true, // everything at the lease LSN must be readable => is needed
+                    },
+                    timeline_id: timeline.timeline_id,
+                    kind: LsnKind::LeaseEnd,
+                });
+            }
         }
 
         // Current end of the timeline
@@ -332,6 +402,7 @@ pub(super) async fn gather_inputs(
             pitr_cutoff,
             next_gc_cutoff,
             retention_param_cutoff,
+            lease_points,
         });
     }
 
@@ -674,7 +745,8 @@ fn verify_size_for_multiple_branches() {
       "horizon_cutoff": "0/2210CD0",
       "pitr_cutoff": "0/2210CD0",
       "next_gc_cutoff": "0/2210CD0",
-      "retention_param_cutoff": null
+      "retention_param_cutoff": null,
+      "lease_points": []
     },
     {
       "timeline_id": "454626700469f0a9914949b9d018e876",
@@ -684,7 +756,8 @@ fn verify_size_for_multiple_branches() {
       "horizon_cutoff": "0/1817770",
       "pitr_cutoff": "0/1817770",
       "next_gc_cutoff": "0/1817770",
-      "retention_param_cutoff": null
+      "retention_param_cutoff": null,
+      "lease_points": []
     },
     {
       "timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f",
@@ -694,7 +767,8 @@ fn verify_size_for_multiple_branches() {
       "horizon_cutoff": "0/18B3D98",
       "pitr_cutoff": "0/18B3D98",
       "next_gc_cutoff": "0/18B3D98",
-      "retention_param_cutoff": null
+      "retention_param_cutoff": null,
+      "lease_points": []
     }
   ]
 }
@@ -749,7 +823,8 @@ fn verify_size_for_one_branch() {
       "horizon_cutoff": "47/240A5860",
       "pitr_cutoff": "47/240A5860",
       "next_gc_cutoff": "47/240A5860",
-      "retention_param_cutoff": "0/0"
+      "retention_param_cutoff": "0/0",
+      "lease_points": []
     }
   ]
 }"#;
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index bbf0d0a4bf..42e55ab269 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -14,6 +14,7 @@ use anyhow::{anyhow, bail, ensure, Context, Result};
 use arc_swap::ArcSwap;
 use bytes::Bytes;
 use camino::Utf8Path;
+use chrono::{DateTime, Utc};
 use enumset::EnumSet;
 use fail::fail_point;
 use once_cell::sync::Lazy;
@@ -1590,7 +1591,13 @@ impl Timeline {
                     let existing_lease = occupied.get_mut();
                     if valid_until > existing_lease.valid_until {
                         existing_lease.valid_until = valid_until;
+                        let dt: DateTime<Utc> = valid_until.into();
+                        info!("lease extended to {}", dt);
+                    } else {
+                        let dt: DateTime<Utc> = existing_lease.valid_until.into();
+                        info!("existing lease covers greater length, valid until {}", dt);
                     }
+
                     existing_lease.clone()
                 } else {
                     // Reject already GC-ed LSN (lsn < latest_gc_cutoff)
@@ -1599,6 +1606,8 @@ impl Timeline {
                         bail!("tried to request a page version that was garbage collected. requested at {} gc cutoff {}", lsn, *latest_gc_cutoff_lsn);
                     }
 
+                    let dt: DateTime<Utc> = valid_until.into();
+                    info!("lease created, valid until {}", dt);
                     entry.or_insert(LsnLease { valid_until }).clone()
                 }
             };
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 3da0be8021..03aee9e5c5 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -599,6 +599,22 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         res_json = res.json()
         return res_json
 
+    def timeline_lsn_lease(
+        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, lsn: Lsn
+    ):
+        data = {
+            "lsn": str(lsn),
+        }
+
+        log.info(f"Requesting lsn lease for {lsn=}, {tenant_id=}, {timeline_id=}")
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/lsn_lease",
+            json=data,
+        )
+        self.verbose_error(res)
+        res_json = res.json()
+        return res_json
+
     def timeline_get_timestamp_of_lsn(
         self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, lsn: Lsn
     ):
diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py
index 6c85ddebbc..70e8fe67d5 100644
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -10,6 +10,7 @@ from fixtures.neon_fixtures import (
     Endpoint,
     NeonEnv,
     NeonEnvBuilder,
+    flush_ep_to_pageserver,
     wait_for_last_flush_lsn,
     wait_for_wal_insert_lsn,
 )
@@ -710,3 +711,90 @@ def mask_model_inputs(x):
         return newlist
     else:
         return x
+
+
+@pytest.mark.parametrize("zero_gc", [True, False])
+def test_lsn_lease_size(neon_env_builder: NeonEnvBuilder, test_output_dir: Path, zero_gc: bool):
+    """
+    Compare a LSN lease to a read-only branch for synthetic size calculation.
+    They should have the same effect.
+    """
+
+    conf = {
+        "pitr_interval": "0s" if zero_gc else "3600s",
+        "gc_period": "0s",
+    }
+
+    env = neon_env_builder.init_start(initial_tenant_conf=conf)
+
+    ro_branch_res = insert_with_action(
+        env, env.initial_tenant, env.initial_timeline, test_output_dir, action="branch"
+    )
+
+    tenant, timeline = env.neon_cli.create_tenant(conf=conf)
+    lease_res = insert_with_action(env, tenant, timeline, test_output_dir, action="lease")
+
+    assert_size_approx_equal(lease_res, ro_branch_res)
+
+
+def insert_with_action(
+    env: NeonEnv,
+    tenant: TenantId,
+    timeline: TimelineId,
+    test_output_dir: Path,
+    action: str,
+) -> int:
+    """
+    Inserts some data on the timeline, perform an action, and insert more data on the same timeline.
+    Returns the size at the end of the insertion.
+
+    Valid actions:
+     - "lease": Acquires a lease.
+     - "branch": Creates a child branch but never writes to it.
+    """
+
+    client = env.pageserver.http_client()
+    with env.endpoints.create_start("main", tenant_id=tenant) as ep:
+        initial_size = client.tenant_size(tenant)
+        log.info(f"initial size: {initial_size}")
+
+        with ep.cursor() as cur:
+            cur.execute(
+                "CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)"
+            )
+        last_flush_lsn = wait_for_last_flush_lsn(env, ep, tenant, timeline)
+
+        if action == "lease":
+            res = client.timeline_lsn_lease(tenant, timeline, last_flush_lsn)
+            log.info(f"result from lsn_lease api: {res}")
+        elif action == "branch":
+            ro_branch = env.neon_cli.create_branch(
+                "ro_branch", tenant_id=tenant, ancestor_start_lsn=last_flush_lsn
+            )
+            log.info(f"{ro_branch=} created")
+        else:
+            raise AssertionError("Invalid action type, only `lease` and `branch`are accepted")
+
+        with ep.cursor() as cur:
+            cur.execute(
+                "CREATE TABLE t1 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)"
+            )
+            cur.execute(
+                "CREATE TABLE t2 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)"
+            )
+            cur.execute(
+                "CREATE TABLE t3 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)"
+            )
+
+        last_flush_lsn = wait_for_last_flush_lsn(env, ep, tenant, timeline)
+
+        # Avoid flakiness when calculating logical size.
+        flush_ep_to_pageserver(env, ep, tenant, timeline)
+
+        size_after_action_and_insert = client.tenant_size(tenant)
+        log.info(f"{size_after_action_and_insert=}")
+
+        size_debug_file = open(test_output_dir / f"size_debug_{action}.html", "w")
+        size_debug = client.tenant_size_debug(tenant)
+        size_debug_file.write(size_debug)
+        return size_after_action_and_insert

From e770aeee9266d16067d993fbedf31a079b4d0578 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 4 Jul 2024 18:59:19 +0200
Subject: [PATCH 177/412] Flatten compression algorithm setting (#8265)

This flattens the compression algorithm setting, removing the
`Option<_>` wrapping layer and making handling of the setting easier.

It also adds a specific setting for *disabled* compression with the
continued ability to read copmressed data, giving us the option to
more easily back out of a compression rollout, should the need arise,
which was one of the limitations of #8238.

Implements my suggestion from
https://github.com/neondatabase/neon/pull/8238#issuecomment-2206181594 ,
inspired by Christian's review in
https://github.com/neondatabase/neon/pull/8238#pullrequestreview-2156460268 .

Part of #5431
---
 libs/pageserver_api/src/models.rs              | 15 ++++++++++++++-
 pageserver/src/config.rs                       | 11 ++++++-----
 pageserver/src/tenant/blob_io.rs               | 18 +++++++++++++-----
 .../src/tenant/storage_layer/delta_layer.rs    |  4 ++--
 pageserver/src/tenant/storage_layer/layer.rs   |  2 +-
 5 files changed, 36 insertions(+), 14 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index ad65602f54..ecc543917e 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -450,9 +450,22 @@ pub enum CompactionAlgorithm {
 )]
 #[strum(serialize_all = "kebab-case")]
 pub enum ImageCompressionAlgorithm {
+    /// Disabled for writes, and never decompress during reading.
+    /// Never set this after you've enabled compression once!
+    DisabledNoDecompress,
+    // Disabled for writes, support decompressing during read path
+    Disabled,
     /// Zstandard compression. Level 0 means and None mean the same (default level). Levels can be negative as well.
     /// For details, see the [manual](http://facebook.github.io/zstd/zstd_manual.html).
-    Zstd { level: Option<i8> },
+    Zstd {
+        level: Option<i8>,
+    },
+}
+
+impl ImageCompressionAlgorithm {
+    pub fn allow_decompression(&self) -> bool {
+        !matches!(self, ImageCompressionAlgorithm::DisabledNoDecompress)
+    }
 }
 
 #[derive(Eq, PartialEq, Debug, Clone, Serialize, Deserialize)]
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index fa7f7d8d97..b7c9af2244 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -91,7 +91,8 @@ pub mod defaults {
 
     pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB
 
-    pub const DEFAULT_IMAGE_COMPRESSION: Option<ImageCompressionAlgorithm> = None;
+    pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm =
+        ImageCompressionAlgorithm::DisabledNoDecompress;
 
     pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true;
 
@@ -288,7 +289,7 @@ pub struct PageServerConf {
 
     pub validate_vectored_get: bool,
 
-    pub image_compression: Option<ImageCompressionAlgorithm>,
+    pub image_compression: ImageCompressionAlgorithm,
 
     /// How many bytes of ephemeral layer content will we allow per kilobyte of RAM.  When this
     /// is exceeded, we start proactively closing ephemeral layers to limit the total amount
@@ -402,7 +403,7 @@ struct PageServerConfigBuilder {
 
     validate_vectored_get: BuilderValue<bool>,
 
-    image_compression: BuilderValue<Option<ImageCompressionAlgorithm>>,
+    image_compression: BuilderValue<ImageCompressionAlgorithm>,
 
     ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
 
@@ -680,7 +681,7 @@ impl PageServerConfigBuilder {
         self.validate_vectored_get = BuilderValue::Set(value);
     }
 
-    pub fn get_image_compression(&mut self, value: Option<ImageCompressionAlgorithm>) {
+    pub fn get_image_compression(&mut self, value: ImageCompressionAlgorithm) {
         self.image_compression = BuilderValue::Set(value);
     }
 
@@ -1028,7 +1029,7 @@ impl PageServerConf {
                     builder.get_validate_vectored_get(parse_toml_bool("validate_vectored_get", item)?)
                 }
                 "image_compression" => {
-                    builder.get_image_compression(Some(parse_toml_from_str("image_compression", item)?))
+                    builder.get_image_compression(parse_toml_from_str("image_compression", item)?)
                 }
                 "ephemeral_bytes_per_memory_kb" => {
                     builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize)
diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs
index 1a6a5702f1..0705182d5d 100644
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -273,7 +273,12 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
         srcbuf: B,
         ctx: &RequestContext,
     ) -> (B::Buf, Result<u64, Error>) {
-        self.write_blob_maybe_compressed(srcbuf, ctx, None).await
+        self.write_blob_maybe_compressed(
+            srcbuf,
+            ctx,
+            ImageCompressionAlgorithm::DisabledNoDecompress,
+        )
+        .await
     }
 
     /// Write a blob of data. Returns the offset that it was written to,
@@ -282,7 +287,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
         &mut self,
         srcbuf: B,
         ctx: &RequestContext,
-        algorithm: Option<ImageCompressionAlgorithm>,
+        algorithm: ImageCompressionAlgorithm,
     ) -> (B::Buf, Result<u64, Error>) {
         let offset = self.offset;
 
@@ -314,7 +319,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
                     );
                 }
                 let (high_bit_mask, len_written, srcbuf) = match algorithm {
-                    Some(ImageCompressionAlgorithm::Zstd { level }) => {
+                    ImageCompressionAlgorithm::Zstd { level } => {
                         let mut encoder = if let Some(level) = level {
                             async_compression::tokio::write::ZstdEncoder::with_quality(
                                 Vec::new(),
@@ -335,7 +340,10 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
                             (BYTE_UNCOMPRESSED, len, slice.into_inner())
                         }
                     }
-                    None => (BYTE_UNCOMPRESSED, len, srcbuf.slice_full().into_inner()),
+                    ImageCompressionAlgorithm::Disabled
+                    | ImageCompressionAlgorithm::DisabledNoDecompress => {
+                        (BYTE_UNCOMPRESSED, len, srcbuf.slice_full().into_inner())
+                    }
                 };
                 let mut len_buf = (len_written as u32).to_be_bytes();
                 assert_eq!(len_buf[0] & 0xf0, 0);
@@ -414,7 +422,7 @@ mod tests {
                     wtr.write_blob_maybe_compressed(
                         blob.clone(),
                         &ctx,
-                        Some(ImageCompressionAlgorithm::Zstd { level: Some(1) }),
+                        ImageCompressionAlgorithm::Zstd { level: Some(1) },
                     )
                     .await
                 } else {
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index e6a4d6d5c4..685f6dce60 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -49,7 +49,7 @@ use camino::{Utf8Path, Utf8PathBuf};
 use futures::StreamExt;
 use itertools::Itertools;
 use pageserver_api::keyspace::KeySpace;
-use pageserver_api::models::LayerAccessKind;
+use pageserver_api::models::{ImageCompressionAlgorithm, LayerAccessKind};
 use pageserver_api::shard::TenantShardId;
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
@@ -453,7 +453,7 @@ impl DeltaLayerWriterInner {
     ) -> (Vec<u8>, anyhow::Result<()>) {
         assert!(self.lsn_range.start <= lsn);
         // We don't want to use compression in delta layer creation
-        let compression = None;
+        let compression = ImageCompressionAlgorithm::DisabledNoDecompress;
         let (val, res) = self
             .blob_writer
             .write_blob_maybe_compressed(val, ctx, compression)
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index d1f5cc8f43..afd11780e7 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1685,7 +1685,7 @@ impl DownloadedLayer {
                     lsn,
                     summary,
                     Some(owner.conf.max_vectored_read_bytes),
-                    owner.conf.image_compression.is_some(),
+                    owner.conf.image_compression.allow_decompression(),
                     ctx,
                 )
                 .await

From 3ee82a98955c35daad4ac1790cc6ade6afccf43a Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 4 Jul 2024 22:03:58 +0300
Subject: [PATCH 178/412] implement rolling hyper-log-log algorithm (#8068)

## Problem

See #7466

## Summary of changes

Implement algorithm descried in
https://hal.science/hal-00465313/document

Now new GUC is added:
`neon.wss_max_duration` which specifies size of sliding window (in
seconds). Default value is 1 hour.

It is possible to request estimation of working set sizes (within this
window using new function
`approximate_working_set_size_seconds`. Old function
`approximate_working_set_size` is preserved for backward compatibility.
But its scope is also limited by `neon.wss_max_duration`.

Version of Neon extension is changed to 1.4

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
Co-authored-by: Matthias van de Meent <matthias@neon.tech>
---
 pgxn/neon/Makefile                            |   3 +-
 pgxn/neon/file_cache.c                        |  42 ++--
 pgxn/neon/hll.c                               | 193 ++++++++++++++++++
 pgxn/neon/hll.h                               |  86 ++++++++
 pgxn/neon/neon--1.3--1.4.sql                  |   9 +
 pgxn/neon/neon--1.4--1.3.sql                  |   1 +
 .../test_lfc_working_set_approximation.py     |  44 ++++
 test_runner/regress/test_neon_extension.py    |   2 +-
 8 files changed, 363 insertions(+), 17 deletions(-)
 create mode 100644 pgxn/neon/hll.c
 create mode 100644 pgxn/neon/hll.h
 create mode 100644 pgxn/neon/neon--1.3--1.4.sql
 create mode 100644 pgxn/neon/neon--1.4--1.3.sql

diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile
index cd316dbb91..3b755bb042 100644
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -6,6 +6,7 @@ OBJS = \
 	$(WIN32RES) \
 	extension_server.o \
 	file_cache.o \
+	hll.o \
 	libpagestore.o \
 	neon.o \
 	neon_utils.o \
@@ -22,7 +23,7 @@ SHLIB_LINK_INTERNAL = $(libpq)
 SHLIB_LINK = -lcurl
 
 EXTENSION = neon
-DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql neon--1.3--1.2.sql neon--1.2--1.1.sql neon--1.1--1.0.sql
+DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql neon--1.3--1.2.sql neon--1.2--1.1.sql neon--1.1--1.0.sql  neon--1.3--1.4.sql neon--1.4--1.3.sql
 PGFILEDESC = "neon - cloud storage for PostgreSQL"
 
 EXTRA_CLEAN = \
diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index 25275ef31f..1894e8c72a 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -26,7 +26,6 @@
 #include "miscadmin.h"
 #include "pagestore_client.h"
 #include "common/hashfn.h"
-#include "lib/hyperloglog.h"
 #include "pgstat.h"
 #include "postmaster/bgworker.h"
 #include RELFILEINFO_HDR
@@ -40,6 +39,8 @@
 #include "utils/dynahash.h"
 #include "utils/guc.h"
 
+#include "hll.h"
+
 /*
  * Local file cache is used to temporary store relations pages in local file system.
  * All blocks of all relations are stored inside one file and addressed using shared hash map.
@@ -62,7 +63,6 @@
 #define BLOCKS_PER_CHUNK	128 /* 1Mb chunk */
 #define MB					((uint64)1024*1024)
 
-#define HYPER_LOG_LOG_BIT_WIDTH   10
 #define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK))
 
 typedef struct FileCacheEntry
@@ -87,8 +87,7 @@ typedef struct FileCacheControl
 	uint64		writes;
 	dlist_head	lru;			/* double linked list for LRU replacement
 								 * algorithm */
-	hyperLogLogState wss_estimation; /* estimation of wroking set size */
-	uint8_t		hyperloglog_hashes[(1 << HYPER_LOG_LOG_BIT_WIDTH) + 1];
+	HyperLogLogState wss_estimation; /* estimation of working set size */
 } FileCacheControl;
 
 static HTAB *lfc_hash;
@@ -238,12 +237,7 @@ lfc_shmem_startup(void)
 		dlist_init(&lfc_ctl->lru);
 
 		/* Initialize hyper-log-log structure for estimating working set size */
-		initHyperLogLog(&lfc_ctl->wss_estimation, HYPER_LOG_LOG_BIT_WIDTH);
-
-		/* We need hashes in shared memory */
-		pfree(lfc_ctl->wss_estimation.hashesArr);
-		memset(lfc_ctl->hyperloglog_hashes, 0, sizeof lfc_ctl->hyperloglog_hashes);
-		lfc_ctl->wss_estimation.hashesArr = lfc_ctl->hyperloglog_hashes;
+		initSHLL(&lfc_ctl->wss_estimation);
 
 		/* Recreate file cache on restart */
 		fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC);
@@ -545,7 +539,7 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 
 	/* Approximate working set */
 	tag.blockNum = blkno;
-	addHyperLogLog(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
+	addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
 
 	if (entry == NULL || (entry->bitmap[chunk_offs >> 5] & (1 << (chunk_offs & 31))) == 0)
 	{
@@ -986,20 +980,38 @@ local_cache_pages(PG_FUNCTION_ARGS)
 		SRF_RETURN_DONE(funcctx);
 }
 
+PG_FUNCTION_INFO_V1(approximate_working_set_size_seconds);
+
+Datum
+approximate_working_set_size_seconds(PG_FUNCTION_ARGS)
+{
+	if (lfc_size_limit != 0)
+	{
+		int32 dc;
+		time_t duration = PG_ARGISNULL(0) ? (time_t)-1 : PG_GETARG_INT32(0);
+		LWLockAcquire(lfc_lock, LW_SHARED);
+		dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, duration);
+		LWLockRelease(lfc_lock);
+		PG_RETURN_INT32(dc);
+	}
+	PG_RETURN_NULL();
+}
+
 PG_FUNCTION_INFO_V1(approximate_working_set_size);
 
 Datum
 approximate_working_set_size(PG_FUNCTION_ARGS)
 {
-	int32 dc = -1;
 	if (lfc_size_limit != 0)
 	{
+		int32 dc;
 		bool reset = PG_GETARG_BOOL(0);
 		LWLockAcquire(lfc_lock, reset ? LW_EXCLUSIVE : LW_SHARED);
-		dc = (int32) estimateHyperLogLog(&lfc_ctl->wss_estimation);
+		dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, (time_t)-1);
 		if (reset)
-			memset(lfc_ctl->hyperloglog_hashes, 0, sizeof lfc_ctl->hyperloglog_hashes);
+			memset(lfc_ctl->wss_estimation.regs, 0, sizeof lfc_ctl->wss_estimation.regs);
 		LWLockRelease(lfc_lock);
+		PG_RETURN_INT32(dc);
 	}
-	PG_RETURN_INT32(dc);
+	PG_RETURN_NULL();
 }
diff --git a/pgxn/neon/hll.c b/pgxn/neon/hll.c
new file mode 100644
index 0000000000..f8496b3125
--- /dev/null
+++ b/pgxn/neon/hll.c
@@ -0,0 +1,193 @@
+/*-------------------------------------------------------------------------
+ *
+ * hll.c
+ *	  Sliding HyperLogLog cardinality estimator
+ *
+ * Portions Copyright (c) 2014-2023, PostgreSQL Global Development Group
+ *
+ * Implements https://hal.science/hal-00465313/document
+ * 
+ * Based on Hideaki Ohno's C++ implementation.  This is probably not ideally
+ * suited to estimating the cardinality of very large sets;  in particular, we
+ * have not attempted to further optimize the implementation as described in
+ * the Heule, Nunkesser and Hall paper "HyperLogLog in Practice: Algorithmic
+ * Engineering of a State of The Art Cardinality Estimation Algorithm".
+ *
+ * A sparse representation of HyperLogLog state is used, with fixed space
+ * overhead.
+ *
+ * The copyright terms of Ohno's original version (the MIT license) follow.
+ *
+ * IDENTIFICATION
+ *	  src/backend/lib/hyperloglog.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+/*
+ * Copyright (c) 2013 Hideaki Ohno <hide.o.j55{at}gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the 'Software'), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <math.h>
+
+#include "postgres.h"
+#include "funcapi.h"
+#include "port/pg_bitutils.h"
+#include "utils/timestamp.h"
+#include "hll.h"
+
+
+#define POW_2_32			(4294967296.0)
+#define NEG_POW_2_32		(-4294967296.0)
+
+#define ALPHA_MM ((0.7213 / (1.0 + 1.079 / HLL_N_REGISTERS)) * HLL_N_REGISTERS * HLL_N_REGISTERS)
+
+/*
+ * Worker for addHyperLogLog().
+ *
+ * Calculates the position of the first set bit in first b bits of x argument
+ * starting from the first, reading from most significant to least significant
+ * bits.
+ *
+ * Example (when considering fist 10 bits of x):
+ *
+ * rho(x = 0b1000000000)   returns 1
+ * rho(x = 0b0010000000)   returns 3
+ * rho(x = 0b0000000000)   returns b + 1
+ *
+ * "The binary address determined by the first b bits of x"
+ *
+ * Return value "j" used to index bit pattern to watch.
+ */
+static inline uint8
+rho(uint32 x, uint8 b)
+{
+	uint8		j = 1;
+
+	if (x == 0)
+		return b + 1;
+
+	j = 32 - pg_leftmost_one_pos32(x);
+
+	if (j > b)
+		return b + 1;
+
+	return j;
+}
+
+/*
+ * Initialize HyperLogLog track state
+ */
+void
+initSHLL(HyperLogLogState *cState)
+{
+	memset(cState->regs, 0, sizeof(cState->regs));
+}
+
+/*
+ * Adds element to the estimator, from caller-supplied hash.
+ *
+ * It is critical that the hash value passed be an actual hash value, typically
+ * generated using hash_any().  The algorithm relies on a specific bit-pattern
+ * observable in conjunction with stochastic averaging.  There must be a
+ * uniform distribution of bits in hash values for each distinct original value
+ * observed.
+ */
+void
+addSHLL(HyperLogLogState *cState, uint32 hash)
+{
+	uint8		count;
+	uint32		index;
+	size_t		i;
+	size_t		j;
+
+	TimestampTz	now = GetCurrentTimestamp();
+	/* Use the first "k" (registerWidth) bits as a zero based index */
+	index = hash >> HLL_C_BITS;
+
+	/* Compute the rank of the remaining 32 - "k" (registerWidth) bits */
+	count = rho(hash << HLL_BIT_WIDTH, HLL_C_BITS);
+
+	cState->regs[index][count] = now;
+}
+
+static uint8
+getMaximum(const TimestampTz* reg, TimestampTz since)
+{
+	uint8 max = 0;
+
+	for (size_t i = 0; i < HLL_C_BITS + 1; i++)
+	{
+		if (reg[i] >= since)
+		{
+			max = i;
+		}
+	}
+
+	return max;
+}
+
+
+/*
+ * Estimates cardinality, based on elements added so far
+ */
+double
+estimateSHLL(HyperLogLogState *cState, time_t duration)
+{
+	double		result;
+	double		sum = 0.0;
+	size_t		i;
+	uint8       R[HLL_N_REGISTERS];
+	/* 0 indicates uninitialized timestamp, so if we need to cover the whole range than starts with 1 */
+	TimestampTz since = duration == (time_t)-1 ? 1 : GetCurrentTimestamp() - duration * USECS_PER_SEC;
+
+	for (i = 0; i < HLL_N_REGISTERS; i++)
+	{
+		R[i] = getMaximum(cState->regs[i], since);
+		sum += 1.0 / pow(2.0, R[i]);
+	}
+
+	/* result set to "raw" HyperLogLog estimate (E in the HyperLogLog paper) */
+	result = ALPHA_MM / sum;
+
+	if (result <= (5.0 / 2.0) * HLL_N_REGISTERS)
+	{
+		/* Small range correction */
+		int			zero_count = 0;
+
+		for (i = 0; i < HLL_N_REGISTERS; i++)
+		{
+			zero_count += R[i] == 0;
+		}
+
+		if (zero_count != 0)
+			result = HLL_N_REGISTERS * log((double) HLL_N_REGISTERS /
+										   zero_count);
+	}
+	else if (result > (1.0 / 30.0) * POW_2_32)
+	{
+		/* Large range correction */
+		result = NEG_POW_2_32 * log(1.0 - (result / POW_2_32));
+	}
+
+	return result;
+}
+
diff --git a/pgxn/neon/hll.h b/pgxn/neon/hll.h
new file mode 100644
index 0000000000..9256cb9afa
--- /dev/null
+++ b/pgxn/neon/hll.h
@@ -0,0 +1,86 @@
+/*-------------------------------------------------------------------------
+ *
+ * hll.h
+ *	  Sliding HyperLogLog cardinality estimator
+ *
+ * Portions Copyright (c) 2014-2023, PostgreSQL Global Development Group
+ *
+ * Implements https://hal.science/hal-00465313/document
+ * 
+ * Based on Hideaki Ohno's C++ implementation.  This is probably not ideally
+ * suited to estimating the cardinality of very large sets;  in particular, we
+ * have not attempted to further optimize the implementation as described in
+ * the Heule, Nunkesser and Hall paper "HyperLogLog in Practice: Algorithmic
+ * Engineering of a State of The Art Cardinality Estimation Algorithm".
+ *
+ * A sparse representation of HyperLogLog state is used, with fixed space
+ * overhead.
+ *
+ * The copyright terms of Ohno's original version (the MIT license) follow.
+ *
+ * IDENTIFICATION
+ *	  src/backend/lib/hyperloglog.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+/*
+ * Copyright (c) 2013 Hideaki Ohno <hide.o.j55{at}gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the 'Software'), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef HLL_H
+#define HLL_H
+
+#define HLL_BIT_WIDTH   10
+#define HLL_C_BITS      (32 - HLL_BIT_WIDTH)
+#define HLL_N_REGISTERS (1 << HLL_BIT_WIDTH)
+
+/*
+ * HyperLogLog is an approximate technique for computing the number of distinct
+ * entries in a set.  Importantly, it does this by using a fixed amount of
+ * memory.  See the 2007 paper "HyperLogLog: the analysis of a near-optimal
+ * cardinality estimation algorithm" for more.
+ *
+ * Instead of a single counter for every bits register, we have a timestamp
+ * for every valid number of bits we can encounter. Every time we encounter
+ * a certain number of bits, we update the timestamp in those registers to
+ * the current timestamp.
+ *
+ * We can query the sketch's stored cardinality for the range of some timestamp
+ * up to now: For each register, we return the highest bits bucket that has a
+ * modified timestamp >= the query timestamp. This value is the number of bits
+ * for this register in the normal HLL calculation.
+ *
+ * The memory usage is 2^B * (C + 1) * sizeof(TimetampTz), or 184kiB.
+ * Usage could be halved if we decide to reduce the required time dimension
+ * precision; as 32 bits in second precision should be enough for statistics.
+ * However, that is not yet implemented.
+ */
+typedef struct HyperLogLogState
+{
+	TimestampTz regs[HLL_N_REGISTERS][HLL_C_BITS + 1];
+} HyperLogLogState;
+
+extern void   initSHLL(HyperLogLogState *cState);
+extern void   addSHLL(HyperLogLogState *cState, uint32 hash);
+extern double estimateSHLL(HyperLogLogState *cState, time_t dutration);
+
+#endif
diff --git a/pgxn/neon/neon--1.3--1.4.sql b/pgxn/neon/neon--1.3--1.4.sql
new file mode 100644
index 0000000000..042effe346
--- /dev/null
+++ b/pgxn/neon/neon--1.3--1.4.sql
@@ -0,0 +1,9 @@
+\echo Use "ALTER EXTENSION neon UPDATE TO '1.4'" to load this file. \quit
+
+CREATE FUNCTION approximate_working_set_size_seconds(duration integer default null)
+RETURNS integer
+AS 'MODULE_PATHNAME', 'approximate_working_set_size_seconds'
+LANGUAGE C PARALLEL SAFE;
+
+GRANT EXECUTE ON FUNCTION approximate_working_set_size_seconds(integer) TO pg_monitor;
+
diff --git a/pgxn/neon/neon--1.4--1.3.sql b/pgxn/neon/neon--1.4--1.3.sql
new file mode 100644
index 0000000000..bea72d1a6b
--- /dev/null
+++ b/pgxn/neon/neon--1.4--1.3.sql
@@ -0,0 +1 @@
+DROP FUNCTION IF EXISTS approximate_working_set_size_seconds(integer) CASCADE;
diff --git a/test_runner/regress/test_lfc_working_set_approximation.py b/test_runner/regress/test_lfc_working_set_approximation.py
index a6f05fe0f7..6465bdfd21 100644
--- a/test_runner/regress/test_lfc_working_set_approximation.py
+++ b/test_runner/regress/test_lfc_working_set_approximation.py
@@ -1,3 +1,4 @@
+import time
 from pathlib import Path
 
 from fixtures.log_helper import log
@@ -72,3 +73,46 @@ WITH (fillfactor='100');
     blocks = query_scalar(cur, "select approximate_working_set_size(true)")
     log.info(f"working set size after some index access of a few select pages only {blocks}")
     assert blocks < 10
+
+
+def test_sliding_working_set_approximation(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+
+    endpoint = env.endpoints.create_start(
+        branch_name="main",
+        config_lines=[
+            "autovacuum = off",
+            "shared_buffers=1MB",
+            "neon.max_file_cache_size=256MB",
+            "neon.file_cache_size_limit=245MB",
+        ],
+    )
+    conn = endpoint.connect()
+    cur = conn.cursor()
+    cur.execute("create extension neon version '1.4'")
+    cur.execute(
+        "create table t(pk integer primary key, count integer default 0, payload text default repeat('?', 128))"
+    )
+    cur.execute("insert into t (pk) values (generate_series(1,1000000))")
+    time.sleep(2)
+    before_10k = time.monotonic()
+    cur.execute("select sum(count) from t where pk between 10000 and 20000")
+    time.sleep(2)
+    before_1k = time.monotonic()
+    cur.execute("select sum(count) from t where pk between 1000 and 2000")
+    after = time.monotonic()
+
+    cur.execute(f"select approximate_working_set_size_seconds({int(after - before_1k + 1)})")
+    estimation_1k = cur.fetchall()[0][0]
+    log.info(f"Working set size for selecting 1k records {estimation_1k}")
+
+    cur.execute(f"select approximate_working_set_size_seconds({int(after - before_10k + 1)})")
+    estimation_10k = cur.fetchall()[0][0]
+    log.info(f"Working set size for selecting 10k records {estimation_10k}")
+
+    cur.execute("select pg_table_size('t')")
+    size = cur.fetchall()[0][0] // 8192
+    log.info(f"Table size {size} blocks")
+
+    assert estimation_1k >= 20 and estimation_1k <= 40
+    assert estimation_10k >= 200 and estimation_10k <= 400
diff --git a/test_runner/regress/test_neon_extension.py b/test_runner/regress/test_neon_extension.py
index 39b4865026..e83aaf91c6 100644
--- a/test_runner/regress/test_neon_extension.py
+++ b/test_runner/regress/test_neon_extension.py
@@ -50,7 +50,7 @@ def test_neon_extension_compatibility(neon_env_builder: NeonEnvBuilder):
             # Ensure that the default version is also updated in the neon.control file
             assert cur.fetchone() == ("1.3",)
             cur.execute("SELECT * from neon.NEON_STAT_FILE_CACHE")
-            all_versions = ["1.3", "1.2", "1.1", "1.0"]
+            all_versions = ["1.4", "1.3", "1.2", "1.1", "1.0"]
             current_version = "1.3"
             for idx, begin_version in enumerate(all_versions):
                 for target_version in all_versions[idx + 1 :]:

From d35ddfbab737721a72987f84c9f7f8f9113f2ef3 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Thu, 4 Jul 2024 22:17:45 +0200
Subject: [PATCH 179/412] add checkout depth1 to workflow to access local
 github actions like generate allure report (#8259)

## Problem

job step to create allure report fails


https://github.com/neondatabase/neon/actions/runs/9781886710/job/27006997416#step:11:1

## Summary of changes

Shallow checkout of sources to get access to local github action needed
in the job step

## Example run
example run with this change
https://github.com/neondatabase/neon/actions/runs/9790647724
do not merge this PR until the job is clean

---------

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 .github/workflows/periodic_pagebench.yml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/.github/workflows/periodic_pagebench.yml b/.github/workflows/periodic_pagebench.yml
index c0219599a2..a8baf6bf7a 100644
--- a/.github/workflows/periodic_pagebench.yml
+++ b/.github/workflows/periodic_pagebench.yml
@@ -43,6 +43,10 @@ jobs:
       AWS_DEFAULT_REGION : "eu-central-1"
       AWS_INSTANCE_ID : "i-02a59a3bf86bc7e74"
     steps:
+    # we don't need the neon source code because we run everything remotely
+    # however we still need the local github actions to run the allure step below
+    - uses: actions/checkout@v4
+
     - name: Show my own (github runner) external IP address - usefull for IP allowlisting
       run: curl https://ifconfig.me
 
@@ -116,6 +120,9 @@ jobs:
         cat "test_log_${GITHUB_RUN_ID}"
 
     - name: Create Allure report
+      env:
+        AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+        AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
       if: ${{ !cancelled() }}
       uses: ./.github/actions/allure-report-generate
 

From 32fc2dd6835adfeb06fa267910b666500140aadd Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 5 Jul 2024 10:09:15 +0100
Subject: [PATCH 180/412] tests: extend allow list in deletion test (#8268)

## Problem

1ea5d8b1327d2e93cbe11682f60a90e35d42d1ee tolerated this as an error
message, but it can show up in logs as well.

Example failure:
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-8201/9780147712/index.html#testresult/263422f5f5f292ea/retries

## Summary of changes

- Tolerate "failed to delete 1 objects" in pageserver logs, this occurs
occasionally when injected failures exhaust deletion's retries.
---
 test_runner/regress/test_tenant_delete.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index d3fba32a19..1d7c8b8e31 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -67,8 +67,9 @@ def test_tenant_delete_smoke(
 
     # first try to delete non existing tenant
     tenant_id = TenantId.generate()
-    env.pageserver.allowed_errors.append(".*NotFound.*")
-    env.pageserver.allowed_errors.append(".*simulated failure.*")
+    env.pageserver.allowed_errors.extend(
+        [".*NotFound.*", ".*simulated failure.*", ".*failed to delete .+ objects.*"]
+    )
 
     # Check that deleting a non-existent tenant gives the expected result: this is a loop because we
     # may need to retry on some remote storage errors injected by the test harness

From 5ffcb688cc97d8492ef101eab88eeff46b70c13a Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Fri, 5 Jul 2024 11:23:46 +0200
Subject: [PATCH 181/412] correct error handling for periodic pagebench runner
 status (#8274)

## Problem

the following periodic pagebench run was failed but was still shown as
successful


https://github.com/neondatabase/neon/actions/runs/9798909458/job/27058179993#step:9:47

## Summary of changes

if the ec2 test runner reports a failure fail the job step and thus the
workflow

---------

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 .github/workflows/periodic_pagebench.yml | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/periodic_pagebench.yml b/.github/workflows/periodic_pagebench.yml
index a8baf6bf7a..ed4e6be712 100644
--- a/.github/workflows/periodic_pagebench.yml
+++ b/.github/workflows/periodic_pagebench.yml
@@ -94,10 +94,12 @@ jobs:
           set +x
           status=$(echo $response | jq -r '.status')
           echo "Test status: $status"
-          if [[ "$status" == "failure" || "$status" == "success" || "$status" == "null" ]]; then
+          if [[ "$status" == "failure" ]]; then
+            echo "Test failed"
+            exit 1 # Fail the job step if status is failure
+          elif [[ "$status" == "success" || "$status" == "null" ]]; then
             break
-          fi
-          if [[ "$status" == "too_many_runs" ]]; then
+          elif [[ "$status" == "too_many_runs" ]]; then
             echo "Too many runs already running"
             echo "too_many_runs=true" >> "$GITHUB_OUTPUT"
             exit 1
@@ -107,6 +109,7 @@ jobs:
         done
 
     - name: Retrieve Test Logs
+      if: always() && steps.poll_step.outputs.too_many_runs != 'true'
       run: |
         curl -k -X 'GET' \
         "${EC2_MACHINE_URL_US}/test_log/${GITHUB_RUN_ID}" \
@@ -115,6 +118,7 @@ jobs:
         --output "test_log_${GITHUB_RUN_ID}.gz"
     
     - name: Unzip Test Log and Print it into this job's log
+      if: always() && steps.poll_step.outputs.too_many_runs != 'true'
       run: |
         gzip -d "test_log_${GITHUB_RUN_ID}.gz"
         cat "test_log_${GITHUB_RUN_ID}"

From 64334f497dbc5b6fd0ee8855c3c69a3220068790 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 5 Jul 2024 10:34:16 +0100
Subject: [PATCH 182/412] tests: make location_conf_churn more robust (#8271)

## Problem

This test directly manages locations on pageservers and configuration of
an endpoint. However, it did not switch off the parts of the storage
controller that attempt to do the same: occasionally, the test would
fail in a strange way such as a compute failing to accept a
reconfiguration request.

## Summary of changes

- Wire up the storage controller's compute notification hook to a no-op
handler
- Configure the tenant's scheduling policy to Stop.
---
 .../regress/test_pageserver_secondary.py      | 27 ++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 4c828b86b0..0416078ebc 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -16,6 +16,8 @@ from fixtures.pageserver.utils import (
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, S3Storage, s3_storage
 from fixtures.utils import wait_until
 from fixtures.workload import Workload
+from werkzeug.wrappers.request import Request
+from werkzeug.wrappers.response import Response
 
 # A tenant configuration that is convenient for generating uploads and deletions
 # without a large amount of postgres traffic.
@@ -59,7 +61,7 @@ def evict_random_layers(
 
 
 @pytest.mark.parametrize("seed", [1, 2, 3])
-def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int):
+def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, make_httpserver, seed: int):
     """
     Issue many location configuration changes, ensure that tenants
     remain readable & we don't get any unexpected errors.  We should
@@ -73,6 +75,20 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int):
     neon_env_builder.enable_pageserver_remote_storage(
         remote_storage_kind=s3_storage(),
     )
+    neon_env_builder.control_plane_compute_hook_api = (
+        f"http://{make_httpserver.host}:{make_httpserver.port}/notify-attach"
+    )
+
+    def ignore_notify(request: Request):
+        # This test does all its own compute configuration (by passing explicit pageserver ID to Workload functions),
+        # so we send controller notifications to /dev/null to prevent it fighting the test for control of the compute.
+        log.info(f"Ignoring storage controller compute notification: {request.json}")
+        return Response(status=200)
+
+    make_httpserver.expect_request("/notify-attach", method="PUT").respond_with_handler(
+        ignore_notify
+    )
+
     env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
 
     pageservers = env.pageservers
@@ -99,6 +115,15 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int):
     workload.init(env.pageservers[0].id)
     workload.write_rows(256, env.pageservers[0].id)
 
+    # Discourage the storage controller from interfering with the changes we will make directly on the pageserver
+    env.storage_controller.tenant_policy_update(
+        tenant_id,
+        {
+            "scheduling": "Stop",
+        },
+    )
+    env.storage_controller.allowed_errors.append(".*Scheduling is disabled by policy Stop.*")
+
     # We use a fixed seed to make the test reproducible: we want a randomly
     # chosen order, but not to change the order every time we run the test.
     rng = random.Random(seed)

From b44ee3950ae59faf73299c9e5b79039b56af4a1d Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 5 Jul 2024 11:17:44 +0100
Subject: [PATCH 183/412] safekeeper: add separate `tombstones` map for deleted
 timelines (#8253)

## Problem

Safekeepers left running for a long time use a lot of memory (up to the
point of OOMing, on small nodes) for deleted timelines, because the
`Timeline` struct is kept alive as a guard against recreating deleted
timelines.

Closes: https://github.com/neondatabase/neon/issues/6810

## Summary of changes

- Create separate tombstones that just record a ttid and when the
timeline was deleted.
- Add a periodic housekeeping task that cleans up tombstones older than
a hardcoded TTL (24h)

I think this also makes https://github.com/neondatabase/neon/pull/6766
un-needed, as the tombstone is also checked during deletion.

I considered making the overall timeline map use an enum type containing
active or deleted, but having a separate map of tombstones avoids
bloating that map, so that calls like `get()` can still go straight to a
timeline without having to walk a hashmap that also contains tombstones.
---
 safekeeper/src/bin/safekeeper.rs       |  13 +++
 safekeeper/src/timelines_global_map.rs | 105 +++++++++++++++++--------
 2 files changed, 87 insertions(+), 31 deletions(-)

diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index d25b8722ac..4d580e57ed 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -445,6 +445,19 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
         .map(|res| ("WAL service main".to_owned(), res));
     tasks_handles.push(Box::pin(wal_service_handle));
 
+    let timeline_housekeeping_handle = current_thread_rt
+        .as_ref()
+        .unwrap_or_else(|| WAL_SERVICE_RUNTIME.handle())
+        .spawn(async move {
+            const TOMBSTONE_TTL: Duration = Duration::from_secs(3600 * 24);
+            loop {
+                tokio::time::sleep(TOMBSTONE_TTL).await;
+                GlobalTimelines::housekeeping(&TOMBSTONE_TTL);
+            }
+        })
+        .map(|res| ("Timeline map housekeeping".to_owned(), res));
+    tasks_handles.push(Box::pin(timeline_housekeeping_handle));
+
     if let Some(pg_listener_tenant_only) = pg_listener_tenant_only {
         let conf_ = conf.clone();
         let wal_service_handle = current_thread_rt
diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs
index 9ce1112cec..f57da5c7cb 100644
--- a/safekeeper/src/timelines_global_map.rs
+++ b/safekeeper/src/timelines_global_map.rs
@@ -15,12 +15,19 @@ use std::collections::HashMap;
 use std::str::FromStr;
 use std::sync::atomic::Ordering;
 use std::sync::{Arc, Mutex};
+use std::time::{Duration, Instant};
 use tracing::*;
 use utils::id::{TenantId, TenantTimelineId, TimelineId};
 use utils::lsn::Lsn;
 
 struct GlobalTimelinesState {
     timelines: HashMap<TenantTimelineId, Arc<Timeline>>,
+
+    // A tombstone indicates this timeline used to exist has been deleted.  These are used to prevent
+    // on-demand timeline creation from recreating deleted timelines.  This is only soft-enforced, as
+    // this map is dropped on restart.
+    tombstones: HashMap<TenantTimelineId, Instant>,
+
     conf: Option<SafeKeeperConf>,
     broker_active_set: Arc<TimelinesSet>,
     load_lock: Arc<tokio::sync::Mutex<TimelineLoadLock>>,
@@ -64,11 +71,17 @@ impl GlobalTimelinesState {
             .cloned()
             .ok_or(TimelineError::NotFound(*ttid))
     }
+
+    fn delete(&mut self, ttid: TenantTimelineId) {
+        self.timelines.remove(&ttid);
+        self.tombstones.insert(ttid, Instant::now());
+    }
 }
 
 static TIMELINES_STATE: Lazy<Mutex<GlobalTimelinesState>> = Lazy::new(|| {
     Mutex::new(GlobalTimelinesState {
         timelines: HashMap::new(),
+        tombstones: HashMap::new(),
         conf: None,
         broker_active_set: Arc::new(TimelinesSet::default()),
         load_lock: Arc::new(tokio::sync::Mutex::new(TimelineLoadLock)),
@@ -198,11 +211,17 @@ impl GlobalTimelines {
                 let tli = Arc::new(timeline);
 
                 // TODO: prevent concurrent timeline creation/loading
-                TIMELINES_STATE
-                    .lock()
-                    .unwrap()
-                    .timelines
-                    .insert(ttid, tli.clone());
+                {
+                    let mut state = TIMELINES_STATE.lock().unwrap();
+
+                    // We may be have been asked to load a timeline that was previously deleted (e.g. from `pull_timeline.rs`).  We trust
+                    // that the human doing this manual intervention knows what they are doing, and remove its tombstone.
+                    if state.tombstones.remove(&ttid).is_some() {
+                        warn!("Un-deleted timeline {ttid}");
+                    }
+
+                    state.timelines.insert(ttid, tli.clone());
+                }
 
                 tli.bootstrap(&conf, broker_active_set, partial_backup_rate_limiter);
 
@@ -229,7 +248,7 @@ impl GlobalTimelines {
 
     /// Create a new timeline with the given id. If the timeline already exists, returns
     /// an existing timeline.
-    pub async fn create(
+    pub(crate) async fn create(
         ttid: TenantTimelineId,
         server_info: ServerInfo,
         commit_lsn: Lsn,
@@ -241,6 +260,11 @@ impl GlobalTimelines {
                 // Timeline already exists, return it.
                 return Ok(timeline);
             }
+
+            if state.tombstones.contains_key(&ttid) {
+                anyhow::bail!("Timeline {ttid} is deleted, refusing to recreate");
+            }
+
             state.get_dependencies()
         };
 
@@ -300,17 +324,19 @@ impl GlobalTimelines {
     /// Get a timeline from the global map. If it's not present, it doesn't exist on disk,
     /// or was corrupted and couldn't be loaded on startup. Returned timeline is always valid,
     /// i.e. loaded in memory and not cancelled.
-    pub fn get(ttid: TenantTimelineId) -> Result<Arc<Timeline>, TimelineError> {
-        let res = TIMELINES_STATE.lock().unwrap().get(&ttid);
-
-        match res {
+    pub(crate) fn get(ttid: TenantTimelineId) -> Result<Arc<Timeline>, TimelineError> {
+        let tli_res = {
+            let state = TIMELINES_STATE.lock().unwrap();
+            state.get(&ttid)
+        };
+        match tli_res {
             Ok(tli) => {
                 if tli.is_cancelled() {
                     return Err(TimelineError::Cancelled(ttid));
                 }
                 Ok(tli)
             }
-            _ => res,
+            _ => tli_res,
         }
     }
 
@@ -339,12 +365,26 @@ impl GlobalTimelines {
 
     /// Cancels timeline, then deletes the corresponding data directory.
     /// If only_local, doesn't remove WAL segments in remote storage.
-    pub async fn delete(
+    pub(crate) async fn delete(
         ttid: &TenantTimelineId,
         only_local: bool,
     ) -> Result<TimelineDeleteForceResult> {
-        let tli_res = TIMELINES_STATE.lock().unwrap().get(ttid);
-        match tli_res {
+        let tli_res = {
+            let state = TIMELINES_STATE.lock().unwrap();
+
+            if state.tombstones.contains_key(ttid) {
+                // Presence of a tombstone guarantees that a previous deletion has completed and there is no work to do.
+                info!("Timeline {ttid} was already deleted");
+                return Ok(TimelineDeleteForceResult {
+                    dir_existed: false,
+                    was_active: false,
+                });
+            }
+
+            state.get(ttid)
+        };
+
+        let result = match tli_res {
             Ok(timeline) => {
                 let was_active = timeline.broker_active.load(Ordering::Relaxed);
 
@@ -354,11 +394,6 @@ impl GlobalTimelines {
                 info!("deleting timeline {}, only_local={}", ttid, only_local);
                 let dir_existed = timeline.delete(&mut shared_state, only_local).await?;
 
-                // Remove timeline from the map.
-                // FIXME: re-enable it once we fix the issue with recreation of deleted timelines
-                // https://github.com/neondatabase/neon/issues/3146
-                // TIMELINES_STATE.lock().unwrap().timelines.remove(ttid);
-
                 Ok(TimelineDeleteForceResult {
                     dir_existed,
                     was_active, // TODO: we probably should remove this field
@@ -374,7 +409,14 @@ impl GlobalTimelines {
                     was_active: false,
                 })
             }
-        }
+        };
+
+        // Finalize deletion, by dropping Timeline objects and storing smaller tombstones.  The tombstones
+        // are used to prevent still-running computes from re-creating the same timeline when they send data,
+        // and to speed up repeated deletion calls by avoiding re-listing objects.
+        TIMELINES_STATE.lock().unwrap().delete(*ttid);
+
+        result
     }
 
     /// Deactivates and deletes all timelines for the tenant. Returns map of all timelines which
@@ -420,19 +462,20 @@ impl GlobalTimelines {
             tenant_id,
         ))?;
 
-        // FIXME: we temporarily disabled removing timelines from the map, see `delete_force`
-        // let tlis_after_delete = Self::get_all_for_tenant(*tenant_id);
-        // if !tlis_after_delete.is_empty() {
-        //     // Some timelines were created while we were deleting them, returning error
-        //     // to the caller, so it can retry later.
-        //     bail!(
-        //         "failed to delete all timelines for tenant {}: some timelines were created while we were deleting them",
-        //         tenant_id
-        //     );
-        // }
-
         Ok(deleted)
     }
+
+    pub fn housekeeping(tombstone_ttl: &Duration) {
+        let mut state = TIMELINES_STATE.lock().unwrap();
+
+        // We keep tombstones long enough to have a good chance of preventing rogue computes from re-creating deleted
+        // timelines.  If a compute kept running for longer than this TTL (or across a safekeeper restart) then they
+        // may recreate a deleted timeline.
+        let now = Instant::now();
+        state
+            .tombstones
+            .retain(|_, v| now.duration_since(*v) < *tombstone_ttl);
+    }
 }
 
 #[derive(Clone, Copy, Serialize)]

From c8dd78c6c8865972401251a7a32f9bc8bfe6dc09 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 5 Jul 2024 14:02:02 +0100
Subject: [PATCH 184/412] pageserver: add time based image layer creation check
 (#8247)

## Problem
Assume a timeline with the following workload: very slow ingest of
updates to a small number of keys that fit within the same partition (as decided by
`KeySpace::partition`). These tenants will create small L0 layers since due to time
based rolling, and, consequently, the L1 layers will also be small.

Currently, by default, we need to ingest 512 MiB of WAL before checking
if an image layer is required. This scheme works fine under the assumption that L1s are roughly of
checkpoint distance size, but as the first paragraph explained, that's not the case for all workloads.

## Summary of changes
Check if new image layers are required at least once every checkpoint timeout interval.
---
 pageserver/src/tenant/timeline.rs | 71 ++++++++++++++++++++++++-------
 1 file changed, 55 insertions(+), 16 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 42e55ab269..92baf1073a 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -365,6 +365,7 @@ pub struct Timeline {
     repartition_threshold: u64,
 
     last_image_layer_creation_check_at: AtomicLsn,
+    last_image_layer_creation_check_instant: std::sync::Mutex<Option<Instant>>,
 
     /// Current logical size of the "datadir", at the last LSN.
     current_logical_size: LogicalSize,
@@ -2384,6 +2385,7 @@ impl Timeline {
                 )),
                 repartition_threshold: 0,
                 last_image_layer_creation_check_at: AtomicLsn::new(0),
+                last_image_layer_creation_check_instant: Mutex::new(None),
 
                 last_received_wal: Mutex::new(None),
                 rel_size_cache: RwLock::new(RelSizeCache {
@@ -4464,6 +4466,58 @@ impl Timeline {
         }
     }
 
+    /// Predicate function which indicates whether we should check if new image layers
+    /// are required. Since checking if new image layers are required is expensive in
+    /// terms of CPU, we only do it in the following cases:
+    /// 1. If the timeline has ingested sufficient WAL to justify the cost
+    /// 2. If enough time has passed since the last check
+    /// 2.1. For large tenants, we wish to perform the check more often since they
+    /// suffer from the lack of image layers
+    /// 2.2. For small tenants (that can mostly fit in RAM), we use a much longer interval
+    fn should_check_if_image_layers_required(self: &Arc<Timeline>, lsn: Lsn) -> bool {
+        const LARGE_TENANT_THRESHOLD: u64 = 2 * 1024 * 1024 * 1024;
+
+        let last_checks_at = self.last_image_layer_creation_check_at.load();
+        let distance = lsn
+            .checked_sub(last_checks_at)
+            .expect("Attempt to compact with LSN going backwards");
+        let min_distance =
+            self.get_image_layer_creation_check_threshold() as u64 * self.get_checkpoint_distance();
+
+        let distance_based_decision = distance.0 >= min_distance;
+
+        let mut time_based_decision = false;
+        let mut last_check_instant = self.last_image_layer_creation_check_instant.lock().unwrap();
+        if let CurrentLogicalSize::Exact(logical_size) = self.current_logical_size.current_size() {
+            let check_required_after = if Into::<u64>::into(&logical_size) >= LARGE_TENANT_THRESHOLD
+            {
+                self.get_checkpoint_timeout()
+            } else {
+                Duration::from_secs(3600 * 48)
+            };
+
+            time_based_decision = match *last_check_instant {
+                Some(last_check) => {
+                    let elapsed = last_check.elapsed();
+                    elapsed >= check_required_after
+                }
+                None => true,
+            };
+        }
+
+        // Do the expensive delta layer counting only if this timeline has ingested sufficient
+        // WAL since the last check or a checkpoint timeout interval has elapsed since the last
+        // check.
+        let decision = distance_based_decision || time_based_decision;
+
+        if decision {
+            self.last_image_layer_creation_check_at.store(lsn);
+            *last_check_instant = Some(Instant::now());
+        }
+
+        decision
+    }
+
     #[tracing::instrument(skip_all, fields(%lsn, %mode))]
     async fn create_image_layers(
         self: &Arc<Timeline>,
@@ -4486,22 +4540,7 @@ impl Timeline {
         // image layers  <100000000..100000099> and <200000000..200000199> are not completely covering it.
         let mut start = Key::MIN;
 
-        let check_for_image_layers = {
-            let last_checks_at = self.last_image_layer_creation_check_at.load();
-            let distance = lsn
-                .checked_sub(last_checks_at)
-                .expect("Attempt to compact with LSN going backwards");
-            let min_distance = self.get_image_layer_creation_check_threshold() as u64
-                * self.get_checkpoint_distance();
-
-            // Skip the expensive delta layer counting if this timeline has not ingested sufficient
-            // WAL since the last check.
-            distance.0 >= min_distance
-        };
-
-        if check_for_image_layers {
-            self.last_image_layer_creation_check_at.store(lsn);
-        }
+        let check_for_image_layers = self.should_check_if_image_layers_required(lsn);
 
         for partition in partitioning.parts.iter() {
             let img_range = start..partition.ranges.last().unwrap().end;

From d1495755e7e49705dffcd204fdaa9315cf1bf5e3 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 5 Jul 2024 15:12:01 +0100
Subject: [PATCH 185/412] =?UTF-8?q?SELECT=20=F0=9F=92=A3();=20(#8270)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem
We want to be able to test how our infrastructure reacts on segfaults in
Postgres (for example, we collect cores, and get some required
logs/metrics, etc)

## Summary of changes
- Add `trigger_segfauls` function to `neon_test_utils` to trigger a
segfault in Postgres
- Add `trigger_panic` function to `neon_test_utils` to trigger SIGABRT
(by using `elog(PANIC, ...))
- Fix cleanup logic in regression tests in endpoint crashed
---
 pgxn/neon_test_utils/Makefile                 |  2 +-
 ...tils--1.2.sql => neon_test_utils--1.3.sql} | 18 +++++++++++++++
 pgxn/neon_test_utils/neon_test_utils.control  |  2 +-
 pgxn/neon_test_utils/neontest.c               | 23 +++++++++++++++++++
 test_runner/fixtures/neon_fixtures.py         | 18 +++++++++++----
 test_runner/regress/test_endpoint_crash.py    | 23 +++++++++++++++++++
 6 files changed, 80 insertions(+), 6 deletions(-)
 rename pgxn/neon_test_utils/{neon_test_utils--1.2.sql => neon_test_utils--1.3.sql} (77%)
 create mode 100644 test_runner/regress/test_endpoint_crash.py

diff --git a/pgxn/neon_test_utils/Makefile b/pgxn/neon_test_utils/Makefile
index 1371272439..252810b5b0 100644
--- a/pgxn/neon_test_utils/Makefile
+++ b/pgxn/neon_test_utils/Makefile
@@ -7,7 +7,7 @@ OBJS = \
 	neontest.o
 
 EXTENSION = neon_test_utils
-DATA = neon_test_utils--1.2.sql
+DATA = neon_test_utils--1.3.sql
 PGFILEDESC = "neon_test_utils - helpers for neon testing and debugging"
 
 PG_CONFIG = pg_config
diff --git a/pgxn/neon_test_utils/neon_test_utils--1.2.sql b/pgxn/neon_test_utils/neon_test_utils--1.3.sql
similarity index 77%
rename from pgxn/neon_test_utils/neon_test_utils--1.2.sql
rename to pgxn/neon_test_utils/neon_test_utils--1.3.sql
index f84a24ec8d..3b8794a8cf 100644
--- a/pgxn/neon_test_utils/neon_test_utils--1.2.sql
+++ b/pgxn/neon_test_utils/neon_test_utils--1.3.sql
@@ -45,3 +45,21 @@ CREATE FUNCTION neon_xlogflush(lsn pg_lsn DEFAULT NULL)
 RETURNS VOID
 AS 'MODULE_PATHNAME', 'neon_xlogflush'
 LANGUAGE C PARALLEL UNSAFE;
+
+CREATE FUNCTION trigger_panic()
+RETURNS VOID
+AS 'MODULE_PATHNAME', 'trigger_panic'
+LANGUAGE C PARALLEL UNSAFE;
+
+CREATE FUNCTION trigger_segfault()
+RETURNS VOID
+AS 'MODULE_PATHNAME', 'trigger_segfault'
+LANGUAGE C PARALLEL UNSAFE;
+
+-- Alias for `trigger_segfault`, just because `SELECT 💣()` looks fun
+CREATE OR REPLACE FUNCTION 💣() RETURNS void
+LANGUAGE plpgsql AS $$
+BEGIN
+    PERFORM trigger_segfault();
+END;
+$$;
diff --git a/pgxn/neon_test_utils/neon_test_utils.control b/pgxn/neon_test_utils/neon_test_utils.control
index c7b9191ddc..f22afd70c4 100644
--- a/pgxn/neon_test_utils/neon_test_utils.control
+++ b/pgxn/neon_test_utils/neon_test_utils.control
@@ -1,6 +1,6 @@
 # neon_test_utils extension
 comment = 'helpers for neon testing and debugging'
-default_version = '1.2'
+default_version = '1.3'
 module_pathname = '$libdir/neon_test_utils'
 relocatable = true
 trusted = true
diff --git a/pgxn/neon_test_utils/neontest.c b/pgxn/neon_test_utils/neontest.c
index 071dc122ed..650ef7405d 100644
--- a/pgxn/neon_test_utils/neontest.c
+++ b/pgxn/neon_test_utils/neontest.c
@@ -42,6 +42,8 @@ PG_FUNCTION_INFO_V1(clear_buffer_cache);
 PG_FUNCTION_INFO_V1(get_raw_page_at_lsn);
 PG_FUNCTION_INFO_V1(get_raw_page_at_lsn_ex);
 PG_FUNCTION_INFO_V1(neon_xlogflush);
+PG_FUNCTION_INFO_V1(trigger_panic);
+PG_FUNCTION_INFO_V1(trigger_segfault);
 
 /*
  * Linkage to functions in neon module.
@@ -489,3 +491,24 @@ neon_xlogflush(PG_FUNCTION_ARGS)
 	XLogFlush(lsn);
 	PG_RETURN_VOID();
 }
+
+/*
+ * Function to trigger panic.
+ */
+Datum
+trigger_panic(PG_FUNCTION_ARGS)
+{
+    elog(PANIC, "neon_test_utils: panic");
+    PG_RETURN_VOID();
+}
+
+/*
+ * Function to trigger a segfault.
+ */
+Datum
+trigger_segfault(PG_FUNCTION_ARGS)
+{
+    int *ptr = NULL;
+    *ptr = 42;
+    PG_RETURN_VOID();
+}
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index c002e11c1c..5fb4d94817 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -943,6 +943,8 @@ class NeonEnvBuilder:
                 # if the test threw an exception, don't check for errors
                 # as a failing assertion would cause the cleanup below to fail
                 ps_assert_metric_no_errors=(exc_type is None),
+                # do not fail on endpoint errors to allow the rest of cleanup to proceed
+                fail_on_endpoint_errors=False,
             )
             cleanup_error = None
 
@@ -1214,11 +1216,11 @@ class NeonEnv:
         for f in futs:
             f.result()
 
-    def stop(self, immediate=False, ps_assert_metric_no_errors=False):
+    def stop(self, immediate=False, ps_assert_metric_no_errors=False, fail_on_endpoint_errors=True):
         """
         After this method returns, there should be no child processes running.
         """
-        self.endpoints.stop_all()
+        self.endpoints.stop_all(fail_on_endpoint_errors)
 
         # Stop storage controller before pageservers: we don't want it to spuriously
         # detect a pageserver "failure" during test teardown
@@ -3899,9 +3901,17 @@ class EndpointFactory:
             pageserver_id=pageserver_id,
         )
 
-    def stop_all(self) -> "EndpointFactory":
+    def stop_all(self, fail_on_error=True) -> "EndpointFactory":
+        exception = None
         for ep in self.endpoints:
-            ep.stop()
+            try:
+                ep.stop()
+            except Exception as e:
+                log.error(f"Failed to stop endpoint {ep.endpoint_id}: {e}")
+                exception = e
+
+        if fail_on_error and exception is not None:
+            raise exception
 
         return self
 
diff --git a/test_runner/regress/test_endpoint_crash.py b/test_runner/regress/test_endpoint_crash.py
new file mode 100644
index 0000000000..ae3dded437
--- /dev/null
+++ b/test_runner/regress/test_endpoint_crash.py
@@ -0,0 +1,23 @@
+import pytest
+from fixtures.neon_fixtures import NeonEnvBuilder
+
+
+@pytest.mark.parametrize(
+    "sql_func",
+    [
+        "trigger_panic",
+        "trigger_segfault",
+        "💣",  # calls `trigger_segfault` internally
+    ],
+)
+def test_endpoint_crash(neon_env_builder: NeonEnvBuilder, sql_func: str):
+    """
+    Test that triggering crash from neon_test_utils crashes the endpoint
+    """
+    env = neon_env_builder.init_start()
+    env.neon_cli.create_branch("test_endpoint_crash")
+    endpoint = env.endpoints.create_start("test_endpoint_crash")
+
+    endpoint.safe_psql("CREATE EXTENSION neon_test_utils;")
+    with pytest.raises(Exception, match="This probably means the server terminated abnormally"):
+        endpoint.safe_psql(f"SELECT {sql_func}();")

From 05b41696447b920ba77067930e0cdfb7b6ad00e1 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Fri, 5 Jul 2024 20:39:10 +0300
Subject: [PATCH 186/412] Increase timeout for wating subscriber caught-up
 (#8118)

## Problem

test_subscriber_restart has quit large failure rate'

https://neonprod.grafana.net/d/fddp4rvg7k2dcf/regression-test-failures?orgId=1&var-test_name=test_subscriber_restart&var-max_count=100&var-restrict=false

I can be caused by too small timeout (5 seconds) to wait until changes
are propagated.

Related to #8097

## Summary of changes

Increase timeout to 30 seconds.

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 test_runner/regress/test_subscriber_restart.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_runner/regress/test_subscriber_restart.py b/test_runner/regress/test_subscriber_restart.py
index d7f3962620..91caad7220 100644
--- a/test_runner/regress/test_subscriber_restart.py
+++ b/test_runner/regress/test_subscriber_restart.py
@@ -54,4 +54,4 @@ def test_subscriber_restart(neon_simple_env: NeonEnv):
         pcur.execute(f"INSERT into t values ({n_records}, 0)")
         n_records += 1
         with sub.cursor() as scur:
-            wait_until(10, 0.5, check_that_changes_propagated)
+            wait_until(60, 0.5, check_that_changes_propagated)

From f3310143e44b536d883d03dbebcb17f496318a7b Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 5 Jul 2024 22:17:05 +0200
Subject: [PATCH 187/412] pageserver_live_connections: track as counter pair
 (#8227)

Generally counter pairs are preferred over gauges.
In this case, I found myself asking what the typical rate of accepted
page_service connections on a pageserver is, and I couldn't answer it
with the gauge metric.

There are a few dashboards using this metric:

https://github.com/search?q=repo%3Aneondatabase%2Fgrafana-dashboard-export%20pageserver_live_connections&type=code

I'll convert them to use the new metric once this PR reaches prod.

refs https://github.com/neondatabase/neon/issues/7427
---
 pageserver/src/metrics.rs                           | 10 ++++++----
 pageserver/src/page_service.rs                      | 13 ++++---------
 .../timeline/walreceiver/walreceiver_connection.rs  | 13 ++++---------
 3 files changed, 14 insertions(+), 22 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 9e9fe7fbb8..59b7293631 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1456,10 +1456,12 @@ impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> {
     }
 }
 
-pub(crate) static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
-    register_int_gauge_vec!(
-        "pageserver_live_connections",
-        "Number of live network connections",
+pub(crate) static LIVE_CONNECTIONS: Lazy<IntCounterPairVec> = Lazy::new(|| {
+    register_int_counter_pair_vec!(
+        "pageserver_live_connections_started",
+        "Number of network connections that we started handling",
+        "pageserver_live_connections_finished",
+        "Number of network connections that we finished handling",
         &["pageserver_connection_kind"]
     )
     .expect("failed to define a metric")
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index a440ad6378..07365b5eb8 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -55,7 +55,7 @@ use crate::basebackup::BasebackupError;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::import_datadir::import_wal_from_tar;
 use crate::metrics;
-use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS_COUNT};
+use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS};
 use crate::pgdatadir_mapping::Version;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
@@ -215,14 +215,9 @@ async fn page_service_conn_main(
     auth_type: AuthType,
     connection_ctx: RequestContext,
 ) -> anyhow::Result<()> {
-    // Immediately increment the gauge, then create a job to decrement it on task exit.
-    // One of the pros of `defer!` is that this will *most probably*
-    // get called, even in presence of panics.
-    let gauge = LIVE_CONNECTIONS_COUNT.with_label_values(&["page_service"]);
-    gauge.inc();
-    scopeguard::defer! {
-        gauge.dec();
-    }
+    let _guard = LIVE_CONNECTIONS
+        .with_label_values(&["page_service"])
+        .guard();
 
     socket
         .set_nodelay(true)
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index c6ee6b90c4..a66900522a 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -26,7 +26,7 @@ use tracing::{debug, error, info, trace, warn, Instrument};
 use super::TaskStateUpdate;
 use crate::{
     context::RequestContext,
-    metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST},
+    metrics::{LIVE_CONNECTIONS, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST},
     task_mgr::TaskKind,
     task_mgr::WALRECEIVER_RUNTIME,
     tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
@@ -208,14 +208,9 @@ pub(super) async fn handle_walreceiver_connection(
         .instrument(tracing::info_span!("poller")),
     );
 
-    // Immediately increment the gauge, then create a job to decrement it on task exit.
-    // One of the pros of `defer!` is that this will *most probably*
-    // get called, even in presence of panics.
-    let gauge = LIVE_CONNECTIONS_COUNT.with_label_values(&["wal_receiver"]);
-    gauge.inc();
-    scopeguard::defer! {
-        gauge.dec();
-    }
+    let _guard = LIVE_CONNECTIONS
+        .with_label_values(&["wal_receiver"])
+        .guard();
 
     let identify = identify_system(&replication_client).await?;
     info!("{identify:?}");

From 3ef7748e6ba0d0552a33499b35fe7ac4b19a0835 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 5 Jul 2024 22:18:05 +0200
Subject: [PATCH 188/412] Improve parsing of `ImageCompressionAlgorithm`
 (#8281)

Improve parsing of the `ImageCompressionAlgorithm` enum to allow level
customization like `zstd(1)`, as strum only takes `Default::default()`,
i.e. `None` as the level.

Part of #5431
---
 libs/pageserver_api/src/models.rs | 64 +++++++++++++++++++++++++------
 1 file changed, 52 insertions(+), 12 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index ecc543917e..49c942938d 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -9,6 +9,7 @@ use std::{
     collections::HashMap,
     io::{BufRead, Read},
     num::{NonZeroU64, NonZeroUsize},
+    str::FromStr,
     sync::atomic::AtomicUsize,
     time::{Duration, SystemTime},
 };
@@ -437,18 +438,7 @@ pub enum CompactionAlgorithm {
     Tiered,
 }
 
-#[derive(
-    Debug,
-    Clone,
-    Copy,
-    PartialEq,
-    Eq,
-    Serialize,
-    Deserialize,
-    strum_macros::FromRepr,
-    strum_macros::EnumString,
-)]
-#[strum(serialize_all = "kebab-case")]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 pub enum ImageCompressionAlgorithm {
     /// Disabled for writes, and never decompress during reading.
     /// Never set this after you've enabled compression once!
@@ -468,6 +458,31 @@ impl ImageCompressionAlgorithm {
     }
 }
 
+impl FromStr for ImageCompressionAlgorithm {
+    type Err = anyhow::Error;
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        let mut components = s.split(['(', ')']);
+        let first = components
+            .next()
+            .ok_or_else(|| anyhow::anyhow!("empty string"))?;
+        match first {
+            "disabled-no-decompress" => Ok(ImageCompressionAlgorithm::DisabledNoDecompress),
+            "disabled" => Ok(ImageCompressionAlgorithm::Disabled),
+            "zstd" => {
+                let level = if let Some(v) = components.next() {
+                    let v: i8 = v.parse()?;
+                    Some(v)
+                } else {
+                    None
+                };
+
+                Ok(ImageCompressionAlgorithm::Zstd { level })
+            }
+            _ => anyhow::bail!("invalid specifier '{first}'"),
+        }
+    }
+}
+
 #[derive(Eq, PartialEq, Debug, Clone, Serialize, Deserialize)]
 pub struct CompactionAlgorithmSettings {
     pub kind: CompactionAlgorithm,
@@ -1660,4 +1675,29 @@ mod tests {
             AuxFilePolicy::CrossValidation
         );
     }
+
+    #[test]
+    fn test_image_compression_algorithm_parsing() {
+        use ImageCompressionAlgorithm::*;
+        assert_eq!(
+            ImageCompressionAlgorithm::from_str("disabled").unwrap(),
+            Disabled
+        );
+        assert_eq!(
+            ImageCompressionAlgorithm::from_str("disabled-no-decompress").unwrap(),
+            DisabledNoDecompress
+        );
+        assert_eq!(
+            ImageCompressionAlgorithm::from_str("zstd").unwrap(),
+            Zstd { level: None }
+        );
+        assert_eq!(
+            ImageCompressionAlgorithm::from_str("zstd(18)").unwrap(),
+            Zstd { level: Some(18) }
+        );
+        assert_eq!(
+            ImageCompressionAlgorithm::from_str("zstd(-3)").unwrap(),
+            Zstd { level: Some(-3) }
+        );
+    }
 }

From 36b790f2825a4021b404d63a76a504d333dda6d1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 5 Jul 2024 22:36:28 +0200
Subject: [PATCH 189/412] Add concurrency to the find-large-objects scrubber
 subcommand (#8291)

The find-large-objects scrubber subcommand is quite fast if you run it
in an environment with low latency to the S3 bucket (say an EC2 instance
in the same region). However, the higher the latency gets, the slower
the command becomes. Therefore, add a concurrency param and make it
parallelized. This doesn't change that general relationship, but at
least lets us do multiple requests in parallel and therefore hopefully
faster.

Running with concurrency of 64 (default):

```
2024-07-05T17:30:22.882959Z  INFO lazy_load_identity [...]
[...]
2024-07-05T17:30:28.289853Z  INFO Scanned 500 shards. [...]
```

With concurrency of 1, simulating state before this PR:

```
2024-07-05T17:31:43.375153Z  INFO lazy_load_identity [...]
[...]
2024-07-05T17:33:51.987092Z  INFO Scanned 500 shards. [...]
```

In other words, to list 500 shards, speed is increased from 2:08 minutes
to 6 seconds.

Follow-up of  #8257, part of #5431
---
 storage_scrubber/src/find_large_objects.rs | 101 +++++++++++++--------
 storage_scrubber/src/main.rs               |  13 ++-
 2 files changed, 72 insertions(+), 42 deletions(-)

diff --git a/storage_scrubber/src/find_large_objects.rs b/storage_scrubber/src/find_large_objects.rs
index 24668b6516..1422545f2f 100644
--- a/storage_scrubber/src/find_large_objects.rs
+++ b/storage_scrubber/src/find_large_objects.rs
@@ -1,4 +1,4 @@
-use futures::StreamExt;
+use futures::{StreamExt, TryStreamExt};
 use pageserver::tenant::storage_layer::LayerName;
 use serde::{Deserialize, Serialize};
 
@@ -29,7 +29,7 @@ impl LargeObjectKind {
     }
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Serialize, Deserialize, Clone)]
 pub struct LargeObject {
     pub key: String,
     pub size: u64,
@@ -45,53 +45,76 @@ pub async fn find_large_objects(
     bucket_config: BucketConfig,
     min_size: u64,
     ignore_deltas: bool,
+    concurrency: usize,
 ) -> anyhow::Result<LargeObjectListing> {
     let (s3_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver)?;
-    let mut tenants = std::pin::pin!(stream_tenants(&s3_client, &target));
+    let tenants = std::pin::pin!(stream_tenants(&s3_client, &target));
+
+    let objects_stream = tenants.map_ok(|tenant_shard_id| {
+        let mut tenant_root = target.tenant_root(&tenant_shard_id);
+        let s3_client = s3_client.clone();
+        async move {
+            let mut objects = Vec::new();
+            let mut total_objects_ctr = 0u64;
+            // We want the objects and not just common prefixes
+            tenant_root.delimiter.clear();
+            let mut continuation_token = None;
+            loop {
+                let fetch_response =
+                    list_objects_with_retries(&s3_client, &tenant_root, continuation_token.clone())
+                        .await?;
+                for obj in fetch_response.contents().iter().filter(|o| {
+                    if let Some(obj_size) = o.size {
+                        min_size as i64 <= obj_size
+                    } else {
+                        false
+                    }
+                }) {
+                    let key = obj.key().expect("couldn't get key").to_owned();
+                    let kind = LargeObjectKind::from_key(&key);
+                    if ignore_deltas && kind == LargeObjectKind::DeltaLayer {
+                        continue;
+                    }
+                    objects.push(LargeObject {
+                        key,
+                        size: obj.size.unwrap() as u64,
+                        kind,
+                    })
+                }
+                total_objects_ctr += fetch_response.contents().len() as u64;
+                match fetch_response.next_continuation_token {
+                    Some(new_token) => continuation_token = Some(new_token),
+                    None => break,
+                }
+            }
+
+            Ok((tenant_shard_id, objects, total_objects_ctr))
+        }
+    });
+    let mut objects_stream = std::pin::pin!(objects_stream.try_buffer_unordered(concurrency));
+
     let mut objects = Vec::new();
+
     let mut tenant_ctr = 0u64;
     let mut object_ctr = 0u64;
-    while let Some(tenant_shard_id) = tenants.next().await {
-        let tenant_shard_id = tenant_shard_id?;
-        let mut tenant_root = target.tenant_root(&tenant_shard_id);
-        // We want the objects and not just common prefixes
-        tenant_root.delimiter.clear();
-        let mut continuation_token = None;
-        loop {
-            let fetch_response =
-                list_objects_with_retries(&s3_client, &tenant_root, continuation_token.clone())
-                    .await?;
-            for obj in fetch_response.contents().iter().filter(|o| {
-                if let Some(obj_size) = o.size {
-                    min_size as i64 <= obj_size
-                } else {
-                    false
-                }
-            }) {
-                let key = obj.key().expect("couldn't get key").to_owned();
-                let kind = LargeObjectKind::from_key(&key);
-                if ignore_deltas && kind == LargeObjectKind::DeltaLayer {
-                    continue;
-                }
-                objects.push(LargeObject {
-                    key,
-                    size: obj.size.unwrap() as u64,
-                    kind,
-                })
-            }
-            object_ctr += fetch_response.contents().len() as u64;
-            match fetch_response.next_continuation_token {
-                Some(new_token) => continuation_token = Some(new_token),
-                None => break,
-            }
-        }
+    while let Some(res) = objects_stream.next().await {
+        let (tenant_shard_id, objects_slice, total_objects_ctr) = res?;
+        objects.extend_from_slice(&objects_slice);
 
+        object_ctr += total_objects_ctr;
         tenant_ctr += 1;
-        if tenant_ctr % 50 == 0 {
+        if tenant_ctr % 100 == 0 {
             tracing::info!(
-                "Scanned {tenant_ctr} shards. objects={object_ctr}, found={}, current={tenant_shard_id}.", objects.len()
+                "Scanned {tenant_ctr} shards. objects={object_ctr}, found={}, current={tenant_shard_id}.",
+                objects.len()
             );
         }
     }
+
+    let bucket_name = target.bucket_name();
+    tracing::info!(
+        "Scan of {bucket_name} finished. Scanned {tenant_ctr} shards. objects={object_ctr}, found={}.",
+        objects.len()
+    );
     Ok(LargeObjectListing { objects })
 }
diff --git a/storage_scrubber/src/main.rs b/storage_scrubber/src/main.rs
index 10699edd3c..16a26613d2 100644
--- a/storage_scrubber/src/main.rs
+++ b/storage_scrubber/src/main.rs
@@ -78,6 +78,8 @@ enum Command {
         min_size: u64,
         #[arg(short, long, default_value_t = false)]
         ignore_deltas: bool,
+        #[arg(long = "concurrency", short = 'j', default_value_t = 64)]
+        concurrency: usize,
     },
 }
 
@@ -210,10 +212,15 @@ async fn main() -> anyhow::Result<()> {
         Command::FindLargeObjects {
             min_size,
             ignore_deltas,
+            concurrency,
         } => {
-            let summary =
-                find_large_objects::find_large_objects(bucket_config, min_size, ignore_deltas)
-                    .await?;
+            let summary = find_large_objects::find_large_objects(
+                bucket_config,
+                min_size,
+                ignore_deltas,
+                concurrency,
+            )
+            .await?;
             println!("{}", serde_json::to_string(&summary).unwrap());
             Ok(())
         }

From a58827f952e47b7a15c7ae8ad0df44a2c24753ad Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sat, 6 Jul 2024 17:41:54 +0100
Subject: [PATCH 190/412] build(deps): bump certifi from 2023.7.22 to 2024.7.4
 (#8301)

---
 poetry.lock | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 7740388fb8..bf16aaf55d 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
 
 [[package]]
 name = "aiohttp"
@@ -734,13 +734,13 @@ typing-extensions = ">=4.1.0"
 
 [[package]]
 name = "certifi"
-version = "2023.7.22"
+version = "2024.7.4"
 description = "Python package for providing Mozilla's CA Bundle."
 optional = false
 python-versions = ">=3.6"
 files = [
-    {file = "certifi-2023.7.22-py3-none-any.whl", hash = "sha256:92d6037539857d8206b8f6ae472e8b77db8058fec5937a1ef3f54304089edbb9"},
-    {file = "certifi-2023.7.22.tar.gz", hash = "sha256:539cc1d13202e33ca466e88b2807e29f4c13049d6d87031a3c110744495cb082"},
+    {file = "certifi-2024.7.4-py3-none-any.whl", hash = "sha256:c198e21b1289c2ab85ee4e67bb4b4ef3ead0892059901a8d5b622f24a1101e90"},
+    {file = "certifi-2024.7.4.tar.gz", hash = "sha256:5a1e7645bc0ec61a09e26c36f6106dd4cf40c6db3a1fb6352b0244e7fb057c7b"},
 ]
 
 [[package]]

From 285c6d2974dfade5f0385b1233bc2669e7f5eb8c Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Mon, 8 Jul 2024 09:05:49 -0400
Subject: [PATCH 191/412] fix(pageserver): ensure sparse keyspace is ordered
 (#8285)

## Problem

Sparse keyspaces were constructed with ranges out of order: this didn't break things obviously, but meant that users of KeySpace functions that assume ordering would assert out.

Closes https://github.com/neondatabase/neon/issues/8277

## Summary of changes

make sure the sparse keyspace has ordered keyspace parts
---
 pageserver/src/pgdatadir_mapping.rs | 52 +++++++++++++++++++++++++----
 1 file changed, 45 insertions(+), 7 deletions(-)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 25d00d6dfd..fefd8d88ff 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -919,6 +919,9 @@ impl Timeline {
             result.add_key(AUX_FILES_KEY);
         }
 
+        // Add extra keyspaces in the test cases. Some test cases write keys into the storage without
+        // creating directory keys. These test cases will add such keyspaces into `extra_test_dense_keyspace`
+        // and the keys will not be garbage-colllected.
         #[cfg(test)]
         {
             let guard = self.extra_test_dense_keyspace.load();
@@ -927,13 +930,48 @@ impl Timeline {
             }
         }
 
-        Ok((
-            result.to_keyspace(),
-            /* AUX sparse key space */
-            SparseKeySpace(KeySpace {
-                ranges: vec![repl_origin_key_range(), Key::metadata_aux_key_range()],
-            }),
-        ))
+        let dense_keyspace = result.to_keyspace();
+        let sparse_keyspace = SparseKeySpace(KeySpace {
+            ranges: vec![Key::metadata_aux_key_range(), repl_origin_key_range()],
+        });
+
+        if cfg!(debug_assertions) {
+            // Verify if the sparse keyspaces are ordered and non-overlapping.
+
+            // We do not use KeySpaceAccum for sparse_keyspace because we want to ensure each
+            // category of sparse keys are split into their own image/delta files. If there
+            // are overlapping keyspaces, they will be automatically merged by keyspace accum,
+            // and we want the developer to keep the keyspaces separated.
+
+            let ranges = &sparse_keyspace.0.ranges;
+
+            // TODO: use a single overlaps_with across the codebase
+            fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
+                !(a.end <= b.start || b.end <= a.start)
+            }
+            for i in 0..ranges.len() {
+                for j in 0..i {
+                    if overlaps_with(&ranges[i], &ranges[j]) {
+                        panic!(
+                            "overlapping sparse keyspace: {}..{} and {}..{}",
+                            ranges[i].start, ranges[i].end, ranges[j].start, ranges[j].end
+                        );
+                    }
+                }
+            }
+            for i in 1..ranges.len() {
+                assert!(
+                    ranges[i - 1].end <= ranges[i].start,
+                    "unordered sparse keyspace: {}..{} and {}..{}",
+                    ranges[i - 1].start,
+                    ranges[i - 1].end,
+                    ranges[i].start,
+                    ranges[i].end
+                );
+            }
+        }
+
+        Ok((dense_keyspace, sparse_keyspace))
     }
 
     /// Get cached size of relation if it not updated after specified LSN

From 0948fb6bf1c0ed5b38a80ea7eededb2cfcb46c98 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 8 Jul 2024 14:10:42 +0100
Subject: [PATCH 192/412] pageserver: switch to jemalloc (#8307)

## Problem

- Resident memory on long running pageserver processes tends to climb:
memory fragmentation is suspected.
- Total resident memory may be a limiting factor for running on smaller
nodes.

## Summary of changes

- As a low-energy experiment, switch the pageserver to use jemalloc (not
a net-new dependency, proxy already use it)
- Decide at end of week whether to revert before next release.
---
 Cargo.lock                       | 2 ++
 pageserver/Cargo.toml            | 1 +
 pageserver/src/bin/pageserver.rs | 3 +++
 workspace_hack/Cargo.toml        | 1 +
 4 files changed, 7 insertions(+)

diff --git a/Cargo.lock b/Cargo.lock
index 6dae8e3403..716b6690d9 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3667,6 +3667,7 @@ dependencies = [
  "sysinfo",
  "tenant_size_model",
  "thiserror",
+ "tikv-jemallocator",
  "tokio",
  "tokio-epoll-uring",
  "tokio-io-timeout",
@@ -7468,6 +7469,7 @@ dependencies = [
  "syn 1.0.109",
  "syn 2.0.52",
  "sync_wrapper",
+ "tikv-jemalloc-sys",
  "time",
  "time-macros",
  "tokio",
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 4335f38f1e..0d9343d643 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -62,6 +62,7 @@ sync_wrapper.workspace = true
 sysinfo.workspace = true
 tokio-tar.workspace = true
 thiserror.workspace = true
+tikv-jemallocator.workspace = true
 tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
 tokio-epoll-uring.workspace = true
 tokio-io-timeout.workspace = true
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 39d4e46c96..2763352a21 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -47,6 +47,9 @@ use utils::{
 project_git_version!(GIT_VERSION);
 project_build_tag!(BUILD_TAG);
 
+#[global_allocator]
+static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
+
 const PID_FILE_NAME: &str = "pageserver.pid";
 
 const FEATURES: &[&str] = &[
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index f43076171f..e1b1806bc8 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -69,6 +69,7 @@ sha2 = { version = "0.10", features = ["asm"] }
 smallvec = { version = "1", default-features = false, features = ["const_new", "write"] }
 subtle = { version = "2" }
 sync_wrapper = { version = "0.1", default-features = false, features = ["futures"] }
+tikv-jemalloc-sys = { version = "0.5" }
 time = { version = "0.3", features = ["macros", "serde-well-known"] }
 tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] }
 tokio-rustls = { version = "0.24" }

From 55d37c77b94f0ef51438f52d0068a40fbe8969f6 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 3 Jul 2024 14:57:17 -0500
Subject: [PATCH 193/412] Hide import behind TYPE_CHECKING

No need to import it if we aren't type checking anything.
---
 test_runner/performance/test_logical_replication.py | 7 ++++++-
 test_runner/regress/test_physical_replication.py    | 6 +++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py
index 7d11facc29..570bd11b6f 100644
--- a/test_runner/performance/test_logical_replication.py
+++ b/test_runner/performance/test_logical_replication.py
@@ -1,8 +1,13 @@
+from __future__ import annotations
+
 import time
 
 import pytest
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import AuxFileStore, NeonEnv, PgBin, logical_replication_sync
+from fixtures.neon_fixtures import AuxFileStore, logical_replication_sync
+
+if TYPE_CHECKING:
+    from fixtures.neon_fixtures import NeonEnv, PgBin
 
 
 @pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.V2])
diff --git a/test_runner/regress/test_physical_replication.py b/test_runner/regress/test_physical_replication.py
index a1bff32eed..043aff686b 100644
--- a/test_runner/regress/test_physical_replication.py
+++ b/test_runner/regress/test_physical_replication.py
@@ -1,7 +1,11 @@
+from __future__ import annotations
+
 import random
 import time
+from typing import TYPE_CHECKING
 
-from fixtures.neon_fixtures import NeonEnv
+if TYPE_CHECKING:
+    from fixtures.neon_fixtures import NeonEnv
 
 
 def test_physical_replication(neon_simple_env: NeonEnv):

From 8d9b632f2ad83463fad448d1aafe0633a383a0c6 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 3 Jul 2024 14:54:49 -0500
Subject: [PATCH 194/412] Add Neon HTTP API test fixture

This is a Python binding to the Neon HTTP API. It isn't complete, but
can be extended as necessary.

Co-authored-by: Sasha Krassovsky <sasha@neon.tech>
---
 test_runner/fixtures/neon_api.py      | 263 ++++++++++++++++++++++++++
 test_runner/fixtures/neon_fixtures.py |  21 ++
 2 files changed, 284 insertions(+)
 create mode 100644 test_runner/fixtures/neon_api.py

diff --git a/test_runner/fixtures/neon_api.py b/test_runner/fixtures/neon_api.py
new file mode 100644
index 0000000000..39baf5fab6
--- /dev/null
+++ b/test_runner/fixtures/neon_api.py
@@ -0,0 +1,263 @@
+from __future__ import annotations
+
+import time
+from typing import TYPE_CHECKING, cast
+
+import requests
+
+if TYPE_CHECKING:
+    from typing import Any, Dict, Literal, Optional, Union
+
+    from fixtures.pg_version import PgVersion
+
+
+def connection_parameters_to_env(params: Dict[str, str]) -> Dict[str, str]:
+    return {
+        "PGHOST": params["host"],
+        "PGDATABASE": params["database"],
+        "PGUSER": params["role"],
+        "PGPASSWORD": params["password"],
+    }
+
+
+class NeonAPI:
+    def __init__(self, neon_api_key: str, neon_api_base_url: str):
+        self.__neon_api_key = neon_api_key
+        self.__neon_api_base_url = neon_api_base_url.strip("/")
+
+    def __request(
+        self, method: Union[str, bytes], endpoint: str, **kwargs: Any
+    ) -> requests.Response:
+        if "headers" not in kwargs:
+            kwargs["headers"] = {}
+        kwargs["headers"]["Authorization"] = f"Bearer {self.__neon_api_key}"
+
+        return requests.request(method, f"{self.__neon_api_base_url}{endpoint}", **kwargs)
+
+    def create_project(
+        self,
+        pg_version: Optional[PgVersion] = None,
+        name: Optional[str] = None,
+        branch_name: Optional[str] = None,
+        branch_role_name: Optional[str] = None,
+        branch_database_name: Optional[str] = None,
+    ) -> Dict[str, Any]:
+        data: Dict[str, Any] = {
+            "project": {
+                "branch": {},
+            },
+        }
+        if name:
+            data["project"]["name"] = name
+        if pg_version:
+            data["project"]["pg_version"] = int(pg_version)
+        if branch_name:
+            data["project"]["branch"]["name"] = branch_name
+        if branch_role_name:
+            data["project"]["branch"]["role_name"] = branch_role_name
+        if branch_database_name:
+            data["project"]["branch"]["database_name"] = branch_database_name
+
+        resp = self.__request(
+            "POST",
+            "/projects",
+            headers={
+                "Accept": "application/json",
+                "Content-Type": "application/json",
+            },
+            json=data,
+        )
+
+        assert resp.status_code == 201
+
+        return cast("Dict[str, Any]", resp.json())
+
+    def get_project_details(self, project_id: str) -> Dict[str, Any]:
+        resp = self.__request(
+            "GET",
+            f"/projects/{project_id}",
+            headers={
+                "Accept": "application/json",
+                "Content-Type": "application/json",
+            },
+        )
+        assert resp.status_code == 200
+        return cast("Dict[str, Any]", resp.json())
+
+    def delete_project(
+        self,
+        project_id: str,
+    ) -> Dict[str, Any]:
+        resp = self.__request(
+            "DELETE",
+            f"/projects/{project_id}",
+            headers={
+                "Accept": "application/json",
+                "Content-Type": "application/json",
+            },
+        )
+
+        assert resp.status_code == 200
+
+        return cast("Dict[str, Any]", resp.json())
+
+    def start_endpoint(
+        self,
+        project_id: str,
+        endpoint_id: str,
+    ) -> Dict[str, Any]:
+        resp = self.__request(
+            "POST",
+            f"/projects/{project_id}/endpoints/{endpoint_id}/start",
+            headers={
+                "Accept": "application/json",
+            },
+        )
+
+        assert resp.status_code == 200
+
+        return cast("Dict[str, Any]", resp.json())
+
+    def suspend_endpoint(
+        self,
+        project_id: str,
+        endpoint_id: str,
+    ) -> Dict[str, Any]:
+        resp = self.__request(
+            "POST",
+            f"/projects/{project_id}/endpoints/{endpoint_id}/suspend",
+            headers={
+                "Accept": "application/json",
+            },
+        )
+
+        assert resp.status_code == 200
+
+        return cast("Dict[str, Any]", resp.json())
+
+    def restart_endpoint(
+        self,
+        project_id: str,
+        endpoint_id: str,
+    ) -> Dict[str, Any]:
+        resp = self.__request(
+            "POST",
+            f"/projects/{project_id}/endpoints/{endpoint_id}/restart",
+            headers={
+                "Accept": "application/json",
+            },
+        )
+
+        assert resp.status_code == 200
+
+        return cast("Dict[str, Any]", resp.json())
+
+    def create_endpoint(
+        self,
+        project_id: str,
+        branch_id: str,
+        endpoint_type: Literal["read_write", "read_only"],
+        settings: Dict[str, Any],
+    ) -> Dict[str, Any]:
+        data: Dict[str, Any] = {
+            "endpoint": {
+                "branch_id": branch_id,
+            },
+        }
+
+        if endpoint_type:
+            data["endpoint"]["type"] = endpoint_type
+        if settings:
+            data["endpoint"]["settings"] = settings
+
+        resp = self.__request(
+            "POST",
+            f"/projects/{project_id}/endpoints",
+            headers={
+                "Accept": "application/json",
+                "Content-Type": "application/json",
+            },
+            json=data,
+        )
+
+        assert resp.status_code == 201
+
+        return cast("Dict[str, Any]", resp.json())
+
+    def get_connection_uri(
+        self,
+        project_id: str,
+        branch_id: Optional[str] = None,
+        endpoint_id: Optional[str] = None,
+        database_name: str = "neondb",
+        role_name: str = "neondb_owner",
+        pooled: bool = True,
+    ) -> Dict[str, Any]:
+        resp = self.__request(
+            "GET",
+            f"/projects/{project_id}/connection_uri",
+            params={
+                "branch_id": branch_id,
+                "endpoint_id": endpoint_id,
+                "database_name": database_name,
+                "role_name": role_name,
+                "pooled": pooled,
+            },
+            headers={
+                "Accept": "application/json",
+            },
+        )
+
+        assert resp.status_code == 200
+
+        return cast("Dict[str, Any]", resp.json())
+
+    def get_branches(self, project_id: str) -> Dict[str, Any]:
+        resp = self.__request(
+            "GET",
+            f"/projects/{project_id}/branches",
+            headers={
+                "Accept": "application/json",
+            },
+        )
+
+        assert resp.status_code == 200
+
+        return cast("Dict[str, Any]", resp.json())
+
+    def get_endpoints(self, project_id: str) -> Dict[str, Any]:
+        resp = self.__request(
+            "GET",
+            f"/projects/{project_id}/endpoints",
+            headers={
+                "Accept": "application/json",
+            },
+        )
+
+        assert resp.status_code == 200
+
+        return cast("Dict[str, Any]", resp.json())
+
+    def get_operations(self, project_id: str) -> Dict[str, Any]:
+        resp = self.__request(
+            "GET",
+            f"/projects/{project_id}/operations",
+            headers={
+                "Accept": "application/json",
+                "Authorization": f"Bearer {self.__neon_api_key}",
+            },
+        )
+
+        assert resp.status_code == 200
+
+        return cast("Dict[str, Any]", resp.json())
+
+    def wait_for_operation_to_finish(self, project_id: str):
+        has_running = True
+        while has_running:
+            has_running = False
+            operations = self.get_operations(project_id)["operations"]
+            for op in operations:
+                if op["status"] in {"scheduling", "running", "cancelling"}:
+                    has_running = True
+            time.sleep(0.5)
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 5fb4d94817..ac2fcd8ade 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -87,6 +87,8 @@ from fixtures.utils import (
 )
 from fixtures.utils import AuxFileStore as AuxFileStore  # reexport
 
+from .neon_api import NeonAPI
+
 """
 This file contains pytest fixtures. A fixture is a test resource that can be
 summoned by placing its name in the test's arguments.
@@ -184,6 +186,25 @@ def versioned_pg_distrib_dir(pg_distrib_dir: Path, pg_version: PgVersion) -> Ite
     yield versioned_dir
 
 
+@pytest.fixture(scope="session")
+def neon_api_key() -> str:
+    api_key = os.getenv("NEON_API_KEY")
+    if not api_key:
+        raise AssertionError("Set the NEON_API_KEY environment variable")
+
+    return api_key
+
+
+@pytest.fixture(scope="session")
+def neon_api_base_url() -> str:
+    return os.getenv("NEON_API_BASE_URL", "https://console-stage.neon.build/api/v2")
+
+
+@pytest.fixture(scope="session")
+def neon_api(neon_api_key: str, neon_api_base_url: str) -> NeonAPI:
+    return NeonAPI(neon_api_key, neon_api_base_url)
+
+
 def shareable_scope(fixture_name: str, config: Config) -> Literal["session", "function"]:
     """Return either session of function scope, depending on TEST_SHARED_FIXTURES envvar.
 

From 8328580dc2d8d7dc395f0c0de19e0136a0a67ab3 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 3 Jul 2024 14:59:19 -0500
Subject: [PATCH 195/412] Log PG environment variables when a PgBin runs

Useful for debugging situations like connecting to databases.

Co-authored-by: Sasha Krassovsky <sasha@neon.tech>
---
 test_runner/fixtures/neon_fixtures.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index ac2fcd8ade..532e7bcce5 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2883,6 +2883,13 @@ class PgBin:
         env.update(env_add)
         return env
 
+    def _log_env(self, env: dict[str, str]) -> None:
+        env_s = {}
+        for k, v in env.items():
+            if k.startswith("PG") and k != "PGPASSWORD":
+                env_s[k] = v
+        log.debug(f"Environment: {env_s}")
+
     def run(
         self,
         command: List[str],
@@ -2905,6 +2912,7 @@ class PgBin:
         self._fixpath(command)
         log.info(f"Running command '{' '.join(command)}'")
         env = self._build_env(env)
+        self._log_env(env)
         subprocess.run(command, env=env, cwd=cwd, check=True)
 
     def run_capture(
@@ -2925,6 +2933,7 @@ class PgBin:
         self._fixpath(command)
         log.info(f"Running command '{' '.join(command)}'")
         env = self._build_env(env)
+        self._log_env(env)
         base_path, _, _ = subprocess_capture(
             self.log_dir,
             command,

From 930201e03389407a06fa2b5431078c401a6872ae Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 3 Jul 2024 15:04:57 -0500
Subject: [PATCH 196/412] Add PgBin.run_nonblocking()

Allows a process to run without blocking program execution, which can be
useful for certain test scenarios.

Co-authored-by: Sasha Krassovsky <sasha@neon.tech>
---
 test_runner/fixtures/neon_fixtures.py | 32 ++++++++++++++++++++++-----
 1 file changed, 27 insertions(+), 5 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 532e7bcce5..cae2e422c1 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2890,14 +2890,14 @@ class PgBin:
                 env_s[k] = v
         log.debug(f"Environment: {env_s}")
 
-    def run(
+    def run_nonblocking(
         self,
         command: List[str],
         env: Optional[Env] = None,
         cwd: Optional[Union[str, Path]] = None,
-    ):
+    ) -> subprocess.Popen[Any]:
         """
-        Run one of the postgres binaries.
+        Run one of the postgres binaries, not waiting for it to finish
 
         The command should be in list form, e.g. ['pgbench', '-p', '55432']
 
@@ -2908,12 +2908,34 @@ class PgBin:
 
         If you want stdout/stderr captured to files, use `run_capture` instead.
         """
-
         self._fixpath(command)
         log.info(f"Running command '{' '.join(command)}'")
         env = self._build_env(env)
         self._log_env(env)
-        subprocess.run(command, env=env, cwd=cwd, check=True)
+        return subprocess.Popen(command, env=env, cwd=cwd, stdout=subprocess.PIPE, text=True)
+
+    def run(
+        self,
+        command: List[str],
+        env: Optional[Env] = None,
+        cwd: Optional[Union[str, Path]] = None,
+    ) -> None:
+        """
+        Run one of the postgres binaries, waiting for it to finish
+
+        The command should be in list form, e.g. ['pgbench', '-p', '55432']
+
+        All the necessary environment variables will be set.
+
+        If the first argument (the command name) doesn't include a path (no '/'
+        characters present), then it will be edited to include the correct path.
+
+        If you want stdout/stderr captured to files, use `run_capture` instead.
+        """
+        proc = self.run_nonblocking(command, env, cwd)
+        proc.wait()
+        if proc.returncode != 0:
+            raise subprocess.CalledProcessError(proc.returncode, proc.args)
 
     def run_capture(
         self,

From cec216c5c0f35e3eb7016341aff8d8961e47e691 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 3 Jul 2024 15:22:42 -0500
Subject: [PATCH 197/412] Add long running replication tests

These tests will help verify that replication, both physical and
logical, works as expected in Neon.

Co-authored-by: Sasha Krassovsky <sasha@neon.tech>
---
 .../actions/run-python-test-set/action.yml    |   1 +
 .github/workflows/benchmarking.yml            |  72 ++++-
 .../performance/test_logical_replication.py   | 295 ++++++++++++++++-
 .../performance/test_physical_replication.py  | 296 ++++++++++++++++++
 4 files changed, 662 insertions(+), 2 deletions(-)
 create mode 100644 test_runner/performance/test_physical_replication.py

diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index 7f843de1a5..daaedf6d11 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -115,6 +115,7 @@ runs:
         export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install}
         export DEFAULT_PG_VERSION=${PG_VERSION#v}
         export LD_LIBRARY_PATH=${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/lib
+        export BENCHMARK_CONNSTR=${BENCHMARK_CONNSTR:-}
 
         if [ "${BUILD_TYPE}" = "remote" ]; then
           export REMOTE_ENV=1
diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index db04b5de7d..899cae2b86 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -99,7 +99,14 @@ jobs:
         # Set --sparse-ordering option of pytest-order plugin
         # to ensure tests are running in order of appears in the file.
         # It's important for test_perf_pgbench.py::test_pgbench_remote_* tests
-        extra_params: -m remote_cluster --sparse-ordering --timeout 5400 --ignore test_runner/performance/test_perf_olap.py --ignore test_runner/performance/test_perf_pgvector_queries.py
+        extra_params:
+          -m remote_cluster
+          --sparse-ordering
+          --timeout 5400
+          --ignore test_runner/performance/test_perf_olap.py
+          --ignore test_runner/performance/test_perf_pgvector_queries.py
+          --ignore test_runner/performance/test_logical_replication.py
+          --ignore test_runner/performance/test_physical_replication.py
       env:
         BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -125,6 +132,69 @@ jobs:
       env:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
+  replication-tests:
+    env:
+      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
+      DEFAULT_PG_VERSION: 14
+      TEST_OUTPUT: /tmp/test_output
+      BUILD_TYPE: remote
+      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
+      PLATFORM: "neon-staging"
+
+    runs-on: [ self-hosted, us-east-2, x64 ]
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
+      options: --init
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Download Neon artifact
+      uses: ./.github/actions/download
+      with:
+        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
+        path: /tmp/neon/
+        prefix: latest
+
+    - name: Run benchmark
+      uses: ./.github/actions/run-python-test-set
+      with:
+        build_type: ${{ env.BUILD_TYPE }}
+        test_selection: performance/test_logical_replication.py
+        run_in_parallel: false
+        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
+        extra_params: -m remote_cluster --timeout 5400
+      env:
+        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
+        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
+        NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
+
+    - name: Run benchmark
+      uses: ./.github/actions/run-python-test-set
+      with:
+        build_type: ${{ env.BUILD_TYPE }}
+        test_selection: performance/test_physical_replication.py
+        run_in_parallel: false
+        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
+        extra_params: -m remote_cluster --timeout 5400
+      env:
+        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
+        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
+        NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
+
+    - name: Create Allure report
+      if: ${{ !cancelled() }}
+      uses: ./.github/actions/allure-report-generate
+
+    - name: Post to a Slack channel
+      if: ${{ github.event.schedule && failure() }}
+      uses: slackapi/slack-github-action@v1
+      with:
+        channel-id: "C033QLM5P7D" # dev-staging-stream
+        slack-message: "Periodic replication testing: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+      env:
+        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
+
   generate-matrices:
     if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
     # Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)
diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py
index 570bd11b6f..5ab83dd31d 100644
--- a/test_runner/performance/test_logical_replication.py
+++ b/test_runner/performance/test_logical_replication.py
@@ -1,13 +1,24 @@
 from __future__ import annotations
 
 import time
+import traceback
+from typing import TYPE_CHECKING
 
+import psycopg2
+import psycopg2.extras
 import pytest
+from fixtures.benchmark_fixture import MetricReport
+from fixtures.common_types import Lsn
 from fixtures.log_helper import log
+from fixtures.neon_api import connection_parameters_to_env
 from fixtures.neon_fixtures import AuxFileStore, logical_replication_sync
+from fixtures.pg_version import PgVersion
 
 if TYPE_CHECKING:
+    from fixtures.benchmark_fixture import NeonBenchmarker
+    from fixtures.neon_api import NeonAPI
     from fixtures.neon_fixtures import NeonEnv, PgBin
+    from fixtures.pg_version import PgVersion
 
 
 @pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.V2])
@@ -31,7 +42,6 @@ def test_logical_replication(neon_simple_env: NeonEnv, pg_bin: PgBin, vanilla_pg
     vanilla_pg.safe_psql("truncate table pgbench_history")
 
     connstr = endpoint.connstr().replace("'", "''")
-    print(f"connstr='{connstr}'")
     vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1")
 
     # Wait logical replication channel to be established
@@ -47,3 +57,286 @@ def test_logical_replication(neon_simple_env: NeonEnv, pg_bin: PgBin, vanilla_pg
     sum_master = endpoint.safe_psql("select sum(abalance) from pgbench_accounts")[0][0]
     sum_replica = vanilla_pg.safe_psql("select sum(abalance) from pgbench_accounts")[0][0]
     assert sum_master == sum_replica
+
+
+def check_pgbench_still_running(pgbench, label=""):
+    rc = pgbench.poll()
+    if rc is not None:
+        raise RuntimeError(f"{label} pgbench terminated early with return code {rc}")
+
+
+def measure_logical_replication_lag(sub_cur, pub_cur, timeout_sec=600):
+    start = time.time()
+    pub_cur.execute("SELECT pg_current_wal_flush_lsn()")
+    pub_lsn = Lsn(pub_cur.fetchall()[0][0])
+    while (time.time() - start) < timeout_sec:
+        sub_cur.execute("SELECT latest_end_lsn FROM pg_catalog.pg_stat_subscription")
+        res = sub_cur.fetchall()[0][0]
+        if res:
+            log.info(f"subscriber_lsn={res}")
+            sub_lsn = Lsn(res)
+            log.info(f"Subscriber LSN={sub_lsn}, publisher LSN={pub_lsn}")
+            if sub_lsn >= pub_lsn:
+                return time.time() - start
+        time.sleep(0.5)
+    raise TimeoutError(f"Logical replication sync took more than {timeout_sec} sec")
+
+
+@pytest.mark.remote_cluster
+@pytest.mark.timeout(2 * 60 * 60)
+def test_subscriber_lag(
+    pg_bin: PgBin,
+    neon_api: NeonAPI,
+    pg_version: PgVersion,
+    zenbenchmark: NeonBenchmarker,
+):
+    """
+    Creates a publisher and subscriber, runs pgbench inserts on publisher and pgbench selects
+    on subscriber. Periodically restarts subscriber while still running the inserts, and
+    measures how long sync takes after restart.
+    """
+    test_duration_min = 60
+    sync_interval_min = 5
+    pgbench_duration = f"-T{test_duration_min * 60 * 2}"
+
+    pub_project = neon_api.create_project(pg_version)
+    pub_project_id = pub_project["project"]["id"]
+    neon_api.wait_for_operation_to_finish(pub_project_id)
+    error_occurred = False
+    try:
+        sub_project = neon_api.create_project(pg_version)
+        sub_project_id = sub_project["project"]["id"]
+        sub_endpoint_id = sub_project["endpoints"][0]["id"]
+        neon_api.wait_for_operation_to_finish(sub_project_id)
+        try:
+            pub_env = connection_parameters_to_env(
+                pub_project["connection_uris"][0]["connection_parameters"]
+            )
+            sub_env = connection_parameters_to_env(
+                sub_project["connection_uris"][0]["connection_parameters"]
+            )
+            pub_connstr = pub_project["connection_uris"][0]["connection_uri"]
+            sub_connstr = sub_project["connection_uris"][0]["connection_uri"]
+
+            pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env)
+            pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env)
+
+            pub_conn = psycopg2.connect(pub_connstr)
+            sub_conn = psycopg2.connect(sub_connstr)
+            pub_conn.autocommit = True
+            sub_conn.autocommit = True
+            with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
+                sub_cur.execute("truncate table pgbench_accounts")
+                sub_cur.execute("truncate table pgbench_history")
+
+                pub_cur.execute(
+                    "create publication pub1 for table pgbench_accounts, pgbench_history"
+                )
+                sub_cur.execute(
+                    f"create subscription sub1 connection '{pub_connstr}' publication pub1"
+                )
+
+                initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur)
+            pub_conn.close()
+            sub_conn.close()
+
+            zenbenchmark.record(
+                "initial_sync_lag", initial_sync_lag, "s", MetricReport.LOWER_IS_BETTER
+            )
+
+            pub_workload = pg_bin.run_nonblocking(
+                ["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=pub_env
+            )
+            try:
+                sub_workload = pg_bin.run_nonblocking(
+                    ["pgbench", "-c10", pgbench_duration, "-S"],
+                    env=sub_env,
+                )
+                try:
+                    start = time.time()
+                    while time.time() - start < test_duration_min * 60:
+                        time.sleep(sync_interval_min * 60)
+                        check_pgbench_still_running(pub_workload, "pub")
+                        check_pgbench_still_running(sub_workload, "sub")
+
+                        with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect(
+                            sub_connstr
+                        ) as sub_conn:
+                            with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
+                                lag = measure_logical_replication_lag(sub_cur, pub_cur)
+
+                        log.info(f"Replica lagged behind master by {lag} seconds")
+                        zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
+                        sub_workload.terminate()
+                        neon_api.restart_endpoint(
+                            sub_project_id,
+                            sub_endpoint_id,
+                        )
+                        neon_api.wait_for_operation_to_finish(sub_project_id)
+                        sub_workload = pg_bin.run_nonblocking(
+                            ["pgbench", "-c10", pgbench_duration, "-S"],
+                            env=sub_env,
+                        )
+
+                        # Measure storage to make sure replication information isn't bloating storage
+                        sub_storage = neon_api.get_project_details(sub_project_id)["project"][
+                            "synthetic_storage_size"
+                        ]
+                        pub_storage = neon_api.get_project_details(pub_project_id)["project"][
+                            "synthetic_storage_size"
+                        ]
+                        zenbenchmark.record(
+                            "sub_storage", sub_storage, "B", MetricReport.LOWER_IS_BETTER
+                        )
+                        zenbenchmark.record(
+                            "pub_storage", pub_storage, "B", MetricReport.LOWER_IS_BETTER
+                        )
+
+                finally:
+                    sub_workload.terminate()
+            finally:
+                pub_workload.terminate()
+        except Exception as e:
+            error_occurred = True
+            log.error(f"Caught exception {e}")
+            log.error(traceback.format_exc())
+        finally:
+            if not error_occurred:
+                neon_api.delete_project(sub_project_id)
+    except Exception as e:
+        error_occurred = True
+        log.error(f"Caught exception {e}")
+        log.error(traceback.format_exc())
+    finally:
+        assert not error_occurred
+        neon_api.delete_project(pub_project_id)
+
+
+@pytest.mark.remote_cluster
+@pytest.mark.timeout(2 * 60 * 60)
+def test_publisher_restart(
+    pg_bin: PgBin,
+    neon_api: NeonAPI,
+    pg_version: PgVersion,
+    zenbenchmark: NeonBenchmarker,
+):
+    """
+    Creates a publisher and subscriber, runs pgbench inserts on publisher and pgbench selects
+    on subscriber. Periodically restarts publisher (to exercise on-demand WAL download), and
+    measures how long sync takes after restart.
+    """
+    test_duration_min = 60
+    sync_interval_min = 5
+    pgbench_duration = f"-T{test_duration_min * 60 * 2}"
+
+    pub_project = neon_api.create_project(pg_version)
+    pub_project_id = pub_project["project"]["id"]
+    pub_endpoint_id = pub_project["endpoints"][0]["id"]
+    neon_api.wait_for_operation_to_finish(pub_project_id)
+    error_occurred = False
+    try:
+        sub_project = neon_api.create_project(pg_version)
+        sub_project_id = sub_project["project"]["id"]
+        neon_api.wait_for_operation_to_finish(sub_project_id)
+        try:
+            pub_env = connection_parameters_to_env(
+                pub_project["connection_uris"][0]["connection_parameters"]
+            )
+            sub_env = connection_parameters_to_env(
+                sub_project["connection_uris"][0]["connection_parameters"]
+            )
+            pub_connstr = pub_project["connection_uris"][0]["connection_uri"]
+            sub_connstr = sub_project["connection_uris"][0]["connection_uri"]
+
+            pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env)
+            pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env)
+
+            pub_conn = psycopg2.connect(pub_connstr)
+            sub_conn = psycopg2.connect(sub_connstr)
+            pub_conn.autocommit = True
+            sub_conn.autocommit = True
+            with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
+                sub_cur.execute("truncate table pgbench_accounts")
+                sub_cur.execute("truncate table pgbench_history")
+
+                pub_cur.execute(
+                    "create publication pub1 for table pgbench_accounts, pgbench_history"
+                )
+                sub_cur.execute(
+                    f"create subscription sub1 connection '{pub_connstr}' publication pub1"
+                )
+
+                initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur)
+            pub_conn.close()
+            sub_conn.close()
+
+            zenbenchmark.record(
+                "initial_sync_lag", initial_sync_lag, "s", MetricReport.LOWER_IS_BETTER
+            )
+
+            pub_workload = pg_bin.run_nonblocking(
+                ["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=pub_env
+            )
+            try:
+                sub_workload = pg_bin.run_nonblocking(
+                    ["pgbench", "-c10", pgbench_duration, "-S"],
+                    env=sub_env,
+                )
+                try:
+                    start = time.time()
+                    while time.time() - start < test_duration_min * 60:
+                        time.sleep(sync_interval_min * 60)
+                        check_pgbench_still_running(pub_workload, "pub")
+                        check_pgbench_still_running(sub_workload, "sub")
+
+                        pub_workload.terminate()
+                        neon_api.restart_endpoint(
+                            pub_project_id,
+                            pub_endpoint_id,
+                        )
+                        neon_api.wait_for_operation_to_finish(pub_project_id)
+                        pub_workload = pg_bin.run_nonblocking(
+                            ["pgbench", "-c10", pgbench_duration, "-Mprepared"],
+                            env=pub_env,
+                        )
+                        with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect(
+                            sub_connstr
+                        ) as sub_conn:
+                            with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
+                                lag = measure_logical_replication_lag(sub_cur, pub_cur)
+
+                        log.info(f"Replica lagged behind master by {lag} seconds")
+                        zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
+
+                        # Measure storage to make sure replication information isn't bloating storage
+                        sub_storage = neon_api.get_project_details(sub_project_id)["project"][
+                            "synthetic_storage_size"
+                        ]
+                        pub_storage = neon_api.get_project_details(pub_project_id)["project"][
+                            "synthetic_storage_size"
+                        ]
+                        zenbenchmark.record(
+                            "sub_storage", sub_storage, "B", MetricReport.LOWER_IS_BETTER
+                        )
+                        zenbenchmark.record(
+                            "pub_storage", pub_storage, "B", MetricReport.LOWER_IS_BETTER
+                        )
+
+                finally:
+                    sub_workload.terminate()
+            finally:
+                pub_workload.terminate()
+        except Exception as e:
+            error_occurred = True
+            log.error(f"Caught exception {e}")
+            log.error(traceback.format_exc())
+        finally:
+            if not error_occurred:
+                neon_api.delete_project(sub_project_id)
+    except Exception as e:
+        error_occurred = True
+        log.error(f"Caught exception {e}")
+        log.error(traceback.format_exc())
+    finally:
+        assert not error_occurred
+        neon_api.delete_project(pub_project_id)
diff --git a/test_runner/performance/test_physical_replication.py b/test_runner/performance/test_physical_replication.py
new file mode 100644
index 0000000000..7e16197211
--- /dev/null
+++ b/test_runner/performance/test_physical_replication.py
@@ -0,0 +1,296 @@
+from __future__ import annotations
+
+import csv
+import os
+import subprocess
+import time
+import traceback
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+import psycopg2
+import psycopg2.extras
+import pytest
+from fixtures.benchmark_fixture import MetricReport
+from fixtures.common_types import Lsn
+from fixtures.log_helper import log
+from fixtures.neon_api import connection_parameters_to_env
+from fixtures.pg_version import PgVersion
+
+if TYPE_CHECKING:
+    from typing import Any, List, Optional
+
+    from fixtures.benchmark_fixture import NeonBenchmarker
+    from fixtures.neon_api import NeonAPI
+    from fixtures.neon_fixtures import PgBin
+
+
+# Granularity of ~0.5 sec
+def measure_replication_lag(master, replica, timeout_sec=600):
+    start = time.time()
+    master.execute("SELECT pg_current_wal_flush_lsn()")
+    master_lsn = Lsn(master.fetchall()[0][0])
+    while (time.time() - start) < timeout_sec:
+        replica.execute("select pg_last_wal_replay_lsn()")
+        replica_lsn = replica.fetchall()[0][0]
+        if replica_lsn:
+            if Lsn(replica_lsn) >= master_lsn:
+                return time.time() - start
+        time.sleep(0.5)
+    raise TimeoutError(f"Replication sync took more than {timeout_sec} sec")
+
+
+def check_pgbench_still_running(pgbench):
+    rc = pgbench.poll()
+    if rc is not None:
+        raise RuntimeError(f"Pgbench terminated early with return code {rc}")
+
+
+@pytest.mark.remote_cluster
+@pytest.mark.timeout(2 * 60 * 60)
+def test_ro_replica_lag(
+    pg_bin: PgBin,
+    neon_api: NeonAPI,
+    pg_version: PgVersion,
+    zenbenchmark: NeonBenchmarker,
+):
+    test_duration_min = 60
+    sync_interval_min = 10
+
+    pgbench_duration = f"-T{test_duration_min * 60 * 2}"
+
+    project = neon_api.create_project(pg_version)
+    project_id = project["project"]["id"]
+    neon_api.wait_for_operation_to_finish(project_id)
+    error_occurred = False
+    try:
+        branch_id = project["branch"]["id"]
+        master_connstr = project["connection_uris"][0]["connection_uri"]
+        master_env = connection_parameters_to_env(
+            project["connection_uris"][0]["connection_parameters"]
+        )
+
+        replica = neon_api.create_endpoint(
+            project_id,
+            branch_id,
+            endpoint_type="read_only",
+            settings={"pg_settings": {"hot_standby_feedback": "on"}},
+        )
+        replica_env = master_env.copy()
+        replica_env["PGHOST"] = replica["endpoint"]["host"]
+        neon_api.wait_for_operation_to_finish(project_id)
+
+        replica_connstr = neon_api.get_connection_uri(
+            project_id,
+            endpoint_id=replica["endpoint"]["id"],
+        )["uri"]
+
+        pg_bin.run_capture(["pgbench", "-i", "-s100"], env=master_env)
+
+        master_workload = pg_bin.run_nonblocking(
+            ["pgbench", "-c10", pgbench_duration, "-Mprepared"],
+            env=master_env,
+        )
+        try:
+            replica_workload = pg_bin.run_nonblocking(
+                ["pgbench", "-c10", pgbench_duration, "-S"],
+                env=replica_env,
+            )
+            try:
+                start = time.time()
+                while time.time() - start < test_duration_min * 60:
+                    check_pgbench_still_running(master_workload)
+                    check_pgbench_still_running(replica_workload)
+                    time.sleep(sync_interval_min * 60)
+                    with psycopg2.connect(master_connstr) as conn_master, psycopg2.connect(
+                        replica_connstr
+                    ) as conn_replica:
+                        with conn_master.cursor() as cur_master, conn_replica.cursor() as cur_replica:
+                            lag = measure_replication_lag(cur_master, cur_replica)
+                    log.info(f"Replica lagged behind master by {lag} seconds")
+                    zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
+            finally:
+                replica_workload.terminate()
+        finally:
+            master_workload.terminate()
+    except Exception as e:
+        error_occurred = True
+        log.error(f"Caught exception: {e}")
+        log.error(traceback.format_exc())
+    finally:
+        assert not error_occurred  # Fail the test if an error occurred
+        neon_api.delete_project(project_id)
+
+
+def report_pgbench_aggregate_intervals(
+    output_dir: Path,
+    prefix: str,
+    zenbenchmark: NeonBenchmarker,
+):
+    for filename in os.listdir(output_dir):
+        if filename.startswith(prefix):
+            # The file will be in the form <prefix>_<node>.<pid>
+            # So we first lop off the .<pid>, and then lop off the prefix and the _
+            node = filename.split(".")[0][len(prefix) + 1 :]
+            with open(output_dir / filename) as f:
+                reader = csv.reader(f, delimiter=" ")
+                for line in reader:
+                    num_transactions = int(line[1])
+                    if num_transactions == 0:
+                        continue
+                    sum_latency = int(line[2])
+                    sum_lag = int(line[3])
+                    zenbenchmark.record(
+                        f"{node}_num_txns", num_transactions, "txns", MetricReport.HIGHER_IS_BETTER
+                    )
+                    zenbenchmark.record(
+                        f"{node}_avg_latency",
+                        sum_latency / num_transactions,
+                        "s",
+                        MetricReport.LOWER_IS_BETTER,
+                    )
+                    zenbenchmark.record(
+                        f"{node}_avg_lag",
+                        sum_lag / num_transactions,
+                        "s",
+                        MetricReport.LOWER_IS_BETTER,
+                    )
+
+
+@pytest.mark.remote_cluster
+@pytest.mark.timeout(2 * 60 * 60)
+def test_replication_start_stop(
+    pg_bin: PgBin,
+    test_output_dir: Path,
+    neon_api: NeonAPI,
+    pg_version: PgVersion,
+    zenbenchmark: NeonBenchmarker,
+):
+    """
+    Cycles through different configurations of read replicas being enabled disabled. The whole time,
+    there's a pgbench read/write workload going on the master. For each replica, we either turn it
+    on or off, and see how long it takes to catch up after some set amount of time of replicating
+    the pgbench.
+    """
+
+    prefix = "pgbench_agg"
+    num_replicas = 2
+    configuration_test_time_sec = 10 * 60
+    pgbench_duration = f"-T{2 ** num_replicas * configuration_test_time_sec}"
+    error_occurred = False
+
+    project = neon_api.create_project(pg_version)
+    project_id = project["project"]["id"]
+    neon_api.wait_for_operation_to_finish(project_id)
+    try:
+        branch_id = project["branch"]["id"]
+        master_connstr = project["connection_uris"][0]["connection_uri"]
+        master_env = connection_parameters_to_env(
+            project["connection_uris"][0]["connection_parameters"]
+        )
+
+        replicas = []
+        for _ in range(num_replicas):
+            replicas.append(
+                neon_api.create_endpoint(
+                    project_id,
+                    branch_id,
+                    endpoint_type="read_only",
+                    settings={"pg_settings": {"hot_standby_feedback": "on"}},
+                )
+            )
+            neon_api.wait_for_operation_to_finish(project_id)
+
+        replica_connstr = [
+            neon_api.get_connection_uri(
+                project_id,
+                endpoint_id=replicas[i]["endpoint"]["id"],
+            )["uri"]
+            for i in range(num_replicas)
+        ]
+        replica_env = [master_env.copy() for _ in range(num_replicas)]
+        for i in range(num_replicas):
+            replica_env[i]["PGHOST"] = replicas[i]["endpoint"]["host"]
+
+        pg_bin.run_capture(["pgbench", "-i", "-s10"], env=master_env)
+
+        # Sync replicas
+        with psycopg2.connect(master_connstr) as conn_master:
+            with conn_master.cursor() as cur_master:
+                for i in range(num_replicas):
+                    conn_replica = psycopg2.connect(replica_connstr[i])
+                    measure_replication_lag(cur_master, conn_replica.cursor())
+
+        master_pgbench = pg_bin.run_nonblocking(
+            [
+                "pgbench",
+                "-c10",
+                pgbench_duration,
+                "-Mprepared",
+                "--log",
+                f"--log-prefix={test_output_dir}/{prefix}_master",
+                f"--aggregate-interval={configuration_test_time_sec}",
+            ],
+            env=master_env,
+        )
+        replica_pgbench: List[Optional[subprocess.Popen[Any]]] = [None for _ in range(num_replicas)]
+
+        # Use the bits of iconfig to tell us which configuration we are on. For example
+        # a iconfig of 2 is 10 in binary, indicating replica 0 is suspended and replica 1 is
+        # alive.
+        for iconfig in range((1 << num_replicas) - 1, -1, -1):
+
+            def replica_enabled(iconfig: int = iconfig):
+                return bool((iconfig >> 1) & 1)
+
+            # Change configuration
+            for ireplica in range(num_replicas):
+                if replica_enabled() and replica_pgbench[ireplica] is None:
+                    replica_pgbench[ireplica] = pg_bin.run_nonblocking(
+                        [
+                            "pgbench",
+                            "-c10",
+                            "-S",
+                            pgbench_duration,
+                            "--log",
+                            f"--log-prefix={test_output_dir}/{prefix}_replica_{ireplica}",
+                            f"--aggregate-interval={configuration_test_time_sec}",
+                        ],
+                        env=replica_env[ireplica],
+                    )
+                elif not replica_enabled() and replica_pgbench[ireplica] is not None:
+                    pgb = replica_pgbench[ireplica]
+                    assert pgb is not None
+                    pgb.terminate()
+                    pgb.wait()
+                    replica_pgbench[ireplica] = None
+
+                    neon_api.suspend_endpoint(
+                        project_id,
+                        replicas[ireplica]["endpoint"]["id"],
+                    )
+                    neon_api.wait_for_operation_to_finish(project_id)
+
+            time.sleep(configuration_test_time_sec)
+
+            with psycopg2.connect(master_connstr) as conn_master:
+                with conn_master.cursor() as cur_master:
+                    for ireplica in range(num_replicas):
+                        replica_conn = psycopg2.connect(replica_connstr[ireplica])
+                        lag = measure_replication_lag(cur_master, replica_conn.cursor())
+                        zenbenchmark.record(
+                            f"Replica {ireplica} lag", lag, "s", MetricReport.LOWER_IS_BETTER
+                        )
+                        log.info(
+                            f"Replica {ireplica} lagging behind master by {lag} seconds after configuration {iconfig:>b}"
+                        )
+        master_pgbench.terminate()
+    except Exception as e:
+        error_occurred = True
+        log.error(f"Caught exception {e}")
+        log.error(traceback.format_exc())
+    finally:
+        assert not error_occurred
+        neon_api.delete_project(project_id)
+        # Only report results if we didn't error out
+        report_pgbench_aggregate_intervals(test_output_dir, prefix, zenbenchmark)

From ab06240faeeca1746449ebda255bb2eeeec853a2 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 8 Jul 2024 15:39:41 +0100
Subject: [PATCH 198/412] pageserver: respect has_relmap_file in
 collect_keyspace (#8276)

## Problem

Rarely, a dbdir entry can exist with no `relmap_file_key` data. This
causes compaction to fail, because it assumes that if the database
exists, then so does the relmap file.

Basebackup already handled this using a boolean to record whether such a
key exists, but `collect_keyspace` didn't.

## Summary of changes

- Respect the flag for whether a relfilemap exists in collect_keyspace
- The reproducer for this issue will merge separately in
https://github.com/neondatabase/neon/pull/8232
---
 pageserver/src/pgdatadir_mapping.rs | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index fefd8d88ff..8a6cfea92b 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -854,13 +854,14 @@ impl Timeline {
         result.add_key(DBDIR_KEY);
 
         // Fetch list of database dirs and iterate them
-        let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
-        let dbdir = DbDirectory::des(&buf)?;
+        let dbdir = self.list_dbdirs(lsn, ctx).await?;
+        let mut dbs: Vec<((Oid, Oid), bool)> = dbdir.into_iter().collect();
 
-        let mut dbs: Vec<(Oid, Oid)> = dbdir.dbdirs.keys().cloned().collect();
-        dbs.sort_unstable();
-        for (spcnode, dbnode) in dbs {
-            result.add_key(relmap_file_key(spcnode, dbnode));
+        dbs.sort_unstable_by(|(k_a, _), (k_b, _)| k_a.cmp(k_b));
+        for ((spcnode, dbnode), has_relmap_file) in dbs {
+            if has_relmap_file {
+                result.add_key(relmap_file_key(spcnode, dbnode));
+            }
             result.add_key(rel_dir_to_key(spcnode, dbnode));
 
             let mut rels: Vec<RelTag> = self

From ebf3bfadde52a22317b3724af7bb0dd02bc03fda Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Mon, 8 Jul 2024 10:43:10 -0400
Subject: [PATCH 199/412] refactor: move part of sharding API from
 `pageserver_api` to `utils` (#8254)

## Problem

LSN Leases introduced in #8084 is a new API that is made shard-aware
from day 1. To support ephemeral endpoint in #7994 without linking
Postgres C API against `compute_ctl`, part of the sharding needs to
reside in `utils`.

## Summary of changes

- Create a new `shard` module in utils crate.
- Move more interface related part of tenant sharding API to utils and
re-export them in pageserver_api.

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
---
 libs/pageserver_api/src/shard.rs | 516 +++----------------------------
 libs/utils/src/lib.rs            |   2 +
 libs/utils/src/shard.rs          | 451 +++++++++++++++++++++++++++
 3 files changed, 490 insertions(+), 479 deletions(-)
 create mode 100644 libs/utils/src/shard.rs

diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index 8c5a4e6168..e83cf4c855 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -1,59 +1,42 @@
-use std::{ops::RangeInclusive, str::FromStr};
+//! See docs/rfcs/031-sharding-static.md for an overview of sharding.
+//!
+//! This module contains a variety of types used to represent the concept of sharding
+//! a Neon tenant across multiple physical shards.  Since there are quite a few of these,
+//! we provide an summary here.
+//!
+//! Types used to describe shards:
+//! - [`ShardCount`] describes how many shards make up a tenant, plus the magic `unsharded` value
+//!   which identifies a tenant which is not shard-aware.  This means its storage paths do not include
+//!   a shard suffix.
+//! - [`ShardNumber`] is simply the zero-based index of a shard within a tenant.
+//! - [`ShardIndex`] is the 2-tuple of `ShardCount` and `ShardNumber`, it's just like a `TenantShardId`
+//!   without the tenant ID.  This is useful for things that are implicitly scoped to a particular
+//!   tenant, such as layer files.
+//! - [`ShardIdentity`]` is the full description of a particular shard's parameters, in sufficient
+//!   detail to convert a [`Key`] to a [`ShardNumber`] when deciding where to write/read.
+//! - The [`ShardSlug`] is a terse formatter for ShardCount and ShardNumber, written as
+//!   four hex digits.  An unsharded tenant is `0000`.
+//! - [`TenantShardId`] is the unique ID of a particular shard within a particular tenant
+//!
+//! Types used to describe the parameters for data distribution in a sharded tenant:
+//! - [`ShardStripeSize`] controls how long contiguous runs of [`Key`]s (stripes) are when distributed across
+//!   multiple shards.  Its value is given in 8kiB pages.
+//! - [`ShardLayout`] describes the data distribution scheme, and at time of writing is
+//!   always zero: this is provided for future upgrades that might introduce different
+//!   data distribution schemes.
+//!
+//! Examples:
+//! - A legacy unsharded tenant has one shard with ShardCount(0), ShardNumber(0), and its slug is 0000
+//! - A single sharded tenant has one shard with ShardCount(1), ShardNumber(0), and its slug is 0001
+//! - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive),
+//!   and their slugs are 0004, 0104, 0204, and 0304.
 
 use crate::{key::Key, models::ShardParameters};
-use hex::FromHex;
 use postgres_ffi::relfile_utils::INIT_FORKNUM;
 use serde::{Deserialize, Serialize};
-use utils::id::TenantId;
 
-/// See docs/rfcs/031-sharding-static.md for an overview of sharding.
-///
-/// This module contains a variety of types used to represent the concept of sharding
-/// a Neon tenant across multiple physical shards.  Since there are quite a few of these,
-/// we provide an summary here.
-///
-/// Types used to describe shards:
-/// - [`ShardCount`] describes how many shards make up a tenant, plus the magic `unsharded` value
-///   which identifies a tenant which is not shard-aware.  This means its storage paths do not include
-///   a shard suffix.
-/// - [`ShardNumber`] is simply the zero-based index of a shard within a tenant.
-/// - [`ShardIndex`] is the 2-tuple of `ShardCount` and `ShardNumber`, it's just like a `TenantShardId`
-///   without the tenant ID.  This is useful for things that are implicitly scoped to a particular
-///   tenant, such as layer files.
-/// - [`ShardIdentity`]` is the full description of a particular shard's parameters, in sufficient
-///   detail to convert a [`Key`] to a [`ShardNumber`] when deciding where to write/read.
-/// - The [`ShardSlug`] is a terse formatter for ShardCount and ShardNumber, written as
-///   four hex digits.  An unsharded tenant is `0000`.
-/// - [`TenantShardId`] is the unique ID of a particular shard within a particular tenant
-///
-/// Types used to describe the parameters for data distribution in a sharded tenant:
-/// - [`ShardStripeSize`] controls how long contiguous runs of [`Key`]s (stripes) are when distributed across
-///   multiple shards.  Its value is given in 8kiB pages.
-/// - [`ShardLayout`] describes the data distribution scheme, and at time of writing is
-///   always zero: this is provided for future upgrades that might introduce different
-///   data distribution schemes.
-///
-/// Examples:
-/// - A legacy unsharded tenant has one shard with ShardCount(0), ShardNumber(0), and its slug is 0000
-/// - A single sharded tenant has one shard with ShardCount(1), ShardNumber(0), and its slug is 0001
-/// - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive),
-///   and their slugs are 0004, 0104, 0204, and 0304.
-
-#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
-pub struct ShardNumber(pub u8);
-
-#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
-pub struct ShardCount(u8);
-
-/// Combination of ShardNumber and ShardCount.  For use within the context of a particular tenant,
-/// when we need to know which shard we're dealing with, but do not need to know the full
-/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know
-/// the fully qualified TenantShardId.
-#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
-pub struct ShardIndex {
-    pub shard_number: ShardNumber,
-    pub shard_count: ShardCount,
-}
+#[doc(inline)]
+pub use ::utils::shard::*;
 
 /// The ShardIdentity contains enough information to map a [`Key`] to a [`ShardNumber`],
 /// and to check whether that [`ShardNumber`] is the same as the current shard.
@@ -65,362 +48,6 @@ pub struct ShardIdentity {
     layout: ShardLayout,
 }
 
-/// Formatting helper, for generating the `shard_id` label in traces.
-struct ShardSlug<'a>(&'a TenantShardId);
-
-/// TenantShardId globally identifies a particular shard in a particular tenant.
-///
-/// These are written as `<TenantId>-<ShardSlug>`, for example:
-///   # The second shard in a two-shard tenant
-///   072f1291a5310026820b2fe4b2968934-0102
-///
-/// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without
-/// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables
-/// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`].
-///
-/// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs,
-/// is both forward and backward compatible with TenantId: a legacy TenantId can be
-/// decoded as a TenantShardId, and when re-encoded it will be parseable
-/// as a TenantId.
-#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
-pub struct TenantShardId {
-    pub tenant_id: TenantId,
-    pub shard_number: ShardNumber,
-    pub shard_count: ShardCount,
-}
-
-impl ShardCount {
-    pub const MAX: Self = Self(u8::MAX);
-
-    /// The internal value of a ShardCount may be zero, which means "1 shard, but use
-    /// legacy format for TenantShardId that excludes the shard suffix", also known
-    /// as [`TenantShardId::unsharded`].
-    ///
-    /// This method returns the actual number of shards, i.e. if our internal value is
-    /// zero, we return 1 (unsharded tenants have 1 shard).
-    pub fn count(&self) -> u8 {
-        if self.0 > 0 {
-            self.0
-        } else {
-            1
-        }
-    }
-
-    /// The literal internal value: this is **not** the number of shards in the
-    /// tenant, as we have a special zero value for legacy unsharded tenants.  Use
-    /// [`Self::count`] if you want to know the cardinality of shards.
-    pub fn literal(&self) -> u8 {
-        self.0
-    }
-
-    /// Whether the `ShardCount` is for an unsharded tenant, so uses one shard but
-    /// uses the legacy format for `TenantShardId`. See also the documentation for
-    /// [`Self::count`].
-    pub fn is_unsharded(&self) -> bool {
-        self.0 == 0
-    }
-
-    /// `v` may be zero, or the number of shards in the tenant.  `v` is what
-    /// [`Self::literal`] would return.
-    pub const fn new(val: u8) -> Self {
-        Self(val)
-    }
-}
-
-impl ShardNumber {
-    pub const MAX: Self = Self(u8::MAX);
-}
-
-impl TenantShardId {
-    pub fn unsharded(tenant_id: TenantId) -> Self {
-        Self {
-            tenant_id,
-            shard_number: ShardNumber(0),
-            shard_count: ShardCount(0),
-        }
-    }
-
-    /// The range of all TenantShardId that belong to a particular TenantId.  This is useful when
-    /// you have a BTreeMap of TenantShardId, and are querying by TenantId.
-    pub fn tenant_range(tenant_id: TenantId) -> RangeInclusive<Self> {
-        RangeInclusive::new(
-            Self {
-                tenant_id,
-                shard_number: ShardNumber(0),
-                shard_count: ShardCount(0),
-            },
-            Self {
-                tenant_id,
-                shard_number: ShardNumber::MAX,
-                shard_count: ShardCount::MAX,
-            },
-        )
-    }
-
-    pub fn shard_slug(&self) -> impl std::fmt::Display + '_ {
-        ShardSlug(self)
-    }
-
-    /// Convenience for code that has special behavior on the 0th shard.
-    pub fn is_shard_zero(&self) -> bool {
-        self.shard_number == ShardNumber(0)
-    }
-
-    /// The "unsharded" value is distinct from simply having a single shard: it represents
-    /// a tenant which is not shard-aware at all, and whose storage paths will not include
-    /// a shard suffix.
-    pub fn is_unsharded(&self) -> bool {
-        self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
-    }
-
-    /// Convenience for dropping the tenant_id and just getting the ShardIndex: this
-    /// is useful when logging from code that is already in a span that includes tenant ID, to
-    /// keep messages reasonably terse.
-    pub fn to_index(&self) -> ShardIndex {
-        ShardIndex {
-            shard_number: self.shard_number,
-            shard_count: self.shard_count,
-        }
-    }
-
-    /// Calculate the children of this TenantShardId when splitting the overall tenant into
-    /// the given number of shards.
-    pub fn split(&self, new_shard_count: ShardCount) -> Vec<TenantShardId> {
-        let effective_old_shard_count = std::cmp::max(self.shard_count.0, 1);
-        let mut child_shards = Vec::new();
-        for shard_number in 0..ShardNumber(new_shard_count.0).0 {
-            // Key mapping is based on a round robin mapping of key hash modulo shard count,
-            // so our child shards are the ones which the same keys would map to.
-            if shard_number % effective_old_shard_count == self.shard_number.0 {
-                child_shards.push(TenantShardId {
-                    tenant_id: self.tenant_id,
-                    shard_number: ShardNumber(shard_number),
-                    shard_count: new_shard_count,
-                })
-            }
-        }
-
-        child_shards
-    }
-}
-
-impl<'a> std::fmt::Display for ShardSlug<'a> {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(
-            f,
-            "{:02x}{:02x}",
-            self.0.shard_number.0, self.0.shard_count.0
-        )
-    }
-}
-
-impl std::fmt::Display for TenantShardId {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        if self.shard_count != ShardCount(0) {
-            write!(f, "{}-{}", self.tenant_id, self.shard_slug())
-        } else {
-            // Legacy case (shard_count == 0) -- format as just the tenant id.  Note that this
-            // is distinct from the normal single shard case (shard count == 1).
-            self.tenant_id.fmt(f)
-        }
-    }
-}
-
-impl std::fmt::Debug for TenantShardId {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        // Debug is the same as Display: the compact hex representation
-        write!(f, "{}", self)
-    }
-}
-
-impl std::str::FromStr for TenantShardId {
-    type Err = hex::FromHexError;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        // Expect format: 16 byte TenantId, '-', 1 byte shard number, 1 byte shard count
-        if s.len() == 32 {
-            // Legacy case: no shard specified
-            Ok(Self {
-                tenant_id: TenantId::from_str(s)?,
-                shard_number: ShardNumber(0),
-                shard_count: ShardCount(0),
-            })
-        } else if s.len() == 37 {
-            let bytes = s.as_bytes();
-            let tenant_id = TenantId::from_hex(&bytes[0..32])?;
-            let mut shard_parts: [u8; 2] = [0u8; 2];
-            hex::decode_to_slice(&bytes[33..37], &mut shard_parts)?;
-            Ok(Self {
-                tenant_id,
-                shard_number: ShardNumber(shard_parts[0]),
-                shard_count: ShardCount(shard_parts[1]),
-            })
-        } else {
-            Err(hex::FromHexError::InvalidStringLength)
-        }
-    }
-}
-
-impl From<[u8; 18]> for TenantShardId {
-    fn from(b: [u8; 18]) -> Self {
-        let tenant_id_bytes: [u8; 16] = b[0..16].try_into().unwrap();
-
-        Self {
-            tenant_id: TenantId::from(tenant_id_bytes),
-            shard_number: ShardNumber(b[16]),
-            shard_count: ShardCount(b[17]),
-        }
-    }
-}
-
-impl ShardIndex {
-    pub fn new(number: ShardNumber, count: ShardCount) -> Self {
-        Self {
-            shard_number: number,
-            shard_count: count,
-        }
-    }
-    pub fn unsharded() -> Self {
-        Self {
-            shard_number: ShardNumber(0),
-            shard_count: ShardCount(0),
-        }
-    }
-
-    /// The "unsharded" value is distinct from simply having a single shard: it represents
-    /// a tenant which is not shard-aware at all, and whose storage paths will not include
-    /// a shard suffix.
-    pub fn is_unsharded(&self) -> bool {
-        self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
-    }
-
-    /// For use in constructing remote storage paths: concatenate this with a TenantId
-    /// to get a fully qualified TenantShardId.
-    ///
-    /// Backward compat: this function returns an empty string if Self::is_unsharded, such
-    /// that the legacy pre-sharding remote key format is preserved.
-    pub fn get_suffix(&self) -> String {
-        if self.is_unsharded() {
-            "".to_string()
-        } else {
-            format!("-{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
-        }
-    }
-}
-
-impl std::fmt::Display for ShardIndex {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
-    }
-}
-
-impl std::fmt::Debug for ShardIndex {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        // Debug is the same as Display: the compact hex representation
-        write!(f, "{}", self)
-    }
-}
-
-impl std::str::FromStr for ShardIndex {
-    type Err = hex::FromHexError;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        // Expect format: 1 byte shard number, 1 byte shard count
-        if s.len() == 4 {
-            let bytes = s.as_bytes();
-            let mut shard_parts: [u8; 2] = [0u8; 2];
-            hex::decode_to_slice(bytes, &mut shard_parts)?;
-            Ok(Self {
-                shard_number: ShardNumber(shard_parts[0]),
-                shard_count: ShardCount(shard_parts[1]),
-            })
-        } else {
-            Err(hex::FromHexError::InvalidStringLength)
-        }
-    }
-}
-
-impl From<[u8; 2]> for ShardIndex {
-    fn from(b: [u8; 2]) -> Self {
-        Self {
-            shard_number: ShardNumber(b[0]),
-            shard_count: ShardCount(b[1]),
-        }
-    }
-}
-
-impl Serialize for TenantShardId {
-    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: serde::Serializer,
-    {
-        if serializer.is_human_readable() {
-            serializer.collect_str(self)
-        } else {
-            // Note: while human encoding of [`TenantShardId`] is backward and forward
-            // compatible, this binary encoding is not.
-            let mut packed: [u8; 18] = [0; 18];
-            packed[0..16].clone_from_slice(&self.tenant_id.as_arr());
-            packed[16] = self.shard_number.0;
-            packed[17] = self.shard_count.0;
-
-            packed.serialize(serializer)
-        }
-    }
-}
-
-impl<'de> Deserialize<'de> for TenantShardId {
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        struct IdVisitor {
-            is_human_readable_deserializer: bool,
-        }
-
-        impl<'de> serde::de::Visitor<'de> for IdVisitor {
-            type Value = TenantShardId;
-
-            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
-                if self.is_human_readable_deserializer {
-                    formatter.write_str("value in form of hex string")
-                } else {
-                    formatter.write_str("value in form of integer array([u8; 18])")
-                }
-            }
-
-            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
-            where
-                A: serde::de::SeqAccess<'de>,
-            {
-                let s = serde::de::value::SeqAccessDeserializer::new(seq);
-                let id: [u8; 18] = Deserialize::deserialize(s)?;
-                Ok(TenantShardId::from(id))
-            }
-
-            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
-            where
-                E: serde::de::Error,
-            {
-                TenantShardId::from_str(v).map_err(E::custom)
-            }
-        }
-
-        if deserializer.is_human_readable() {
-            deserializer.deserialize_str(IdVisitor {
-                is_human_readable_deserializer: true,
-            })
-        } else {
-            deserializer.deserialize_tuple(
-                18,
-                IdVisitor {
-                    is_human_readable_deserializer: false,
-                },
-            )
-        }
-    }
-}
-
 /// Stripe size in number of pages
 #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
 pub struct ShardStripeSize(pub u32);
@@ -585,77 +212,6 @@ impl ShardIdentity {
     }
 }
 
-impl Serialize for ShardIndex {
-    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: serde::Serializer,
-    {
-        if serializer.is_human_readable() {
-            serializer.collect_str(self)
-        } else {
-            // Binary encoding is not used in index_part.json, but is included in anticipation of
-            // switching various structures (e.g. inter-process communication, remote metadata) to more
-            // compact binary encodings in future.
-            let mut packed: [u8; 2] = [0; 2];
-            packed[0] = self.shard_number.0;
-            packed[1] = self.shard_count.0;
-            packed.serialize(serializer)
-        }
-    }
-}
-
-impl<'de> Deserialize<'de> for ShardIndex {
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        struct IdVisitor {
-            is_human_readable_deserializer: bool,
-        }
-
-        impl<'de> serde::de::Visitor<'de> for IdVisitor {
-            type Value = ShardIndex;
-
-            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
-                if self.is_human_readable_deserializer {
-                    formatter.write_str("value in form of hex string")
-                } else {
-                    formatter.write_str("value in form of integer array([u8; 2])")
-                }
-            }
-
-            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
-            where
-                A: serde::de::SeqAccess<'de>,
-            {
-                let s = serde::de::value::SeqAccessDeserializer::new(seq);
-                let id: [u8; 2] = Deserialize::deserialize(s)?;
-                Ok(ShardIndex::from(id))
-            }
-
-            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
-            where
-                E: serde::de::Error,
-            {
-                ShardIndex::from_str(v).map_err(E::custom)
-            }
-        }
-
-        if deserializer.is_human_readable() {
-            deserializer.deserialize_str(IdVisitor {
-                is_human_readable_deserializer: true,
-            })
-        } else {
-            deserializer.deserialize_tuple(
-                2,
-                IdVisitor {
-                    is_human_readable_deserializer: false,
-                },
-            )
-        }
-    }
-}
-
 /// Whether this key is always held on shard 0 (e.g. shard 0 holds all SLRU keys
 /// in order to be able to serve basebackup requests without peer communication).
 fn key_is_shard0(key: &Key) -> bool {
@@ -737,7 +293,9 @@ pub fn describe(
 
 #[cfg(test)]
 mod tests {
-    use utils::Hex;
+    use std::str::FromStr;
+
+    use utils::{id::TenantId, Hex};
 
     use super::*;
 
diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs
index 2a397d97d2..711e617801 100644
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -26,6 +26,8 @@ pub mod auth;
 // utility functions and helper traits for unified unique id generation/serialization etc.
 pub mod id;
 
+pub mod shard;
+
 mod hex;
 pub use hex::Hex;
 
diff --git a/libs/utils/src/shard.rs b/libs/utils/src/shard.rs
new file mode 100644
index 0000000000..4f9ac6bdb4
--- /dev/null
+++ b/libs/utils/src/shard.rs
@@ -0,0 +1,451 @@
+//! See `pageserver_api::shard` for description on sharding.
+
+use std::{ops::RangeInclusive, str::FromStr};
+
+use hex::FromHex;
+use serde::{Deserialize, Serialize};
+
+use crate::id::TenantId;
+
+#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
+pub struct ShardNumber(pub u8);
+
+#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
+pub struct ShardCount(pub u8);
+
+/// Combination of ShardNumber and ShardCount.  For use within the context of a particular tenant,
+/// when we need to know which shard we're dealing with, but do not need to know the full
+/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know
+/// the fully qualified TenantShardId.
+#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
+pub struct ShardIndex {
+    pub shard_number: ShardNumber,
+    pub shard_count: ShardCount,
+}
+
+/// Formatting helper, for generating the `shard_id` label in traces.
+pub struct ShardSlug<'a>(&'a TenantShardId);
+
+/// TenantShardId globally identifies a particular shard in a particular tenant.
+///
+/// These are written as `<TenantId>-<ShardSlug>`, for example:
+///   # The second shard in a two-shard tenant
+///   072f1291a5310026820b2fe4b2968934-0102
+///
+/// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without
+/// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables
+/// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`].
+///
+/// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs,
+/// is both forward and backward compatible with TenantId: a legacy TenantId can be
+/// decoded as a TenantShardId, and when re-encoded it will be parseable
+/// as a TenantId.
+#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
+pub struct TenantShardId {
+    pub tenant_id: TenantId,
+    pub shard_number: ShardNumber,
+    pub shard_count: ShardCount,
+}
+
+impl ShardCount {
+    pub const MAX: Self = Self(u8::MAX);
+
+    /// The internal value of a ShardCount may be zero, which means "1 shard, but use
+    /// legacy format for TenantShardId that excludes the shard suffix", also known
+    /// as [`TenantShardId::unsharded`].
+    ///
+    /// This method returns the actual number of shards, i.e. if our internal value is
+    /// zero, we return 1 (unsharded tenants have 1 shard).
+    pub fn count(&self) -> u8 {
+        if self.0 > 0 {
+            self.0
+        } else {
+            1
+        }
+    }
+
+    /// The literal internal value: this is **not** the number of shards in the
+    /// tenant, as we have a special zero value for legacy unsharded tenants.  Use
+    /// [`Self::count`] if you want to know the cardinality of shards.
+    pub fn literal(&self) -> u8 {
+        self.0
+    }
+
+    /// Whether the `ShardCount` is for an unsharded tenant, so uses one shard but
+    /// uses the legacy format for `TenantShardId`. See also the documentation for
+    /// [`Self::count`].
+    pub fn is_unsharded(&self) -> bool {
+        self.0 == 0
+    }
+
+    /// `v` may be zero, or the number of shards in the tenant.  `v` is what
+    /// [`Self::literal`] would return.
+    pub const fn new(val: u8) -> Self {
+        Self(val)
+    }
+}
+
+impl ShardNumber {
+    pub const MAX: Self = Self(u8::MAX);
+}
+
+impl TenantShardId {
+    pub fn unsharded(tenant_id: TenantId) -> Self {
+        Self {
+            tenant_id,
+            shard_number: ShardNumber(0),
+            shard_count: ShardCount(0),
+        }
+    }
+
+    /// The range of all TenantShardId that belong to a particular TenantId.  This is useful when
+    /// you have a BTreeMap of TenantShardId, and are querying by TenantId.
+    pub fn tenant_range(tenant_id: TenantId) -> RangeInclusive<Self> {
+        RangeInclusive::new(
+            Self {
+                tenant_id,
+                shard_number: ShardNumber(0),
+                shard_count: ShardCount(0),
+            },
+            Self {
+                tenant_id,
+                shard_number: ShardNumber::MAX,
+                shard_count: ShardCount::MAX,
+            },
+        )
+    }
+
+    pub fn shard_slug(&self) -> impl std::fmt::Display + '_ {
+        ShardSlug(self)
+    }
+
+    /// Convenience for code that has special behavior on the 0th shard.
+    pub fn is_shard_zero(&self) -> bool {
+        self.shard_number == ShardNumber(0)
+    }
+
+    /// The "unsharded" value is distinct from simply having a single shard: it represents
+    /// a tenant which is not shard-aware at all, and whose storage paths will not include
+    /// a shard suffix.
+    pub fn is_unsharded(&self) -> bool {
+        self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
+    }
+
+    /// Convenience for dropping the tenant_id and just getting the ShardIndex: this
+    /// is useful when logging from code that is already in a span that includes tenant ID, to
+    /// keep messages reasonably terse.
+    pub fn to_index(&self) -> ShardIndex {
+        ShardIndex {
+            shard_number: self.shard_number,
+            shard_count: self.shard_count,
+        }
+    }
+
+    /// Calculate the children of this TenantShardId when splitting the overall tenant into
+    /// the given number of shards.
+    pub fn split(&self, new_shard_count: ShardCount) -> Vec<TenantShardId> {
+        let effective_old_shard_count = std::cmp::max(self.shard_count.0, 1);
+        let mut child_shards = Vec::new();
+        for shard_number in 0..ShardNumber(new_shard_count.0).0 {
+            // Key mapping is based on a round robin mapping of key hash modulo shard count,
+            // so our child shards are the ones which the same keys would map to.
+            if shard_number % effective_old_shard_count == self.shard_number.0 {
+                child_shards.push(TenantShardId {
+                    tenant_id: self.tenant_id,
+                    shard_number: ShardNumber(shard_number),
+                    shard_count: new_shard_count,
+                })
+            }
+        }
+
+        child_shards
+    }
+}
+
+impl<'a> std::fmt::Display for ShardSlug<'a> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "{:02x}{:02x}",
+            self.0.shard_number.0, self.0.shard_count.0
+        )
+    }
+}
+
+impl std::fmt::Display for TenantShardId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        if self.shard_count != ShardCount(0) {
+            write!(f, "{}-{}", self.tenant_id, self.shard_slug())
+        } else {
+            // Legacy case (shard_count == 0) -- format as just the tenant id.  Note that this
+            // is distinct from the normal single shard case (shard count == 1).
+            self.tenant_id.fmt(f)
+        }
+    }
+}
+
+impl std::fmt::Debug for TenantShardId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        // Debug is the same as Display: the compact hex representation
+        write!(f, "{}", self)
+    }
+}
+
+impl std::str::FromStr for TenantShardId {
+    type Err = hex::FromHexError;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        // Expect format: 16 byte TenantId, '-', 1 byte shard number, 1 byte shard count
+        if s.len() == 32 {
+            // Legacy case: no shard specified
+            Ok(Self {
+                tenant_id: TenantId::from_str(s)?,
+                shard_number: ShardNumber(0),
+                shard_count: ShardCount(0),
+            })
+        } else if s.len() == 37 {
+            let bytes = s.as_bytes();
+            let tenant_id = TenantId::from_hex(&bytes[0..32])?;
+            let mut shard_parts: [u8; 2] = [0u8; 2];
+            hex::decode_to_slice(&bytes[33..37], &mut shard_parts)?;
+            Ok(Self {
+                tenant_id,
+                shard_number: ShardNumber(shard_parts[0]),
+                shard_count: ShardCount(shard_parts[1]),
+            })
+        } else {
+            Err(hex::FromHexError::InvalidStringLength)
+        }
+    }
+}
+
+impl From<[u8; 18]> for TenantShardId {
+    fn from(b: [u8; 18]) -> Self {
+        let tenant_id_bytes: [u8; 16] = b[0..16].try_into().unwrap();
+
+        Self {
+            tenant_id: TenantId::from(tenant_id_bytes),
+            shard_number: ShardNumber(b[16]),
+            shard_count: ShardCount(b[17]),
+        }
+    }
+}
+
+impl ShardIndex {
+    pub fn new(number: ShardNumber, count: ShardCount) -> Self {
+        Self {
+            shard_number: number,
+            shard_count: count,
+        }
+    }
+    pub fn unsharded() -> Self {
+        Self {
+            shard_number: ShardNumber(0),
+            shard_count: ShardCount(0),
+        }
+    }
+
+    /// The "unsharded" value is distinct from simply having a single shard: it represents
+    /// a tenant which is not shard-aware at all, and whose storage paths will not include
+    /// a shard suffix.
+    pub fn is_unsharded(&self) -> bool {
+        self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
+    }
+
+    /// For use in constructing remote storage paths: concatenate this with a TenantId
+    /// to get a fully qualified TenantShardId.
+    ///
+    /// Backward compat: this function returns an empty string if Self::is_unsharded, such
+    /// that the legacy pre-sharding remote key format is preserved.
+    pub fn get_suffix(&self) -> String {
+        if self.is_unsharded() {
+            "".to_string()
+        } else {
+            format!("-{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
+        }
+    }
+}
+
+impl std::fmt::Display for ShardIndex {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
+    }
+}
+
+impl std::fmt::Debug for ShardIndex {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        // Debug is the same as Display: the compact hex representation
+        write!(f, "{}", self)
+    }
+}
+
+impl std::str::FromStr for ShardIndex {
+    type Err = hex::FromHexError;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        // Expect format: 1 byte shard number, 1 byte shard count
+        if s.len() == 4 {
+            let bytes = s.as_bytes();
+            let mut shard_parts: [u8; 2] = [0u8; 2];
+            hex::decode_to_slice(bytes, &mut shard_parts)?;
+            Ok(Self {
+                shard_number: ShardNumber(shard_parts[0]),
+                shard_count: ShardCount(shard_parts[1]),
+            })
+        } else {
+            Err(hex::FromHexError::InvalidStringLength)
+        }
+    }
+}
+
+impl From<[u8; 2]> for ShardIndex {
+    fn from(b: [u8; 2]) -> Self {
+        Self {
+            shard_number: ShardNumber(b[0]),
+            shard_count: ShardCount(b[1]),
+        }
+    }
+}
+
+impl Serialize for TenantShardId {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        if serializer.is_human_readable() {
+            serializer.collect_str(self)
+        } else {
+            // Note: while human encoding of [`TenantShardId`] is backward and forward
+            // compatible, this binary encoding is not.
+            let mut packed: [u8; 18] = [0; 18];
+            packed[0..16].clone_from_slice(&self.tenant_id.as_arr());
+            packed[16] = self.shard_number.0;
+            packed[17] = self.shard_count.0;
+
+            packed.serialize(serializer)
+        }
+    }
+}
+
+impl<'de> Deserialize<'de> for TenantShardId {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        struct IdVisitor {
+            is_human_readable_deserializer: bool,
+        }
+
+        impl<'de> serde::de::Visitor<'de> for IdVisitor {
+            type Value = TenantShardId;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
+                if self.is_human_readable_deserializer {
+                    formatter.write_str("value in form of hex string")
+                } else {
+                    formatter.write_str("value in form of integer array([u8; 18])")
+                }
+            }
+
+            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
+            where
+                A: serde::de::SeqAccess<'de>,
+            {
+                let s = serde::de::value::SeqAccessDeserializer::new(seq);
+                let id: [u8; 18] = Deserialize::deserialize(s)?;
+                Ok(TenantShardId::from(id))
+            }
+
+            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                TenantShardId::from_str(v).map_err(E::custom)
+            }
+        }
+
+        if deserializer.is_human_readable() {
+            deserializer.deserialize_str(IdVisitor {
+                is_human_readable_deserializer: true,
+            })
+        } else {
+            deserializer.deserialize_tuple(
+                18,
+                IdVisitor {
+                    is_human_readable_deserializer: false,
+                },
+            )
+        }
+    }
+}
+
+impl Serialize for ShardIndex {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        if serializer.is_human_readable() {
+            serializer.collect_str(self)
+        } else {
+            // Binary encoding is not used in index_part.json, but is included in anticipation of
+            // switching various structures (e.g. inter-process communication, remote metadata) to more
+            // compact binary encodings in future.
+            let mut packed: [u8; 2] = [0; 2];
+            packed[0] = self.shard_number.0;
+            packed[1] = self.shard_count.0;
+            packed.serialize(serializer)
+        }
+    }
+}
+
+impl<'de> Deserialize<'de> for ShardIndex {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        struct IdVisitor {
+            is_human_readable_deserializer: bool,
+        }
+
+        impl<'de> serde::de::Visitor<'de> for IdVisitor {
+            type Value = ShardIndex;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
+                if self.is_human_readable_deserializer {
+                    formatter.write_str("value in form of hex string")
+                } else {
+                    formatter.write_str("value in form of integer array([u8; 2])")
+                }
+            }
+
+            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
+            where
+                A: serde::de::SeqAccess<'de>,
+            {
+                let s = serde::de::value::SeqAccessDeserializer::new(seq);
+                let id: [u8; 2] = Deserialize::deserialize(s)?;
+                Ok(ShardIndex::from(id))
+            }
+
+            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                ShardIndex::from_str(v).map_err(E::custom)
+            }
+        }
+
+        if deserializer.is_human_readable() {
+            deserializer.deserialize_str(IdVisitor {
+                is_human_readable_deserializer: true,
+            })
+        } else {
+            deserializer.deserialize_tuple(
+                2,
+                IdVisitor {
+                    is_human_readable_deserializer: false,
+                },
+            )
+        }
+    }
+}

From e83a499ab416f6aca35ecfb5ecfe898e067b9b10 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Mon, 8 Jul 2024 19:54:02 +0200
Subject: [PATCH 200/412] compute_ctl: Use 'fast' shutdown for Postgres
 termination (#8289)

## Problem

We currently use 'immediate' mode in the most commonly used shutdown
path, when the control plane calls a `compute_ctl` API to terminate
Postgres inside compute without waiting for the actual pod / VM
termination. Yet, 'immediate' shutdown doesn't create a shutdown
checkpoint and ROs have bad times figuring out the list of running xacts
during next start.

## Summary of changes

Use 'fast' mode, which creates a shutdown checkpoint that is important
for ROs to get a list of running xacts faster instead of going through
the CLOG. On the control plane side, we poll this `compute_ctl`
termination API for 10s, it should be enough as we don't really write
any data at checkpoint time. If it times out, we anyway switch to the
slow k8s-based termination.

See https://www.postgresql.org/docs/current/server-shutdown.html for the
list of modes and signals.

The default VM shutdown hook already uses `fast` mode, see [1]

[1]
https://github.com/neondatabase/neon/blob/c9fd8d76937c2031fd4fea1cdf661d6cf4f00dc3/vm-image-spec.yaml#L30-L31

Related to #6211
---
 compute_tools/src/compute.rs | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 41a52ef5b6..1fa2b9f71d 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1386,7 +1386,9 @@ pub fn forward_termination_signal() {
     let pg_pid = PG_PID.load(Ordering::SeqCst);
     if pg_pid != 0 {
         let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
-        // use 'immediate' shutdown (SIGQUIT): https://www.postgresql.org/docs/current/server-shutdown.html
-        kill(pg_pid, Signal::SIGQUIT).ok();
+        // Use 'fast' shutdown (SIGINT) because it also creates a shutdown checkpoint, which is important for
+        // ROs to get a list of running xacts faster instead of going through the CLOG.
+        // See https://www.postgresql.org/docs/current/server-shutdown.html for the list of modes and signals.
+        kill(pg_pid, Signal::SIGINT).ok();
     }
 }

From 108bf56e44b7cd0409a3e6a1ae121cd0c70377b1 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 8 Jul 2024 20:05:35 +0100
Subject: [PATCH 201/412] tests: use smaller layers in test_pg_regress (#8232)

## Problem

Debug-mode runs of test_pg_regress are rather slow since
https://github.com/neondatabase/neon/pull/8105, and occasionally exceed
their 600s timeout.

## Summary of changes

- Use 8MiB layer files, avoiding large ephemeral layers

On a hetzner AX102, this takes the runtime from 230s to 190s. Which
hopefully will be enough to get the runtime on github runners more
reliably below its 600s timeout.

This has the side benefit of exercising more of the pageserver stack
(including compaction) under a workload that exercises a more diverse
set of postgres functionality than most of our tests.
---
 pageserver/src/tenant/timeline.rs      |   3 +
 test_runner/regress/test_pg_regress.py | 182 ++++++++++++++++---------
 2 files changed, 118 insertions(+), 67 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 92baf1073a..541704e8d6 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -728,6 +728,9 @@ impl From<CreateImageLayersError> for CompactionError {
     fn from(e: CreateImageLayersError) -> Self {
         match e {
             CreateImageLayersError::Cancelled => CompactionError::ShuttingDown,
+            CreateImageLayersError::Other(e) => {
+                CompactionError::Other(e.context("create image layers"))
+            }
             _ => CompactionError::Other(e.into()),
         }
     }
diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py
index 756a2c17c9..54b493ec70 100644
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -8,8 +8,11 @@ from typing import TYPE_CHECKING, cast
 
 import pytest
 from fixtures.neon_fixtures import (
+    Endpoint,
+    NeonEnv,
     NeonEnvBuilder,
     check_restored_datadir_content,
+    tenant_get_shards,
 )
 from fixtures.pg_version import PgVersion
 from fixtures.remote_storage import s3_storage
@@ -21,6 +24,97 @@ if TYPE_CHECKING:
     from pytest import CaptureFixture
 
 
+TENANT_CONF = {
+    # Scaled down thresholds so that we are exercising the pageserver beyond just writing
+    # ephemeral/L0 layers, and because debug-mode code is slow to read from full sized ephemeral layer files.
+    "pitr_interval": "60s",
+    "checkpoint_distance": f"{8 * 1024 * 1024}",
+    "compaction_target_size": f"{8 * 1024 * 1024}",
+}
+
+# # Ensure that compaction works, on a timeline containing all the diversity that postgres regression tests create.
+# # There should have been compactions mid-test as well, this final check is in addition those.
+# for (shard, pageserver) in tenant_get_shards(env, env.initial_tenant):
+#     pageserver.http_client().timeline_checkpoint(env.initial_tenant, env.initial_timeline, force_repartition=True, force_image_layer_creation=True)
+
+
+def post_checks(env: NeonEnv, test_output_dir: Path, db_name: str, endpoint: Endpoint):
+    """
+    After running some opaque tests that create interesting content in a timeline, run
+    some generic integrity checks that the storage stack is able to reproduce the written
+    data properly.
+    """
+
+    ignored_files: Optional[list[str]] = None
+
+    # Neon handles unlogged relations in a special manner. During a
+    # basebackup, we ship the init fork as the main fork. This presents a
+    # problem in that the endpoint's data directory and the basebackup will
+    # have differences and will fail the eventual file comparison.
+    #
+    # Unlogged tables were introduced in version 9.1. ALTER TABLE grew
+    # support for setting the persistence of a table in 9.5. The reason that
+    # this doesn't affect versions < 15 (but probably would between 9.1 and
+    # 9.5) is that all the regression tests that deal with unlogged tables
+    # up until that point dropped the unlogged tables or set them to logged
+    # at some point during the test.
+    #
+    # In version 15, Postgres grew support for unlogged sequences, and with
+    # that came a few more regression tests. These tests did not all drop
+    # the unlogged tables/sequences prior to finishing.
+    #
+    # But unlogged sequences came with a bug in that, sequences didn't
+    # inherit the persistence of their "parent" tables if they had one. This
+    # was fixed and backported to 15, thus exacerbating our problem a bit.
+    #
+    # So what we can do is just ignore file differences between the data
+    # directory and basebackup for unlogged relations.
+    results = cast(
+        "list[tuple[str, str]]",
+        endpoint.safe_psql(
+            """
+        SELECT
+            relkind,
+            pg_relation_filepath(
+                pg_filenode_relation(reltablespace, relfilenode)
+            ) AS unlogged_relation_paths
+        FROM pg_class
+        WHERE relpersistence = 'u'
+        """,
+            dbname=db_name,
+        ),
+    )
+
+    unlogged_relation_files: list[str] = []
+    for r in results:
+        unlogged_relation_files.append(r[1])
+        # This is related to the following Postgres commit:
+        #
+        # commit ccadf73163ca88bdaa74b8223d4dde05d17f550b
+        # Author: Heikki Linnakangas <heikki.linnakangas@iki.fi>
+        # Date:   2023-08-23 09:21:31 -0500
+        #
+        # Use the buffer cache when initializing an unlogged index.
+        #
+        # This patch was backpatched to 16. Without it, the LSN in the
+        # page header would be 0/0 in the data directory, which wouldn't
+        # match the LSN generated during the basebackup, thus creating
+        # a difference.
+        if env.pg_version <= PgVersion.V15 and r[0] == "i":
+            unlogged_relation_files.append(f"{r[1]}_init")
+
+    ignored_files = unlogged_relation_files
+
+    check_restored_datadir_content(test_output_dir, env, endpoint, ignored_files=ignored_files)
+
+    # Ensure that compaction works, on a timeline containing all the diversity that postgres regression tests create.
+    # There should have been compactions mid-test as well, this final check is in addition those.
+    for shard, pageserver in tenant_get_shards(env, env.initial_tenant):
+        pageserver.http_client().timeline_checkpoint(
+            shard, env.initial_timeline, force_repartition=True, force_image_layer_creation=True
+        )
+
+
 # Run the main PostgreSQL regression tests, in src/test/regress.
 #
 @pytest.mark.timeout(600)
@@ -45,7 +139,10 @@ def test_pg_regress(
 
     neon_env_builder.enable_pageserver_remote_storage(s3_storage())
     neon_env_builder.enable_scrub_on_exit()
-    env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
+    env = neon_env_builder.init_start(
+        initial_tenant_conf=TENANT_CONF,
+        initial_tenant_shard_count=shard_count,
+    )
 
     # Connect to postgres and create a database called "regression".
     endpoint = env.endpoints.create_start("main")
@@ -84,67 +181,7 @@ def test_pg_regress(
     with capsys.disabled():
         pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath)
 
-        ignored_files: Optional[list[str]] = None
-
-        # Neon handles unlogged relations in a special manner. During a
-        # basebackup, we ship the init fork as the main fork. This presents a
-        # problem in that the endpoint's data directory and the basebackup will
-        # have differences and will fail the eventual file comparison.
-        #
-        # Unlogged tables were introduced in version 9.1. ALTER TABLE grew
-        # support for setting the persistence of a table in 9.5. The reason that
-        # this doesn't affect versions < 15 (but probably would between 9.1 and
-        # 9.5) is that all the regression tests that deal with unlogged tables
-        # up until that point dropped the unlogged tables or set them to logged
-        # at some point during the test.
-        #
-        # In version 15, Postgres grew support for unlogged sequences, and with
-        # that came a few more regression tests. These tests did not all drop
-        # the unlogged tables/sequences prior to finishing.
-        #
-        # But unlogged sequences came with a bug in that, sequences didn't
-        # inherit the persistence of their "parent" tables if they had one. This
-        # was fixed and backported to 15, thus exacerbating our problem a bit.
-        #
-        # So what we can do is just ignore file differences between the data
-        # directory and basebackup for unlogged relations.
-        results = cast(
-            "list[tuple[str, str]]",
-            endpoint.safe_psql(
-                """
-            SELECT
-                relkind,
-                pg_relation_filepath(
-                    pg_filenode_relation(reltablespace, relfilenode)
-                ) AS unlogged_relation_paths
-            FROM pg_class
-            WHERE relpersistence = 'u'
-            """,
-                dbname=DBNAME,
-            ),
-        )
-
-        unlogged_relation_files: list[str] = []
-        for r in results:
-            unlogged_relation_files.append(r[1])
-            # This is related to the following Postgres commit:
-            #
-            # commit ccadf73163ca88bdaa74b8223d4dde05d17f550b
-            # Author: Heikki Linnakangas <heikki.linnakangas@iki.fi>
-            # Date:   2023-08-23 09:21:31 -0500
-            #
-            # Use the buffer cache when initializing an unlogged index.
-            #
-            # This patch was backpatched to 16. Without it, the LSN in the
-            # page header would be 0/0 in the data directory, which wouldn't
-            # match the LSN generated during the basebackup, thus creating
-            # a difference.
-            if env.pg_version <= PgVersion.V15 and r[0] == "i":
-                unlogged_relation_files.append(f"{r[1]}_init")
-
-        ignored_files = unlogged_relation_files
-
-        check_restored_datadir_content(test_output_dir, env, endpoint, ignored_files=ignored_files)
+    post_checks(env, test_output_dir, DBNAME, endpoint)
 
 
 # Run the PostgreSQL "isolation" tests, in src/test/isolation.
@@ -159,16 +196,20 @@ def test_isolation(
     pg_distrib_dir: Path,
     shard_count: Optional[int],
 ):
+    DBNAME = "isolation_regression"
+
     if shard_count is not None:
         neon_env_builder.num_pageservers = shard_count
     neon_env_builder.enable_pageserver_remote_storage(s3_storage())
     neon_env_builder.enable_scrub_on_exit()
-    env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
+    env = neon_env_builder.init_start(
+        initial_tenant_conf=TENANT_CONF, initial_tenant_shard_count=shard_count
+    )
 
     # Connect to postgres and create a database called "regression".
     # isolation tests use prepared transactions, so enable them
     endpoint = env.endpoints.create_start("main", config_lines=["max_prepared_transactions=100"])
-    endpoint.safe_psql("CREATE DATABASE isolation_regression")
+    endpoint.safe_psql(f"CREATE DATABASE {DBNAME}")
 
     # Create some local directories for pg_isolation_regress to run in.
     runpath = test_output_dir / "regress"
@@ -202,6 +243,9 @@ def test_isolation(
     with capsys.disabled():
         pg_bin.run(pg_isolation_regress_command, env=env_vars, cwd=runpath)
 
+    # This fails with a mismatch on `pg_multixact/offsets/0000`
+    # post_checks(env, test_output_dir, DBNAME, endpoint)
+
 
 # Run extra Neon-specific pg_regress-based tests. The tests and their
 # schedule file are in the sql_regress/ directory.
@@ -215,15 +259,19 @@ def test_sql_regress(
     pg_distrib_dir: Path,
     shard_count: Optional[int],
 ):
+    DBNAME = "regression"
+
     if shard_count is not None:
         neon_env_builder.num_pageservers = shard_count
     neon_env_builder.enable_pageserver_remote_storage(s3_storage())
     neon_env_builder.enable_scrub_on_exit()
-    env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
+    env = neon_env_builder.init_start(
+        initial_tenant_conf=TENANT_CONF, initial_tenant_shard_count=shard_count
+    )
 
     # Connect to postgres and create a database called "regression".
     endpoint = env.endpoints.create_start("main")
-    endpoint.safe_psql("CREATE DATABASE regression")
+    endpoint.safe_psql(f"CREATE DATABASE {DBNAME}")
 
     # Create some local directories for pg_regress to run in.
     runpath = test_output_dir / "regress"
@@ -258,4 +306,4 @@ def test_sql_regress(
     with capsys.disabled():
         pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath)
 
-        check_restored_datadir_content(test_output_dir, env, endpoint)
+    post_checks(env, test_output_dir, DBNAME, endpoint)

From 472acae61546aa13f11bba5303daa1a54dac344e Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Mon, 8 Jul 2024 15:05:59 -0400
Subject: [PATCH 202/412] fix(pageserver): write to both v1+v2 for aux tenant
 import (#8316)

close https://github.com/neondatabase/neon/issues/8202 ref
https://github.com/neondatabase/neon/pull/6560

For tenant imports, we now write the aux files into both v1+v2 storage,
so that the test case can pick either one for testing. Given the API is
only used for testing, this looks like a safe change.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 storage_controller/src/service.rs | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 3965d7453d..78f0848c24 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -4062,7 +4062,14 @@ impl Service {
                 placement_policy: Some(PlacementPolicy::Attached(0)), // No secondaries, for convenient debug/hacking
 
                 // There is no way to know what the tenant's config was: revert to defaults
-                config: TenantConfig::default(),
+                //
+                // TODO: remove `switch_aux_file_policy` once we finish auxv2 migration
+                //
+                // we write to both v1+v2 storage, so that the test case can use either storage format for testing
+                config: TenantConfig {
+                    switch_aux_file_policy: Some(models::AuxFilePolicy::CrossValidation),
+                    ..TenantConfig::default()
+                },
             })
             .await?;
 

From 7cfaecbeb63c130b8a91d96cfa9b6ff9396ca880 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 8 Jul 2024 21:06:34 +0100
Subject: [PATCH 203/412] tests: stabilize test_timeline_size_quota_on_startup
 (#8255)

## Problem

`test_timeline_size_quota_on_startup` assumed that writing data beyond
the size limit would always be blocked. This is not so: the limit is
only enforced if feedback makes it back from the pageserver to the
safekeeper + compute.

Closes: https://github.com/neondatabase/neon/issues/6562

## Summary of changes

- Modify the test to wait for the pageserver to catch up. The size limit
was never actually being enforced robustly, the original version of this
test was just writing much more than 30MB and about 98% of the time
getting lucky such that the feedback happened to arrive before the tests
for loop was done.
- If the test fails, log the logical size as seen by the pageserver.
---
 test_runner/regress/test_timeline_size.py | 46 +++++++++++++++++------
 1 file changed, 35 insertions(+), 11 deletions(-)

diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index f47356839c..5e9a42f6b4 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -152,10 +152,12 @@ def test_timeline_size_quota_on_startup(neon_env_builder: NeonEnvBuilder):
 
     client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id)
 
+    size_limit_mb = 30
+
     endpoint_main = env.endpoints.create(
         "test_timeline_size_quota_on_startup",
         # Set small limit for the test
-        config_lines=["neon.max_cluster_size=30MB"],
+        config_lines=[f"neon.max_cluster_size={size_limit_mb}MB"],
     )
     endpoint_main.start()
 
@@ -165,17 +167,39 @@ def test_timeline_size_quota_on_startup(neon_env_builder: NeonEnvBuilder):
 
             # Insert many rows. This query must fail because of space limit
             try:
-                for _i in range(5000):
-                    cur.execute(
-                        """
-                        INSERT INTO foo
-                            SELECT 'long string to consume some space' || g
-                            FROM generate_series(1, 100) g
-                    """
-                    )
 
-                # If we get here, the timeline size limit failed
-                log.error("Query unexpectedly succeeded")
+                def write_rows(count):
+                    for _i in range(count):
+                        cur.execute(
+                            """
+                            INSERT INTO foo
+                                SELECT 'long string to consume some space' || g
+                                FROM generate_series(1, 100) g
+                        """
+                        )
+
+                # Write some data that exceeds limit, then let the pageserver ingest it to guarantee that some feedback has made it to
+                # the safekeeper, then try to write some more.  We expect either the initial writes or the ones after
+                # the wait_for_last_flush_lsn to generate an exception.
+                #
+                # Without the wait_for_last_flush_lsn, the size limit sometimes isn't enforced (see https://github.com/neondatabase/neon/issues/6562)
+                write_rows(2500)
+                wait_for_last_flush_lsn(env, endpoint_main, env.initial_tenant, new_timeline_id)
+                logical_size = env.pageserver.http_client().timeline_detail(
+                    env.initial_tenant, new_timeline_id
+                )["current_logical_size"]
+                assert logical_size > size_limit_mb * 1024 * 1024
+                write_rows(2500)
+
+                # If we get here, the timeline size limit failed.  Find out from the pageserver how large it
+                # thinks the timeline is.
+                wait_for_last_flush_lsn(env, endpoint_main, env.initial_tenant, new_timeline_id)
+                logical_size = env.pageserver.http_client().timeline_detail(
+                    env.initial_tenant, new_timeline_id
+                )["current_logical_size"]
+                log.error(
+                    f"Query unexpectedly succeeded, pageserver logical size is {logical_size}"
+                )
                 raise AssertionError()
 
             except psycopg2.errors.DiskFull as err:

From 0b6492e7d31db93b3effa7b70659270dfb7055b6 Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Mon, 8 Jul 2024 16:50:13 -0400
Subject: [PATCH 204/412] tests: increase approx size equal threshold to avoid
 `test_lsn_lease_size` flakiness (#8282)

## Summary of changes

Increase the `assert_size_approx_equal` threshold to avoid flakiness of
`test_lsn_lease_size`. Still needs more investigation to fully resolve
#8293.

- Also set `autovacuum=off` for the endpoint we are running in the test.

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
---
 test_runner/regress/test_tenant_size.py | 29 +++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py
index 70e8fe67d5..b1ade77a14 100644
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -720,9 +720,30 @@ def test_lsn_lease_size(neon_env_builder: NeonEnvBuilder, test_output_dir: Path,
     They should have the same effect.
     """
 
+    def assert_size_approx_equal_for_lease_test(size_lease, size_branch):
+        """
+        Tests that evaluate sizes are checking the pageserver space consumption
+        that sits many layers below the user input.  The exact space needed
+        varies slightly depending on postgres behavior.
+
+        Rather than expecting postgres to be determinstic and occasionally
+        failing the test, we permit sizes for the same data to vary by a few pages.
+        """
+
+        # FIXME(yuchen): The delta is too large, used as temp solution to pass the test reliably.
+        # Investigate and reduce the threshold.
+        threshold = 22 * 8272
+
+        log.info(
+            f"delta: size_branch({size_branch}) -  size_lease({size_lease}) = {size_branch - size_lease}"
+        )
+
+        assert size_lease == pytest.approx(size_branch, abs=threshold)
+
     conf = {
         "pitr_interval": "0s" if zero_gc else "3600s",
         "gc_period": "0s",
+        "compaction_period": "0s",
     }
 
     env = neon_env_builder.init_start(initial_tenant_conf=conf)
@@ -734,7 +755,7 @@ def test_lsn_lease_size(neon_env_builder: NeonEnvBuilder, test_output_dir: Path,
     tenant, timeline = env.neon_cli.create_tenant(conf=conf)
     lease_res = insert_with_action(env, tenant, timeline, test_output_dir, action="lease")
 
-    assert_size_approx_equal(lease_res, ro_branch_res)
+    assert_size_approx_equal_for_lease_test(lease_res, ro_branch_res)
 
 
 def insert_with_action(
@@ -754,7 +775,11 @@ def insert_with_action(
     """
 
     client = env.pageserver.http_client()
-    with env.endpoints.create_start("main", tenant_id=tenant) as ep:
+    with env.endpoints.create_start(
+        "main",
+        tenant_id=tenant,
+        config_lines=["autovacuum=off"],
+    ) as ep:
         initial_size = client.tenant_size(tenant)
         log.info(f"initial size: {initial_size}")
 

From 3b2fc27de462ee10ca8eacede7e23963c2f336cf Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 9 Jul 2024 09:39:10 +0100
Subject: [PATCH 205/412] CI(promote-compatibility-data): take into account
 commit sha (#8283)

## Problem

In https://github.com/neondatabase/neon/pull/8161, we changed the path
to Neon artefacts by adding commit sha to it, but we missed adding these
changes to `promote-compatibility-data` job that we use for
backward/forward- compatibility testing.

## Summary of changes
- Add commit sha to `promote-compatibility-data`
---
 .github/workflows/build_and_test.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index a3246987e2..cb7655e039 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1336,6 +1336,7 @@ jobs:
         env:
           BUCKET: neon-github-public-dev
           PREFIX: artifacts/latest
+          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
         run: |
           # Update compatibility snapshot for the release
           for pg_version in v14 v15 v16; do
@@ -1349,7 +1350,7 @@ jobs:
 
           # Update Neon artifact for the release (reuse already uploaded artifact)
           for build_type in debug release; do
-            OLD_PREFIX=artifacts/${GITHUB_RUN_ID}
+            OLD_PREFIX=artifacts/${COMMIT_SHA}/${GITHUB_RUN_ID}
             FILENAME=neon-${{ runner.os }}-${{ runner.arch }}-${build_type}-artifact.tar.zst
 
             S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[]?.Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)

From 7b4a9c1d82bdd7d6ad35a469d4b3db81c5b12686 Mon Sep 17 00:00:00 2001
From: Luca BRUNO <lucab@lucabruno.net>
Date: Tue, 9 Jul 2024 10:43:42 +0200
Subject: [PATCH 206/412] proxy/http: avoid spurious vector reallocations

This tweaks the rows-to-JSON rendering logic in order to avoid
allocating 0-sized temporary vectors and later growing them
to insert elements.
As the exact size is known in advance, both vectors can be built
with an exact capacity upfront. This will avoid further vector
growing/reallocation in the rendering hotpath.

Signed-off-by: Luca BRUNO <lucab@lucabruno.net>
---
 proxy/src/serverless/sql_over_http.rs | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 583ff75f7c..8118ae5ea8 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -838,8 +838,9 @@ async fn query_to_json<T: GenericClient>(
         "finished reading rows"
     );
 
-    let mut fields = vec![];
-    let mut columns = vec![];
+    let columns_len = row_stream.columns().len();
+    let mut fields = Vec::with_capacity(columns_len);
+    let mut columns = Vec::with_capacity(columns_len);
 
     for c in row_stream.columns() {
         fields.push(json!({

From 44339f5b708331040862c9ae1aa1200e30498c72 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Tue, 9 Jul 2024 12:11:37 -0400
Subject: [PATCH 207/412] chore(storage-scrubber): allow disable file logging
 (#8297)

part of https://github.com/neondatabase/cloud/issues/14024, k8s does not
always have a volume available for logging, and I'm running into weird
permission errors... While I could spend time figuring out how to create
temp directories for logging, I think it would be better to just disable
file logging as k8s containers are ephemeral and we cannot retrieve
anything on the fs after the container gets removed.

## Summary of changes

`PAGESERVER_DISABLE_FILE_LOGGING=1` -> file logging disabled

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 storage_scrubber/src/lib.rs | 40 ++++++++++++++++++++++++-------------
 1 file changed, 26 insertions(+), 14 deletions(-)

diff --git a/storage_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs
index 6adaa5d38f..8f567b22e0 100644
--- a/storage_scrubber/src/lib.rs
+++ b/storage_scrubber/src/lib.rs
@@ -242,24 +242,36 @@ impl ConsoleConfig {
     }
 }
 
-pub fn init_logging(file_name: &str) -> WorkerGuard {
-    let (file_writer, guard) =
-        tracing_appender::non_blocking(tracing_appender::rolling::never("./logs/", file_name));
-
-    let file_logs = fmt::Layer::new()
-        .with_target(false)
-        .with_ansi(false)
-        .with_writer(file_writer);
+pub fn init_logging(file_name: &str) -> Option<WorkerGuard> {
     let stderr_logs = fmt::Layer::new()
         .with_target(false)
         .with_writer(std::io::stderr);
-    tracing_subscriber::registry()
-        .with(EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info")))
-        .with(file_logs)
-        .with(stderr_logs)
-        .init();
 
-    guard
+    let disable_file_logging = match std::env::var("PAGESERVER_DISABLE_FILE_LOGGING") {
+        Ok(s) => s == "1" || s.to_lowercase() == "true",
+        Err(_) => false,
+    };
+
+    if disable_file_logging {
+        tracing_subscriber::registry()
+            .with(EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info")))
+            .with(stderr_logs)
+            .init();
+        None
+    } else {
+        let (file_writer, guard) =
+            tracing_appender::non_blocking(tracing_appender::rolling::never("./logs/", file_name));
+        let file_logs = fmt::Layer::new()
+            .with_target(false)
+            .with_ansi(false)
+            .with_writer(file_writer);
+        tracing_subscriber::registry()
+            .with(EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info")))
+            .with(stderr_logs)
+            .with(file_logs)
+            .init();
+        Some(guard)
+    }
 }
 
 pub fn init_s3_client(bucket_region: Region) -> Client {

From 1145700f87f2cbd72d8ddbe6d8b59ef3033298f8 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Tue, 9 Jul 2024 18:25:49 +0100
Subject: [PATCH 208/412] chore: fix nightly build (#8142)

## Problem

`cargo +nightly check` fails

## Summary of changes

Updates `measured`, `time`, and `crc32c`.

* `measured`: updated to fix
https://github.com/rust-lang/rust/issues/125763.
* `time`: updated to fix https://github.com/rust-lang/rust/issues/125319
* `crc32c`: updated to remove some nightly feature detection with a
removed nightly feature
---
 Cargo.lock                | 65 ++++++++++++++++++++++++++++-----------
 Cargo.toml                |  4 +--
 libs/metrics/src/hll.rs   | 14 ++++-----
 libs/metrics/src/lib.rs   | 27 ++++------------
 proxy/src/jemalloc.rs     |  6 ++--
 proxy/src/metrics.rs      | 28 ++++++++++++++++-
 workspace_hack/Cargo.toml |  3 ++
 7 files changed, 94 insertions(+), 53 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 716b6690d9..63628160d1 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1397,9 +1397,9 @@ dependencies = [
 
 [[package]]
 name = "crc32c"
-version = "0.6.5"
+version = "0.6.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "89254598aa9b9fa608de44b3ae54c810f0f06d755e24c50177f1f8f31ff50ce2"
+checksum = "3a47af21622d091a8f0fb295b88bc886ac74efcc613efc19f5d0b21de5c89e47"
 dependencies = [
  "rustc_version",
 ]
@@ -1651,6 +1651,16 @@ dependencies = [
  "rusticata-macros",
 ]
 
+[[package]]
+name = "deranged"
+version = "0.3.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4"
+dependencies = [
+ "powerfmt",
+ "serde",
+]
+
 [[package]]
 name = "desim"
 version = "0.1.0"
@@ -3008,9 +3018,9 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
 
 [[package]]
 name = "measured"
-version = "0.0.21"
+version = "0.0.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "652bc741286361c06de8cb4d89b21a6437f120c508c51713663589eeb9928ac5"
+checksum = "3051f3a030d55d680cdef6ca50e80abd1182f8da29f2344a7c9cb575721138f0"
 dependencies = [
  "bytes",
  "crossbeam-utils",
@@ -3026,9 +3036,9 @@ dependencies = [
 
 [[package]]
 name = "measured-derive"
-version = "0.0.21"
+version = "0.0.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6ea497f33e1e856a376c32ad916f69a0bd3c597db1f912a399f842b01a4a685d"
+checksum = "b9e6777fc80a575f9503d908c8b498782a6c3ee88a06cb416dc3941401e43b94"
 dependencies = [
  "heck 0.5.0",
  "proc-macro2",
@@ -3038,9 +3048,9 @@ dependencies = [
 
 [[package]]
 name = "measured-process"
-version = "0.0.21"
+version = "0.0.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b364ccb66937a814b6b2ad751d1a2f7a9d5a78c761144036825fb36bb0771000"
+checksum = "7c4b80445aeb08e832d87bf1830049a924cdc1d6b7ef40b6b9b365bff17bf8ec"
 dependencies = [
  "libc",
  "measured",
@@ -3275,6 +3285,12 @@ dependencies = [
  "num-traits",
 ]
 
+[[package]]
+name = "num-conv"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9"
+
 [[package]]
 name = "num-integer"
 version = "0.1.45"
@@ -4118,6 +4134,12 @@ dependencies = [
  "workspace_hack",
 ]
 
+[[package]]
+name = "powerfmt"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
+
 [[package]]
 name = "ppv-lite86"
 version = "0.2.17"
@@ -5397,9 +5419,9 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4"
 
 [[package]]
 name = "serde"
-version = "1.0.183"
+version = "1.0.203"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "32ac8da02677876d532745a130fc9d8e6edfa81a269b107c5b00829b91d8eb3c"
+checksum = "7253ab4de971e72fb7be983802300c30b5a7f0c2e56fab8abfc6a214307c0094"
 dependencies = [
  "serde_derive",
 ]
@@ -5416,9 +5438,9 @@ dependencies = [
 
 [[package]]
 name = "serde_derive"
-version = "1.0.183"
+version = "1.0.203"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "aafe972d60b0b9bee71a91b92fee2d4fb3c9d7e8f6b179aa99f27203d99a4816"
+checksum = "500cbc0ebeb6f46627f50f3f5811ccf6bf00643be300b4c3eabc0ef55dc5b5ba"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -6108,12 +6130,15 @@ dependencies = [
 
 [[package]]
 name = "time"
-version = "0.3.21"
+version = "0.3.36"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8f3403384eaacbca9923fa06940178ac13e4edb725486d70e8e15881d0c836cc"
+checksum = "5dfd88e563464686c916c7e46e623e520ddc6d79fa6641390f2e3fa86e83e885"
 dependencies = [
+ "deranged",
  "itoa",
  "js-sys",
+ "num-conv",
+ "powerfmt",
  "serde",
  "time-core",
  "time-macros",
@@ -6121,16 +6146,17 @@ dependencies = [
 
 [[package]]
 name = "time-core"
-version = "0.1.1"
+version = "0.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7300fbefb4dadc1af235a9cef3737cea692a9d97e1b9cbcd4ebdae6f8868e6fb"
+checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3"
 
 [[package]]
 name = "time-macros"
-version = "0.2.9"
+version = "0.2.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "372950940a5f07bf38dbe211d7283c9e6d7327df53794992d293e534c733d09b"
+checksum = "3f252a68540fde3a3877aeea552b832b40ab9a69e318efd078774a01ddee1ccf"
 dependencies = [
+ "num-conv",
  "time-core",
 ]
 
@@ -7428,6 +7454,7 @@ dependencies = [
  "clap",
  "clap_builder",
  "crossbeam-utils",
+ "deranged",
  "either",
  "fail",
  "futures-channel",
@@ -7452,7 +7479,9 @@ dependencies = [
  "num-traits",
  "once_cell",
  "parquet",
+ "proc-macro2",
  "prost",
+ "quote",
  "rand 0.8.5",
  "regex",
  "regex-automata 0.4.3",
diff --git a/Cargo.toml b/Cargo.toml
index 8fddaaef12..fc3dd51809 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -111,8 +111,8 @@ lasso = "0.7"
 leaky-bucket = "1.0.1"
 libc = "0.2"
 md5 = "0.7.0"
-measured = { version = "0.0.21", features=["lasso"] }
-measured-process = { version = "0.0.21" }
+measured = { version = "0.0.22", features=["lasso"] }
+measured-process = { version = "0.0.22" }
 memoffset = "0.8"
 nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
 notify = "6.0.0"
diff --git a/libs/metrics/src/hll.rs b/libs/metrics/src/hll.rs
index f53511ab5c..723916a742 100644
--- a/libs/metrics/src/hll.rs
+++ b/libs/metrics/src/hll.rs
@@ -13,11 +13,7 @@ use std::{
 
 use measured::{
     label::{LabelGroupVisitor, LabelName, LabelValue, LabelVisitor},
-    metric::{
-        group::{Encoding, MetricValue},
-        name::MetricNameEncoder,
-        Metric, MetricType, MetricVec,
-    },
+    metric::{counter::CounterState, name::MetricNameEncoder, Metric, MetricType, MetricVec},
     text::TextEncoder,
     LabelGroup,
 };
@@ -144,6 +140,7 @@ impl<const N: usize> HyperLogLogState<N> {
         })
     }
 }
+
 impl<W: std::io::Write, const N: usize> measured::metric::MetricEncoding<TextEncoder<W>>
     for HyperLogLogState<N>
 {
@@ -182,12 +179,13 @@ impl<W: std::io::Write, const N: usize> measured::metric::MetricEncoding<TextEnc
             .into_iter()
             .enumerate()
             .try_for_each(|(hll_shard, val)| {
-                enc.write_metric_value(
-                    name.by_ref(),
+                CounterState::new(val as u64).collect_into(
+                    &(),
                     labels.by_ref().compose_with(HllShardLabel {
                         hll_shard: hll_shard as i64,
                     }),
-                    MetricValue::Int(val as i64),
+                    name.by_ref(),
+                    enc,
                 )
             })
     }
diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs
index 0ff8ec8be3..df000cd0fb 100644
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -9,7 +9,7 @@ use measured::{
     metric::{
         counter::CounterState,
         gauge::GaugeState,
-        group::{Encoding, MetricValue},
+        group::Encoding,
         name::{MetricName, MetricNameEncoder},
         MetricEncoding, MetricFamilyEncoding,
     },
@@ -171,8 +171,11 @@ fn write_gauge<Enc: Encoding>(
     labels: impl LabelGroup,
     name: impl MetricNameEncoder,
     enc: &mut Enc,
-) -> Result<(), Enc::Err> {
-    enc.write_metric_value(name, labels, MetricValue::Int(x))
+) -> Result<(), Enc::Err>
+where
+    GaugeState: MetricEncoding<Enc>,
+{
+    GaugeState::new(x).collect_into(&(), labels, name, enc)
 }
 
 #[derive(Default)]
@@ -544,15 +547,6 @@ impl<T: Encoding> Encoding for Inc<T> {
     fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
         self.0.write_help(name, help)
     }
-
-    fn write_metric_value(
-        &mut self,
-        name: impl MetricNameEncoder,
-        labels: impl LabelGroup,
-        value: MetricValue,
-    ) -> Result<(), Self::Err> {
-        self.0.write_metric_value(name, labels, value)
-    }
 }
 
 impl<T: Encoding> MetricEncoding<Inc<T>> for MeasuredCounterPairState
@@ -579,15 +573,6 @@ impl<T: Encoding> Encoding for Dec<T> {
     fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
         self.0.write_help(name, help)
     }
-
-    fn write_metric_value(
-        &mut self,
-        name: impl MetricNameEncoder,
-        labels: impl LabelGroup,
-        value: MetricValue,
-    ) -> Result<(), Self::Err> {
-        self.0.write_metric_value(name, labels, value)
-    }
 }
 
 /// Write the dec counter to the encoder
diff --git a/proxy/src/jemalloc.rs b/proxy/src/jemalloc.rs
index 3243e6a140..d307d80f4a 100644
--- a/proxy/src/jemalloc.rs
+++ b/proxy/src/jemalloc.rs
@@ -3,8 +3,8 @@ use std::marker::PhantomData;
 use measured::{
     label::NoLabels,
     metric::{
-        gauge::GaugeState, group::Encoding, group::MetricValue, name::MetricNameEncoder,
-        MetricEncoding, MetricFamilyEncoding, MetricType,
+        gauge::GaugeState, group::Encoding, name::MetricNameEncoder, MetricEncoding,
+        MetricFamilyEncoding, MetricType,
     },
     text::TextEncoder,
     LabelGroup, MetricGroup,
@@ -100,7 +100,7 @@ macro_rules! jemalloc_gauge {
                 enc: &mut TextEncoder<W>,
             ) -> Result<(), std::io::Error> {
                 if let Ok(v) = mib.read() {
-                    enc.write_metric_value(name, labels, MetricValue::Int(v as i64))?;
+                    GaugeState::new(v as i64).collect_into(&(), labels, name, enc)?;
                 }
                 Ok(())
             }
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index e2a75a8720..db25ac0311 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -2,7 +2,7 @@ use std::sync::{Arc, OnceLock};
 
 use lasso::ThreadedRodeo;
 use measured::{
-    label::{FixedCardinalitySet, LabelName, LabelSet, LabelValue, StaticLabelSet},
+    label::{FixedCardinalitySet, LabelGroupSet, LabelName, LabelSet, LabelValue, StaticLabelSet},
     metric::{histogram::Thresholds, name::MetricName},
     Counter, CounterVec, FixedCardinalityLabel, Gauge, GaugeVec, Histogram, HistogramVec,
     LabelGroup, MetricGroup,
@@ -577,6 +577,32 @@ impl LabelGroup for ThreadPoolWorkerId {
     }
 }
 
+impl LabelGroupSet for ThreadPoolWorkers {
+    type Group<'a> = ThreadPoolWorkerId;
+
+    fn cardinality(&self) -> Option<usize> {
+        Some(self.0)
+    }
+
+    fn encode_dense(&self, value: Self::Unique) -> Option<usize> {
+        Some(value)
+    }
+
+    fn decode_dense(&self, value: usize) -> Self::Group<'_> {
+        ThreadPoolWorkerId(value)
+    }
+
+    type Unique = usize;
+
+    fn encode(&self, value: Self::Group<'_>) -> Option<Self::Unique> {
+        Some(value.0)
+    }
+
+    fn decode(&self, value: &Self::Unique) -> Self::Group<'_> {
+        ThreadPoolWorkerId(*value)
+    }
+}
+
 impl LabelSet for ThreadPoolWorkers {
     type Value<'a> = ThreadPoolWorkerId;
 
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index e1b1806bc8..7f57585994 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -30,6 +30,7 @@ chrono = { version = "0.4", default-features = false, features = ["clock", "serd
 clap = { version = "4", features = ["derive", "string"] }
 clap_builder = { version = "4", default-features = false, features = ["color", "help", "std", "string", "suggestions", "usage"] }
 crossbeam-utils = { version = "0.8" }
+deranged = { version = "0.3", default-features = false, features = ["powerfmt", "serde", "std"] }
 either = { version = "1" }
 fail = { version = "0.5", default-features = false, features = ["failpoints"] }
 futures-channel = { version = "0.3", features = ["sink"] }
@@ -107,7 +108,9 @@ num-integer = { version = "0.1", features = ["i128"] }
 num-traits = { version = "0.2", features = ["i128", "libm"] }
 once_cell = { version = "1" }
 parquet = { git = "https://github.com/apache/arrow-rs", branch = "master", default-features = false, features = ["zstd"] }
+proc-macro2 = { version = "1" }
 prost = { version = "0.11" }
+quote = { version = "1" }
 regex = { version = "1" }
 regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] }
 regex-syntax = { version = "0.8" }

From f8c01c6341eb109e50aa338d45834a7728c319d3 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Tue, 9 Jul 2024 13:41:37 -0400
Subject: [PATCH 209/412] fix(storage-scrubber): use default AWS authentication
 (#8299)

part of https://github.com/neondatabase/cloud/issues/14024
close https://github.com/neondatabase/neon/issues/7665

Things running in k8s container use this authentication:
https://docs.aws.amazon.com/sdkref/latest/guide/feature-container-credentials.html
while we did not configure the client to use it. This pull request
simply uses the default s3 client credential chain for storage scrubber.
It might break compatibility with minio.

## Summary of changes

* Use default AWS credential provider chain.
* Improvements for s3 errors, we now have detailed errors and correct
backtrace on last trial of the operation.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Joonas Koivunen <joonas@neon.tech>
---
 storage_scrubber/src/find_large_objects.rs    |  2 +-
 storage_scrubber/src/garbage.rs               |  4 +-
 storage_scrubber/src/lib.rs                   | 89 +++++--------------
 storage_scrubber/src/main.rs                  |  2 +-
 .../src/pageserver_physical_gc.rs             |  2 +-
 .../src/scan_pageserver_metadata.rs           |  2 +-
 .../src/scan_safekeeper_metadata.rs           |  2 +-
 storage_scrubber/src/tenant_snapshot.rs       |  7 +-
 8 files changed, 33 insertions(+), 77 deletions(-)

diff --git a/storage_scrubber/src/find_large_objects.rs b/storage_scrubber/src/find_large_objects.rs
index 1422545f2f..2ef802229d 100644
--- a/storage_scrubber/src/find_large_objects.rs
+++ b/storage_scrubber/src/find_large_objects.rs
@@ -47,7 +47,7 @@ pub async fn find_large_objects(
     ignore_deltas: bool,
     concurrency: usize,
 ) -> anyhow::Result<LargeObjectListing> {
-    let (s3_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver)?;
+    let (s3_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?;
     let tenants = std::pin::pin!(stream_tenants(&s3_client, &target));
 
     let objects_stream = tenants.map_ok(|tenant_shard_id| {
diff --git a/storage_scrubber/src/garbage.rs b/storage_scrubber/src/garbage.rs
index ce0ff10ec6..0450851988 100644
--- a/storage_scrubber/src/garbage.rs
+++ b/storage_scrubber/src/garbage.rs
@@ -140,7 +140,7 @@ async fn find_garbage_inner(
     node_kind: NodeKind,
 ) -> anyhow::Result<GarbageList> {
     // Construct clients for S3 and for Console API
-    let (s3_client, target) = init_remote(bucket_config.clone(), node_kind)?;
+    let (s3_client, target) = init_remote(bucket_config.clone(), node_kind).await?;
     let cloud_admin_api_client = Arc::new(CloudAdminApiClient::new(console_config));
 
     // Build a set of console-known tenants, for quickly eliminating known-active tenants without having
@@ -432,7 +432,7 @@ pub async fn purge_garbage(
     );
 
     let (s3_client, target) =
-        init_remote(garbage_list.bucket_config.clone(), garbage_list.node_kind)?;
+        init_remote(garbage_list.bucket_config.clone(), garbage_list.node_kind).await?;
 
     // Sanity checks on the incoming list
     if garbage_list.active_tenant_count == 0 {
diff --git a/storage_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs
index 8f567b22e0..9102ad9906 100644
--- a/storage_scrubber/src/lib.rs
+++ b/storage_scrubber/src/lib.rs
@@ -15,17 +15,10 @@ use std::fmt::Display;
 use std::sync::Arc;
 use std::time::Duration;
 
-use anyhow::Context;
-use aws_config::environment::EnvironmentVariableCredentialsProvider;
-use aws_config::imds::credentials::ImdsCredentialsProvider;
-use aws_config::meta::credentials::CredentialsProviderChain;
-use aws_config::profile::ProfileFileCredentialsProvider;
-use aws_config::retry::RetryConfig;
-use aws_config::sso::SsoCredentialsProvider;
-use aws_config::BehaviorVersion;
-use aws_sdk_s3::config::{AsyncSleep, Region, SharedAsyncSleep};
-use aws_sdk_s3::{Client, Config};
-use aws_smithy_async::rt::sleep::TokioSleep;
+use anyhow::{anyhow, Context};
+use aws_sdk_s3::config::Region;
+use aws_sdk_s3::error::DisplayErrorContext;
+use aws_sdk_s3::Client;
 
 use camino::{Utf8Path, Utf8PathBuf};
 use clap::ValueEnum;
@@ -274,65 +267,21 @@ pub fn init_logging(file_name: &str) -> Option<WorkerGuard> {
     }
 }
 
-pub fn init_s3_client(bucket_region: Region) -> Client {
-    let credentials_provider = {
-        // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
-        let chain = CredentialsProviderChain::first_try(
-            "env",
-            EnvironmentVariableCredentialsProvider::new(),
-        )
-        // uses "AWS_PROFILE" / `aws sso login --profile <profile>`
-        .or_else(
-            "profile-sso",
-            ProfileFileCredentialsProvider::builder().build(),
-        );
-
-        // Use SSO if we were given an account ID
-        match std::env::var("SSO_ACCOUNT_ID").ok() {
-            Some(sso_account) => chain.or_else(
-                "sso",
-                SsoCredentialsProvider::builder()
-                    .account_id(sso_account)
-                    .role_name("PowerUserAccess")
-                    .start_url("https://neondb.awsapps.com/start")
-                    .region(bucket_region.clone())
-                    .build(),
-            ),
-            None => chain,
-        }
-        .or_else(
-            // Finally try IMDS
-            "imds",
-            ImdsCredentialsProvider::builder().build(),
-        )
-    };
-
-    let sleep_impl: Arc<dyn AsyncSleep> = Arc::new(TokioSleep::new());
-
-    let mut builder = Config::builder()
-        .behavior_version(
-            #[allow(deprecated)] /* TODO: https://github.com/neondatabase/neon/issues/7665 */
-            BehaviorVersion::v2023_11_09(),
-        )
+pub async fn init_s3_client(bucket_region: Region) -> Client {
+    let config = aws_config::defaults(aws_config::BehaviorVersion::v2024_03_28())
         .region(bucket_region)
-        .retry_config(RetryConfig::adaptive().with_max_attempts(3))
-        .sleep_impl(SharedAsyncSleep::from(sleep_impl))
-        .credentials_provider(credentials_provider);
-
-    if let Ok(endpoint) = env::var("AWS_ENDPOINT_URL") {
-        builder = builder.endpoint_url(endpoint)
-    }
-
-    Client::from_conf(builder.build())
+        .load()
+        .await;
+    Client::new(&config)
 }
 
-fn init_remote(
+async fn init_remote(
     bucket_config: BucketConfig,
     node_kind: NodeKind,
 ) -> anyhow::Result<(Arc<Client>, RootTarget)> {
     let bucket_region = Region::new(bucket_config.region);
     let delimiter = "/".to_string();
-    let s3_client = Arc::new(init_s3_client(bucket_region));
+    let s3_client = Arc::new(init_s3_client(bucket_region).await);
 
     let s3_root = match node_kind {
         NodeKind::Pageserver => RootTarget::Pageserver(S3Target {
@@ -357,7 +306,7 @@ async fn list_objects_with_retries(
     s3_target: &S3Target,
     continuation_token: Option<String>,
 ) -> anyhow::Result<aws_sdk_s3::operation::list_objects_v2::ListObjectsV2Output> {
-    for _ in 0..MAX_RETRIES {
+    for trial in 0..MAX_RETRIES {
         match s3_client
             .list_objects_v2()
             .bucket(&s3_target.bucket_name)
@@ -369,16 +318,22 @@ async fn list_objects_with_retries(
         {
             Ok(response) => return Ok(response),
             Err(e) => {
+                if trial == MAX_RETRIES - 1 {
+                    return Err(e)
+                        .with_context(|| format!("Failed to list objects {MAX_RETRIES} times"));
+                }
                 error!(
-                    "list_objects_v2 query failed: {e}, bucket_name={}, prefix={}, delimiter={}",
-                    s3_target.bucket_name, s3_target.prefix_in_bucket, s3_target.delimiter
+                    "list_objects_v2 query failed: bucket_name={}, prefix={}, delimiter={}, error={}",
+                    s3_target.bucket_name,
+                    s3_target.prefix_in_bucket,
+                    s3_target.delimiter,
+                    DisplayErrorContext(e),
                 );
                 tokio::time::sleep(Duration::from_secs(1)).await;
             }
         }
     }
-
-    anyhow::bail!("Failed to list objects {MAX_RETRIES} times")
+    Err(anyhow!("unreachable unless MAX_RETRIES==0"))
 }
 
 async fn download_object_with_retries(
diff --git a/storage_scrubber/src/main.rs b/storage_scrubber/src/main.rs
index 16a26613d2..d816121192 100644
--- a/storage_scrubber/src/main.rs
+++ b/storage_scrubber/src/main.rs
@@ -196,7 +196,7 @@ async fn main() -> anyhow::Result<()> {
             concurrency,
         } => {
             let downloader =
-                SnapshotDownloader::new(bucket_config, tenant_id, output_path, concurrency)?;
+                SnapshotDownloader::new(bucket_config, tenant_id, output_path, concurrency).await?;
             downloader.download().await
         }
         Command::PageserverPhysicalGc {
diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs
index 0146433128..fb8fbc1635 100644
--- a/storage_scrubber/src/pageserver_physical_gc.rs
+++ b/storage_scrubber/src/pageserver_physical_gc.rs
@@ -160,7 +160,7 @@ pub async fn pageserver_physical_gc(
     min_age: Duration,
     mode: GcMode,
 ) -> anyhow::Result<GcSummary> {
-    let (s3_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver)?;
+    let (s3_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?;
 
     let tenants = if tenant_ids.is_empty() {
         futures::future::Either::Left(stream_tenants(&s3_client, &target))
diff --git a/storage_scrubber/src/scan_pageserver_metadata.rs b/storage_scrubber/src/scan_pageserver_metadata.rs
index af74ffa4cd..df4f29acf7 100644
--- a/storage_scrubber/src/scan_pageserver_metadata.rs
+++ b/storage_scrubber/src/scan_pageserver_metadata.rs
@@ -199,7 +199,7 @@ pub async fn scan_metadata(
     bucket_config: BucketConfig,
     tenant_ids: Vec<TenantShardId>,
 ) -> anyhow::Result<MetadataSummary> {
-    let (s3_client, target) = init_remote(bucket_config, NodeKind::Pageserver)?;
+    let (s3_client, target) = init_remote(bucket_config, NodeKind::Pageserver).await?;
 
     let tenants = if tenant_ids.is_empty() {
         futures::future::Either::Left(stream_tenants(&s3_client, &target))
diff --git a/storage_scrubber/src/scan_safekeeper_metadata.rs b/storage_scrubber/src/scan_safekeeper_metadata.rs
index 24051b03de..553adf8f46 100644
--- a/storage_scrubber/src/scan_safekeeper_metadata.rs
+++ b/storage_scrubber/src/scan_safekeeper_metadata.rs
@@ -106,7 +106,7 @@ pub async fn scan_safekeeper_metadata(
     let timelines = client.query(&query, &[]).await?;
     info!("loaded {} timelines", timelines.len());
 
-    let (s3_client, target) = init_remote(bucket_config, NodeKind::Safekeeper)?;
+    let (s3_client, target) = init_remote(bucket_config, NodeKind::Safekeeper).await?;
     let console_config = ConsoleConfig::from_env()?;
     let cloud_admin_api_client = CloudAdminApiClient::new(console_config);
 
diff --git a/storage_scrubber/src/tenant_snapshot.rs b/storage_scrubber/src/tenant_snapshot.rs
index 450b337235..5a75f8d40e 100644
--- a/storage_scrubber/src/tenant_snapshot.rs
+++ b/storage_scrubber/src/tenant_snapshot.rs
@@ -28,13 +28,13 @@ pub struct SnapshotDownloader {
 }
 
 impl SnapshotDownloader {
-    pub fn new(
+    pub async fn new(
         bucket_config: BucketConfig,
         tenant_id: TenantId,
         output_path: Utf8PathBuf,
         concurrency: usize,
     ) -> anyhow::Result<Self> {
-        let (s3_client, s3_root) = init_remote(bucket_config.clone(), NodeKind::Pageserver)?;
+        let (s3_client, s3_root) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?;
         Ok(Self {
             s3_client,
             s3_root,
@@ -215,7 +215,8 @@ impl SnapshotDownloader {
     }
 
     pub async fn download(&self) -> anyhow::Result<()> {
-        let (s3_client, target) = init_remote(self.bucket_config.clone(), NodeKind::Pageserver)?;
+        let (s3_client, target) =
+            init_remote(self.bucket_config.clone(), NodeKind::Pageserver).await?;
 
         // Generate a stream of TenantShardId
         let shards = stream_tenant_shards(&s3_client, &target, self.tenant_id).await?;

From 547a431b0d183039620446e4aa5cb452102d65c9 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 1 Jul 2024 13:45:42 -0500
Subject: [PATCH 210/412] Refactor how migrations are ran

Just a small improvement I noticed while looking at fixing CVE-2024-4317
in Neon.
---
 compute_tools/src/lib.rs       |   1 +
 compute_tools/src/migration.rs | 100 +++++++++++++++++++++++++++++++++
 compute_tools/src/spec.rs      |  65 +--------------------
 3 files changed, 103 insertions(+), 63 deletions(-)
 create mode 100644 compute_tools/src/migration.rs

diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs
index 18c228ba54..543d4462ed 100644
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -11,6 +11,7 @@ pub mod logger;
 pub mod catalog;
 pub mod compute;
 pub mod extension_server;
+mod migration;
 pub mod monitor;
 pub mod params;
 pub mod pg_helpers;
diff --git a/compute_tools/src/migration.rs b/compute_tools/src/migration.rs
new file mode 100644
index 0000000000..61dcf01c84
--- /dev/null
+++ b/compute_tools/src/migration.rs
@@ -0,0 +1,100 @@
+use anyhow::{Context, Result};
+use postgres::Client;
+use tracing::info;
+
+pub(crate) struct MigrationRunner<'m> {
+    client: &'m mut Client,
+    migrations: &'m [&'m str],
+}
+
+impl<'m> MigrationRunner<'m> {
+    pub fn new(client: &'m mut Client, migrations: &'m [&'m str]) -> Self {
+        Self { client, migrations }
+    }
+
+    fn get_migration_id(&mut self) -> Result<i64> {
+        let query = "SELECT id FROM neon_migration.migration_id";
+        let row = self
+            .client
+            .query_one(query, &[])
+            .context("run_migrations get migration_id")?;
+
+        Ok(row.get::<&str, i64>("id"))
+    }
+
+    fn update_migration_id(&mut self) -> Result<()> {
+        let setval = format!(
+            "UPDATE neon_migration.migration_id SET id={}",
+            self.migrations.len()
+        );
+
+        self.client
+            .simple_query(&setval)
+            .context("run_migrations update id")?;
+
+        Ok(())
+    }
+
+    fn prepare_migrations(&mut self) -> Result<()> {
+        let query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
+        self.client.simple_query(query)?;
+
+        let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
+        self.client.simple_query(query)?;
+
+        let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
+        self.client.simple_query(query)?;
+
+        let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
+        self.client.simple_query(query)?;
+
+        let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
+        self.client.simple_query(query)?;
+
+        Ok(())
+    }
+
+    pub fn run_migrations(mut self) -> Result<()> {
+        self.prepare_migrations()?;
+
+        let mut current_migration: usize = self.get_migration_id()? as usize;
+        let starting_migration_id = current_migration;
+
+        let query = "BEGIN";
+        self.client
+            .simple_query(query)
+            .context("run_migrations begin")?;
+
+        while current_migration < self.migrations.len() {
+            let migration = self.migrations[current_migration];
+
+            if migration.starts_with("-- SKIP") {
+                info!("Skipping migration id={}", current_migration);
+            } else {
+                info!(
+                    "Running migration id={}:\n{}\n",
+                    current_migration, migration
+                );
+                self.client.simple_query(migration).with_context(|| {
+                    format!("run_migration current_migration={}", current_migration)
+                })?;
+            }
+
+            current_migration += 1;
+        }
+
+        self.update_migration_id()?;
+
+        let query = "COMMIT";
+        self.client
+            .simple_query(query)
+            .context("run_migrations commit")?;
+
+        info!(
+            "Ran {} migrations",
+            (self.migrations.len() - starting_migration_id)
+        );
+
+        Ok(())
+    }
+}
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 143f6c1e5f..37090b08fd 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -10,6 +10,7 @@ use tracing::{error, info, info_span, instrument, span_enabled, warn, Level};
 
 use crate::config;
 use crate::logger::inlinify;
+use crate::migration::MigrationRunner;
 use crate::params::PG_HBA_ALL_MD5;
 use crate::pg_helpers::*;
 
@@ -791,69 +792,7 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> {
         include_str!("./migrations/0008-revoke_replication_for_previously_allowed_roles.sql"),
     ];
 
-    let mut func = || {
-        let query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
-        client.simple_query(query)?;
-
-        let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
-        client.simple_query(query)?;
-
-        let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
-        client.simple_query(query)?;
-
-        let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
-        client.simple_query(query)?;
-
-        let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
-        client.simple_query(query)?;
-        Ok::<_, anyhow::Error>(())
-    };
-    func().context("handle_migrations prepare")?;
-
-    let query = "SELECT id FROM neon_migration.migration_id";
-    let row = client
-        .query_one(query, &[])
-        .context("handle_migrations get migration_id")?;
-    let mut current_migration: usize = row.get::<&str, i64>("id") as usize;
-    let starting_migration_id = current_migration;
-
-    let query = "BEGIN";
-    client
-        .simple_query(query)
-        .context("handle_migrations begin")?;
-
-    while current_migration < migrations.len() {
-        let migration = &migrations[current_migration];
-        if migration.starts_with("-- SKIP") {
-            info!("Skipping migration id={}", current_migration);
-        } else {
-            info!(
-                "Running migration id={}:\n{}\n",
-                current_migration, migration
-            );
-            client.simple_query(migration).with_context(|| {
-                format!("handle_migrations current_migration={}", current_migration)
-            })?;
-        }
-        current_migration += 1;
-    }
-    let setval = format!(
-        "UPDATE neon_migration.migration_id SET id={}",
-        migrations.len()
-    );
-    client
-        .simple_query(&setval)
-        .context("handle_migrations update id")?;
-
-    let query = "COMMIT";
-    client
-        .simple_query(query)
-        .context("handle_migrations commit")?;
-
-    info!(
-        "Ran {} migrations",
-        (migrations.len() - starting_migration_id)
-    );
+    MigrationRunner::new(client, &migrations).run_migrations()?;
 
     Ok(())
 }

From 6fd35bfe32067636cc8ad4e3a49766726c5462d4 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 9 Jul 2024 10:21:23 -0500
Subject: [PATCH 211/412] Add an application_name to more Neon connections

Helps identify connections in the logs.
---
 compute_tools/src/compute.rs | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 1fa2b9f71d..eced6fc0b2 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -798,7 +798,11 @@ impl ComputeNode {
         // In this case we need to connect with old `zenith_admin` name
         // and create new user. We cannot simply rename connected user,
         // but we can create a new one and grant it all privileges.
-        let connstr = self.connstr.clone();
+        let mut connstr = self.connstr.clone();
+        connstr
+            .query_pairs_mut()
+            .append_pair("application_name", "apply_config");
+
         let mut client = match Client::connect(connstr.as_str(), NoTls) {
             Err(e) => match e.code() {
                 Some(&SqlState::INVALID_PASSWORD)
@@ -867,6 +871,11 @@ impl ComputeNode {
 
         // Run migrations separately to not hold up cold starts
         thread::spawn(move || {
+            let mut connstr = connstr.clone();
+            connstr
+                .query_pairs_mut()
+                .append_pair("application_name", "migrations");
+
             let mut client = Client::connect(connstr.as_str(), NoTls)?;
             handle_migrations(&mut client).context("apply_config handle_migrations")
         });

From e619e8703edd4aedd1e27724cb53a97631385aaa Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 9 Jul 2024 20:11:11 +0200
Subject: [PATCH 212/412] refactor: postgres_backend: replace abstract
 shutdown_watcher with CancellationToken (#8295)

Preliminary refactoring while working on
https://github.com/neondatabase/neon/issues/7427
and specifically https://github.com/neondatabase/neon/pull/8286
---
 Cargo.lock                                   |  3 +-
 libs/postgres_backend/Cargo.toml             |  3 +-
 libs/postgres_backend/src/lib.rs             | 33 +++++++-------------
 libs/postgres_backend/tests/simple_select.rs |  7 +++--
 pageserver/src/page_service.rs               |  2 +-
 proxy/src/console/mgmt.rs                    |  7 +++--
 safekeeper/src/wal_service.rs                |  5 +--
 workspace_hack/Cargo.toml                    |  2 --
 8 files changed, 28 insertions(+), 34 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 63628160d1..776d95c3c7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4094,6 +4094,7 @@ dependencies = [
  "tokio-postgres",
  "tokio-postgres-rustls",
  "tokio-rustls 0.25.0",
+ "tokio-util",
  "tracing",
  "workspace_hack",
 ]
@@ -7458,10 +7459,8 @@ dependencies = [
  "either",
  "fail",
  "futures-channel",
- "futures-core",
  "futures-executor",
  "futures-io",
- "futures-sink",
  "futures-util",
  "getrandom 0.2.11",
  "hashbrown 0.14.5",
diff --git a/libs/postgres_backend/Cargo.toml b/libs/postgres_backend/Cargo.toml
index 8e249c09f7..c7611b9f21 100644
--- a/libs/postgres_backend/Cargo.toml
+++ b/libs/postgres_backend/Cargo.toml
@@ -13,6 +13,7 @@ rustls.workspace = true
 serde.workspace = true
 thiserror.workspace = true
 tokio.workspace = true
+tokio-util.workspace = true
 tokio-rustls.workspace = true
 tracing.workspace = true
 
@@ -23,4 +24,4 @@ workspace_hack.workspace = true
 once_cell.workspace = true
 rustls-pemfile.workspace = true
 tokio-postgres.workspace = true
-tokio-postgres-rustls.workspace = true
\ No newline at end of file
+tokio-postgres-rustls.workspace = true
diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs
index 6c41b7f347..c79ee4e053 100644
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -16,6 +16,7 @@ use std::{fmt, io};
 use std::{future::Future, str::FromStr};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_rustls::TlsAcceptor;
+use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, trace, warn};
 
 use pq_proto::framed::{ConnectionError, Framed, FramedReader, FramedWriter};
@@ -400,21 +401,15 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
     }
 
     /// Wrapper for run_message_loop() that shuts down socket when we are done
-    pub async fn run<F, S>(
+    pub async fn run(
         mut self,
         handler: &mut impl Handler<IO>,
-        shutdown_watcher: F,
-    ) -> Result<(), QueryError>
-    where
-        F: Fn() -> S + Clone,
-        S: Future,
-    {
-        let ret = self
-            .run_message_loop(handler, shutdown_watcher.clone())
-            .await;
+        cancel: &CancellationToken,
+    ) -> Result<(), QueryError> {
+        let ret = self.run_message_loop(handler, cancel).await;
 
         tokio::select! {
-            _ = shutdown_watcher() => {
+            _ = cancel.cancelled() => {
                 // do nothing; we most likely got already stopped by shutdown and will log it next.
             }
             _ = self.framed.shutdown() => {
@@ -444,21 +439,17 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
         }
     }
 
-    async fn run_message_loop<F, S>(
+    async fn run_message_loop(
         &mut self,
         handler: &mut impl Handler<IO>,
-        shutdown_watcher: F,
-    ) -> Result<(), QueryError>
-    where
-        F: Fn() -> S,
-        S: Future,
-    {
+        cancel: &CancellationToken,
+    ) -> Result<(), QueryError> {
         trace!("postgres backend to {:?} started", self.peer_addr);
 
         tokio::select!(
             biased;
 
-            _ = shutdown_watcher() => {
+            _ = cancel.cancelled() => {
                 // We were requested to shut down.
                 tracing::info!("shutdown request received during handshake");
                 return Err(QueryError::Shutdown)
@@ -473,7 +464,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
         let mut query_string = Bytes::new();
         while let Some(msg) = tokio::select!(
             biased;
-            _ = shutdown_watcher() => {
+            _ = cancel.cancelled() => {
                 // We were requested to shut down.
                 tracing::info!("shutdown request received in run_message_loop");
                 return Err(QueryError::Shutdown)
@@ -485,7 +476,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
             let result = self.process_message(handler, msg, &mut query_string).await;
             tokio::select!(
                 biased;
-                _ = shutdown_watcher() => {
+                _ = cancel.cancelled() => {
                     // We were requested to shut down.
                     tracing::info!("shutdown request received during response flush");
 
diff --git a/libs/postgres_backend/tests/simple_select.rs b/libs/postgres_backend/tests/simple_select.rs
index 80df9db858..7ec85f0dbe 100644
--- a/libs/postgres_backend/tests/simple_select.rs
+++ b/libs/postgres_backend/tests/simple_select.rs
@@ -3,13 +3,14 @@ use once_cell::sync::Lazy;
 use postgres_backend::{AuthType, Handler, PostgresBackend, QueryError};
 use pq_proto::{BeMessage, RowDescriptor};
 use std::io::Cursor;
-use std::{future, sync::Arc};
+use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio::net::{TcpListener, TcpStream};
 use tokio_postgres::config::SslMode;
 use tokio_postgres::tls::MakeTlsConnect;
 use tokio_postgres::{Config, NoTls, SimpleQueryMessage};
 use tokio_postgres_rustls::MakeRustlsConnect;
+use tokio_util::sync::CancellationToken;
 
 // generate client, server test streams
 async fn make_tcp_pair() -> (TcpStream, TcpStream) {
@@ -50,7 +51,7 @@ async fn simple_select() {
 
     tokio::spawn(async move {
         let mut handler = TestHandler {};
-        pgbackend.run(&mut handler, future::pending::<()>).await
+        pgbackend.run(&mut handler, &CancellationToken::new()).await
     });
 
     let conf = Config::new();
@@ -102,7 +103,7 @@ async fn simple_select_ssl() {
 
     tokio::spawn(async move {
         let mut handler = TestHandler {};
-        pgbackend.run(&mut handler, future::pending::<()>).await
+        pgbackend.run(&mut handler, &CancellationToken::new()).await
     });
 
     let client_cfg = rustls::ClientConfig::builder()
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 07365b5eb8..975c912970 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -267,7 +267,7 @@ async fn page_service_conn_main(
     let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
 
     match pgbackend
-        .run(&mut conn_handler, task_mgr::shutdown_watcher)
+        .run(&mut conn_handler, &task_mgr::shutdown_token())
         .await
     {
         Ok(()) => {
diff --git a/proxy/src/console/mgmt.rs b/proxy/src/console/mgmt.rs
index c7a2d467c0..befe7d7510 100644
--- a/proxy/src/console/mgmt.rs
+++ b/proxy/src/console/mgmt.rs
@@ -6,8 +6,9 @@ use anyhow::Context;
 use once_cell::sync::Lazy;
 use postgres_backend::{AuthType, PostgresBackend, PostgresBackendTCP, QueryError};
 use pq_proto::{BeMessage, SINGLE_COL_ROWDESC};
-use std::{convert::Infallible, future};
+use std::convert::Infallible;
 use tokio::net::{TcpListener, TcpStream};
+use tokio_util::sync::CancellationToken;
 use tracing::{error, info, info_span, Instrument};
 
 static CPLANE_WAITERS: Lazy<Waiters<ComputeReady>> = Lazy::new(Default::default);
@@ -67,7 +68,9 @@ pub async fn task_main(listener: TcpListener) -> anyhow::Result<Infallible> {
 
 async fn handle_connection(socket: TcpStream) -> Result<(), QueryError> {
     let pgbackend = PostgresBackend::new(socket, AuthType::Trust, None)?;
-    pgbackend.run(&mut MgmtHandler, future::pending::<()>).await
+    pgbackend
+        .run(&mut MgmtHandler, &CancellationToken::new())
+        .await
 }
 
 /// A message received by `mgmt` when a compute node is ready.
diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs
index 4a97eb3993..091571111e 100644
--- a/safekeeper/src/wal_service.rs
+++ b/safekeeper/src/wal_service.rs
@@ -4,9 +4,10 @@
 //!
 use anyhow::{Context, Result};
 use postgres_backend::QueryError;
-use std::{future, time::Duration};
+use std::time::Duration;
 use tokio::net::TcpStream;
 use tokio_io_timeout::TimeoutReader;
+use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::{auth::Scope, measured_stream::MeasuredStream};
 
@@ -100,7 +101,7 @@ async fn handle_socket(
     // libpq protocol between safekeeper and walproposer / pageserver
     // We don't use shutdown.
     pgbackend
-        .run(&mut conn_handler, future::pending::<()>)
+        .run(&mut conn_handler, &CancellationToken::new())
         .await
 }
 
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 7f57585994..832fe06bf6 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -34,10 +34,8 @@ deranged = { version = "0.3", default-features = false, features = ["powerfmt",
 either = { version = "1" }
 fail = { version = "0.5", default-features = false, features = ["failpoints"] }
 futures-channel = { version = "0.3", features = ["sink"] }
-futures-core = { version = "0.3" }
 futures-executor = { version = "0.3" }
 futures-io = { version = "0.3" }
-futures-sink = { version = "0.3" }
 futures-util = { version = "0.3", features = ["channel", "io", "sink"] }
 getrandom = { version = "0.2", default-features = false, features = ["std"] }
 hashbrown = { version = "0.14", features = ["raw"] }

From 35dac6e6c83a48cd4e8185122ee883160bdfdcb5 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 9 Jul 2024 20:58:48 +0200
Subject: [PATCH 213/412] fix(l0_flush): drops permit before fsync, potential
 cause for OOMs (#8327)

## Problem

Slack thread:
https://neondb.slack.com/archives/C033RQ5SPDH/p1720511577862519

We're seeing OOMs in staging on a pageserver that has
l0_flush.mode=Direct enabled.

There's a strong correlation between jumps in `maxrss_kb` and
`pageserver_timeline_ephemeral_bytes`, so, it's quite likely that
l0_flush.mode=Direct is the culprit.

Notably, the expected max memory usage on that staging server by the
l0_flush.mode=Direct is ~2GiB but we're seeing as much as 24GiB max RSS
before the OOM kill.

One hypothesis is that we're dropping the semaphore permit before all
the dirtied pages have been flushed to disk. (The flushing to disk
likely happens in the fsync inside the `.finish()` call, because we're
using ext4 in data=ordered mode).

## Summary of changes

Hold the permit until after we're done with `.finish()`.
---
 .../src/tenant/storage_layer/inmemory_layer.rs   | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index e1eaea90af..5941a52e98 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -715,16 +715,22 @@ impl InMemoryLayer {
                         res?;
                     }
                 }
-
-                // Hold the permit until the IO is done; if we didn't, one could drop this future,
-                // thereby releasing the permit, but the Vec<u8> remains allocated until the IO completes.
-                // => we'd have more concurrenct Vec<u8> than allowed as per the semaphore.
-                drop(_concurrency_permit);
             }
         }
 
         // MAX is used here because we identify L0 layers by full key range
         let delta_layer = delta_layer_writer.finish(Key::MAX, timeline, ctx).await?;
+
+        // Hold the permit until all the IO is done, including the fsync in `delta_layer_writer.finish()``.
+        //
+        // If we didn't and our caller drops this future, tokio-epoll-uring would extend the lifetime of
+        // the `file_contents: Vec<u8>` until the IO is done, but not the permit's lifetime.
+        // Thus, we'd have more concurrenct `Vec<u8>` in existence than the semaphore allows.
+        //
+        // We hold across the fsync so that on ext4 mounted with data=ordered, all the kernel page cache pages
+        // we dirtied when writing to the filesystem have been flushed and marked !dirty.
+        drop(_concurrency_permit);
+
         Ok(Some(delta_layer))
     }
 }

From bfc733824693d780ce16bd610e6de470728aa21b Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 9 Jul 2024 23:17:42 +0200
Subject: [PATCH 214/412] pageserver: move `page_service`'s  `import
 basebackup` / `import wal` to mgmt API (#8292)

I want to fix bugs in `page_service`
([issue](https://github.com/neondatabase/neon/issues/7427)) and the
`import basebackup` / `import wal` stand in the way / make the
refactoring more complicated.

We don't use these methods anyway in practice, but, there have been some
objections to removing the functionality completely.

So, this PR preserves the existing functionality but moves it into the
HTTP management API.

Note that I don't try to fix existing bugs in the code, specifically not
fixing
* it only ever worked correctly for unsharded tenants
* it doesn't clean up on error

All errors are mapped to `ApiError::InternalServerError`.
---
 control_plane/src/pageserver.rs    |  58 ++---
 libs/utils/src/http/request.rs     |   9 +
 pageserver/client/Cargo.toml       |   2 +-
 pageserver/client/src/mgmt_api.rs  |  79 ++++++-
 pageserver/src/bin/pageserver.rs   |   1 -
 pageserver/src/http/routes.rs      | 194 ++++++++++++++++
 pageserver/src/metrics.rs          |   2 -
 pageserver/src/page_service.rs     | 357 +----------------------------
 storage_controller/src/node.rs     |   2 +-
 storage_controller/src/service.rs  |   4 +
 test_runner/regress/test_import.py |   3 +-
 11 files changed, 302 insertions(+), 409 deletions(-)

diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 983f78577c..f0403b1796 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -15,7 +15,6 @@ use std::time::Duration;
 
 use anyhow::{bail, Context};
 use camino::Utf8PathBuf;
-use futures::SinkExt;
 use pageserver_api::models::{
     self, AuxFilePolicy, LocationConfig, TenantHistorySize, TenantInfo, TimelineInfo,
 };
@@ -566,60 +565,39 @@ impl PageServerNode {
         pg_wal: Option<(Lsn, PathBuf)>,
         pg_version: u32,
     ) -> anyhow::Result<()> {
-        let (client, conn) = self.page_server_psql_client().await?;
-        // The connection object performs the actual communication with the database,
-        // so spawn it off to run on its own.
-        tokio::spawn(async move {
-            if let Err(e) = conn.await {
-                eprintln!("connection error: {}", e);
-            }
-        });
-        let client = std::pin::pin!(client);
-
         // Init base reader
         let (start_lsn, base_tarfile_path) = base;
         let base_tarfile = tokio::fs::File::open(base_tarfile_path).await?;
-        let base_tarfile = tokio_util::io::ReaderStream::new(base_tarfile);
+        let base_tarfile =
+            mgmt_api::ReqwestBody::wrap_stream(tokio_util::io::ReaderStream::new(base_tarfile));
 
         // Init wal reader if necessary
         let (end_lsn, wal_reader) = if let Some((end_lsn, wal_tarfile_path)) = pg_wal {
             let wal_tarfile = tokio::fs::File::open(wal_tarfile_path).await?;
-            let wal_reader = tokio_util::io::ReaderStream::new(wal_tarfile);
+            let wal_reader =
+                mgmt_api::ReqwestBody::wrap_stream(tokio_util::io::ReaderStream::new(wal_tarfile));
             (end_lsn, Some(wal_reader))
         } else {
             (start_lsn, None)
         };
 
-        let copy_in = |reader, cmd| {
-            let client = &client;
-            async move {
-                let writer = client.copy_in(&cmd).await?;
-                let writer = std::pin::pin!(writer);
-                let mut writer = writer.sink_map_err(|e| {
-                    std::io::Error::new(std::io::ErrorKind::Other, format!("{e}"))
-                });
-                let mut reader = std::pin::pin!(reader);
-                writer.send_all(&mut reader).await?;
-                writer.into_inner().finish().await?;
-                anyhow::Ok(())
-            }
-        };
-
         // Import base
-        copy_in(
-            base_tarfile,
-            format!(
-                "import basebackup {tenant_id} {timeline_id} {start_lsn} {end_lsn} {pg_version}"
-            ),
-        )
-        .await?;
-        // Import wal if necessary
-        if let Some(wal_reader) = wal_reader {
-            copy_in(
-                wal_reader,
-                format!("import wal {tenant_id} {timeline_id} {start_lsn} {end_lsn}"),
+        self.http_client
+            .import_basebackup(
+                tenant_id,
+                timeline_id,
+                start_lsn,
+                end_lsn,
+                pg_version,
+                base_tarfile,
             )
             .await?;
+
+        // Import wal if necessary
+        if let Some(wal_reader) = wal_reader {
+            self.http_client
+                .import_wal(tenant_id, timeline_id, start_lsn, end_lsn, wal_reader)
+                .await?;
         }
 
         Ok(())
diff --git a/libs/utils/src/http/request.rs b/libs/utils/src/http/request.rs
index 766bbfc9df..8b8ed5a67f 100644
--- a/libs/utils/src/http/request.rs
+++ b/libs/utils/src/http/request.rs
@@ -74,6 +74,15 @@ pub fn parse_query_param<E: fmt::Display, T: FromStr<Err = E>>(
         .transpose()
 }
 
+pub fn must_parse_query_param<E: fmt::Display, T: FromStr<Err = E>>(
+    request: &Request<Body>,
+    param_name: &str,
+) -> Result<T, ApiError> {
+    parse_query_param(request, param_name)?.ok_or_else(|| {
+        ApiError::BadRequest(anyhow!("no {param_name} specified in query parameters"))
+    })
+}
+
 pub async fn ensure_no_body(request: &mut Request<Body>) -> Result<(), ApiError> {
     match request.body_mut().data().await {
         Some(_) => Err(ApiError::BadRequest(anyhow!("Unexpected request body"))),
diff --git a/pageserver/client/Cargo.toml b/pageserver/client/Cargo.toml
index 0ed27602cd..a938367334 100644
--- a/pageserver/client/Cargo.toml
+++ b/pageserver/client/Cargo.toml
@@ -8,7 +8,7 @@ license.workspace = true
 pageserver_api.workspace = true
 thiserror.workspace = true
 async-trait.workspace = true
-reqwest.workspace = true
+reqwest = { workspace = true, features = [ "stream" ] }
 utils.workspace = true
 serde.workspace = true
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 48b27775cb..e3ddb446fa 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -9,6 +9,8 @@ use utils::{
     lsn::Lsn,
 };
 
+pub use reqwest::Body as ReqwestBody;
+
 pub mod util;
 
 #[derive(Debug, Clone)]
@@ -20,6 +22,9 @@ pub struct Client {
 
 #[derive(thiserror::Error, Debug)]
 pub enum Error {
+    #[error("send request: {0}")]
+    SendRequest(reqwest::Error),
+
     #[error("receive body: {0}")]
     ReceiveBody(reqwest::Error),
 
@@ -173,19 +178,30 @@ impl Client {
         self.request(Method::GET, uri, ()).await
     }
 
+    fn start_request<U: reqwest::IntoUrl>(
+        &self,
+        method: Method,
+        uri: U,
+    ) -> reqwest::RequestBuilder {
+        let req = self.client.request(method, uri);
+        if let Some(value) = &self.authorization_header {
+            req.header(reqwest::header::AUTHORIZATION, value)
+        } else {
+            req
+        }
+    }
+
     async fn request_noerror<B: serde::Serialize, U: reqwest::IntoUrl>(
         &self,
         method: Method,
         uri: U,
         body: B,
     ) -> Result<reqwest::Response> {
-        let req = self.client.request(method, uri);
-        let req = if let Some(value) = &self.authorization_header {
-            req.header(reqwest::header::AUTHORIZATION, value)
-        } else {
-            req
-        };
-        req.json(&body).send().await.map_err(Error::ReceiveBody)
+        self.start_request(method, uri)
+            .json(&body)
+            .send()
+            .await
+            .map_err(Error::ReceiveBody)
     }
 
     async fn request<B: serde::Serialize, U: reqwest::IntoUrl>(
@@ -609,4 +625,53 @@ impl Client {
             }),
         }
     }
+
+    pub async fn import_basebackup(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        base_lsn: Lsn,
+        end_lsn: Lsn,
+        pg_version: u32,
+        basebackup_tarball: ReqwestBody,
+    ) -> Result<()> {
+        let uri = format!(
+            "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/import_basebackup?base_lsn={base_lsn}&end_lsn={end_lsn}&pg_version={pg_version}",
+            self.mgmt_api_endpoint,
+        );
+        self.start_request(Method::PUT, uri)
+            .body(basebackup_tarball)
+            .send()
+            .await
+            .map_err(Error::SendRequest)?
+            .error_from_body()
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
+    pub async fn import_wal(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        start_lsn: Lsn,
+        end_lsn: Lsn,
+        wal_tarball: ReqwestBody,
+    ) -> Result<()> {
+        let uri = format!(
+            "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/import_wal?start_lsn={start_lsn}&end_lsn={end_lsn}",
+            self.mgmt_api_endpoint,
+        );
+        self.start_request(Method::PUT, uri)
+            .body(wal_tarball)
+            .send()
+            .await
+            .map_err(Error::SendRequest)?
+            .error_from_body()
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
 }
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 2763352a21..9f705f0bc9 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -660,7 +660,6 @@ fn start_pageserver(
                 async move {
                     page_service::libpq_listener_main(
                         tenant_manager,
-                        broker_client,
                         pg_auth,
                         pageserver_listener,
                         conf.pg_auth_type,
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 893302b7d6..6f8f3e6389 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -10,6 +10,7 @@ use std::time::Duration;
 
 use anyhow::{anyhow, Context, Result};
 use enumset::EnumSet;
+use futures::StreamExt;
 use futures::TryFutureExt;
 use humantime::format_rfc3339;
 use hyper::header;
@@ -44,12 +45,14 @@ use remote_storage::DownloadError;
 use remote_storage::GenericRemoteStorage;
 use remote_storage::TimeTravelError;
 use tenant_size_model::{svg::SvgBranchKind, SizeResult, StorageModel};
+use tokio_util::io::StreamReader;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::auth::JwtAuth;
 use utils::failpoint_support::failpoints_handler;
 use utils::http::endpoint::prometheus_metrics_handler;
 use utils::http::endpoint::request_span;
+use utils::http::request::must_parse_query_param;
 use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
 
 use crate::context::{DownloadBehavior, RequestContext};
@@ -2404,6 +2407,189 @@ async fn post_top_tenants(
     )
 }
 
+async fn put_tenant_timeline_import_basebackup(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    let base_lsn: Lsn = must_parse_query_param(&request, "base_lsn")?;
+    let end_lsn: Lsn = must_parse_query_param(&request, "end_lsn")?;
+    let pg_version: u32 = must_parse_query_param(&request, "pg_version")?;
+
+    check_permission(&request, Some(tenant_id))?;
+
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
+
+    let span = info_span!("import_basebackup", tenant_id=%tenant_id, timeline_id=%timeline_id, base_lsn=%base_lsn, end_lsn=%end_lsn, pg_version=%pg_version);
+    async move {
+        let state = get_state(&request);
+        let tenant = state
+            .tenant_manager
+            .get_attached_tenant_shard(TenantShardId::unsharded(tenant_id))?;
+
+        let broker_client = state.broker_client.clone();
+
+        let mut body = StreamReader::new(request.into_body().map(|res| {
+            res.map_err(|error| {
+                std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!(error))
+            })
+        }));
+
+        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+
+        let timeline = tenant
+            .create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
+            .map_err(ApiError::InternalServerError)
+            .await?;
+
+        // TODO mark timeline as not ready until it reaches end_lsn.
+        // We might have some wal to import as well, and we should prevent compute
+        // from connecting before that and writing conflicting wal.
+        //
+        // This is not relevant for pageserver->pageserver migrations, since there's
+        // no wal to import. But should be fixed if we want to import from postgres.
+
+        // TODO leave clean state on error. For now you can use detach to clean
+        // up broken state from a failed import.
+
+        // Import basebackup provided via CopyData
+        info!("importing basebackup");
+
+        timeline
+            .import_basebackup_from_tar(tenant.clone(), &mut body, base_lsn, broker_client, &ctx)
+            .await
+            .map_err(ApiError::InternalServerError)?;
+
+        // Read the end of the tar archive.
+        read_tar_eof(body)
+            .await
+            .map_err(ApiError::InternalServerError)?;
+
+        // TODO check checksum
+        // Meanwhile you can verify client-side by taking fullbackup
+        // and checking that it matches in size with what was imported.
+        // It wouldn't work if base came from vanilla postgres though,
+        // since we discard some log files.
+
+        info!("done");
+        json_response(StatusCode::OK, ())
+    }
+    .instrument(span)
+    .await
+}
+
+async fn put_tenant_timeline_import_wal(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    let start_lsn: Lsn = must_parse_query_param(&request, "start_lsn")?;
+    let end_lsn: Lsn = must_parse_query_param(&request, "end_lsn")?;
+
+    check_permission(&request, Some(tenant_id))?;
+
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
+
+    let span = info_span!("import_wal", tenant_id=%tenant_id, timeline_id=%timeline_id, start_lsn=%start_lsn, end_lsn=%end_lsn);
+    async move {
+        let state = get_state(&request);
+
+        let timeline = active_timeline_of_active_tenant(&state.tenant_manager, TenantShardId::unsharded(tenant_id), timeline_id).await?;
+
+        let mut body = StreamReader::new(request.into_body().map(|res| {
+            res.map_err(|error| {
+                std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!(error))
+            })
+        }));
+
+        let last_record_lsn = timeline.get_last_record_lsn();
+        if last_record_lsn != start_lsn {
+            return Err(ApiError::InternalServerError(anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}")));
+        }
+
+        // TODO leave clean state on error. For now you can use detach to clean
+        // up broken state from a failed import.
+
+        // Import wal provided via CopyData
+        info!("importing wal");
+        crate::import_datadir::import_wal_from_tar(&timeline, &mut body, start_lsn, end_lsn, &ctx).await.map_err(ApiError::InternalServerError)?;
+        info!("wal import complete");
+
+        // Read the end of the tar archive.
+        read_tar_eof(body).await.map_err(ApiError::InternalServerError)?;
+
+        // TODO Does it make sense to overshoot?
+        if timeline.get_last_record_lsn() < end_lsn {
+            return Err(ApiError::InternalServerError(anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}")));
+        }
+
+        // Flush data to disk, then upload to s3. No need for a forced checkpoint.
+        // We only want to persist the data, and it doesn't matter if it's in the
+        // shape of deltas or images.
+        info!("flushing layers");
+        timeline.freeze_and_flush().await.map_err(|e| match e {
+            tenant::timeline::FlushLayerError::Cancelled => ApiError::ShuttingDown,
+            other => ApiError::InternalServerError(anyhow::anyhow!(other)),
+        })?;
+
+        info!("done");
+
+        json_response(StatusCode::OK, ())
+    }.instrument(span).await
+}
+
+/// Read the end of a tar archive.
+///
+/// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each.
+/// `tokio_tar` already read the first such block. Read the second all-zeros block,
+/// and check that there is no more data after the EOF marker.
+///
+/// 'tar' command can also write extra blocks of zeros, up to a record
+/// size, controlled by the --record-size argument. Ignore them too.
+async fn read_tar_eof(mut reader: (impl tokio::io::AsyncRead + Unpin)) -> anyhow::Result<()> {
+    use tokio::io::AsyncReadExt;
+    let mut buf = [0u8; 512];
+
+    // Read the all-zeros block, and verify it
+    let mut total_bytes = 0;
+    while total_bytes < 512 {
+        let nbytes = reader.read(&mut buf[total_bytes..]).await?;
+        total_bytes += nbytes;
+        if nbytes == 0 {
+            break;
+        }
+    }
+    if total_bytes < 512 {
+        anyhow::bail!("incomplete or invalid tar EOF marker");
+    }
+    if !buf.iter().all(|&x| x == 0) {
+        anyhow::bail!("invalid tar EOF marker");
+    }
+
+    // Drain any extra zero-blocks after the EOF marker
+    let mut trailing_bytes = 0;
+    let mut seen_nonzero_bytes = false;
+    loop {
+        let nbytes = reader.read(&mut buf).await?;
+        trailing_bytes += nbytes;
+        if !buf.iter().all(|&x| x == 0) {
+            seen_nonzero_bytes = true;
+        }
+        if nbytes == 0 {
+            break;
+        }
+    }
+    if seen_nonzero_bytes {
+        anyhow::bail!("unexpected non-zero bytes after the tar archive");
+    }
+    if trailing_bytes % 512 != 0 {
+        anyhow::bail!("unexpected number of zeros ({trailing_bytes}), not divisible by tar block size (512 bytes), after the tar archive");
+    }
+    Ok(())
+}
+
 /// Common functionality of all the HTTP API handlers.
 ///
 /// - Adds a tracing span to each request (by `request_span`)
@@ -2698,5 +2884,13 @@ pub fn make_router(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/perf_info",
             |r| testing_api_handler("perf_info", r, perf_info),
         )
+        .put(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/import_basebackup",
+            |r| api_handler(r, put_tenant_timeline_import_basebackup),
+        )
+        .put(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/import_wal",
+            |r| api_handler(r, put_tenant_timeline_import_wal),
+        )
         .any(handler_404))
 }
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 59b7293631..e67fa656d0 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1473,8 +1473,6 @@ pub(crate) enum ComputeCommandKind {
     PageStream,
     Basebackup,
     Fullbackup,
-    ImportBasebackup,
-    ImportWal,
     LeaseLsn,
     Show,
 }
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 975c912970..c10c2f2a0f 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -4,9 +4,7 @@
 use anyhow::Context;
 use async_compression::tokio::write::GzipEncoder;
 use bytes::Buf;
-use bytes::Bytes;
 use futures::stream::FuturesUnordered;
-use futures::Stream;
 use futures::StreamExt;
 use pageserver_api::key::Key;
 use pageserver_api::models::TenantState;
@@ -28,7 +26,6 @@ use std::borrow::Cow;
 use std::collections::HashMap;
 use std::io;
 use std::net::TcpListener;
-use std::pin::pin;
 use std::str;
 use std::str::FromStr;
 use std::sync::Arc;
@@ -37,7 +34,6 @@ use std::time::Instant;
 use std::time::SystemTime;
 use tokio::io::AsyncWriteExt;
 use tokio::io::{AsyncRead, AsyncWrite};
-use tokio_util::io::StreamReader;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::id::ConnectionId;
@@ -53,7 +49,6 @@ use crate::auth::check_permission;
 use crate::basebackup;
 use crate::basebackup::BasebackupError;
 use crate::context::{DownloadBehavior, RequestContext};
-use crate::import_datadir::import_wal_from_tar;
 use crate::metrics;
 use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS};
 use crate::pgdatadir_mapping::Version;
@@ -66,7 +61,6 @@ use crate::tenant::mgr::GetTenantError;
 use crate::tenant::mgr::ShardResolveResult;
 use crate::tenant::mgr::ShardSelector;
 use crate::tenant::mgr::TenantManager;
-use crate::tenant::timeline::FlushLayerError;
 use crate::tenant::timeline::WaitLsnError;
 use crate::tenant::GetTimelineError;
 use crate::tenant::PageReconstructError;
@@ -82,56 +76,6 @@ use postgres_ffi::BLCKSZ;
 // is not yet in state [`TenantState::Active`].
 const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
 
-/// Read the end of a tar archive.
-///
-/// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each.
-/// `tokio_tar` already read the first such block. Read the second all-zeros block,
-/// and check that there is no more data after the EOF marker.
-///
-/// 'tar' command can also write extra blocks of zeros, up to a record
-/// size, controlled by the --record-size argument. Ignore them too.
-async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()> {
-    use tokio::io::AsyncReadExt;
-    let mut buf = [0u8; 512];
-
-    // Read the all-zeros block, and verify it
-    let mut total_bytes = 0;
-    while total_bytes < 512 {
-        let nbytes = reader.read(&mut buf[total_bytes..]).await?;
-        total_bytes += nbytes;
-        if nbytes == 0 {
-            break;
-        }
-    }
-    if total_bytes < 512 {
-        anyhow::bail!("incomplete or invalid tar EOF marker");
-    }
-    if !buf.iter().all(|&x| x == 0) {
-        anyhow::bail!("invalid tar EOF marker");
-    }
-
-    // Drain any extra zero-blocks after the EOF marker
-    let mut trailing_bytes = 0;
-    let mut seen_nonzero_bytes = false;
-    loop {
-        let nbytes = reader.read(&mut buf).await?;
-        trailing_bytes += nbytes;
-        if !buf.iter().all(|&x| x == 0) {
-            seen_nonzero_bytes = true;
-        }
-        if nbytes == 0 {
-            break;
-        }
-    }
-    if seen_nonzero_bytes {
-        anyhow::bail!("unexpected non-zero bytes after the tar archive");
-    }
-    if trailing_bytes % 512 != 0 {
-        anyhow::bail!("unexpected number of zeros ({trailing_bytes}), not divisible by tar block size (512 bytes), after the tar archive");
-    }
-    Ok(())
-}
-
 ///////////////////////////////////////////////////////////////////////////////
 
 ///
@@ -141,7 +85,6 @@ async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()
 ///
 pub async fn libpq_listener_main(
     tenant_manager: Arc<TenantManager>,
-    broker_client: storage_broker::BrokerClientChannel,
     auth: Option<Arc<SwappableJwtAuth>>,
     listener: TcpListener,
     auth_type: AuthType,
@@ -186,7 +129,6 @@ pub async fn libpq_listener_main(
                     false,
                     page_service_conn_main(
                         tenant_manager.clone(),
-                        broker_client.clone(),
                         local_auth,
                         socket,
                         auth_type,
@@ -209,7 +151,6 @@ pub async fn libpq_listener_main(
 #[instrument(skip_all, fields(peer_addr))]
 async fn page_service_conn_main(
     tenant_manager: Arc<TenantManager>,
-    broker_client: storage_broker::BrokerClientChannel,
     auth: Option<Arc<SwappableJwtAuth>>,
     socket: tokio::net::TcpStream,
     auth_type: AuthType,
@@ -262,8 +203,7 @@ async fn page_service_conn_main(
     // and create a child per-query context when it invokes process_query.
     // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
     // and create the per-query context in process_query ourselves.
-    let mut conn_handler =
-        PageServerHandler::new(tenant_manager, broker_client, auth, connection_ctx);
+    let mut conn_handler = PageServerHandler::new(tenant_manager, auth, connection_ctx);
     let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
 
     match pgbackend
@@ -294,7 +234,6 @@ struct HandlerTimeline {
 }
 
 struct PageServerHandler {
-    broker_client: storage_broker::BrokerClientChannel,
     auth: Option<Arc<SwappableJwtAuth>>,
     claims: Option<Claims>,
 
@@ -386,13 +325,11 @@ impl From<WaitLsnError> for QueryError {
 impl PageServerHandler {
     pub fn new(
         tenant_manager: Arc<TenantManager>,
-        broker_client: storage_broker::BrokerClientChannel,
         auth: Option<Arc<SwappableJwtAuth>>,
         connection_ctx: RequestContext,
     ) -> Self {
         PageServerHandler {
             tenant_manager,
-            broker_client,
             auth,
             claims: None,
             connection_ctx,
@@ -475,73 +412,6 @@ impl PageServerHandler {
         )
     }
 
-    fn copyin_stream<'a, IO>(
-        &'a self,
-        pgb: &'a mut PostgresBackend<IO>,
-        cancel: &'a CancellationToken,
-    ) -> impl Stream<Item = io::Result<Bytes>> + 'a
-    where
-        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
-    {
-        async_stream::try_stream! {
-            loop {
-                let msg = tokio::select! {
-                    biased;
-
-                    _ = cancel.cancelled() => {
-                        // We were requested to shut down.
-                        let msg = "pageserver is shutting down";
-                        let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, None));
-                        Err(QueryError::Shutdown)
-                    }
-
-                    msg = pgb.read_message() => { msg.map_err(QueryError::from)}
-                };
-
-                match msg {
-                    Ok(Some(message)) => {
-                        let copy_data_bytes = match message {
-                            FeMessage::CopyData(bytes) => bytes,
-                            FeMessage::CopyDone => { break },
-                            FeMessage::Sync => continue,
-                            FeMessage::Terminate => {
-                                let msg = "client terminated connection with Terminate message during COPY";
-                                let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
-                                // error can't happen here, ErrorResponse serialization should be always ok
-                                pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
-                                Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
-                                break;
-                            }
-                            m => {
-                                let msg = format!("unexpected message {m:?}");
-                                // error can't happen here, ErrorResponse serialization should be always ok
-                                pgb.write_message_noflush(&BeMessage::ErrorResponse(&msg, None)).map_err(|e| e.into_io_error())?;
-                                Err(io::Error::new(io::ErrorKind::Other, msg))?;
-                                break;
-                            }
-                        };
-
-                        yield copy_data_bytes;
-                    }
-                    Ok(None) => {
-                        let msg = "client closed connection during COPY";
-                        let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
-                        // error can't happen here, ErrorResponse serialization should be always ok
-                        pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
-                        self.flush_cancellable(pgb, cancel).await.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
-                        Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
-                    }
-                    Err(QueryError::Disconnected(ConnectionError::Io(io_error))) => {
-                        Err(io_error)?;
-                    }
-                    Err(other) => {
-                        Err(io::Error::new(io::ErrorKind::Other, other.to_string()))?;
-                    }
-                };
-            }
-        }
-    }
-
     #[instrument(skip_all)]
     async fn handle_pagerequests<IO>(
         &mut self,
@@ -713,128 +583,6 @@ impl PageServerHandler {
         Ok(())
     }
 
-    #[allow(clippy::too_many_arguments)]
-    #[instrument(skip_all, fields(%base_lsn, end_lsn=%_end_lsn, %pg_version))]
-    async fn handle_import_basebackup<IO>(
-        &self,
-        pgb: &mut PostgresBackend<IO>,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        base_lsn: Lsn,
-        _end_lsn: Lsn,
-        pg_version: u32,
-        ctx: RequestContext,
-    ) -> Result<(), QueryError>
-    where
-        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
-    {
-        debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
-
-        // Create empty timeline
-        info!("creating new timeline");
-        let tenant = self
-            .get_active_tenant_with_timeout(tenant_id, ShardSelector::Zero, ACTIVE_TENANT_TIMEOUT)
-            .await?;
-        let timeline = tenant
-            .create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
-            .await?;
-
-        // TODO mark timeline as not ready until it reaches end_lsn.
-        // We might have some wal to import as well, and we should prevent compute
-        // from connecting before that and writing conflicting wal.
-        //
-        // This is not relevant for pageserver->pageserver migrations, since there's
-        // no wal to import. But should be fixed if we want to import from postgres.
-
-        // TODO leave clean state on error. For now you can use detach to clean
-        // up broken state from a failed import.
-
-        // Import basebackup provided via CopyData
-        info!("importing basebackup");
-        pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
-        self.flush_cancellable(pgb, &tenant.cancel).await?;
-
-        let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &tenant.cancel)));
-        timeline
-            .import_basebackup_from_tar(
-                tenant.clone(),
-                &mut copyin_reader,
-                base_lsn,
-                self.broker_client.clone(),
-                &ctx,
-            )
-            .await?;
-
-        // Read the end of the tar archive.
-        read_tar_eof(copyin_reader).await?;
-
-        // TODO check checksum
-        // Meanwhile you can verify client-side by taking fullbackup
-        // and checking that it matches in size with what was imported.
-        // It wouldn't work if base came from vanilla postgres though,
-        // since we discard some log files.
-
-        info!("done");
-        Ok(())
-    }
-
-    #[instrument(skip_all, fields(shard_id, %start_lsn, %end_lsn))]
-    async fn handle_import_wal<IO>(
-        &self,
-        pgb: &mut PostgresBackend<IO>,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        start_lsn: Lsn,
-        end_lsn: Lsn,
-        ctx: RequestContext,
-    ) -> Result<(), QueryError>
-    where
-        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
-    {
-        let timeline = self
-            .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
-            .await?;
-        let last_record_lsn = timeline.get_last_record_lsn();
-        if last_record_lsn != start_lsn {
-            return Err(QueryError::Other(
-                anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}"))
-            );
-        }
-
-        // TODO leave clean state on error. For now you can use detach to clean
-        // up broken state from a failed import.
-
-        // Import wal provided via CopyData
-        info!("importing wal");
-        pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
-        self.flush_cancellable(pgb, &timeline.cancel).await?;
-        let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &timeline.cancel)));
-        import_wal_from_tar(&timeline, &mut copyin_reader, start_lsn, end_lsn, &ctx).await?;
-        info!("wal import complete");
-
-        // Read the end of the tar archive.
-        read_tar_eof(copyin_reader).await?;
-
-        // TODO Does it make sense to overshoot?
-        if timeline.get_last_record_lsn() < end_lsn {
-            return Err(QueryError::Other(
-                anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}"))
-            );
-        }
-
-        // Flush data to disk, then upload to s3. No need for a forced checkpoint.
-        // We only want to persist the data, and it doesn't matter if it's in the
-        // shape of deltas or images.
-        info!("flushing layers");
-        timeline.freeze_and_flush().await.map_err(|e| match e {
-            FlushLayerError::Cancelled => QueryError::Shutdown,
-            other => QueryError::Other(other.into()),
-        })?;
-
-        info!("done");
-        Ok(())
-    }
-
     /// Helper function to handle the LSN from client request.
     ///
     /// Each GetPage (and Exists and Nblocks) request includes information about
@@ -1705,109 +1453,6 @@ where
             )
             .await?;
             pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
-        } else if query_string.starts_with("import basebackup ") {
-            // Import the `base` section (everything but the wal) of a basebackup.
-            // Assumes the tenant already exists on this pageserver.
-            //
-            // Files are scheduled to be persisted to remote storage, and the
-            // caller should poll the http api to check when that is done.
-            //
-            // Example import command:
-            // 1. Get start/end LSN from backup_manifest file
-            // 2. Run:
-            // cat my_backup/base.tar | psql -h $PAGESERVER \
-            //     -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN $PG_VERSION"
-            let params = &parts[2..];
-            if params.len() != 5 {
-                return Err(QueryError::Other(anyhow::anyhow!(
-                    "invalid param number for import basebackup command"
-                )));
-            }
-            let tenant_id = TenantId::from_str(params[0])
-                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
-            let timeline_id = TimelineId::from_str(params[1])
-                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
-            let base_lsn = Lsn::from_str(params[2])
-                .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
-            let end_lsn = Lsn::from_str(params[3])
-                .with_context(|| format!("Failed to parse Lsn from {}", params[3]))?;
-            let pg_version = u32::from_str(params[4])
-                .with_context(|| format!("Failed to parse pg_version from {}", params[4]))?;
-
-            tracing::Span::current()
-                .record("tenant_id", field::display(tenant_id))
-                .record("timeline_id", field::display(timeline_id));
-
-            self.check_permission(Some(tenant_id))?;
-
-            COMPUTE_COMMANDS_COUNTERS
-                .for_command(ComputeCommandKind::ImportBasebackup)
-                .inc();
-
-            match self
-                .handle_import_basebackup(
-                    pgb,
-                    tenant_id,
-                    timeline_id,
-                    base_lsn,
-                    end_lsn,
-                    pg_version,
-                    ctx,
-                )
-                .await
-            {
-                Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
-                Err(e) => {
-                    error!("error importing base backup between {base_lsn} and {end_lsn}: {e:?}");
-                    pgb.write_message_noflush(&BeMessage::ErrorResponse(
-                        &e.to_string(),
-                        Some(e.pg_error_code()),
-                    ))?
-                }
-            };
-        } else if query_string.starts_with("import wal ") {
-            // Import the `pg_wal` section of a basebackup.
-            //
-            // Files are scheduled to be persisted to remote storage, and the
-            // caller should poll the http api to check when that is done.
-            let params = &parts[2..];
-            if params.len() != 4 {
-                return Err(QueryError::Other(anyhow::anyhow!(
-                    "invalid param number for import wal command"
-                )));
-            }
-            let tenant_id = TenantId::from_str(params[0])
-                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
-            let timeline_id = TimelineId::from_str(params[1])
-                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
-            let start_lsn = Lsn::from_str(params[2])
-                .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
-            let end_lsn = Lsn::from_str(params[3])
-                .with_context(|| format!("Failed to parse Lsn from {}", params[3]))?;
-
-            tracing::Span::current()
-                .record("tenant_id", field::display(tenant_id))
-                .record("timeline_id", field::display(timeline_id));
-
-            self.check_permission(Some(tenant_id))?;
-
-            COMPUTE_COMMANDS_COUNTERS
-                .for_command(ComputeCommandKind::ImportWal)
-                .inc();
-
-            match self
-                .handle_import_wal(pgb, tenant_id, timeline_id, start_lsn, end_lsn, ctx)
-                .await
-            {
-                Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
-                Err(e) => {
-                    error!("error importing WAL between {start_lsn} and {end_lsn}: {e:?}");
-                    pgb.write_message_noflush(&BeMessage::ErrorResponse(
-                        &e.to_string(),
-                        Some(e.pg_error_code()),
-                    ))?
-                }
-            };
         } else if query_string.to_ascii_lowercase().starts_with("set ") {
             // important because psycopg2 executes "SET datestyle TO 'ISO'"
             // on connect
diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs
index 4d17dff9fe..fff44aaf26 100644
--- a/storage_controller/src/node.rs
+++ b/storage_controller/src/node.rs
@@ -226,7 +226,7 @@ impl Node {
         fn is_fatal(e: &mgmt_api::Error) -> bool {
             use mgmt_api::Error::*;
             match e {
-                ReceiveBody(_) | ReceiveErrorBody(_) => false,
+                SendRequest(_) | ReceiveBody(_) | ReceiveErrorBody(_) => false,
                 ApiError(StatusCode::SERVICE_UNAVAILABLE, _)
                 | ApiError(StatusCode::GATEWAY_TIMEOUT, _)
                 | ApiError(StatusCode::REQUEST_TIMEOUT, _) => false,
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 78f0848c24..aada1939ee 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -151,6 +151,10 @@ struct ServiceState {
 /// controller API.
 fn passthrough_api_error(node: &Node, e: mgmt_api::Error) -> ApiError {
     match e {
+        mgmt_api::Error::SendRequest(e) => {
+            // Presume errors sending requests are connectivity/availability issues
+            ApiError::ResourceUnavailable(format!("{node} error sending request: {e}").into())
+        }
         mgmt_api::Error::ReceiveErrorBody(str) => {
             // Presume errors receiving body are connectivity/availability issues
             ApiError::ResourceUnavailable(
diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py
index d97e882a70..4dae9176b8 100644
--- a/test_runner/regress/test_import.py
+++ b/test_runner/regress/test_import.py
@@ -88,7 +88,8 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
 
     env.pageserver.allowed_errors.extend(
         [
-            ".*error importing base backup .*",
+            ".*Failed to import basebackup.*",
+            ".*unexpected non-zero bytes after the tar archive.*",
             ".*Timeline got dropped without initializing, cleaning its files.*",
             ".*InternalServerError.*timeline not found.*",
             ".*InternalServerError.*Tenant .* not found.*",

From 4df39d730444ba107ce1b3e4138c7fa43923fdc3 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 10 Jul 2024 09:10:29 +0100
Subject: [PATCH 215/412] proxy: pg17 fixes (#8321)

## Problem

#7809 - we do not support sslnegotiation=direct
#7810 - we do not support negotiating down the protocol extensions.

## Summary of changes

1. Same as postgres, check the first startup packet byte for tls header
`0x16`, and check the ALPN.
2. Tell clients using protocol >3.0 to downgrade
---
 libs/postgres_backend/src/lib.rs |  12 ++-
 libs/pq_proto/src/framed.rs      |   6 +-
 libs/pq_proto/src/lib.rs         |  91 ++++++++++++++----
 proxy/src/bin/pg_sni_router.rs   |   3 +-
 proxy/src/config.rs              |  12 ++-
 proxy/src/proxy/handshake.rs     | 152 ++++++++++++++++++++++++++-----
 6 files changed, 222 insertions(+), 54 deletions(-)

diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs
index c79ee4e053..7c7c6535b3 100644
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -663,11 +663,17 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
         assert!(self.state < ProtoState::Authentication);
         let have_tls = self.tls_config.is_some();
         match msg {
-            FeStartupPacket::SslRequest => {
+            FeStartupPacket::SslRequest { direct } => {
                 debug!("SSL requested");
 
-                self.write_message(&BeMessage::EncryptionResponse(have_tls))
-                    .await?;
+                if !direct {
+                    self.write_message(&BeMessage::EncryptionResponse(have_tls))
+                        .await?;
+                } else if !have_tls {
+                    return Err(QueryError::Other(anyhow::anyhow!(
+                        "direct SSL negotiation but no TLS support"
+                    )));
+                }
 
                 if have_tls {
                     self.start_tls().await?;
diff --git a/libs/pq_proto/src/framed.rs b/libs/pq_proto/src/framed.rs
index 6e97b8c2a0..ccbb90e384 100644
--- a/libs/pq_proto/src/framed.rs
+++ b/libs/pq_proto/src/framed.rs
@@ -44,9 +44,9 @@ impl ConnectionError {
 /// Wraps async io `stream`, providing messages to write/flush + read Postgres
 /// messages.
 pub struct Framed<S> {
-    stream: S,
-    read_buf: BytesMut,
-    write_buf: BytesMut,
+    pub stream: S,
+    pub read_buf: BytesMut,
+    pub write_buf: BytesMut,
 }
 
 impl<S> Framed<S> {
diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs
index cee3742017..a01191bd5d 100644
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -39,14 +39,39 @@ pub enum FeMessage {
     PasswordMessage(Bytes),
 }
 
+#[derive(Clone, Copy, PartialEq, PartialOrd)]
+pub struct ProtocolVersion(u32);
+
+impl ProtocolVersion {
+    pub const fn new(major: u16, minor: u16) -> Self {
+        Self((major as u32) << 16 | minor as u32)
+    }
+    pub const fn minor(self) -> u16 {
+        self.0 as u16
+    }
+    pub const fn major(self) -> u16 {
+        (self.0 >> 16) as u16
+    }
+}
+
+impl fmt::Debug for ProtocolVersion {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_list()
+            .entry(&self.major())
+            .entry(&self.minor())
+            .finish()
+    }
+}
+
 #[derive(Debug)]
 pub enum FeStartupPacket {
     CancelRequest(CancelKeyData),
-    SslRequest,
+    SslRequest {
+        direct: bool,
+    },
     GssEncRequest,
     StartupMessage {
-        major_version: u32,
-        minor_version: u32,
+        version: ProtocolVersion,
         params: StartupMessageParams,
     },
 }
@@ -301,11 +326,23 @@ impl FeStartupPacket {
     /// different from [`FeMessage::parse`] because startup messages don't have
     /// message type byte; otherwise, its comments apply.
     pub fn parse(buf: &mut BytesMut) -> Result<Option<FeStartupPacket>, ProtocolError> {
+        /// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L118>
         const MAX_STARTUP_PACKET_LENGTH: usize = 10000;
-        const RESERVED_INVALID_MAJOR_VERSION: u32 = 1234;
-        const CANCEL_REQUEST_CODE: u32 = 5678;
-        const NEGOTIATE_SSL_CODE: u32 = 5679;
-        const NEGOTIATE_GSS_CODE: u32 = 5680;
+        const RESERVED_INVALID_MAJOR_VERSION: u16 = 1234;
+        /// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L132>
+        const CANCEL_REQUEST_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5678);
+        /// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L166>
+        const NEGOTIATE_SSL_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5679);
+        /// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L167>
+        const NEGOTIATE_GSS_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5680);
+
+        // <https://github.com/postgres/postgres/blob/04bcf9e19a4261fe9c7df37c777592c2e10c32a7/src/backend/tcop/backend_startup.c#L378-L382>
+        // First byte indicates standard SSL handshake message
+        // (It can't be a Postgres startup length because in network byte order
+        // that would be a startup packet hundreds of megabytes long)
+        if buf.first() == Some(&0x16) {
+            return Ok(Some(FeStartupPacket::SslRequest { direct: true }));
+        }
 
         // need at least 4 bytes with packet len
         if buf.len() < 4 {
@@ -338,12 +375,10 @@ impl FeStartupPacket {
         let mut msg = buf.split_to(len).freeze();
         msg.advance(4); // consume len
 
-        let request_code = msg.get_u32();
-        let req_hi = request_code >> 16;
-        let req_lo = request_code & ((1 << 16) - 1);
+        let request_code = ProtocolVersion(msg.get_u32());
         // StartupMessage, CancelRequest, SSLRequest etc are differentiated by request code.
-        let message = match (req_hi, req_lo) {
-            (RESERVED_INVALID_MAJOR_VERSION, CANCEL_REQUEST_CODE) => {
+        let message = match request_code {
+            CANCEL_REQUEST_CODE => {
                 if msg.remaining() != 8 {
                     return Err(ProtocolError::BadMessage(
                         "CancelRequest message is malformed, backend PID / secret key missing"
@@ -355,21 +390,22 @@ impl FeStartupPacket {
                     cancel_key: msg.get_i32(),
                 })
             }
-            (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_SSL_CODE) => {
+            NEGOTIATE_SSL_CODE => {
                 // Requested upgrade to SSL (aka TLS)
-                FeStartupPacket::SslRequest
+                FeStartupPacket::SslRequest { direct: false }
             }
-            (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_GSS_CODE) => {
+            NEGOTIATE_GSS_CODE => {
                 // Requested upgrade to GSSAPI
                 FeStartupPacket::GssEncRequest
             }
-            (RESERVED_INVALID_MAJOR_VERSION, unrecognized_code) => {
+            version if version.major() == RESERVED_INVALID_MAJOR_VERSION => {
                 return Err(ProtocolError::Protocol(format!(
-                    "Unrecognized request code {unrecognized_code}"
+                    "Unrecognized request code {}",
+                    version.minor()
                 )));
             }
             // TODO bail if protocol major_version is not 3?
-            (major_version, minor_version) => {
+            version => {
                 // StartupMessage
 
                 let s = str::from_utf8(&msg).map_err(|_e| {
@@ -382,8 +418,7 @@ impl FeStartupPacket {
                 })?;
 
                 FeStartupPacket::StartupMessage {
-                    major_version,
-                    minor_version,
+                    version,
                     params: StartupMessageParams {
                         params: msg.slice_ref(s.as_bytes()),
                     },
@@ -522,6 +557,10 @@ pub enum BeMessage<'a> {
     RowDescription(&'a [RowDescriptor<'a>]),
     XLogData(XLogDataBody<'a>),
     NoticeResponse(&'a str),
+    NegotiateProtocolVersion {
+        version: ProtocolVersion,
+        options: &'a [&'a str],
+    },
     KeepAlive(WalSndKeepAlive),
 }
 
@@ -945,6 +984,18 @@ impl<'a> BeMessage<'a> {
                     buf.put_u8(u8::from(req.request_reply));
                 });
             }
+
+            BeMessage::NegotiateProtocolVersion { version, options } => {
+                buf.put_u8(b'v');
+                write_body(buf, |buf| {
+                    buf.put_u32(version.0);
+                    buf.put_u32(options.len() as u32);
+                    for option in options.iter() {
+                        write_cstr(option, buf)?;
+                    }
+                    Ok(())
+                })?
+            }
         }
         Ok(())
     }
diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs
index 44e880838e..d7a3eb9a4d 100644
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -216,10 +216,11 @@ async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
     use pq_proto::FeStartupPacket::*;
 
     match msg {
-        SslRequest => {
+        SslRequest { direct: false } => {
             stream
                 .write_message(&pq_proto::BeMessage::EncryptionResponse(true))
                 .await?;
+
             // Upgrade raw stream into a secure TLS-backed stream.
             // NOTE: We've consumed `tls`; this fact will be used later.
 
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index af5511d7ec..6504919760 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -75,6 +75,9 @@ impl TlsConfig {
     }
 }
 
+/// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L159>
+pub const PG_ALPN_PROTOCOL: &[u8] = b"postgresql";
+
 /// Configure TLS for the main endpoint.
 pub fn configure_tls(
     key_path: &str,
@@ -111,16 +114,17 @@ pub fn configure_tls(
     let cert_resolver = Arc::new(cert_resolver);
 
     // allow TLS 1.2 to be compatible with older client libraries
-    let config = rustls::ServerConfig::builder_with_protocol_versions(&[
+    let mut config = rustls::ServerConfig::builder_with_protocol_versions(&[
         &rustls::version::TLS13,
         &rustls::version::TLS12,
     ])
     .with_no_client_auth()
-    .with_cert_resolver(cert_resolver.clone())
-    .into();
+    .with_cert_resolver(cert_resolver.clone());
+
+    config.alpn_protocols = vec![PG_ALPN_PROTOCOL.to_vec()];
 
     Ok(TlsConfig {
-        config,
+        config: Arc::new(config),
         common_names,
         cert_resolver,
     })
diff --git a/proxy/src/proxy/handshake.rs b/proxy/src/proxy/handshake.rs
index dd935cc245..d488aea927 100644
--- a/proxy/src/proxy/handshake.rs
+++ b/proxy/src/proxy/handshake.rs
@@ -1,11 +1,17 @@
-use pq_proto::{BeMessage as Be, CancelKeyData, FeStartupPacket, StartupMessageParams};
+use bytes::Buf;
+use pq_proto::{
+    framed::Framed, BeMessage as Be, CancelKeyData, FeStartupPacket, ProtocolVersion,
+    StartupMessageParams,
+};
 use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite};
-use tracing::info;
+use tracing::{info, warn};
 
 use crate::{
-    config::TlsConfig,
+    auth::endpoint_sni,
+    config::{TlsConfig, PG_ALPN_PROTOCOL},
     error::ReportableError,
+    metrics::Metrics,
     proxy::ERR_INSECURE_CONNECTION,
     stream::{PqStream, Stream, StreamUpgradeError},
 };
@@ -68,6 +74,9 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
     // Client may try upgrading to each protocol only once
     let (mut tried_ssl, mut tried_gss) = (false, false);
 
+    const PG_PROTOCOL_EARLIEST: ProtocolVersion = ProtocolVersion::new(3, 0);
+    const PG_PROTOCOL_LATEST: ProtocolVersion = ProtocolVersion::new(3, 0);
+
     let mut stream = PqStream::new(Stream::from_raw(stream));
     loop {
         let msg = stream.read_startup_packet().await?;
@@ -75,40 +84,96 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
 
         use FeStartupPacket::*;
         match msg {
-            SslRequest => match stream.get_ref() {
+            SslRequest { direct } => match stream.get_ref() {
                 Stream::Raw { .. } if !tried_ssl => {
                     tried_ssl = true;
 
                     // We can't perform TLS handshake without a config
-                    let enc = tls.is_some();
-                    stream.write_message(&Be::EncryptionResponse(enc)).await?;
+                    let have_tls = tls.is_some();
+                    if !direct {
+                        stream
+                            .write_message(&Be::EncryptionResponse(have_tls))
+                            .await?;
+                    } else if !have_tls {
+                        return Err(HandshakeError::ProtocolViolation);
+                    }
+
                     if let Some(tls) = tls.take() {
                         // Upgrade raw stream into a secure TLS-backed stream.
                         // NOTE: We've consumed `tls`; this fact will be used later.
 
-                        let (raw, read_buf) = stream.into_inner();
-                        // TODO: Normally, client doesn't send any data before
-                        // server says TLS handshake is ok and read_buf is empy.
-                        // However, you could imagine pipelining of postgres
-                        // SSLRequest + TLS ClientHello in one hunk similar to
-                        // pipelining in our node js driver. We should probably
-                        // support that by chaining read_buf with the stream.
+                        let Framed {
+                            stream: raw,
+                            read_buf,
+                            write_buf,
+                        } = stream.framed;
+
+                        let Stream::Raw { raw } = raw else {
+                            return Err(HandshakeError::StreamUpgradeError(
+                                StreamUpgradeError::AlreadyTls,
+                            ));
+                        };
+
+                        let mut read_buf = read_buf.reader();
+                        let mut res = Ok(());
+                        let accept = tokio_rustls::TlsAcceptor::from(tls.to_server_config())
+                            .accept_with(raw, |session| {
+                                // push the early data to the tls session
+                                while !read_buf.get_ref().is_empty() {
+                                    match session.read_tls(&mut read_buf) {
+                                        Ok(_) => {}
+                                        Err(e) => {
+                                            res = Err(e);
+                                            break;
+                                        }
+                                    }
+                                }
+                            });
+
+                        res?;
+
+                        let read_buf = read_buf.into_inner();
                         if !read_buf.is_empty() {
                             return Err(HandshakeError::EarlyData);
                         }
-                        let tls_stream = raw
-                            .upgrade(tls.to_server_config(), record_handshake_error)
-                            .await?;
+
+                        let tls_stream = accept.await.inspect_err(|_| {
+                            if record_handshake_error {
+                                Metrics::get().proxy.tls_handshake_failures.inc()
+                            }
+                        })?;
+
+                        let conn_info = tls_stream.get_ref().1;
+
+                        // check the ALPN, if exists, as required.
+                        match conn_info.alpn_protocol() {
+                            None | Some(PG_ALPN_PROTOCOL) => {}
+                            Some(other) => {
+                                // try parse ep for better error
+                                let ep = conn_info.server_name().and_then(|sni| {
+                                    endpoint_sni(sni, &tls.common_names).ok().flatten()
+                                });
+                                let alpn = String::from_utf8_lossy(other);
+                                warn!(?ep, %alpn, "unexpected ALPN");
+                                return Err(HandshakeError::ProtocolViolation);
+                            }
+                        }
 
                         let (_, tls_server_end_point) = tls
                             .cert_resolver
-                            .resolve(tls_stream.get_ref().1.server_name())
+                            .resolve(conn_info.server_name())
                             .ok_or(HandshakeError::MissingCertificate)?;
 
-                        stream = PqStream::new(Stream::Tls {
-                            tls: Box::new(tls_stream),
-                            tls_server_end_point,
-                        });
+                        stream = PqStream {
+                            framed: Framed {
+                                stream: Stream::Tls {
+                                    tls: Box::new(tls_stream),
+                                    tls_server_end_point,
+                                },
+                                read_buf,
+                                write_buf,
+                            },
+                        };
                     }
                 }
                 _ => return Err(HandshakeError::ProtocolViolation),
@@ -122,7 +187,9 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                 }
                 _ => return Err(HandshakeError::ProtocolViolation),
             },
-            StartupMessage { params, .. } => {
+            StartupMessage { params, version }
+                if PG_PROTOCOL_EARLIEST <= version && version <= PG_PROTOCOL_LATEST =>
+            {
                 // Check that the config has been consumed during upgrade
                 // OR we didn't provide it at all (for dev purposes).
                 if tls.is_some() {
@@ -131,9 +198,48 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                         .await?;
                 }
 
-                info!(session_type = "normal", "successful handshake");
+                info!(?version, session_type = "normal", "successful handshake");
                 break Ok(HandshakeData::Startup(stream, params));
             }
+            // downgrade protocol version
+            StartupMessage { params, version }
+                if version.major() == 3 && version > PG_PROTOCOL_LATEST =>
+            {
+                warn!(?version, "unsupported minor version");
+
+                // no protocol extensions are supported.
+                // <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/backend/tcop/backend_startup.c#L744-L753>
+                let mut unsupported = vec![];
+                for (k, _) in params.iter() {
+                    if k.starts_with("_pq_.") {
+                        unsupported.push(k);
+                    }
+                }
+
+                // TODO: remove unsupported options so we don't send them to compute.
+
+                stream
+                    .write_message(&Be::NegotiateProtocolVersion {
+                        version: PG_PROTOCOL_LATEST,
+                        options: &unsupported,
+                    })
+                    .await?;
+
+                info!(
+                    ?version,
+                    session_type = "normal",
+                    "successful handshake; unsupported minor version requested"
+                );
+                break Ok(HandshakeData::Startup(stream, params));
+            }
+            StartupMessage { version, .. } => {
+                warn!(
+                    ?version,
+                    session_type = "normal",
+                    "unsuccessful handshake; unsupported version"
+                );
+                return Err(HandshakeError::ProtocolViolation);
+            }
             CancelRequest(cancel_key_data) => {
                 info!(session_type = "cancellation", "successful handshake");
                 break Ok(HandshakeData::Cancel(cancel_key_data));

From f28abb953d5a37ceaf689b0ed9c2a59e944ffb32 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 10 Jul 2024 14:14:10 +0100
Subject: [PATCH 216/412] tests: stabilize test_sharding_split_compaction
 (#8318)

## Problem

This test incorrectly assumed that a post-split compaction would only
drop content. This was easily destabilized by any changes to image
generation rules.

## Summary of changes

- Before split, do a full image layer generation pass, to guarantee that
post-split compaction should only drop data, never create it.
- Fix the force_image_layer_creation mode of compaction that we use from
tests like this: previously it would try and generate image layers even
if one already existed with the same layer key, which caused compaction
to fail.
---
 pageserver/src/tenant/timeline.rs             | 19 ++++++++++++++++++-
 .../src/tenant/timeline/layer_manager.rs      |  8 ++++++++
 test_runner/regress/test_sharding.py          |  6 ++++++
 3 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 541704e8d6..762e903bf8 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -66,12 +66,12 @@ use std::{
     ops::{Deref, Range},
 };
 
-use crate::pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS;
 use crate::{
     aux_file::AuxFileSizeEstimator,
     tenant::{
         layer_map::{LayerMap, SearchResult},
         metadata::TimelineMetadata,
+        storage_layer::PersistentLayerDesc,
     },
 };
 use crate::{
@@ -98,6 +98,7 @@ use crate::{
     metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize,
 };
 use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind};
+use crate::{pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS, tenant::storage_layer::PersistentLayerKey};
 use crate::{
     pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind},
     virtual_file::{MaybeFatalIo, VirtualFile},
@@ -4572,6 +4573,22 @@ impl Timeline {
                     start = img_range.end;
                     continue;
                 }
+            } else if let ImageLayerCreationMode::Force = mode {
+                // When forced to create image layers, we might try and create them where they already
+                // exist.  This mode is only used in tests/debug.
+                let layers = self.layers.read().await;
+                if layers.contains_key(&PersistentLayerKey {
+                    key_range: img_range.clone(),
+                    lsn_range: PersistentLayerDesc::image_layer_lsn_range(lsn),
+                    is_delta: false,
+                }) {
+                    tracing::info!(
+                        "Skipping image layer at {lsn} {}..{}, already exists",
+                        img_range.start,
+                        img_range.end
+                    );
+                    continue;
+                }
             }
 
             let image_layer_writer = ImageLayerWriter::new(
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index 948237e06a..a43ff873ac 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -339,6 +339,10 @@ impl LayerManager {
         self.layer_fmgr.contains(layer)
     }
 
+    pub(crate) fn contains_key(&self, key: &PersistentLayerKey) -> bool {
+        self.layer_fmgr.contains_key(key)
+    }
+
     pub(crate) fn all_persistent_layers(&self) -> Vec<PersistentLayerKey> {
         self.layer_fmgr.0.keys().cloned().collect_vec()
     }
@@ -363,6 +367,10 @@ impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
             .clone()
     }
 
+    fn contains_key(&self, key: &PersistentLayerKey) -> bool {
+        self.0.contains_key(key)
+    }
+
     pub(crate) fn insert(&mut self, layer: T) {
         let present = self.0.insert(layer.layer_desc().key(), layer.clone());
         if present.is_some() && cfg!(debug_assertions) {
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index d414f986e6..4471237900 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -225,6 +225,12 @@ def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder, failpoint:
     workload.validate()
     workload.stop()
 
+    # Do a full image layer generation before splitting, so that when we compact after splitting
+    # we should only see sizes decrease (from post-split drops/rewrites), not increase (from image layer generation)
+    env.get_tenant_pageserver(tenant_id).http_client().timeline_compact(
+        tenant_id, timeline_id, force_image_layer_creation=True, wait_until_uploaded=True
+    )
+
     # Split one shard into two
     shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=2)
 

From 4c7c00268caa749dac7db1ab3aa2856aefd8bf4e Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 10 Jul 2024 15:05:25 +0100
Subject: [PATCH 217/412] proxy: remove some trace logs (#8334)

---
 proxy/src/http.rs    | 41 +----------------------------------------
 proxy/src/logging.rs |  3 ++-
 2 files changed, 3 insertions(+), 41 deletions(-)

diff --git a/proxy/src/http.rs b/proxy/src/http.rs
index fc7400869f..dd7164181d 100644
--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -4,14 +4,11 @@
 
 pub mod health_server;
 
-use std::{str::FromStr, sync::Arc, time::Duration};
+use std::time::Duration;
 
-use futures::FutureExt;
 pub use reqwest::{Request, Response, StatusCode};
 pub use reqwest_middleware::{ClientWithMiddleware, Error};
 pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
-use tokio::time::Instant;
-use tracing::trace;
 
 use crate::{
     metrics::{ConsoleRequest, Metrics},
@@ -24,8 +21,6 @@ use reqwest_middleware::RequestBuilder;
 /// We deliberately don't want to replace this with a public static.
 pub fn new_client() -> ClientWithMiddleware {
     let client = reqwest::ClientBuilder::new()
-        .dns_resolver(Arc::new(GaiResolver::default()))
-        .connection_verbose(true)
         .build()
         .expect("Failed to create http client");
 
@@ -36,8 +31,6 @@ pub fn new_client() -> ClientWithMiddleware {
 
 pub fn new_client_with_timeout(default_timout: Duration) -> ClientWithMiddleware {
     let timeout_client = reqwest::ClientBuilder::new()
-        .dns_resolver(Arc::new(GaiResolver::default()))
-        .connection_verbose(true)
         .timeout(default_timout)
         .build()
         .expect("Failed to create http client with timeout");
@@ -103,38 +96,6 @@ impl Endpoint {
     }
 }
 
-use hyper_util::client::legacy::connect::dns::{
-    GaiResolver as HyperGaiResolver, Name as HyperName,
-};
-use reqwest::dns::{Addrs, Name, Resolve, Resolving};
-/// https://docs.rs/reqwest/0.11.18/src/reqwest/dns/gai.rs.html
-use tower_service::Service;
-#[derive(Debug)]
-pub struct GaiResolver(HyperGaiResolver);
-
-impl Default for GaiResolver {
-    fn default() -> Self {
-        Self(HyperGaiResolver::new())
-    }
-}
-
-impl Resolve for GaiResolver {
-    fn resolve(&self, name: Name) -> Resolving {
-        let this = &mut self.0.clone();
-        let hyper_name = HyperName::from_str(name.as_str()).expect("name should be valid");
-        let start = Instant::now();
-        Box::pin(
-            Service::<HyperName>::call(this, hyper_name).map(move |result| {
-                let resolve_duration = start.elapsed();
-                trace!(duration = ?resolve_duration, addr = %name.as_str(), "resolve host complete");
-                result
-                    .map(|addrs| -> Addrs { Box::new(addrs) })
-                    .map_err(|err| -> Box<dyn std::error::Error + Send + Sync> { Box::new(err) })
-            }),
-        )
-    }
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs
index 3405b8cbc6..3b30ad8b46 100644
--- a/proxy/src/logging.rs
+++ b/proxy/src/logging.rs
@@ -15,7 +15,8 @@ use tracing_subscriber::{
 pub async fn init() -> anyhow::Result<LoggingGuard> {
     let env_filter = EnvFilter::builder()
         .with_default_directive(LevelFilter::INFO.into())
-        .from_env_lossy();
+        .from_env_lossy()
+        .add_directive("azure_core::policies::transport=off".parse().unwrap());
 
     let fmt_layer = tracing_subscriber::fmt::layer()
         .with_ansi(false)

From 4d75e1ef81d2180211ee84501ccacd9b3effcc11 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 9 Jul 2024 18:12:57 +0000
Subject: [PATCH 218/412] build(deps-dev): bump zipp from 3.8.1 to 3.19.1

Bumps [zipp](https://github.com/jaraco/zipp) from 3.8.1 to 3.19.1.
- [Release notes](https://github.com/jaraco/zipp/releases)
- [Changelog](https://github.com/jaraco/zipp/blob/main/NEWS.rst)
- [Commits](https://github.com/jaraco/zipp/compare/v3.8.1...v3.19.1)

---
updated-dependencies:
- dependency-name: zipp
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 poetry.lock | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index bf16aaf55d..8091141411 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -3133,18 +3133,18 @@ multidict = ">=4.0"
 
 [[package]]
 name = "zipp"
-version = "3.8.1"
+version = "3.19.1"
 description = "Backport of pathlib-compatible object wrapper for zip files"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "zipp-3.8.1-py3-none-any.whl", hash = "sha256:47c40d7fe183a6f21403a199b3e4192cca5774656965b0a4988ad2f8feb5f009"},
-    {file = "zipp-3.8.1.tar.gz", hash = "sha256:05b45f1ee8f807d0cc928485ca40a07cb491cf092ff587c0df9cb1fd154848d2"},
+    {file = "zipp-3.19.1-py3-none-any.whl", hash = "sha256:2828e64edb5386ea6a52e7ba7cdb17bb30a73a858f5eb6eb93d8d36f5ea26091"},
+    {file = "zipp-3.19.1.tar.gz", hash = "sha256:35427f6d5594f4acf82d25541438348c26736fa9b3afa2754bcd63cdb99d8e8f"},
 ]
 
 [package.extras]
-docs = ["jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx"]
-testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"]
+doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
+test = ["big-O", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy", "pytest-ruff (>=0.2.1)"]
 
 [[package]]
 name = "zstandard"

From 8e2fe6b22efdc14302c405e2e9cacaab271d3887 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 10 Jul 2024 18:09:19 +0200
Subject: [PATCH 219/412] Remove
 ImageCompressionAlgorithm::DisabledNoDecompress (#8300)

Removes the `ImageCompressionAlgorithm::DisabledNoDecompress` variant.
We now assume any blob with the specific bits set is actually a
compressed blob.

The `ImageCompressionAlgorithm::Disabled` variant still remains and is
the new default.

Reverts large parts of #8238 , as originally intended in that PR.

Part of #5431
---
 libs/pageserver_api/src/models.rs             | 14 ----------
 pageserver/src/config.rs                      |  2 +-
 pageserver/src/tenant/blob_io.rs              | 11 ++------
 pageserver/src/tenant/block_io.rs             | 10 +------
 .../src/tenant/storage_layer/delta_layer.rs   |  2 +-
 .../src/tenant/storage_layer/image_layer.rs   | 28 ++++++-------------
 pageserver/src/tenant/storage_layer/layer.rs  |  1 -
 7 files changed, 15 insertions(+), 53 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 49c942938d..d360cc6e87 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -440,9 +440,6 @@ pub enum CompactionAlgorithm {
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 pub enum ImageCompressionAlgorithm {
-    /// Disabled for writes, and never decompress during reading.
-    /// Never set this after you've enabled compression once!
-    DisabledNoDecompress,
     // Disabled for writes, support decompressing during read path
     Disabled,
     /// Zstandard compression. Level 0 means and None mean the same (default level). Levels can be negative as well.
@@ -452,12 +449,6 @@ pub enum ImageCompressionAlgorithm {
     },
 }
 
-impl ImageCompressionAlgorithm {
-    pub fn allow_decompression(&self) -> bool {
-        !matches!(self, ImageCompressionAlgorithm::DisabledNoDecompress)
-    }
-}
-
 impl FromStr for ImageCompressionAlgorithm {
     type Err = anyhow::Error;
     fn from_str(s: &str) -> Result<Self, Self::Err> {
@@ -466,7 +457,6 @@ impl FromStr for ImageCompressionAlgorithm {
             .next()
             .ok_or_else(|| anyhow::anyhow!("empty string"))?;
         match first {
-            "disabled-no-decompress" => Ok(ImageCompressionAlgorithm::DisabledNoDecompress),
             "disabled" => Ok(ImageCompressionAlgorithm::Disabled),
             "zstd" => {
                 let level = if let Some(v) = components.next() {
@@ -1683,10 +1673,6 @@ mod tests {
             ImageCompressionAlgorithm::from_str("disabled").unwrap(),
             Disabled
         );
-        assert_eq!(
-            ImageCompressionAlgorithm::from_str("disabled-no-decompress").unwrap(),
-            DisabledNoDecompress
-        );
         assert_eq!(
             ImageCompressionAlgorithm::from_str("zstd").unwrap(),
             Zstd { level: None }
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index b7c9af2244..17bc427b2c 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -92,7 +92,7 @@ pub mod defaults {
     pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB
 
     pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm =
-        ImageCompressionAlgorithm::DisabledNoDecompress;
+        ImageCompressionAlgorithm::Disabled;
 
     pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true;
 
diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs
index 0705182d5d..e98ed66ef9 100644
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -273,12 +273,8 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
         srcbuf: B,
         ctx: &RequestContext,
     ) -> (B::Buf, Result<u64, Error>) {
-        self.write_blob_maybe_compressed(
-            srcbuf,
-            ctx,
-            ImageCompressionAlgorithm::DisabledNoDecompress,
-        )
-        .await
+        self.write_blob_maybe_compressed(srcbuf, ctx, ImageCompressionAlgorithm::Disabled)
+            .await
     }
 
     /// Write a blob of data. Returns the offset that it was written to,
@@ -340,8 +336,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
                             (BYTE_UNCOMPRESSED, len, slice.into_inner())
                         }
                     }
-                    ImageCompressionAlgorithm::Disabled
-                    | ImageCompressionAlgorithm::DisabledNoDecompress => {
+                    ImageCompressionAlgorithm::Disabled => {
                         (BYTE_UNCOMPRESSED, len, srcbuf.slice_full().into_inner())
                     }
                 };
diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs
index 3324e840ec..601b095155 100644
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -202,18 +202,10 @@ pub struct FileBlockReader<'a> {
 
 impl<'a> FileBlockReader<'a> {
     pub fn new(file: &'a VirtualFile, file_id: FileId) -> Self {
-        Self::new_with_compression(file, file_id, false)
-    }
-
-    pub fn new_with_compression(
-        file: &'a VirtualFile,
-        file_id: FileId,
-        compressed_reads: bool,
-    ) -> Self {
         FileBlockReader {
             file_id,
             file,
-            compressed_reads,
+            compressed_reads: true,
         }
     }
 
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 685f6dce60..000076d7c0 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -453,7 +453,7 @@ impl DeltaLayerWriterInner {
     ) -> (Vec<u8>, anyhow::Result<()>) {
         assert!(self.lsn_range.start <= lsn);
         // We don't want to use compression in delta layer creation
-        let compression = ImageCompressionAlgorithm::DisabledNoDecompress;
+        let compression = ImageCompressionAlgorithm::Disabled;
         let (val, res) = self
             .blob_writer
             .write_blob_maybe_compressed(val, ctx, compression)
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 4a1b3a0237..50aacbd9ad 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -165,7 +165,6 @@ pub struct ImageLayerInner {
     file_id: FileId,
 
     max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
-    compressed_reads: bool,
 }
 
 impl std::fmt::Debug for ImageLayerInner {
@@ -179,8 +178,7 @@ impl std::fmt::Debug for ImageLayerInner {
 
 impl ImageLayerInner {
     pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> {
-        let block_reader =
-            FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads);
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
         let tree_reader = DiskBtreeReader::<_, KEY_SIZE>::new(
             self.index_start_blk,
             self.index_root_blk,
@@ -268,10 +266,9 @@ impl ImageLayer {
     async fn load_inner(&self, ctx: &RequestContext) -> Result<ImageLayerInner> {
         let path = self.path();
 
-        let loaded =
-            ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, None, false, ctx)
-                .await
-                .and_then(|res| res)?;
+        let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, None, ctx)
+            .await
+            .and_then(|res| res)?;
 
         // not production code
         let actual_layer_name = LayerName::from_str(path.file_name().unwrap()).unwrap();
@@ -380,7 +377,6 @@ impl ImageLayerInner {
         lsn: Lsn,
         summary: Option<Summary>,
         max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
-        support_compressed_reads: bool,
         ctx: &RequestContext,
     ) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
         let file = match VirtualFile::open(path, ctx).await {
@@ -424,7 +420,6 @@ impl ImageLayerInner {
             file,
             file_id,
             max_vectored_read_bytes,
-            compressed_reads: support_compressed_reads,
             key_range: actual_summary.key_range,
         }))
     }
@@ -435,8 +430,7 @@ impl ImageLayerInner {
         reconstruct_state: &mut ValueReconstructState,
         ctx: &RequestContext,
     ) -> anyhow::Result<ValueReconstructResult> {
-        let block_reader =
-            FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads);
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
         let tree_reader =
             DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader);
 
@@ -496,14 +490,12 @@ impl ImageLayerInner {
         &self,
         ctx: &RequestContext,
     ) -> anyhow::Result<Vec<(Key, Lsn, Value)>> {
-        let block_reader =
-            FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads);
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
         let tree_reader =
             DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader);
         let mut result = Vec::new();
         let mut stream = Box::pin(tree_reader.into_stream(&[0; KEY_SIZE], ctx));
-        let block_reader =
-            FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads);
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
         let cursor = block_reader.block_cursor();
         while let Some(item) = stream.next().await {
             // TODO: dedup code with get_reconstruct_value
@@ -538,8 +530,7 @@ impl ImageLayerInner {
                 .into(),
         );
 
-        let block_reader =
-            FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads);
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
         let tree_reader =
             DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader);
 
@@ -700,8 +691,7 @@ impl ImageLayerInner {
 
     #[cfg(test)]
     pub(crate) fn iter<'a>(&'a self, ctx: &'a RequestContext) -> ImageLayerIterator<'a> {
-        let block_reader =
-            FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads);
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
         let tree_reader =
             DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader);
         ImageLayerIterator {
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index afd11780e7..02069c29d2 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1685,7 +1685,6 @@ impl DownloadedLayer {
                     lsn,
                     summary,
                     Some(owner.conf.max_vectored_read_bytes),
-                    owner.conf.image_compression.allow_decompression(),
                     ctx,
                 )
                 .await

From bea6532881b830e10b8844f4ffb0f98656aa444e Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Wed, 10 Jul 2024 14:11:27 -0400
Subject: [PATCH 220/412] feat(pageserver): add k-merge layer iterator with
 lazy loading (#8053)

Part of https://github.com/neondatabase/neon/issues/8002. This pull
request adds a k-merge iterator for bottom-most compaction.

## Summary of changes

* Added back lsn_range / key_range in delta layer inner. This was
removed due to https://github.com/neondatabase/neon/pull/8050, but added
back because iterators need that information to process lazy loading.
* Added lazy-loading k-merge iterator.
* Added iterator wrapper as a unified iterator type for image+delta
iterator.

The current status and test should cover the use case for L0 compaction
so that the L0 compaction process can bypass page cache and have a fixed
amount of memory usage. The next step is to integrate this with the new
bottom-most compaction.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 pageserver/src/tenant/storage_layer.rs        |   3 +
 .../src/tenant/storage_layer/delta_layer.rs   |  30 +-
 .../src/tenant/storage_layer/image_layer.rs   |  10 +
 .../tenant/storage_layer/merge_iterator.rs    | 412 ++++++++++++++++++
 4 files changed, 452 insertions(+), 3 deletions(-)
 create mode 100644 pageserver/src/tenant/storage_layer/merge_iterator.rs

diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 9607546ce0..62730f88b2 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -7,6 +7,9 @@ pub(crate) mod layer;
 mod layer_desc;
 mod layer_name;
 
+#[cfg(test)]
+pub mod merge_iterator;
+
 use crate::context::{AccessStatsBehavior, RequestContext};
 use crate::repository::Value;
 use crate::task_mgr::TaskKind;
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 000076d7c0..dfd0196c87 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -223,6 +223,11 @@ pub struct DeltaLayerInner {
     file: VirtualFile,
     file_id: FileId,
 
+    #[allow(dead_code)]
+    layer_key_range: Range<Key>,
+    #[allow(dead_code)]
+    layer_lsn_range: Range<Lsn>,
+
     max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
 }
 
@@ -742,6 +747,16 @@ impl DeltaLayer {
 }
 
 impl DeltaLayerInner {
+    #[cfg(test)]
+    pub(crate) fn key_range(&self) -> &Range<Key> {
+        &self.layer_key_range
+    }
+
+    #[cfg(test)]
+    pub(crate) fn lsn_range(&self) -> &Range<Lsn> {
+        &self.layer_lsn_range
+    }
+
     /// Returns nested result following Result<Result<_, OpErr>, Critical>:
     /// - inner has the success or transient failure
     /// - outer has the permanent failure
@@ -790,6 +805,8 @@ impl DeltaLayerInner {
             index_start_blk: actual_summary.index_start_blk,
             index_root_blk: actual_summary.index_root_blk,
             max_vectored_read_bytes,
+            layer_key_range: actual_summary.key_range,
+            layer_lsn_range: actual_summary.lsn_range,
         }))
     }
 
@@ -1639,7 +1656,7 @@ impl<'a> DeltaLayerIterator<'a> {
 }
 
 #[cfg(test)]
-mod test {
+pub(crate) mod test {
     use std::collections::BTreeMap;
 
     use itertools::MinMaxResult;
@@ -2217,13 +2234,20 @@ mod test {
         }
     }
 
-    async fn produce_delta_layer(
+    pub(crate) fn sort_delta(
+        (k1, l1, _): &(Key, Lsn, Value),
+        (k2, l2, _): &(Key, Lsn, Value),
+    ) -> std::cmp::Ordering {
+        (k1, l1).cmp(&(k2, l2))
+    }
+
+    pub(crate) async fn produce_delta_layer(
         tenant: &Tenant,
         tline: &Arc<Timeline>,
         mut deltas: Vec<(Key, Lsn, Value)>,
         ctx: &RequestContext,
     ) -> anyhow::Result<ResidentLayer> {
-        deltas.sort_by(|(k1, l1, _), (k2, l2, _)| (k1, l1).cmp(&(k2, l2)));
+        deltas.sort_by(sort_delta);
         let (key_start, _, _) = deltas.first().unwrap();
         let (key_max, _, _) = deltas.first().unwrap();
         let lsn_min = deltas.iter().map(|(_, lsn, _)| lsn).min().unwrap();
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 50aacbd9ad..1e03e1a58c 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -369,6 +369,16 @@ impl ImageLayer {
 }
 
 impl ImageLayerInner {
+    #[cfg(test)]
+    pub(crate) fn key_range(&self) -> &Range<Key> {
+        &self.key_range
+    }
+
+    #[cfg(test)]
+    pub(crate) fn lsn(&self) -> Lsn {
+        self.lsn
+    }
+
     /// Returns nested result following Result<Result<_, OpErr>, Critical>:
     /// - inner has the success or transient failure
     /// - outer has the permanent failure
diff --git a/pageserver/src/tenant/storage_layer/merge_iterator.rs b/pageserver/src/tenant/storage_layer/merge_iterator.rs
new file mode 100644
index 0000000000..36386c87c9
--- /dev/null
+++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs
@@ -0,0 +1,412 @@
+use std::{
+    cmp::Ordering,
+    collections::{binary_heap, BinaryHeap},
+};
+
+use pageserver_api::key::Key;
+use utils::lsn::Lsn;
+
+use crate::{context::RequestContext, repository::Value};
+
+use super::{
+    delta_layer::{DeltaLayerInner, DeltaLayerIterator},
+    image_layer::{ImageLayerInner, ImageLayerIterator},
+};
+
+#[derive(Clone, Copy)]
+enum LayerRef<'a> {
+    Image(&'a ImageLayerInner),
+    Delta(&'a DeltaLayerInner),
+}
+
+impl<'a> LayerRef<'a> {
+    fn iter(self, ctx: &'a RequestContext) -> LayerIterRef<'a> {
+        match self {
+            Self::Image(x) => LayerIterRef::Image(x.iter(ctx)),
+            Self::Delta(x) => LayerIterRef::Delta(x.iter(ctx)),
+        }
+    }
+}
+
+enum LayerIterRef<'a> {
+    Image(ImageLayerIterator<'a>),
+    Delta(DeltaLayerIterator<'a>),
+}
+
+impl LayerIterRef<'_> {
+    async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
+        match self {
+            Self::Delta(x) => x.next().await,
+            Self::Image(x) => x.next().await,
+        }
+    }
+}
+
+/// This type plays several roles at once
+/// 1. Unified iterator for image and delta layers.
+/// 2. `Ord` for use in [`MergeIterator::heap`] (for the k-merge).
+/// 3. Lazy creation of the real delta/image iterator.
+enum IteratorWrapper<'a> {
+    NotLoaded {
+        ctx: &'a RequestContext,
+        first_key_lower_bound: (Key, Lsn),
+        layer: LayerRef<'a>,
+    },
+    Loaded {
+        iter: PeekableLayerIterRef<'a>,
+    },
+}
+
+struct PeekableLayerIterRef<'a> {
+    iter: LayerIterRef<'a>,
+    peeked: Option<(Key, Lsn, Value)>, // None == end
+}
+
+impl<'a> PeekableLayerIterRef<'a> {
+    async fn create(mut iter: LayerIterRef<'a>) -> anyhow::Result<Self> {
+        let peeked = iter.next().await?;
+        Ok(Self { iter, peeked })
+    }
+
+    fn peek(&self) -> &Option<(Key, Lsn, Value)> {
+        &self.peeked
+    }
+
+    async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
+        let result = self.peeked.take();
+        self.peeked = self.iter.next().await?;
+        Ok(result)
+    }
+}
+
+impl<'a> std::cmp::PartialEq for IteratorWrapper<'a> {
+    fn eq(&self, other: &Self) -> bool {
+        self.cmp(other) == Ordering::Equal
+    }
+}
+
+impl<'a> std::cmp::Eq for IteratorWrapper<'a> {}
+
+impl<'a> std::cmp::PartialOrd for IteratorWrapper<'a> {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl<'a> std::cmp::Ord for IteratorWrapper<'a> {
+    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        use std::cmp::Ordering;
+        let a = self.peek_next_key_lsn();
+        let b = other.peek_next_key_lsn();
+        match (a, b) {
+            (Some((k1, l1)), Some((k2, l2))) => {
+                let loaded_1 = if self.is_loaded() { 1 } else { 0 };
+                let loaded_2 = if other.is_loaded() { 1 } else { 0 };
+                // When key_lsn are the same, the unloaded iter will always appear before the loaded one.
+                // And note that we do a reverse at the end of the comparison, so it works with the max heap.
+                (k1, l1, loaded_1).cmp(&(k2, l2, loaded_2))
+            }
+            (Some(_), None) => Ordering::Less,
+            (None, Some(_)) => Ordering::Greater,
+            (None, None) => Ordering::Equal,
+        }
+        .reverse()
+    }
+}
+
+impl<'a> IteratorWrapper<'a> {
+    pub fn create_from_image_layer(
+        image_layer: &'a ImageLayerInner,
+        ctx: &'a RequestContext,
+    ) -> Self {
+        Self::NotLoaded {
+            layer: LayerRef::Image(image_layer),
+            first_key_lower_bound: (image_layer.key_range().start, image_layer.lsn()),
+            ctx,
+        }
+    }
+
+    pub fn create_from_delta_layer(
+        delta_layer: &'a DeltaLayerInner,
+        ctx: &'a RequestContext,
+    ) -> Self {
+        Self::NotLoaded {
+            layer: LayerRef::Delta(delta_layer),
+            first_key_lower_bound: (delta_layer.key_range().start, delta_layer.lsn_range().start),
+            ctx,
+        }
+    }
+
+    fn peek_next_key_lsn(&self) -> Option<(&Key, Lsn)> {
+        match self {
+            Self::Loaded { iter } => iter.peek().as_ref().map(|(key, lsn, _)| (key, *lsn)),
+            Self::NotLoaded {
+                first_key_lower_bound: (key, lsn),
+                ..
+            } => Some((key, *lsn)),
+        }
+    }
+
+    // CORRECTNESS: this function must always take `&mut self`, never `&self`.
+    //
+    // The reason is that `impl Ord for Self` evaluates differently after this function
+    // returns. We're called through a `PeekMut::deref_mut`, which causes heap repair when
+    // the PeekMut gets returned. So, it's critical that we actually run through `PeekMut::deref_mut`
+    // and not just `PeekMut::deref`
+    // If we don't take `&mut self`
+    async fn load(&mut self) -> anyhow::Result<()> {
+        assert!(!self.is_loaded());
+        let Self::NotLoaded {
+            ctx,
+            first_key_lower_bound,
+            layer,
+        } = self
+        else {
+            unreachable!()
+        };
+        let iter = layer.iter(ctx);
+        let iter = PeekableLayerIterRef::create(iter).await?;
+        if let Some((k1, l1, _)) = iter.peek() {
+            let (k2, l2) = first_key_lower_bound;
+            debug_assert!((k1, l1) >= (k2, l2));
+        }
+        *self = Self::Loaded { iter };
+        Ok(())
+    }
+
+    fn is_loaded(&self) -> bool {
+        matches!(self, Self::Loaded { .. })
+    }
+
+    /// Correctness: must load the iterator before using.
+    ///
+    /// Given this iterator wrapper is private to the merge iterator, users won't be able to mis-use it.
+    /// The public interfaces to use are [`crate::tenant::storage_layer::delta_layer::DeltaLayerIterator`] and
+    /// [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`].
+    async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
+        let Self::Loaded { iter } = self else {
+            panic!("must load the iterator before using")
+        };
+        iter.next().await
+    }
+}
+
+pub struct MergeIterator<'a> {
+    heap: BinaryHeap<IteratorWrapper<'a>>,
+}
+
+impl<'a> MergeIterator<'a> {
+    pub fn create(
+        deltas: &[&'a DeltaLayerInner],
+        images: &[&'a ImageLayerInner],
+        ctx: &'a RequestContext,
+    ) -> Self {
+        let mut heap = Vec::with_capacity(images.len() + deltas.len());
+        for image in images {
+            heap.push(IteratorWrapper::create_from_image_layer(image, ctx));
+        }
+        for delta in deltas {
+            heap.push(IteratorWrapper::create_from_delta_layer(delta, ctx));
+        }
+        Self {
+            heap: BinaryHeap::from(heap),
+        }
+    }
+
+    pub async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
+        while let Some(mut iter) = self.heap.peek_mut() {
+            if !iter.is_loaded() {
+                // Once we load the iterator, we can know the real first key-value pair in the iterator.
+                // We put it back into the heap so that a potentially unloaded layer may have a key between
+                // [potential_first_key, loaded_first_key).
+                iter.load().await?;
+                continue;
+            }
+            let Some(item) = iter.next().await? else {
+                // If the iterator returns None, we pop this iterator. Actually, in the current implementation,
+                // we order None > Some, and all the rest of the iterators should return None.
+                binary_heap::PeekMut::pop(iter);
+                continue;
+            };
+            return Ok(Some(item));
+        }
+        Ok(None)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use itertools::Itertools;
+    use pageserver_api::key::Key;
+    use utils::lsn::Lsn;
+
+    use crate::{
+        tenant::{
+            harness::{TenantHarness, TIMELINE_ID},
+            storage_layer::delta_layer::test::{produce_delta_layer, sort_delta},
+        },
+        DEFAULT_PG_VERSION,
+    };
+
+    async fn assert_merge_iter_equal(
+        merge_iter: &mut MergeIterator<'_>,
+        expect: &[(Key, Lsn, Value)],
+    ) {
+        let mut expect_iter = expect.iter();
+        loop {
+            let o1 = merge_iter.next().await.unwrap();
+            let o2 = expect_iter.next();
+            assert_eq!(o1.is_some(), o2.is_some());
+            if o1.is_none() && o2.is_none() {
+                break;
+            }
+            let (k1, l1, v1) = o1.unwrap();
+            let (k2, l2, v2) = o2.unwrap();
+            assert_eq!(&k1, k2);
+            assert_eq!(l1, *l2);
+            assert_eq!(&v1, v2);
+        }
+    }
+
+    #[tokio::test]
+    async fn merge_in_between() {
+        use crate::repository::Value;
+        use bytes::Bytes;
+
+        let harness = TenantHarness::create("merge_iterator_delta_merge").unwrap();
+        let (tenant, ctx) = harness.load().await;
+
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await
+            .unwrap();
+
+        fn get_key(id: u32) -> Key {
+            let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
+            key.field6 = id;
+            key
+        }
+        let test_deltas1 = vec![
+            (
+                get_key(0),
+                Lsn(0x10),
+                Value::Image(Bytes::copy_from_slice(b"test")),
+            ),
+            (
+                get_key(5),
+                Lsn(0x10),
+                Value::Image(Bytes::copy_from_slice(b"test")),
+            ),
+        ];
+        let resident_layer_1 = produce_delta_layer(&tenant, &tline, test_deltas1.clone(), &ctx)
+            .await
+            .unwrap();
+        let test_deltas2 = vec![
+            (
+                get_key(3),
+                Lsn(0x10),
+                Value::Image(Bytes::copy_from_slice(b"test")),
+            ),
+            (
+                get_key(4),
+                Lsn(0x10),
+                Value::Image(Bytes::copy_from_slice(b"test")),
+            ),
+        ];
+        let resident_layer_2 = produce_delta_layer(&tenant, &tline, test_deltas2.clone(), &ctx)
+            .await
+            .unwrap();
+        let mut merge_iter = MergeIterator::create(
+            &[
+                resident_layer_2.get_as_delta(&ctx).await.unwrap(),
+                resident_layer_1.get_as_delta(&ctx).await.unwrap(),
+            ],
+            &[],
+            &ctx,
+        );
+        let mut expect = Vec::new();
+        expect.extend(test_deltas1);
+        expect.extend(test_deltas2);
+        expect.sort_by(sort_delta);
+        assert_merge_iter_equal(&mut merge_iter, &expect).await;
+    }
+
+    #[tokio::test]
+    async fn delta_merge() {
+        use crate::repository::Value;
+        use bytes::Bytes;
+
+        let harness = TenantHarness::create("merge_iterator_delta_merge").unwrap();
+        let (tenant, ctx) = harness.load().await;
+
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await
+            .unwrap();
+
+        fn get_key(id: u32) -> Key {
+            let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
+            key.field6 = id;
+            key
+        }
+        const N: usize = 1000;
+        let test_deltas1 = (0..N)
+            .map(|idx| {
+                (
+                    get_key(idx as u32 / 10),
+                    Lsn(0x20 * ((idx as u64) % 10 + 1)),
+                    Value::Image(Bytes::from(format!("img{idx:05}"))),
+                )
+            })
+            .collect_vec();
+        let resident_layer_1 = produce_delta_layer(&tenant, &tline, test_deltas1.clone(), &ctx)
+            .await
+            .unwrap();
+        let test_deltas2 = (0..N)
+            .map(|idx| {
+                (
+                    get_key(idx as u32 / 10),
+                    Lsn(0x20 * ((idx as u64) % 10 + 1) + 0x10),
+                    Value::Image(Bytes::from(format!("img{idx:05}"))),
+                )
+            })
+            .collect_vec();
+        let resident_layer_2 = produce_delta_layer(&tenant, &tline, test_deltas2.clone(), &ctx)
+            .await
+            .unwrap();
+        let test_deltas3 = (0..N)
+            .map(|idx| {
+                (
+                    get_key(idx as u32 / 10 + N as u32),
+                    Lsn(0x10 * ((idx as u64) % 10 + 1)),
+                    Value::Image(Bytes::from(format!("img{idx:05}"))),
+                )
+            })
+            .collect_vec();
+        let resident_layer_3 = produce_delta_layer(&tenant, &tline, test_deltas3.clone(), &ctx)
+            .await
+            .unwrap();
+        let mut merge_iter = MergeIterator::create(
+            &[
+                resident_layer_1.get_as_delta(&ctx).await.unwrap(),
+                resident_layer_2.get_as_delta(&ctx).await.unwrap(),
+                resident_layer_3.get_as_delta(&ctx).await.unwrap(),
+            ],
+            &[],
+            &ctx,
+        );
+        let mut expect = Vec::new();
+        expect.extend(test_deltas1);
+        expect.extend(test_deltas2);
+        expect.extend(test_deltas3);
+        expect.sort_by(sort_delta);
+        assert_merge_iter_equal(&mut merge_iter, &expect).await;
+
+        // TODO: test layers are loaded only when needed, reducing num of active iterators in k-merge
+    }
+
+    // TODO: image layer merge, delta+image mixed merge
+    // TODO: is it possible to have duplicated delta at same LSN now? we might need to test that
+}

From 547acde6cd3f1c062e1bbee775c6778d9e1a26d2 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 10 Jul 2024 19:38:14 +0100
Subject: [PATCH 221/412] safekeeper: add eviction_min_resident to stop
 evictions thrashing (#8335)

## Problem

- The condition for eviction is not time-based: it is possible for a
timeline to be restored in response to a client, that client times out,
and then as soon as the timeline is restored it is immediately evicted
again.
- There is no delay on eviction at startup of the safekeeper, so when it
starts up and sees many idle timelines, it does many evictions which
will likely be immediately restored when someone uses the timeline.

## Summary of changes

- Add `eviction_min_resident` parameter, and use it in
`ready_for_eviction` to avoid evictions if the timeline has been
resident for less than this period.
- This also implicitly delays evictions at startup for
`eviction_min_resident`
- Set this to a very low number for the existing eviction test, which
expects immediate eviction.

The default period is 15 minutes. The general reasoning for that is that
in the worst case where we thrash ~10k timelines on one safekeeper,
downloading 16MB for each one, we should set a period that would not
overwhelm the node's bandwidth.
---
 safekeeper/src/bin/safekeeper.rs              | 11 ++++++++--
 safekeeper/src/lib.rs                         |  7 +++++++
 safekeeper/src/timeline_eviction.rs           |  4 ++++
 safekeeper/src/timeline_manager.rs            |  5 +++++
 .../tests/walproposer_sim/safekeeper.rs       |  1 +
 test_runner/fixtures/neon_fixtures.py         | 21 +++++++++++++++++--
 test_runner/regress/test_wal_acceptor.py      | 21 ++++++++++---------
 7 files changed, 56 insertions(+), 14 deletions(-)

diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 4d580e57ed..9eb6546d6b 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -27,8 +27,8 @@ use utils::pid_file;
 
 use metrics::set_build_info_metric;
 use safekeeper::defaults::{
-    DEFAULT_CONTROL_FILE_SAVE_INTERVAL, DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR,
-    DEFAULT_MAX_OFFLOADER_LAG_BYTES, DEFAULT_PARTIAL_BACKUP_CONCURRENCY,
+    DEFAULT_CONTROL_FILE_SAVE_INTERVAL, DEFAULT_EVICTION_MIN_RESIDENT, DEFAULT_HEARTBEAT_TIMEOUT,
+    DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES, DEFAULT_PARTIAL_BACKUP_CONCURRENCY,
     DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR,
 };
 use safekeeper::http;
@@ -194,6 +194,12 @@ struct Args {
     /// Number of allowed concurrent uploads of partial segments to remote storage.
     #[arg(long, default_value = DEFAULT_PARTIAL_BACKUP_CONCURRENCY)]
     partial_backup_concurrency: usize,
+    /// How long a timeline must be resident before it is eligible for eviction.
+    /// Usually, timeline eviction has to wait for `partial_backup_timeout` before being eligible for eviction,
+    /// but if a timeline is un-evicted and then _not_ written to, it would immediately flap to evicting again,
+    /// if it weren't for `eviction_min_resident` preventing that.
+    #[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_EVICTION_MIN_RESIDENT)]
+    eviction_min_resident: Duration,
 }
 
 // Like PathBufValueParser, but allows empty string.
@@ -348,6 +354,7 @@ async fn main() -> anyhow::Result<()> {
         delete_offloaded_wal: args.delete_offloaded_wal,
         control_file_save_interval: args.control_file_save_interval,
         partial_backup_concurrency: args.partial_backup_concurrency,
+        eviction_min_resident: args.eviction_min_resident,
     };
 
     // initialize sentry if SENTRY_DSN is provided
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index 5cd676d857..af83feb77f 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -53,6 +53,11 @@ pub mod defaults {
     pub const DEFAULT_PARTIAL_BACKUP_TIMEOUT: &str = "15m";
     pub const DEFAULT_CONTROL_FILE_SAVE_INTERVAL: &str = "300s";
     pub const DEFAULT_PARTIAL_BACKUP_CONCURRENCY: &str = "5";
+
+    // By default, our required residency before eviction is the same as the period that passes
+    // before uploading a partial segment, so that in normal operation the eviction can happen
+    // as soon as we have done the partial segment upload.
+    pub const DEFAULT_EVICTION_MIN_RESIDENT: &str = DEFAULT_PARTIAL_BACKUP_TIMEOUT;
 }
 
 #[derive(Debug, Clone)]
@@ -93,6 +98,7 @@ pub struct SafeKeeperConf {
     pub delete_offloaded_wal: bool,
     pub control_file_save_interval: Duration,
     pub partial_backup_concurrency: usize,
+    pub eviction_min_resident: Duration,
 }
 
 impl SafeKeeperConf {
@@ -136,6 +142,7 @@ impl SafeKeeperConf {
             delete_offloaded_wal: false,
             control_file_save_interval: Duration::from_secs(1),
             partial_backup_concurrency: 1,
+            eviction_min_resident: Duration::ZERO,
         }
     }
 }
diff --git a/safekeeper/src/timeline_eviction.rs b/safekeeper/src/timeline_eviction.rs
index b303d41b7b..e4ab65290d 100644
--- a/safekeeper/src/timeline_eviction.rs
+++ b/safekeeper/src/timeline_eviction.rs
@@ -5,6 +5,7 @@
 use anyhow::Context;
 use camino::Utf8PathBuf;
 use remote_storage::RemotePath;
+use std::time::Instant;
 use tokio::{
     fs::File,
     io::{AsyncRead, AsyncWriteExt},
@@ -48,6 +49,7 @@ impl Manager {
                 .flush_lsn
                 .segment_number(self.wal_seg_size)
                 == self.last_removed_segno + 1
+            && self.resident_since.elapsed() >= self.conf.eviction_min_resident
     }
 
     /// Evict the timeline to remote storage.
@@ -91,6 +93,8 @@ impl Manager {
             return;
         }
 
+        self.resident_since = Instant::now();
+
         info!("successfully restored evicted timeline");
     }
 }
diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs
index 62142162de..debf8c824f 100644
--- a/safekeeper/src/timeline_manager.rs
+++ b/safekeeper/src/timeline_manager.rs
@@ -186,6 +186,10 @@ pub(crate) struct Manager {
     // misc
     pub(crate) access_service: AccessService,
     pub(crate) partial_backup_rate_limiter: RateLimiter,
+
+    // Anti-flapping state: we evict timelines eagerly if they are inactive, but should not
+    // evict them if they go inactive very soon after being restored.
+    pub(crate) resident_since: std::time::Instant,
 }
 
 /// This task gets spawned alongside each timeline and is responsible for managing the timeline's
@@ -350,6 +354,7 @@ impl Manager {
             access_service: AccessService::new(manager_tx),
             tli,
             partial_backup_rate_limiter,
+            resident_since: std::time::Instant::now(),
         }
     }
 
diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs
index 6bbf96d71d..0c6d97ddfa 100644
--- a/safekeeper/tests/walproposer_sim/safekeeper.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper.rs
@@ -188,6 +188,7 @@ pub fn run_server(os: NodeOs, disk: Arc<SafekeeperDisk>) -> Result<()> {
         delete_offloaded_wal: false,
         control_file_save_interval: Duration::from_secs(1),
         partial_backup_concurrency: 1,
+        eviction_min_resident: Duration::ZERO,
     };
 
     let mut global = GlobalMap::new(disk, conf.clone())?;
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index cae2e422c1..5ca31644a9 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -492,6 +492,7 @@ class NeonEnvBuilder:
         pageserver_virtual_file_io_engine: Optional[str] = None,
         pageserver_aux_file_policy: Optional[AuxFileStore] = None,
         pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]] = None,
+        safekeeper_extra_opts: Optional[list[str]] = None,
     ):
         self.repo_dir = repo_dir
         self.rust_log_override = rust_log_override
@@ -557,6 +558,8 @@ class NeonEnvBuilder:
 
         self.pageserver_aux_file_policy = pageserver_aux_file_policy
 
+        self.safekeeper_extra_opts = safekeeper_extra_opts
+
         assert test_name.startswith(
             "test_"
         ), "Unexpectedly instantiated from outside a test function"
@@ -1193,7 +1196,9 @@ class NeonEnv:
                 sk_cfg[
                     "remote_storage"
                 ] = self.safekeepers_remote_storage.to_toml_inline_table().strip()
-            self.safekeepers.append(Safekeeper(env=self, id=id, port=port))
+            self.safekeepers.append(
+                Safekeeper(env=self, id=id, port=port, extra_opts=config.safekeeper_extra_opts)
+            )
             cfg["safekeepers"].append(sk_cfg)
 
         log.info(f"Config: {cfg}")
@@ -4016,16 +4021,28 @@ class Safekeeper(LogUtils):
     id: int
     running: bool = False
 
-    def __init__(self, env: NeonEnv, port: SafekeeperPort, id: int, running: bool = False):
+    def __init__(
+        self,
+        env: NeonEnv,
+        port: SafekeeperPort,
+        id: int,
+        running: bool = False,
+        extra_opts: Optional[List[str]] = None,
+    ):
         self.env = env
         self.port = port
         self.id = id
         self.running = running
         self.logfile = Path(self.data_dir) / f"safekeeper-{id}.log"
+        self.extra_opts = extra_opts
 
     def start(
         self, extra_opts: Optional[List[str]] = None, timeout_in_seconds: Optional[int] = None
     ) -> "Safekeeper":
+        if extra_opts is None:
+            # Apply either the extra_opts passed in, or the ones from our constructor: we do not merge the two.
+            extra_opts = self.extra_opts
+
         assert self.running is False
         self.env.neon_cli.safekeeper_start(
             self.id, extra_opts=extra_opts, timeout_in_seconds=timeout_in_seconds
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index febfc10293..7efd86e349 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -2191,24 +2191,25 @@ def test_s3_eviction(
 ):
     neon_env_builder.num_safekeepers = 3
     neon_env_builder.enable_safekeeper_remote_storage(RemoteStorageKind.LOCAL_FS)
-    env = neon_env_builder.init_start(
-        initial_tenant_conf={
-            "checkpoint_timeout": "100ms",
-        }
-    )
 
-    extra_opts = [
+    neon_env_builder.safekeeper_extra_opts = [
         "--enable-offload",
         "--partial-backup-timeout",
         "50ms",
         "--control-file-save-interval",
         "1s",
+        # Safekeepers usually wait a while before evicting something: for this test we want them to
+        # evict things as soon as they are inactive.
+        "--eviction-min-resident=100ms",
     ]
     if delete_offloaded_wal:
-        extra_opts.append("--delete-offloaded-wal")
+        neon_env_builder.safekeeper_extra_opts.append("--delete-offloaded-wal")
 
-    for sk in env.safekeepers:
-        sk.stop().start(extra_opts=extra_opts)
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            "checkpoint_timeout": "100ms",
+        }
+    )
 
     n_timelines = 5
 
@@ -2263,7 +2264,7 @@ def test_s3_eviction(
         # restarting random safekeepers
         for sk in env.safekeepers:
             if random.random() < restart_chance:
-                sk.stop().start(extra_opts=extra_opts)
+                sk.stop().start()
         time.sleep(0.5)
 
     # require at least one successful eviction in at least one safekeeper

From a91f9d5832ff75fc39b95aef20fdcc945394172d Mon Sep 17 00:00:00 2001
From: Stas Kelvich <stas.kelvich@gmail.com>
Date: Thu, 11 Jul 2024 10:20:14 +0300
Subject: [PATCH 222/412] Enable core dumps for postgres (#8272)

Set core rmilit to ulimited in compute_ctl, so that all child processes
inherit it. We could also set rlimit in relevant startup script, but
that way we would depend on external setup and might inadvertently
disable it again (core dumping worked in pods, but not in VMs with
inittab-based startup).
---
 Cargo.lock                           | 10 ++++++++++
 compute_tools/Cargo.toml             |  1 +
 compute_tools/src/bin/compute_ctl.rs |  4 ++++
 compute_tools/src/compute.rs         |  2 +-
 4 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/Cargo.lock b/Cargo.lock
index 776d95c3c7..9fb3f5385d 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1236,6 +1236,7 @@ dependencies = [
  "regex",
  "remote_storage",
  "reqwest 0.12.4",
+ "rlimit",
  "rust-ini",
  "serde",
  "serde_json",
@@ -4901,6 +4902,15 @@ dependencies = [
  "windows-sys 0.48.0",
 ]
 
+[[package]]
+name = "rlimit"
+version = "0.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3560f70f30a0f16d11d01ed078a07740fe6b489667abc7c7b029155d9f21c3d8"
+dependencies = [
+ "libc",
+]
+
 [[package]]
 name = "routerify"
 version = "3.0.0"
diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml
index 8f96530a9d..8ceb8f2ad2 100644
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -44,3 +44,4 @@ vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" }
 zstd = "0.13"
 bytes = "1.0"
 rust-ini = "0.20.0"
+rlimit = "0.10.1"
diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index 7bf5db5a57..f4c396a85d 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -64,6 +64,7 @@ use compute_tools::monitor::launch_monitor;
 use compute_tools::params::*;
 use compute_tools::spec::*;
 use compute_tools::swap::resize_swap;
+use rlimit::{setrlimit, Resource};
 
 // this is an arbitrary build tag. Fine as a default / for testing purposes
 // in-case of not-set environment var
@@ -72,6 +73,9 @@ const BUILD_TAG_DEFAULT: &str = "latest";
 fn main() -> Result<()> {
     let (build_tag, clap_args) = init()?;
 
+    // enable core dumping for all child processes
+    setrlimit(Resource::CORE, rlimit::INFINITY, rlimit::INFINITY)?;
+
     let (pg_handle, start_pg_result) = {
         // Enter startup tracing context
         let _startup_context_guard = startup_context_from_env();
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index eced6fc0b2..1112795d30 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1116,7 +1116,7 @@ impl ComputeNode {
     // EKS worker nodes have following core dump settings:
     //   /proc/sys/kernel/core_pattern -> core
     //   /proc/sys/kernel/core_uses_pid -> 1
-    //   ulimint -c -> unlimited
+    //   ulimit -c -> unlimited
     // which results in core dumps being written to postgres data directory as core.<pid>.
     //
     // Use that as a default location and pattern, except macos where core dumps are written

From 32f668f5e7fa1855d7b1c5cf3773203da61ba110 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 11 Jul 2024 08:23:51 +0100
Subject: [PATCH 223/412] rfcs: add RFC for timeline archival (#8221)

A design for a cheap low-resource state for idle timelines:
- #8088
---
 docs/rfcs/034-timeline-archive.md | 507 ++++++++++++++++++++++++++++++
 1 file changed, 507 insertions(+)
 create mode 100644 docs/rfcs/034-timeline-archive.md

diff --git a/docs/rfcs/034-timeline-archive.md b/docs/rfcs/034-timeline-archive.md
new file mode 100644
index 0000000000..c834216962
--- /dev/null
+++ b/docs/rfcs/034-timeline-archive.md
@@ -0,0 +1,507 @@
+# Timeline Archival
+
+## Summary
+
+This RFC describes a mechanism for pageservers to eliminate local storage + compute work
+for timelines which are not in use, in response to external API calls to "archive" a timeline.
+
+The archived state roughly corresponds to fully offloading a timeline to object storage, such
+that its cost is purely the cost of that object storage.
+
+## Motivation
+
+Archived timelines serve multiple purposes:
+- Act as a 'snapshot' for workloads that would like to retain restorable copies of their
+  database from longer ago than their PITR window.
+- Enable users to create huge numbers of branches (e.g. one per github PR) without having
+  to diligently clean them up later to avoid overloading the pageserver (currently we support
+  up to ~500 branches per tenant).
+
+### Prior art
+
+Most storage and database systems have some form of snapshot, which can be implemented several ways:
+1. full copies of data (e.g. an EBS snapshot to S3)
+2. shallow snapshots which are CoW relative to the original version of the data, e.g. on a typical NFS appliance, or a filesystem like CephFS.
+3. a series of snapshots which are CoW or de-duplicated relative to one another.
+
+Today's Neon branches are approximately like `2.`, although due to implementation details branches
+often end up storing much more data than they really need, as parent branches assume that all data
+at the branch point is needed.  The layers pinned in the parent branch may have a much larger size
+than the physical size of a compressed image layer representing the data at the branch point.
+
+## Requirements
+
+- Enter & exit the archived state in response to external admin API calls
+- API calls to modify the archived state are atomic and durable
+- An archived timeline should eventually (once out of PITR window) use an efficient compressed
+  representation, and avoid retaining arbitrarily large data in its parent branch.
+- Remote object GETs during tenant start may be O(N) with the number of _active_ branches,
+  but must not scale with the number of _archived_ branches.
+- Background I/O for archived branches should only be done a limited number of times to evolve them
+  to a long-term-efficient state (e.g. rewriting to image layers).  There should be no ongoing "housekeeping"
+  overhead for archived branches, including operations related to calculating sizes for billing.
+- The pageserver should put no load on the safekeeper for archived branches.
+- Performance of un-archiving a branch must make good use of S3/disk bandwidth to restore the branch
+  to a performant state in a short time (linear with the branch's logical size)
+
+## Non Goals
+
+- Archived branches are not a literal `fullbackup` postgres snapshot: they are still stored
+  in Neon's internal format.
+- Compute cold starts after activating an archived branch will not have comparable performance to
+  cold starts on an active branch.
+- Archived branches will not use any new/additional compression or de-duplication beyond what
+  is already implemented for image layers (zstd per page).
+- The pageserver will not "auto start" archived branches in response to page_service API requests: they
+  are only activated explicitly via the HTTP API.
+- We will not implement a total offload of archived timelines from safekeepers: their control file (small) will
+  remain on local disk, although existing eviction mechanisms will remove any segments from local disk.
+- We will not expose any prometheus metrics for archived timelines, or make them visible in any
+  detailed HTTP APIs other than the specific API for listing archived timelines.
+- A parent branch may not be archived unless all its children are.
+
+## Impacted Components
+
+pageserver, storage controller
+
+## Terminology
+
+**Archived**: a branch is _archived_ when an HTTP API request to archive it has succeeded: the caller
+may assume that this branch is now very cheap to store, although this may not be physically so until the
+branch proceeds to the offloaded state.
+
+**Active** branches are branches which are available for use by page_service clients, and have a relatively
+high cost due to consuming local storage.
+
+**Offloaded** branches are a subset of _archived_ branches, which have had their local state removed such
+that they now consume minimal runtime resources and have a cost similar to the cost of object storage.
+
+**Activate** (verb): transition from Archived to Active
+
+**Archive** (verb): transition from Active to Archived
+
+**Offload** (verb): transition from Archived to Offloaded
+
+**Offload manifest**: an object stored in S3 that describes timelines which pageservers do not load.
+
+**Warm up** (verb): operation done on an active branch, by downloading its active layers.  Once a branch is
+warmed up, good performance will be available to page_service clients.
+
+## Implementation
+
+### High level flow
+
+We may think of a timeline which is archived and then activated as proceeding through a series of states:
+
+```mermaid
+stateDiagram
+  [*] --> Active(warm)
+  Active(warm) --> Archived
+  Archived --> Offloaded
+  Archived --> Active(warm)
+  Offloaded --> Active(cold)
+  Active(cold) --> Active(warm)
+```
+
+Note that the transition from Archived to Active(warm) is expected to be fairly rare: the most common lifecycles
+of branches will be:
+- Very frequent: Short lived branches: Active -> Deleted
+- Frequent: Long-lived branches: Active -> Archived -> Offloaded -> Deleted
+- Rare: Branches used to restore old state: Active ->Archived -> Offloaded -> Active
+
+These states are _not_ all stored as a single physical state on the timeline, but rather represent the combination
+of:
+- the timeline's lifecycle state: active or archived, stored in the timeline's index
+- its offload state: whether pageserver has chosen to drop local storage of the timeline and write it into the
+  manifest of offloaded timelines.
+- cache state (whether it's warm or cold).
+
+### Storage format changes
+
+There are two storage format changes:
+1. `index_part.json` gets a new attribute `state` that describes whether the timeline is to
+   be considered active or archived.
+2. A new tenant-level _manifest_ object `tenant_manifest-v1.json` describes which timelines a tenant does not need to load
+   at startup (and is available for storing other small, rarely changing tenant-wide attributes in future)
+
+The manifest object will have a format like this:
+```
+{
+  "offload_timelines": [
+    {
+      "timeline_id": ...
+      "last_record_lsn": ...
+      "last_record_lsn_time": ...
+      "pitr_interval": ...
+      "last_gc_lsn": ...  # equal to last_record_lsn if this branch has no history (i.e. a snapshot)
+      "logical_size": ...  # The size at last_record_lsn
+      "physical_size" ...
+      "parent": Option<{
+        "timeline_id"...
+        "lsn"... # Branch point LSN on the parent
+        "requires_data": bool # True if this branch depends on layers in its parent, identify it here
+
+      }>
+    }
+  ]
+}
+```
+
+The information about a timeline in its offload state is intentionally minimal: just enough to decide:
+- Whether it requires [archive optimization](#archive-branch-optimization) by rewriting as a set of image layers: we may infer this
+  by checking if now > last_record_lsn_time - pitr_interval, and pitr_lsn < last_record_lsn.
+- Whether a parent branch should include this offloaded branch in its GC inputs to avoid removing
+  layers that the archived branch depends on
+- Whether requests to delete this `timeline_id` should be executed (i.e. if a deletion request
+  is received for a timeline_id that isn't in the site of live `Timelines` or in the manifest, then
+  we don't need to go to S3 for the deletion.
+- How much archived space to report in consumption metrics
+
+The contents of the manifest's offload list will also be stored as an attribute of `Tenant`, such that the total
+set of timelines may be found by the union of `Tenant::timelines` (non-offloaded timelines) and `Tenant::offloaded`
+(offloaded timelines).
+
+For split-brain protection, the manifest object will be written with a generation suffix, in the same way as
+index_part objects are (see [generation numbers RFC](025-generation-numbers.md)).  This will add some complexity, but
+give us total safety against two pageservers with the same tenant attached fighting over the object.  Existing code
+for finding the latest generation and for cleaning up old generations (in the scrubber) will be generalized to cover
+the manifest file.
+
+### API & Timeline state
+
+Timelines will store a lifecycle state (enum of Active or Archived) in their IndexPart.  This will
+be controlled by a new per-timeline `configure` endpoint.  This is intentionally generic naming, which
+may be used in future to control other per-timeline attributes (e.g. in future we may make PITR interval
+a per-timeline configuration).
+
+`PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/configure`
+```
+{
+  'state': 'active|archive'
+}
+```
+
+When archiving a timeline, this API will complete as soon as the timeline's state has been set in index_part, and that index has been uploaded.
+
+When activating a timeline, this API will complete as soon as the timeline's state has been set in index_part,
+**and** the `Timeline` object has been instantiated and activated.  This will require reading the timeline's
+index, but not any data: it should be about as fast as a couple of small S3 requests.
+
+The API will be available with identical path via the storage controller: calling this on a sharded tenant
+will simply map the API call to all the shards.
+
+Archived timelines may never have descendent timelines which are active.  This will be enforced at the API level,
+such that activating a timeline requires that all its ancestors are active, and archiving a timeline requires
+that all its descendents are archived.  It is the callers responsibility to walk the hierarchy of timelines
+in the proper order if they would like to archive whole trees of branches.
+
+Because archive timelines will be excluded from the usual timeline listing APIs, a new API specifically
+for archived timelines will be added: this is for use in support/debug:
+
+```
+GET /v1/tenants/{tenant_id}/archived_timelines
+
+{
+  ...same per-timeline content as the tenant manifest...
+}
+
+```
+
+### Tenant attach changes
+
+Currently, during Tenant::spawn we list all the timelines in the S3 bucket, and then for each timeline
+we load their index_part.json.  To avoid the number of GETs scaling linearly with the number of archived
+timelines, we must have a single object that tells us which timelines do not need to be loaded.  The
+number of ListObjects requests while listing timelines will still scale O(N), but this is less problematic
+because each request covers 1000 timelines.
+
+This is **not** literally the same as the set of timelines who have state=archived.  Rather, it is
+the set of timelines which have been offloaded in the background after their state was set to archived.
+
+We may simply skip loading these timelines: there will be no special state of `Timeline`, they just won't
+exist from the perspective of an active `Tenant` apart from in deletion: timeline deletion will need
+to check for offloaded timelines as well as active timelines, to avoid wrongly returning 404 on trying
+to delete an offloaded timeline.
+
+### Warm-up API
+
+`PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/download?wait_ms=1234`
+
+This API will be similar to the existing `download_remote_layers` API, but smarter:
+- It will not download _all_ remote layers, just the visible set (i.e. layers needed for a read)
+- It will download layers in the visible set until reaching `wait_ms`, then return a struct describing progress
+  of downloads, so that the caller can poll.
+
+The _visible set_ mentioned above will be calculated by the pageserver in the background, by taking the set
+of readable LSNs (i.e. branch points and heads of branches), and walking the layer map to work out which layers
+can possibly be read from these LSNs.  This concept of layer visibility is more generally useful for cache
+eviction and heatmaps, as well as in this specific case of warming up a timeline.
+
+The caller does not have to wait for the warm up API, or call it at all.  But it is strongly advised
+to call it, because otherwise populating local contents for a timeline can take a long time when waiting
+for SQL queries to coincidentally hit all the layers, and during that time query latency remains quite
+volatile.
+
+### Background work
+
+Archived branches are not subject to normal compaction.  Instead, when the compaction loop encounters
+an archived branch, it will consider rewriting the branch to just image layers if the branch has no history
+([archive branch optimization](#archive-branch-optimization)), or offloading the timeline from local disk
+if its state permits that.
+
+Additionally, the tenant compaction task will walk the state of already offloaded timelines to consider
+optimizing their storage, e.g. if a timeline had some history when offloaded, but since then its PITR
+has elapsed and it can now be rewritten to image layers.
+
+#### Archive branch offload
+
+Recall that when we archive a timeline via the HTTP API, this only sets a state: it doesn't do
+any actual work.
+
+This work is done in the background compaction loop.  It makes sense to tag this work on to the compaction
+loop, because it is spiritually aligned: offloading data for archived branches improves storage efficiency.
+
+The condition for offload is simple:
+ - a `Timeline` object exists with state `Archived`
+ - the timeline does not have any non-offloaded children.
+ 
+ Regarding the condition that children must be offloaded, this will always be eventually true, because
+ we enforce at the API level that children of archived timelines must themselves be archived, and all
+ archived timelines will eventually be offloaded.
+
+Offloading a timeline is simple:
+- Read the timeline's attributes that we will store in its offloaded state (especially its logical size)
+- Call `shutdown()` on the timeline and remove it from the `Tenant` (as if we were about to delete it)
+- Erase all the timeline's content from local storage (`remove_dir_all` on its path)
+- Write the tenant manifest to S3 to prevent this timeline being loaded on next start.
+
+#### Archive branch optimization (flattening)
+
+When we offloaded a branch, it might have had some history that prevented rewriting it to a single
+point in time set of image layers.  For example, a branch might have several days of writes and a 7
+day PITR: when we archive it, it still has those days of history.
+
+Once the PITR has expired, we have an opportunity to reduce the physical footprint of the branch by:
+- Writing compressed image layers within the archived branch, as these are more efficient as a way of storing
+  a point in time compared with delta layers
+- Updating the branch's offload metadata to indicate that this branch no longer depends on its ancestor
+  for data, i.e. the ancestor is free to GC layers files at+below the branch point
+
+Fully compacting an archived branch into image layers at a single LSN may be thought of as *flattening* the
+branch, such that it is now a one-dimensional keyspace rather than a two-dimensional key/lsn space. It becomes
+a true snapshot at that LSN.
+
+It is not always more efficient to flatten a branch than to keep some extra history on the parent: this
+is described in more detail in [optimizations](#delaying-storage-optimization-if-retaining-parent-layers-is-cheaper)
+
+Archive branch optimization should be done _before_ background offloads during compaction, because there may
+be timelines which are ready to be offloaded but also would benefit from the optimization step before
+being offloaded.  For example, a branch which has already fallen out of PITR window and has no history
+of its own may be immediately re-written as a series of image layers before being offloaded.
+
+### Consumption metrics
+
+Archived timelines and offloaded timelines will be excluded from the synthetic size calculation, in anticipating
+that billing structures based on consumption metrics are highly likely to apply different $/GB rates to archived
+vs. ordinary content.
+
+Archived and offloaded timelines' logical size will be reported under the existing `timeline_logical_size`
+variant of `MetricsKey`: receivers are then free to bill on this metric as they please.
+
+### Secondary locations
+
+Archived timelines (including offloaded timelines) will be excluded from heatmaps, and thereby
+when a timeline is archived, after the next cycle of heatmap upload & secondary download, its contents
+will be dropped from secondary locations.
+
+### Sharding
+
+Archiving or activating a timeline will be done symmetrically across all shards in a tenant, in
+the same way that timeline creation and deletion is done.  There are no special rules about ordering:
+the storage controller may dispatch concurrent calls to all shards when archiving or activating a timeline.
+
+Since consumption metrics are only transmitted from shard zero, the state of archival on this shard
+will be authoritative for consumption metrics.
+
+## Error cases
+
+### Errors in sharded tenants
+
+If one shard in a tenant fails an operation but others succeed, the tenant may end up in a mixed
+state, where a timeline is archived on some shards but not on others.  
+
+We will not bother implementing a rollback mechanism for this: errors in archiving/activating a timeline
+are either transient (e.g. S3 unavailable, shutting down), or the fault of the caller (NotFound, BadRequest).
+In the transient case callers are expected to retry until success, or to make appropriate API calls to clear
+up their mistake.  We rely on this good behavior of callers to eventually get timelines into a consistent
+state across all shards.  If callers do leave a timeline in an inconsistent state across shards, this doesn't
+break anything, it's just "weird".
+
+This is similar to the status quo for timeline creation and deletion: callers are expected to retry
+these operations until they succeed.
+
+### Archiving/activating
+
+Archiving/activating a timeline can fail in a limited number of ways:
+1. I/O error storing/reading the timeline's updated index
+    - These errors are always retryable: a fundamental design assumption of the pageserver is that remote
+      storage errors are always transient. 
+2. NotFound if the timeline doesn't exist
+    - Callers of the API are expected to avoid calling deletion and archival APIs concurrently.
+    - The storage controller has runtime locking to prevent races such as deleting a timeline while
+      archiving it.
+3. BadRequest if the rules around ancestors/descendents of archived timelines would be violated
+    - Callers are expected to do their own checks to avoid hitting this case.  If they make
+      a mistake and encounter this error, they should give up.
+
+### Offloading
+
+Offloading can only fail if remote storage is unavailable, which would prevent us from writing the
+tenant manifest.  In such error cases, we give up in the expectation that offloading will be tried 
+again at the next iteration of the compaction loop.
+
+### Archive branch optimization
+
+Optimization is a special form of compaction, so can encounter all the same errors as regular compaction
+can: it should return Result<(), CompactionError>, and as with compaction it will be retried on
+the next iteration of the compaction loop.
+
+## Optimizations
+
+### Delaying storage optimization if retaining parent layers is cheaper
+
+Optimizing archived branches to image layers and thereby enabling parent branch GC to progress
+is a safe default: archived branches cannot over-fill a pageserver's local disk, and once they
+are offloaded to S3 they're totally safe, inert things.
+
+However, in some cases it can be advantageous to retain extra history on their parent branch rather
+than flattening the archived branch.  For example, if a 1TB parent branch is rather slow-changing (1GB
+of data per day), and archive branches are being created nightly, then writing out full 1TB image layers
+for each nightly branch is inefficient compared with just keeping more history on the main branch.
+
+Getting this right requires consideration of:
+- Compaction: if keeping more history on the main branch is going to prompt the main branch's compaction to
+  write out extra image layers, then it might make more sense to just write out the image layers on
+  the archived branch.
+- Metadata bloat: keeping extra history on a parent branch doesn't just cost GB of storage, it makes
+  the layer map (and index_part) bigger.  There are practical limits beyond which writing an indefinitely
+  large layer map can cause problems elsewhere.
+
+This optimization can probably be implemented quite cheaply with some basic heuristics like:
+- don't bother doing optimization on an archive branch if the LSN distance between
+  its branch point and the end of the PITR window is <5% of the logical size of the archive branch.
+- ...but, Don't keep more history on the main branch than double the PITR
+
+### Creating a timeline in archived state (a snapshot)
+
+Sometimes, one might want to create a branch with no history, which will not be written to
+before it is archived.  This is a snapshot, although we do not require a special snapshot API,
+since a snapshot can be represented as a timeline with no history.
+
+This can be accomplished by simply creating a timeline and then immediately archiving it, but
+that is somewhat wasteful: this timeline it will spin up various tasks and open a connection to the storage
+broker to try and ingest WAL, before being shutdown in the subsequent archival call.  To explicitly
+support this common special case, we may add a parameter to the timeline creation API which
+creates a timeline directly into the archived state.
+
+Such a timeline creation will do exactly two I/Os at creation time:
+- write the index_part object to record the timeline's existence
+- when the timeline is offloaded in the next iteration of the compaction loop (~20s later),
+  write the tenant manifest.
+
+Later, when the timeline falls off the end of the PITR interval, the usual offload logic will wake
+up the 'snapshot' branch and write out image layers.
+
+## Future Work
+
+### Enabling `fullbackup` dumps from archive branches
+
+It would be useful to be able to export an archive branch to another system, or for use in a local
+postgres database.
+
+This could be implemented as a general capability for all branches, in which case it would "just work"
+for archive branches by activating them.  However, downloading all the layers in a branch just to generate
+a fullbackup is a bit inefficient: we could implement a special case for flattened archived branches
+which streams image layers from S3 and outputs the fullbackup stream without writing the layers out to disk.
+
+Implementing `fullbackup` is a bit more complicated than this because of sharding, but solving that problem
+is unrelated to the topic of archived branches (it probably involves having each shard write out a fullbackup 
+stream to S3 in an intermediate format and, then having one node stitch them together).
+
+### Tagging layers from archived branches
+
+When we know a layer is an image layer written for an archived branch that has fallen off the PITR window,
+we may add tags to the S3 objects to enable writing lifecycle policies that transition such layers to even
+cheaper storage.
+
+This could be done for all archived layers, or it could be driven by the archival API, to give the pageserver
+external hints on which branches are likely to be reactivated, and which branches are good candidates for
+tagging for low performance storage.
+
+Tagging+lifecycles is just one mechanism: one might also directly use S3 storage classes.  Other clouds' object
+stores have similar mechanisms.
+
+### Storing sequences of archive branches as deltas
+
+When archived branches are used as scheduled snapshots, we could store them even more efficiently
+by encoding them as deltas relative to each other (i.e. for nightly snapshots, when we do the
+storage optimization for Tuesday's snapshot, we would read Monday's snapshot and store only the modified
+pages). This is the kind of encoding that many backup storage systems use.
+
+The utility of this depends a lot on the churn rate of the data, and the cost of doing the delta encoding
+vs. just writing out a simple stream of the entire database.  For smaller databases, writing out a full
+copy is pretty trivial (e.g. writing a compressed copy of a 10GiB database to S3 can take under 10 seconds,
+so the complexity tradeoff of diff-encoding it is dubious).
+
+One does not necessarily have to read-back the previous snapshot in order to encoded the next one: if the
+pageserver knows about the schedule, it can intentionally retain extra history on the main branch so that
+we can say: "A branch exists from Monday night.  I have Monday night's data still active in the main branch,
+so now I can read at the Monday LSN and the Tuesday LSN, calculate the delta, and store it as Tuesday's
+delta snapshot".
+
+Clearly this all requires careful housekeeping to retain the relationship between branches that depend on
+each other: perhaps this would be done by making the archive branches have child/parent relationships with
+each other, or perhaps we would permit them to remain children of their original parent, but additionally
+have a relationship with the snapshot they're encoded relative to.
+
+Activating a branch that is diff-encoded may require activating several earlier branches too, so figuring
+out how frequently to write a full copy is important.  This is essentially a zoomed-out version of what
+we do with delta layers and image layers within a timeline, except each "layer" is a whole timeline.
+
+
+## FAQ/Alternatives
+
+### Store all timelines in the tenant manifest
+
+Rather than special-casing offloaded timelines in the offload manifest, we could store a total
+manifest of all timelines, eliminating the need for the pageserver to list timelines in S3 on
+startup.
+
+That would be a more invasive change (require hooking in to timeline creation), and would
+generate much more I/O to this manifest for tenants that had many branches _and_ frequent
+create/delete cycles for short lived branches.  Restricting the manifest to offloaded timelines
+means that we only have to cope with the rate at which long-lived timelines are archived, rather
+than the rate at which sort lived timelines are created & destroyed.
+
+### Automatically archiving/activating timelines without external API calls
+
+We could implement TTL driven offload of timelines, waking them up when a page request
+arrives.
+
+This has downsides:
+- Opacity: if we do TTL-driven offload inside the pageserver, then the end user doesn't
+  know which of their branches are in this state, and might get a surprise when they try
+  to use such a branch.
+- Price fluctuation: if the archival of a branch is used in end user pricing, then users
+  prefer clarity & consistency.  Ideally a branch's storage should cost the same from the moment it
+  is created, rather than having a usage-dependency storage price.
+- Complexity: enabling the page service to call up into the Tenant to activate a timeline
+  would be awkward, compared with an external entry point.
+
+### Make offloaded a state of Timeline
+
+To reduce the operator-facing complexity of having some timelines APIs that only return
+non-offloaded timelines, we could build the offloaded state into the Timeline type.
+
+`timeline.rs` is already one of the most egregiously long source files in the tree, so
+this is rejected on the basis that we need to avoid making that complexity worse.
\ No newline at end of file

From 6caf70241783e7324d38453f919c78aa4f7f6faa Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Thu, 11 Jul 2024 11:07:12 +0200
Subject: [PATCH 224/412] Run Performance bench on more platforms (#8312)

## Problem

https://github.com/neondatabase/cloud/issues/14721

## Summary of changes

add one more platform to benchmarking job


https://github.com/neondatabase/neon/blob/57535c039c938f7c179693d9db8b052912019823/.github/workflows/benchmarking.yml#L57C3-L126

Run with pg 16, provisioner k8-neonvm by default on the new platform.

Adjust some test cases to

- not depend on database client <-> database server latency by pushing
loops into server side pl/pgSQL functions
- increase statement and test timeouts

First successful run of these job steps

https://github.com/neondatabase/neon/actions/runs/9869817756/job/27254280428
---
 .github/workflows/benchmarking.yml        | 21 +++++++++++++----
 test_runner/performance/test_hot_page.py  | 28 +++++++++++++++++------
 test_runner/performance/test_hot_table.py | 21 +++++++++++++----
 3 files changed, 55 insertions(+), 15 deletions(-)

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 899cae2b86..d038f64f15 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -56,15 +56,26 @@ concurrency:
 jobs:
   bench:
     if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
+    strategy:
+      matrix:
+        include:
+          - DEFAULT_PG_VERSION: 14
+            PLATFORM: "neon-staging"
+            region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
+            provisioner: 'k8s-pod' 
+          - DEFAULT_PG_VERSION: 16
+            PLATFORM: "azure-staging"
+            region_id: 'azure-eastus2'
+            provisioner: 'k8s-neonvm'
     env:
       TEST_PG_BENCH_DURATIONS_MATRIX: "300"
       TEST_PG_BENCH_SCALES_MATRIX: "10,100"
       POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 14
+      DEFAULT_PG_VERSION: ${{ matrix.DEFAULT_PG_VERSION }}
       TEST_OUTPUT: /tmp/test_output
       BUILD_TYPE: remote
       SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
-      PLATFORM: "neon-staging"
+      PLATFORM: ${{ matrix.PLATFORM }}
 
     runs-on: [ self-hosted, us-east-2, x64 ]
     container:
@@ -85,9 +96,10 @@ jobs:
       id: create-neon-project
       uses: ./.github/actions/neon-project-create
       with:
-        region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
+        region_id: ${{ matrix.region_id }}
         postgres_version: ${{ env.DEFAULT_PG_VERSION }}
         api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+        provisioner: ${{ matrix.provisioner }}
 
     - name: Run benchmark
       uses: ./.github/actions/run-python-test-set
@@ -96,13 +108,14 @@ jobs:
         test_selection: performance
         run_in_parallel: false
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
         # Set --sparse-ordering option of pytest-order plugin
         # to ensure tests are running in order of appears in the file.
         # It's important for test_perf_pgbench.py::test_pgbench_remote_* tests
         extra_params:
           -m remote_cluster
           --sparse-ordering
-          --timeout 5400
+          --timeout 14400
           --ignore test_runner/performance/test_perf_olap.py
           --ignore test_runner/performance/test_perf_pgvector_queries.py
           --ignore test_runner/performance/test_logical_replication.py
diff --git a/test_runner/performance/test_hot_page.py b/test_runner/performance/test_hot_page.py
index d9785dd87e..5e97c7cddf 100644
--- a/test_runner/performance/test_hot_page.py
+++ b/test_runner/performance/test_hot_page.py
@@ -16,20 +16,34 @@ from pytest_lazyfixture import lazy_fixture
 )
 def test_hot_page(env: PgCompare):
     # Update the same page many times, then measure read performance
-    num_writes = 1000000
 
     with closing(env.pg.connect()) as conn:
         with conn.cursor() as cur:
             cur.execute("drop table if exists t, f;")
+            num_writes = 1000000
 
-            # Write many updates to the same row
+            # Use a PL/pgSQL block to perform many updates to the same row
+            # without depending on the latency between database client and postgres
+            # server
+            # - however a single staement should not run into a timeout so we increase it
+            cur.execute("SET statement_timeout = '4h';")
             with env.record_duration("write"):
-                cur.execute("create table t (i integer);")
-                cur.execute("insert into t values (0);")
-                for i in range(num_writes):
-                    cur.execute(f"update t set i = {i};")
+                cur.execute(
+                    f"""
+                DO $$
+                BEGIN
+                    create table t (i integer);
+                    insert into t values (0);
 
-            # Write 3-4 MB to evict t from compute cache
+                    FOR j IN 1..{num_writes} LOOP
+                        update t set i = j;
+                    END LOOP;
+                END $$;
+                """
+                )
+
+            # Write ca 350 MB to evict t from compute shared buffers (128 MB)
+            # however it will still be in LFC, so I do not really understand the point of this test
             cur.execute("create table f (i integer);")
             cur.execute("insert into f values (generate_series(1,100000));")
 
diff --git a/test_runner/performance/test_hot_table.py b/test_runner/performance/test_hot_table.py
index 5fcffc8afb..9a78c92ec0 100644
--- a/test_runner/performance/test_hot_table.py
+++ b/test_runner/performance/test_hot_table.py
@@ -16,8 +16,8 @@ from pytest_lazyfixture import lazy_fixture
 )
 def test_hot_table(env: PgCompare):
     # Update a small table many times, then measure read performance
-    num_rows = 100000  # Slightly larger than shared buffers size  TODO validate
-    num_writes = 1000000
+    num_rows = 100000  # initial table size only about 4 MB
+    num_writes = 10000000  # write approximately 349 MB blocks > 128 MB shared_buffers
     num_reads = 10
 
     with closing(env.pg.connect()) as conn:
@@ -28,8 +28,21 @@ def test_hot_table(env: PgCompare):
             with env.record_duration("write"):
                 cur.execute("create table t (i integer primary key);")
                 cur.execute(f"insert into t values (generate_series(1,{num_rows}));")
-                for i in range(num_writes):
-                    cur.execute(f"update t set i = {i + num_rows} WHERE i = {i};")
+                # PL/pgSQL block to perform updates (and avoid latency between client and server)
+                # - however a single staement should not run into a timeout so we increase it
+                cur.execute("SET statement_timeout = '4h';")
+                cur.execute(
+                    f"""
+                DO $$
+                DECLARE
+                    r integer := {num_rows};
+                BEGIN
+                    FOR j IN 1..{num_writes} LOOP
+                        UPDATE t SET i = j + r WHERE i = j;
+                    END LOOP;
+                END $$;
+                """
+                )
 
             # Read the table
             with env.record_duration("read"):

From 5bba3e3c75d0840c50023eaf2be2d9a483f2a020 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 11 Jul 2024 15:17:07 +0200
Subject: [PATCH 225/412] pageserver: remove `trace_read_requests` (#8338)

`trace_read_requests` is a per `Tenant`-object option.
But the `handle_pagerequests` loop doesn't know which
`Tenant` object (i.e., which shard) the request is for.

The remaining use of the `Tenant` object is to check `tenant.cancel`.
That check is incorrect [if the pageserver hosts multiple
shards](https://github.com/neondatabase/neon/issues/7427#issuecomment-2220577518).
I'll fix that in a future PR where I completely eliminate the holding
of `Tenant/Timeline` objects across requests.
See [my code RFC](https://github.com/neondatabase/neon/pull/8286) for
the
high level idea.

Note that we can always bring the tracing functionality if we need it.
But since it's actually about logging the `page_service` wire bytes,
it should be a `page_service`-level config option, not per-Tenant.
And for enabling tracing on a single connection, we can implement
a `set pageserver_trace_connection;` option.
---
 Cargo.lock                                    |  11 --
 Cargo.toml                                    |   1 -
 control_plane/src/pageserver.rs               |  10 --
 libs/pageserver_api/src/models.rs             |   1 -
 libs/utils/src/id.rs                          |  11 --
 pageserver/src/config.rs                      |  45 -----
 pageserver/src/http/openapi_spec.yml          |   2 -
 pageserver/src/lib.rs                         |   1 -
 pageserver/src/page_service.rs                |  19 --
 pageserver/src/tenant.rs                      |   8 -
 pageserver/src/tenant/config.rs               |  10 --
 pageserver/src/trace.rs                       |  36 ----
 test_runner/fixtures/compare_fixtures.py      |   2 -
 .../regress/test_attach_tenant_config.py      |   1 -
 test_runner/regress/test_read_trace.py        |  39 ----
 trace/Cargo.toml                              |  13 --
 trace/src/main.rs                             | 167 ------------------
 17 files changed, 377 deletions(-)
 delete mode 100644 pageserver/src/trace.rs
 delete mode 100644 test_runner/regress/test_read_trace.py
 delete mode 100644 trace/Cargo.toml
 delete mode 100644 trace/src/main.rs

diff --git a/Cargo.lock b/Cargo.lock
index 9fb3f5385d..4b1525edee 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -6510,17 +6510,6 @@ version = "0.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52"
 
-[[package]]
-name = "trace"
-version = "0.1.0"
-dependencies = [
- "anyhow",
- "clap",
- "pageserver_api",
- "utils",
- "workspace_hack",
-]
-
 [[package]]
 name = "tracing"
 version = "0.1.37"
diff --git a/Cargo.toml b/Cargo.toml
index fc3dd51809..6bad8e3b20 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -15,7 +15,6 @@ members = [
     "storage_controller",
     "storage_scrubber",
     "workspace_hack",
-    "trace",
     "libs/compute_api",
     "libs/pageserver_api",
     "libs/postgres_ffi",
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index f0403b1796..5f2373e95a 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -349,11 +349,6 @@ impl PageServerNode {
                 .map(|x| x.parse::<NonZeroU64>())
                 .transpose()
                 .context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?,
-            trace_read_requests: settings
-                .remove("trace_read_requests")
-                .map(|x| x.parse::<bool>())
-                .transpose()
-                .context("Failed to parse 'trace_read_requests' as bool")?,
             eviction_policy: settings
                 .remove("eviction_policy")
                 .map(serde_json::from_str)
@@ -454,11 +449,6 @@ impl PageServerNode {
                     .map(|x| x.parse::<NonZeroU64>())
                     .transpose()
                     .context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?,
-                trace_read_requests: settings
-                    .remove("trace_read_requests")
-                    .map(|x| x.parse::<bool>())
-                    .transpose()
-                    .context("Failed to parse 'trace_read_requests' as bool")?,
                 eviction_policy: settings
                     .remove("eviction_policy")
                     .map(serde_json::from_str)
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index d360cc6e87..6abdcb88d0 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -294,7 +294,6 @@ pub struct TenantConfig {
     pub walreceiver_connect_timeout: Option<String>,
     pub lagging_wal_timeout: Option<String>,
     pub max_lsn_wal_lag: Option<NonZeroU64>,
-    pub trace_read_requests: Option<bool>,
     pub eviction_policy: Option<EvictionPolicy>,
     pub min_resident_size_override: Option<u64>,
     pub evictions_low_residence_duration_metric_threshold: Option<String>,
diff --git a/libs/utils/src/id.rs b/libs/utils/src/id.rs
index 0409001f4f..db468e3054 100644
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -302,17 +302,6 @@ pub struct TenantId(Id);
 
 id_newtype!(TenantId);
 
-/// Neon Connection Id identifies long-lived connections (for example a pagestream
-/// connection with the page_service). Is used for better logging and tracing
-///
-/// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look
-/// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`.
-/// See [`Id`] for alternative ways to serialize it.
-#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
-pub struct ConnectionId(Id);
-
-id_newtype!(ConnectionId);
-
 // A pair uniquely identifying Neon instance.
 #[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)]
 pub struct TenantTimelineId {
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 17bc427b2c..5b103b551f 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -12,7 +12,6 @@ use serde::de::IntoDeserializer;
 use std::env;
 use storage_broker::Uri;
 use utils::crashsafe::path_with_suffix_extension;
-use utils::id::ConnectionId;
 use utils::logging::SecretString;
 
 use once_cell::sync::OnceCell;
@@ -870,22 +869,6 @@ impl PageServerConf {
         )
     }
 
-    pub fn traces_path(&self) -> Utf8PathBuf {
-        self.workdir.join("traces")
-    }
-
-    pub fn trace_path(
-        &self,
-        tenant_shard_id: &TenantShardId,
-        timeline_id: &TimelineId,
-        connection_id: &ConnectionId,
-    ) -> Utf8PathBuf {
-        self.traces_path()
-            .join(tenant_shard_id.to_string())
-            .join(timeline_id.to_string())
-            .join(connection_id.to_string())
-    }
-
     /// Turns storage remote path of a file into its local path.
     pub fn local_path(&self, remote_path: &RemotePath) -> Utf8PathBuf {
         remote_path.with_base(&self.workdir)
@@ -1560,34 +1543,6 @@ broker_endpoint = '{broker_endpoint}'
         Ok(())
     }
 
-    #[test]
-    fn parse_tenant_config() -> anyhow::Result<()> {
-        let tempdir = tempdir()?;
-        let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
-
-        let broker_endpoint = "http://127.0.0.1:7777";
-        let trace_read_requests = true;
-
-        let config_string = format!(
-            r#"{ALL_BASE_VALUES_TOML}
-pg_distrib_dir='{pg_distrib_dir}'
-broker_endpoint = '{broker_endpoint}'
-
-[tenant_config]
-trace_read_requests = {trace_read_requests}"#,
-        );
-
-        let toml = config_string.parse()?;
-
-        let conf = PageServerConf::parse_and_validate(&toml, &workdir)?;
-        assert_eq!(
-            conf.default_tenant_conf.trace_read_requests, trace_read_requests,
-            "Tenant config from pageserver config file should be parsed and udpated values used as defaults for all tenants",
-        );
-
-        Ok(())
-    }
-
     #[test]
     fn parse_incorrect_tenant_config() -> anyhow::Result<()> {
         let config_string = r#"
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 5ba329f05e..ae109ec1e7 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -873,8 +873,6 @@ components:
           type: string
         max_lsn_wal_lag:
           type: integer
-        trace_read_requests:
-          type: boolean
         heatmap_period:
           type: string
     TenantConfigResponse:
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index ac6b9b4f2a..63c677574f 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -23,7 +23,6 @@ pub mod span;
 pub(crate) mod statvfs;
 pub mod task_mgr;
 pub mod tenant;
-pub mod trace;
 pub mod utilization;
 pub mod virtual_file;
 pub mod walingest;
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index c10c2f2a0f..f94b0d335e 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -36,7 +36,6 @@ use tokio::io::AsyncWriteExt;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
-use utils::id::ConnectionId;
 use utils::sync::gate::GateGuard;
 use utils::{
     auth::{Claims, Scope, SwappableJwtAuth},
@@ -66,7 +65,6 @@ use crate::tenant::GetTimelineError;
 use crate::tenant::PageReconstructError;
 use crate::tenant::Tenant;
 use crate::tenant::Timeline;
-use crate::trace::Tracer;
 use pageserver_api::key::rel_block_to_key;
 use pageserver_api::reltag::SlruKind;
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
@@ -430,18 +428,6 @@ impl PageServerHandler {
             .get_active_tenant_with_timeout(tenant_id, ShardSelector::First, ACTIVE_TENANT_TIMEOUT)
             .await?;
 
-        // Make request tracer if needed
-        let mut tracer = if tenant.get_trace_read_requests() {
-            let connection_id = ConnectionId::generate();
-            let path =
-                tenant
-                    .conf
-                    .trace_path(&tenant.tenant_shard_id(), &timeline_id, &connection_id);
-            Some(Tracer::new(path))
-        } else {
-            None
-        };
-
         // switch client to COPYBOTH
         pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
         self.flush_cancellable(pgb, &tenant.cancel).await?;
@@ -473,11 +459,6 @@ impl PageServerHandler {
             trace!("query: {copy_data_bytes:?}");
             fail::fail_point!("ps::handle-pagerequest-message");
 
-            // Trace request if needed
-            if let Some(t) = tracer.as_mut() {
-                t.trace(&copy_data_bytes)
-            }
-
             let neon_fe_msg =
                 PagestreamFeMessage::parse(&mut copy_data_bytes.reader(), protocol_version)?;
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index eef8dc104c..bf23513527 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2341,13 +2341,6 @@ impl Tenant {
             .unwrap_or(self.conf.default_tenant_conf.pitr_interval)
     }
 
-    pub fn get_trace_read_requests(&self) -> bool {
-        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
-        tenant_conf
-            .trace_read_requests
-            .unwrap_or(self.conf.default_tenant_conf.trace_read_requests)
-    }
-
     pub fn get_min_resident_size_override(&self) -> Option<u64> {
         let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
         tenant_conf
@@ -3718,7 +3711,6 @@ pub(crate) mod harness {
                 walreceiver_connect_timeout: Some(tenant_conf.walreceiver_connect_timeout),
                 lagging_wal_timeout: Some(tenant_conf.lagging_wal_timeout),
                 max_lsn_wal_lag: Some(tenant_conf.max_lsn_wal_lag),
-                trace_read_requests: Some(tenant_conf.trace_read_requests),
                 eviction_policy: Some(tenant_conf.eviction_policy),
                 min_resident_size_override: tenant_conf.min_resident_size_override,
                 evictions_low_residence_duration_metric_threshold: Some(
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index 5b532e4830..48ff17db94 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -335,7 +335,6 @@ pub struct TenantConf {
     /// A lagging safekeeper will be changed after `lagging_wal_timeout` time elapses since the last WAL update,
     /// to avoid eager reconnects.
     pub max_lsn_wal_lag: NonZeroU64,
-    pub trace_read_requests: bool,
     pub eviction_policy: EvictionPolicy,
     pub min_resident_size_override: Option<u64>,
     // See the corresponding metric's help string.
@@ -436,10 +435,6 @@ pub struct TenantConfOpt {
     #[serde(default)]
     pub max_lsn_wal_lag: Option<NonZeroU64>,
 
-    #[serde(skip_serializing_if = "Option::is_none")]
-    #[serde(default)]
-    pub trace_read_requests: Option<bool>,
-
     #[serde(skip_serializing_if = "Option::is_none")]
     #[serde(default)]
     pub eviction_policy: Option<EvictionPolicy>,
@@ -519,9 +514,6 @@ impl TenantConfOpt {
                 .lagging_wal_timeout
                 .unwrap_or(global_conf.lagging_wal_timeout),
             max_lsn_wal_lag: self.max_lsn_wal_lag.unwrap_or(global_conf.max_lsn_wal_lag),
-            trace_read_requests: self
-                .trace_read_requests
-                .unwrap_or(global_conf.trace_read_requests),
             eviction_policy: self.eviction_policy.unwrap_or(global_conf.eviction_policy),
             min_resident_size_override: self
                 .min_resident_size_override
@@ -581,7 +573,6 @@ impl Default for TenantConf {
                 .expect("cannot parse default walreceiver lagging wal timeout"),
             max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
                 .expect("cannot parse default max walreceiver Lsn wal lag"),
-            trace_read_requests: false,
             eviction_policy: EvictionPolicy::NoEviction,
             min_resident_size_override: None,
             evictions_low_residence_duration_metric_threshold: humantime::parse_duration(
@@ -659,7 +650,6 @@ impl From<TenantConfOpt> for models::TenantConfig {
             walreceiver_connect_timeout: value.walreceiver_connect_timeout.map(humantime),
             lagging_wal_timeout: value.lagging_wal_timeout.map(humantime),
             max_lsn_wal_lag: value.max_lsn_wal_lag,
-            trace_read_requests: value.trace_read_requests,
             eviction_policy: value.eviction_policy,
             min_resident_size_override: value.min_resident_size_override,
             evictions_low_residence_duration_metric_threshold: value
diff --git a/pageserver/src/trace.rs b/pageserver/src/trace.rs
deleted file mode 100644
index 18ec269198..0000000000
--- a/pageserver/src/trace.rs
+++ /dev/null
@@ -1,36 +0,0 @@
-use bytes::Bytes;
-use camino::Utf8PathBuf;
-use std::{
-    fs::{create_dir_all, File},
-    io::{BufWriter, Write},
-};
-
-pub struct Tracer {
-    writer: BufWriter<File>,
-}
-
-impl Drop for Tracer {
-    fn drop(&mut self) {
-        self.flush()
-    }
-}
-
-impl Tracer {
-    pub fn new(path: Utf8PathBuf) -> Self {
-        let parent = path.parent().expect("failed to parse parent path");
-        create_dir_all(parent).expect("failed to create trace dir");
-
-        let file = File::create(path).expect("failed to create trace file");
-        Tracer {
-            writer: BufWriter::new(file),
-        }
-    }
-
-    pub fn trace(&mut self, msg: &Bytes) {
-        self.writer.write_all(msg).expect("failed to write trace");
-    }
-
-    pub fn flush(&mut self) {
-        self.writer.flush().expect("failed to flush trace file");
-    }
-}
diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py
index 429b6af548..08215438e1 100644
--- a/test_runner/fixtures/compare_fixtures.py
+++ b/test_runner/fixtures/compare_fixtures.py
@@ -109,8 +109,6 @@ class NeonCompare(PgCompare):
 
         # Create tenant
         tenant_conf: Dict[str, str] = {}
-        if False:  # TODO add pytest setting for this
-            tenant_conf["trace_read_requests"] = "true"
         self.tenant, _ = self.env.neon_cli.create_tenant(conf=tenant_conf)
 
         # Create timeline
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index f2ee2b70aa..a7eda73d4c 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -168,7 +168,6 @@ def test_fully_custom_config(positive_env: NeonEnv):
             "refill_amount": 1000,
             "max": 1000,
         },
-        "trace_read_requests": True,
         "walreceiver_connect_timeout": "13m",
         "image_layer_creation_check_threshold": 1,
         "switch_aux_file_policy": "cross-validation",
diff --git a/test_runner/regress/test_read_trace.py b/test_runner/regress/test_read_trace.py
deleted file mode 100644
index cc5853b727..0000000000
--- a/test_runner/regress/test_read_trace.py
+++ /dev/null
@@ -1,39 +0,0 @@
-from contextlib import closing
-
-from fixtures.common_types import Lsn
-from fixtures.neon_fixtures import NeonEnvBuilder
-from fixtures.pageserver.utils import wait_for_last_record_lsn
-from fixtures.utils import query_scalar
-
-
-# This test demonstrates how to collect a read trace. It's useful until
-# it gets replaced by a test that actually does stuff with the trace.
-#
-# Additionally, tests that pageserver is able to create tenants with custom configs.
-def test_read_request_tracing(neon_env_builder: NeonEnvBuilder):
-    neon_env_builder.num_safekeepers = 1
-    env = neon_env_builder.init_start(
-        initial_tenant_conf={
-            "trace_read_requests": "true",
-        }
-    )
-
-    tenant_id = env.initial_tenant
-    timeline_id = env.initial_timeline
-    endpoint = env.endpoints.create_start("main")
-
-    with closing(endpoint.connect()) as conn:
-        with conn.cursor() as cur:
-            cur.execute("create table t (i integer);")
-            cur.execute(f"insert into t values (generate_series(1,{10000}));")
-            cur.execute("select count(*) from t;")
-            current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
-    # wait until pageserver receives that data
-    pageserver_http = env.pageserver.http_client()
-    wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn)
-
-    # Stop postgres so we drop the connection and flush the traces
-    endpoint.stop()
-
-    trace_path = env.pageserver.workdir / "traces" / str(tenant_id) / str(timeline_id)
-    assert trace_path.exists()
diff --git a/trace/Cargo.toml b/trace/Cargo.toml
deleted file mode 100644
index d6eed3f49c..0000000000
--- a/trace/Cargo.toml
+++ /dev/null
@@ -1,13 +0,0 @@
-[package]
-name = "trace"
-version = "0.1.0"
-edition.workspace = true
-license.workspace = true
-
-[dependencies]
-clap.workspace = true
-anyhow.workspace = true
-
-pageserver_api.workspace = true
-utils.workspace = true
-workspace_hack.workspace = true
diff --git a/trace/src/main.rs b/trace/src/main.rs
deleted file mode 100644
index 79e1df988d..0000000000
--- a/trace/src/main.rs
+++ /dev/null
@@ -1,167 +0,0 @@
-//! A tool for working with read traces generated by the pageserver.
-use std::collections::HashMap;
-use std::path::PathBuf;
-use std::str::FromStr;
-use std::{
-    fs::{read_dir, File},
-    io::BufReader,
-};
-
-use pageserver_api::models::{
-    PagestreamFeMessage, PagestreamGetPageRequest, PagestreamProtocolVersion,
-};
-use utils::id::{ConnectionId, TenantId, TimelineId};
-
-use clap::{Parser, Subcommand};
-
-/// Utils for working with pageserver read traces. For generating
-/// traces, see the `trace_read_requests` tenant config option.
-#[derive(Parser, Debug)]
-#[command(author, version, about, long_about = None)]
-struct Args {
-    /// Path of trace directory
-    #[arg(short, long)]
-    path: PathBuf,
-
-    #[command(subcommand)]
-    command: Command,
-}
-
-/// What to do with the read trace
-#[derive(Subcommand, Debug)]
-enum Command {
-    /// List traces in the directory
-    List,
-
-    /// Print the traces in text format
-    Dump,
-
-    /// Print stats and anomalies about the traces
-    Analyze,
-}
-
-// HACK This function will change and improve as we see what kind of analysis is useful.
-//      Currently it collects the difference in blkno of consecutive GetPage requests,
-//      and counts the frequency of each value. This information is useful in order to:
-//      - see how sequential a workload is by seeing how often the delta is 1
-//      - detect any prefetching anomalies by looking for negative deltas during seqscan
-fn analyze_trace<R: std::io::Read>(mut reader: R) {
-    let mut total = 0; // Total requests traced
-    let mut cross_rel = 0; // Requests that ask for different rel than previous request
-    let mut deltas = HashMap::<i32, u32>::new(); // Consecutive blkno differences
-    let mut prev: Option<PagestreamGetPageRequest> = None;
-
-    // Compute stats
-    while let Ok(msg) = PagestreamFeMessage::parse(&mut reader, PagestreamProtocolVersion::V2) {
-        match msg {
-            PagestreamFeMessage::Exists(_) => {}
-            PagestreamFeMessage::Nblocks(_) => {}
-            PagestreamFeMessage::GetSlruSegment(_) => {}
-            PagestreamFeMessage::GetPage(req) => {
-                total += 1;
-
-                if let Some(prev) = prev {
-                    if prev.rel == req.rel {
-                        let delta = (req.blkno as i32) - (prev.blkno as i32);
-                        deltas.entry(delta).and_modify(|c| *c += 1).or_insert(1);
-                    } else {
-                        cross_rel += 1;
-                    }
-                }
-                prev = Some(req);
-            }
-            PagestreamFeMessage::DbSize(_) => {}
-        };
-    }
-
-    // Print stats.
-    let mut other = deltas.len();
-    deltas.retain(|_, count| *count > 300);
-    other -= deltas.len();
-    dbg!(total);
-    dbg!(cross_rel);
-    dbg!(other);
-    dbg!(deltas);
-}
-
-fn dump_trace<R: std::io::Read>(mut reader: R) {
-    while let Ok(msg) = PagestreamFeMessage::parse(&mut reader, PagestreamProtocolVersion::V2) {
-        println!("{msg:?}");
-    }
-}
-
-#[derive(Debug)]
-struct TraceFile {
-    #[allow(dead_code)]
-    pub tenant_id: TenantId,
-
-    #[allow(dead_code)]
-    pub timeline_id: TimelineId,
-
-    #[allow(dead_code)]
-    pub connection_id: ConnectionId,
-
-    pub path: PathBuf,
-}
-
-fn get_trace_files(traces_dir: &PathBuf) -> anyhow::Result<Vec<TraceFile>> {
-    let mut trace_files = Vec::<TraceFile>::new();
-
-    // Trace files are organized as {tenant_id}/{timeline_id}/{connection_id}
-    for tenant_dir in read_dir(traces_dir)? {
-        let entry = tenant_dir?;
-        let path = entry.path();
-        let tenant_id = TenantId::from_str(path.file_name().unwrap().to_str().unwrap())?;
-
-        for timeline_dir in read_dir(path)? {
-            let entry = timeline_dir?;
-            let path = entry.path();
-            let timeline_id = TimelineId::from_str(path.file_name().unwrap().to_str().unwrap())?;
-
-            for trace_dir in read_dir(path)? {
-                let entry = trace_dir?;
-                let path = entry.path();
-                let connection_id =
-                    ConnectionId::from_str(path.file_name().unwrap().to_str().unwrap())?;
-
-                trace_files.push(TraceFile {
-                    tenant_id,
-                    timeline_id,
-                    connection_id,
-                    path,
-                });
-            }
-        }
-    }
-
-    Ok(trace_files)
-}
-
-fn main() -> anyhow::Result<()> {
-    let args = Args::parse();
-
-    match args.command {
-        Command::List => {
-            for trace_file in get_trace_files(&args.path)? {
-                println!("{trace_file:?}");
-            }
-        }
-        Command::Dump => {
-            for trace_file in get_trace_files(&args.path)? {
-                let file = File::open(trace_file.path.clone())?;
-                let reader = BufReader::new(file);
-                dump_trace(reader);
-            }
-        }
-        Command::Analyze => {
-            for trace_file in get_trace_files(&args.path)? {
-                println!("analyzing {trace_file:?}");
-                let file = File::open(trace_file.path.clone())?;
-                let reader = BufReader::new(file);
-                analyze_trace(reader);
-            }
-        }
-    }
-
-    Ok(())
-}

From 5c80743c9c7b5d6ca840343ea088f39e9b560ea0 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 11 Jul 2024 15:43:28 +0100
Subject: [PATCH 226/412] storage_controller: fix ReconcilerWaiter::get_status
 (#8341)

## Problem
SeqWait::would_wait_for returns Ok in the case when we would not wait
for the sequence number and Err otherwise.
ReconcilerWaiter::get_status uses it the wrong way around. This can
cause the storage controller to go into a busy loop
and make it look unavailable to the k8s controller.

## Summary of changes
Use `SeqWait::would_wait_for` correctly.
---
 storage_controller/src/tenant_shard.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index 3fcf31ac10..2ddab58aaf 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -383,9 +383,9 @@ impl ReconcilerWaiter {
     }
 
     pub(crate) fn get_status(&self) -> ReconcilerStatus {
-        if self.seq_wait.would_wait_for(self.seq).is_err() {
+        if self.seq_wait.would_wait_for(self.seq).is_ok() {
             ReconcilerStatus::Done
-        } else if self.error_seq_wait.would_wait_for(self.seq).is_err() {
+        } else if self.error_seq_wait.would_wait_for(self.seq).is_ok() {
             ReconcilerStatus::Failed
         } else {
             ReconcilerStatus::InProgress

From 8cc768254f207b310e4737c297235ba87fa95515 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 11 Jul 2024 17:05:35 +0100
Subject: [PATCH 227/412] safekeeper: eviction metrics (#8348)

## Problem

Follow up to https://github.com/neondatabase/neon/pull/8335, to improve
observability of how many evict/restores we are doing.

## Summary of changes

- Add `safekeeper_eviction_events_started_total` and
`safekeeper_eviction_events_completed_total`, with a "kind" label of
evict or restore. This gives us rates, and also ability to calculate how
many are in progress.
- Generalize SafekeeperMetrics test type to use the same helpers as
pageserver, and enable querying any metric.
- Read the new metrics at the end of the eviction test.
---
 Cargo.lock                               |  2 +
 safekeeper/Cargo.toml                    |  2 +
 safekeeper/src/metrics.rs                | 26 +++++++++++++
 safekeeper/src/timeline_eviction.rs      | 19 ++++++++++
 test_runner/fixtures/safekeeper/http.py  | 48 +++++++++++-------------
 test_runner/regress/test_wal_acceptor.py | 24 +++++++++++-
 6 files changed, 92 insertions(+), 29 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 4b1525edee..b31ac69e6c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5206,6 +5206,8 @@ dependencies = [
  "sha2",
  "signal-hook",
  "storage_broker",
+ "strum",
+ "strum_macros",
  "thiserror",
  "tokio",
  "tokio-io-timeout",
diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml
index a650d5e207..9f32016fd9 100644
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -41,6 +41,8 @@ serde.workspace = true
 serde_json.workspace = true
 serde_with.workspace = true
 signal-hook.workspace = true
+strum.workspace = true
+strum_macros.workspace = true
 thiserror.workspace = true
 tokio = { workspace = true, features = ["fs"] }
 tokio-util = { workspace = true }
diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs
index 539ecf826b..aa2bafbe92 100644
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -205,6 +205,32 @@ pub static WAL_BACKUP_TASKS: Lazy<IntCounterPair> = Lazy::new(|| {
     .expect("Failed to register safekeeper_wal_backup_tasks_finished_total counter")
 });
 
+// Metrics collected on operations on the storage repository.
+#[derive(strum_macros::EnumString, strum_macros::Display, strum_macros::IntoStaticStr)]
+#[strum(serialize_all = "kebab_case")]
+pub(crate) enum EvictionEvent {
+    Evict,
+    Restore,
+}
+
+pub(crate) static EVICTION_EVENTS_STARTED: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "safekeeper_eviction_events_started_total",
+        "Number of eviction state changes, incremented when they start",
+        &["kind"]
+    )
+    .expect("Failed to register metric")
+});
+
+pub(crate) static EVICTION_EVENTS_COMPLETED: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "safekeeper_eviction_events_completed_total",
+        "Number of eviction state changes, incremented when they complete",
+        &["kind"]
+    )
+    .expect("Failed to register metric")
+});
+
 pub const LABEL_UNKNOWN: &str = "unknown";
 
 /// Labels for traffic metrics.
diff --git a/safekeeper/src/timeline_eviction.rs b/safekeeper/src/timeline_eviction.rs
index e4ab65290d..0b8d58ee8a 100644
--- a/safekeeper/src/timeline_eviction.rs
+++ b/safekeeper/src/timeline_eviction.rs
@@ -14,6 +14,7 @@ use tracing::{debug, info, instrument, warn};
 use utils::crashsafe::durable_rename;
 
 use crate::{
+    metrics::{EvictionEvent, EVICTION_EVENTS_COMPLETED, EVICTION_EVENTS_STARTED},
     timeline_manager::{Manager, StateSnapshot},
     wal_backup,
     wal_backup_partial::{self, PartialRemoteSegment},
@@ -66,6 +67,15 @@ impl Manager {
 
         info!("starting eviction, using {:?}", partial_backup_uploaded);
 
+        EVICTION_EVENTS_STARTED
+            .with_label_values(&[EvictionEvent::Evict.into()])
+            .inc();
+        let _guard = scopeguard::guard((), |_| {
+            EVICTION_EVENTS_COMPLETED
+                .with_label_values(&[EvictionEvent::Evict.into()])
+                .inc();
+        });
+
         if let Err(e) = do_eviction(self, &partial_backup_uploaded).await {
             warn!("failed to evict timeline: {:?}", e);
             return;
@@ -88,6 +98,15 @@ impl Manager {
 
         info!("starting uneviction, using {:?}", partial_backup_uploaded);
 
+        EVICTION_EVENTS_STARTED
+            .with_label_values(&[EvictionEvent::Restore.into()])
+            .inc();
+        let _guard = scopeguard::guard((), |_| {
+            EVICTION_EVENTS_COMPLETED
+                .with_label_values(&[EvictionEvent::Restore.into()])
+                .inc();
+        });
+
         if let Err(e) = do_uneviction(self, &partial_backup_uploaded).await {
             warn!("failed to unevict timeline: {:?}", e);
             return;
diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py
index 11e6fef28f..a51b89744b 100644
--- a/test_runner/fixtures/safekeeper/http.py
+++ b/test_runner/fixtures/safekeeper/http.py
@@ -1,6 +1,5 @@
 import json
-import re
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import pytest
@@ -8,6 +7,7 @@ import requests
 
 from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
+from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
 
 
 # Walreceiver as returned by sk's timeline status endpoint.
@@ -31,15 +31,26 @@ class SafekeeperTimelineStatus:
     walreceivers: List[Walreceiver]
 
 
-@dataclass
-class SafekeeperMetrics:
+class SafekeeperMetrics(Metrics):
+    # Helpers to get metrics from tests without hardcoding the metric names there.
     # These are metrics from Prometheus which uses float64 internally.
     # As a consequence, values may differ from real original int64s.
-    flush_lsn_inexact: Dict[Tuple[TenantId, TimelineId], int] = field(default_factory=dict)
-    commit_lsn_inexact: Dict[Tuple[TenantId, TimelineId], int] = field(default_factory=dict)
+
+    def __init__(self, m: Metrics):
+        self.metrics = m.metrics
+
+    def flush_lsn_inexact(self, tenant_id: TenantId, timeline_id: TimelineId):
+        return self.query_one(
+            "safekeeper_flush_lsn", {"tenant_id": str(tenant_id), "timeline_id": str(timeline_id)}
+        ).value
+
+    def commit_lsn_inexact(self, tenant_id: TenantId, timeline_id: TimelineId):
+        return self.query_one(
+            "safekeeper_commit_lsn", {"tenant_id": str(tenant_id), "timeline_id": str(timeline_id)}
+        ).value
 
 
-class SafekeeperHttpClient(requests.Session):
+class SafekeeperHttpClient(requests.Session, MetricsGetter):
     HTTPError = requests.HTTPError
 
     def __init__(self, port: int, auth_token: Optional[str] = None, is_testing_enabled=False):
@@ -209,28 +220,11 @@ class SafekeeperHttpClient(requests.Session):
         return res_json
 
     def get_metrics_str(self) -> str:
+        """You probably want to use get_metrics() instead."""
         request_result = self.get(f"http://localhost:{self.port}/metrics")
         request_result.raise_for_status()
         return request_result.text
 
     def get_metrics(self) -> SafekeeperMetrics:
-        all_metrics_text = self.get_metrics_str()
-
-        metrics = SafekeeperMetrics()
-        for match in re.finditer(
-            r'^safekeeper_flush_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$',
-            all_metrics_text,
-            re.MULTILINE,
-        ):
-            metrics.flush_lsn_inexact[(TenantId(match.group(1)), TimelineId(match.group(2)))] = int(
-                match.group(3)
-            )
-        for match in re.finditer(
-            r'^safekeeper_commit_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$',
-            all_metrics_text,
-            re.MULTILINE,
-        ):
-            metrics.commit_lsn_inexact[
-                (TenantId(match.group(1)), TimelineId(match.group(2)))
-            ] = int(match.group(3))
-        return metrics
+        res = self.get_metrics_str()
+        return SafekeeperMetrics(parse_metrics(res))
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 7efd86e349..e0ad4fdd5c 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -147,8 +147,8 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder):
                 last_record_lsn=Lsn(timeline_detail["last_record_lsn"]),
             )
             for sk_m in sk_metrics:
-                m.flush_lsns.append(Lsn(sk_m.flush_lsn_inexact[(tenant_id, timeline_id)]))
-                m.commit_lsns.append(Lsn(sk_m.commit_lsn_inexact[(tenant_id, timeline_id)]))
+                m.flush_lsns.append(Lsn(int(sk_m.flush_lsn_inexact(tenant_id, timeline_id))))
+                m.commit_lsns.append(Lsn(int(sk_m.commit_lsn_inexact(tenant_id, timeline_id))))
 
             for flush_lsn, commit_lsn in zip(m.flush_lsns, m.commit_lsns):
                 # Invariant. May be < when transaction is in progress.
@@ -2274,3 +2274,23 @@ def test_s3_eviction(
         and sk.log_contains("successfully restored evicted timeline")
         for sk in env.safekeepers
     )
+
+    assert any(
+        sk.http_client().get_metric_value(
+            "safekeeper_eviction_events_started_total", {"kind": "evict"}
+        )
+        or 0 > 0
+        and sk.http_client().get_metric_value(
+            "safekeeper_eviction_events_completed_total", {"kind": "evict"}
+        )
+        or 0 > 0
+        and sk.http_client().get_metric_value(
+            "safekeeper_eviction_events_started_total", {"kind": "restore"}
+        )
+        or 0 > 0
+        and sk.http_client().get_metric_value(
+            "safekeeper_eviction_events_completed_total", {"kind": "restore"}
+        )
+        or 0 > 0
+        for sk in env.safekeepers
+    )

From d3ff47f5722cf73c6f23783f55a145d368e78eae Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 11 Jul 2024 17:05:47 +0100
Subject: [PATCH 228/412] storage controller: add node deletion API (#8226)

## Problem

In anticipation of later adding a really nice drain+delete API, I
initially only added an intentionally basic `/drop` API that is just
about usable for deleting nodes in a pinch, but requires some ugly
storage controller restarts to persuade it to restart secondaries.

## Summary of changes

I started making a few tiny fixes, and ended up writing the delete
API...

- Quality of life nit: ordering of node + tenant listings in storcon_cli
- Papercut: Fix the attach_hook using the wrong operation type for
reporting slow locks
- Make Service::spawn tolerate `generation_pageserver` columns that
point to nonexistent node IDs. I started out thinking of this as a
general resilience thing, but when implementing the delete API I
realized it was actually a legitimate end state after the delete API is
called (as that API doesn't wait for all reconciles to succeed).
- Add a `DELETE` API for nodes, which does not gracefully drain, but
does reschedule everything. This becomes safe to use when the system is
in any state, but will incur availability gaps for any tenants that
weren't already live-migrated away. If tenants have already been
drained, this becomes a totally clean + safe way to decom a node.
- Add a test and a storcon_cli wrapper for it

This is meant to be a robust initial API that lets us remove nodes
without doing ugly things like restarting the storage controller -- it's
not quite a totally graceful node-draining routine yet. There's more
work in https://github.com/neondatabase/neon/issues/8333 to get to our
end-end state.
---
 control_plane/storcon_cli/src/main.rs         |  19 ++-
 storage_controller/src/http.rs                |  11 ++
 storage_controller/src/service.rs             | 121 +++++++++++++++++-
 storage_controller/src/tenant_shard.rs        |  19 ++-
 test_runner/fixtures/neon_fixtures.py         |   8 ++
 test_runner/regress/test_compatibility.py     |  25 ++++
 .../regress/test_storage_controller.py        |  88 +++++++++++++
 7 files changed, 277 insertions(+), 14 deletions(-)

diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index b2c5dfe58a..815f5c940f 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -56,6 +56,10 @@ enum Command {
         #[arg(long)]
         scheduling: Option<NodeSchedulingPolicy>,
     },
+    NodeDelete {
+        #[arg(long)]
+        node_id: NodeId,
+    },
     /// Modify a tenant's policies in the storage controller
     TenantPolicy {
         #[arg(long)]
@@ -357,13 +361,16 @@ async fn main() -> anyhow::Result<()> {
             tracing::info!("Delete status: {}", status);
         }
         Command::Nodes {} => {
-            let resp = storcon_client
+            let mut resp = storcon_client
                 .dispatch::<(), Vec<NodeDescribeResponse>>(
                     Method::GET,
                     "control/v1/node".to_string(),
                     None,
                 )
                 .await?;
+
+            resp.sort_by(|a, b| a.listen_http_addr.cmp(&b.listen_http_addr));
+
             let mut table = comfy_table::Table::new();
             table.set_header(["Id", "Hostname", "Scheduling", "Availability"]);
             for node in resp {
@@ -395,13 +402,16 @@ async fn main() -> anyhow::Result<()> {
                 .await?;
         }
         Command::Tenants {} => {
-            let resp = storcon_client
+            let mut resp = storcon_client
                 .dispatch::<(), Vec<TenantDescribeResponse>>(
                     Method::GET,
                     "control/v1/tenant".to_string(),
                     None,
                 )
                 .await?;
+
+            resp.sort_by(|a, b| a.tenant_id.cmp(&b.tenant_id));
+
             let mut table = comfy_table::Table::new();
             table.set_header([
                 "TenantId",
@@ -650,6 +660,11 @@ async fn main() -> anyhow::Result<()> {
                 .dispatch::<(), ()>(Method::POST, format!("debug/v1/node/{node_id}/drop"), None)
                 .await?;
         }
+        Command::NodeDelete { node_id } => {
+            storcon_client
+                .dispatch::<(), ()>(Method::DELETE, format!("control/v1/node/{node_id}"), None)
+                .await?;
+        }
         Command::TenantSetTimeBasedEviction {
             tenant_id,
             period,
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 7446ad53a2..3a62c0dd4f 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -456,6 +456,14 @@ async fn handle_node_drop(req: Request<Body>) -> Result<Response<Body>, ApiError
     json_response(StatusCode::OK, state.service.node_drop(node_id).await?)
 }
 
+async fn handle_node_delete(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    let state = get_state(&req);
+    let node_id: NodeId = parse_request_param(&req, "node_id")?;
+    json_response(StatusCode::OK, state.service.node_delete(node_id).await?)
+}
+
 async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
@@ -878,6 +886,9 @@ pub fn make_router(
         .post("/control/v1/node", |r| {
             named_request_span(r, handle_node_register, RequestName("control_v1_node"))
         })
+        .delete("/control/v1/node/:node_id", |r| {
+            named_request_span(r, handle_node_delete, RequestName("control_v1_node_delete"))
+        })
         .get("/control/v1/node", |r| {
             named_request_span(r, handle_node_list, RequestName("control_v1_node"))
         })
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index aada1939ee..b6e2b53191 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -2,6 +2,7 @@ use std::{
     borrow::Cow,
     cmp::Ordering,
     collections::{BTreeMap, HashMap, HashSet},
+    ops::Deref,
     path::PathBuf,
     str::FromStr,
     sync::Arc,
@@ -115,12 +116,14 @@ enum TenantOperations {
     SecondaryDownload,
     TimelineCreate,
     TimelineDelete,
+    AttachHook,
 }
 
 #[derive(Clone, strum_macros::Display)]
 enum NodeOperations {
     Register,
     Configure,
+    Delete,
 }
 
 pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128;
@@ -845,9 +848,10 @@ impl Service {
         tenant_id=%result.tenant_shard_id.tenant_id, shard_id=%result.tenant_shard_id.shard_slug(),
         sequence=%result.sequence
     ))]
-    fn process_result(&self, result: ReconcileResult) {
+    fn process_result(&self, mut result: ReconcileResult) {
         let mut locked = self.inner.write().unwrap();
-        let Some(tenant) = locked.tenants.get_mut(&result.tenant_shard_id) else {
+        let (nodes, tenants, _scheduler) = locked.parts_mut();
+        let Some(tenant) = tenants.get_mut(&result.tenant_shard_id) else {
             // A reconciliation result might race with removing a tenant: drop results for
             // tenants that aren't in our map.
             return;
@@ -864,6 +868,13 @@ impl Service {
         // Let the TenantShard know it is idle.
         tenant.reconcile_complete(result.sequence);
 
+        // In case a node was deleted while this reconcile is in flight, filter it out of the update we will
+        // make to the tenant
+        result
+            .observed
+            .locations
+            .retain(|node_id, _loc| nodes.contains_key(node_id));
+
         match result.result {
             Ok(()) => {
                 for (node_id, loc) in &result.observed.locations {
@@ -873,6 +884,7 @@ impl Service {
                         tracing::info!("Setting observed location {} to None", node_id,)
                     }
                 }
+
                 tenant.observed = result.observed;
                 tenant.waiter.advance(result.sequence);
             }
@@ -1109,8 +1121,16 @@ impl Service {
             // We will populate intent properly later in [`Self::startup_reconcile`], initially populate
             // it with what we can infer: the node for which a generation was most recently issued.
             let mut intent = IntentState::new();
-            if let Some(generation_pageserver) = tsp.generation_pageserver {
-                intent.set_attached(&mut scheduler, Some(NodeId(generation_pageserver as u64)));
+            if let Some(generation_pageserver) = tsp.generation_pageserver.map(|n| NodeId(n as u64))
+            {
+                if nodes.contains_key(&generation_pageserver) {
+                    intent.set_attached(&mut scheduler, Some(generation_pageserver));
+                } else {
+                    // If a node was removed before being completely drained, it is legal for it to leave behind a `generation_pageserver` referring
+                    // to a non-existent node, because node deletion doesn't block on completing the reconciliations that will issue new generations
+                    // on different pageservers.
+                    tracing::warn!("Tenant shard {tenant_shard_id} references non-existent node {generation_pageserver} in database, will be rescheduled");
+                }
             }
             let new_tenant = TenantShard::from_persistent(tsp, intent)?;
 
@@ -1237,7 +1257,7 @@ impl Service {
         let _tenant_lock = trace_exclusive_lock(
             &self.tenant_op_locks,
             attach_req.tenant_shard_id.tenant_id,
-            TenantOperations::ShardSplit,
+            TenantOperations::AttachHook,
         )
         .await;
 
@@ -4210,8 +4230,6 @@ impl Service {
     /// This is for debug/support only: we simply drop all state for a tenant, without
     /// detaching or deleting it on pageservers.  We do not try and re-schedule any
     /// tenants that were on this node.
-    ///
-    /// TODO: proper node deletion API that unhooks things more gracefully
     pub(crate) async fn node_drop(&self, node_id: NodeId) -> Result<(), ApiError> {
         self.persistence.delete_node(node_id).await?;
 
@@ -4219,6 +4237,7 @@ impl Service {
 
         for shard in locked.tenants.values_mut() {
             shard.deref_node(node_id);
+            shard.observed.locations.remove(&node_id);
         }
 
         let mut nodes = (*locked.nodes).clone();
@@ -4230,6 +4249,94 @@ impl Service {
         Ok(())
     }
 
+    /// If a node has any work on it, it will be rescheduled: this is "clean" in the sense
+    /// that we don't leave any bad state behind in the storage controller, but unclean
+    /// in the sense that we are not carefully draining the node.
+    pub(crate) async fn node_delete(&self, node_id: NodeId) -> Result<(), ApiError> {
+        let _node_lock =
+            trace_exclusive_lock(&self.node_op_locks, node_id, NodeOperations::Delete).await;
+
+        // 1. Atomically update in-memory state:
+        //    - set the scheduling state to Pause to make subsequent scheduling ops skip it
+        //    - update shards' intents to exclude the node, and reschedule any shards whose intents we modified.
+        //    - drop the node from the main nodes map, so that when running reconciles complete they do not
+        //      re-insert references to this node into the ObservedState of shards
+        //    - drop the node from the scheduler
+        {
+            let mut locked = self.inner.write().unwrap();
+            let (nodes, tenants, scheduler) = locked.parts_mut();
+
+            {
+                let mut nodes_mut = (*nodes).deref().clone();
+                match nodes_mut.get_mut(&node_id) {
+                    Some(node) => {
+                        // We do not bother setting this in the database, because we're about to delete the row anyway, and
+                        // if we crash it would not be desirable to leave the node paused after a restart.
+                        node.set_scheduling(NodeSchedulingPolicy::Pause);
+                    }
+                    None => {
+                        tracing::info!(
+                            "Node not found: presuming this is a retry and returning success"
+                        );
+                        return Ok(());
+                    }
+                }
+
+                *nodes = Arc::new(nodes_mut);
+            }
+
+            for (tenant_shard_id, shard) in tenants {
+                if shard.deref_node(node_id) {
+                    // FIXME: we need to build a ScheduleContext that reflects this shard's peers, otherwise
+                    // it won't properly do anti-affinity.
+                    let mut schedule_context = ScheduleContext::default();
+
+                    if let Err(e) = shard.schedule(scheduler, &mut schedule_context) {
+                        // TODO: implement force flag to remove a node even if we can't reschedule
+                        // a tenant
+                        tracing::error!("Refusing to delete node, shard {tenant_shard_id} can't be rescheduled: {e}");
+                        return Err(e.into());
+                    } else {
+                        tracing::info!(
+                            "Rescheduled shard {tenant_shard_id} away from node during deletion"
+                        )
+                    }
+
+                    self.maybe_reconcile_shard(shard, nodes);
+                }
+
+                // Here we remove an existing observed location for the node we're removing, and it will
+                // not be re-added by a reconciler's completion because we filter out removed nodes in
+                // process_result.
+                //
+                // Note that we update the shard's observed state _after_ calling maybe_reconcile_shard: that
+                // means any reconciles we spawned will know about the node we're deleting, enabling them
+                // to do live migrations if it's still online.
+                shard.observed.locations.remove(&node_id);
+            }
+
+            scheduler.node_remove(node_id);
+
+            {
+                let mut nodes_mut = (**nodes).clone();
+                nodes_mut.remove(&node_id);
+                *nodes = Arc::new(nodes_mut);
+            }
+        }
+
+        // Note: some `generation_pageserver` columns on tenant shards in the database may still refer to
+        // the removed node, as this column means "The pageserver to which this generation was issued", and
+        // their generations won't get updated until the reconcilers moving them away from this node complete.
+        // That is safe because in Service::spawn we only use generation_pageserver if it refers to a node
+        // that exists.
+
+        // 2. Actually delete the node from the database and from in-memory state
+        tracing::info!("Deleting node from database");
+        self.persistence.delete_node(node_id).await?;
+
+        Ok(())
+    }
+
     pub(crate) async fn node_list(&self) -> Result<Vec<Node>, ApiError> {
         let nodes = {
             self.inner
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index 2ddab58aaf..2574dc297a 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -1229,18 +1229,27 @@ impl TenantShard {
         }
     }
 
-    // If we had any state at all referring to this node ID, drop it.  Does not
-    // attempt to reschedule.
-    pub(crate) fn deref_node(&mut self, node_id: NodeId) {
+    /// If we had any state at all referring to this node ID, drop it.  Does not
+    /// attempt to reschedule.
+    ///
+    /// Returns true if we modified the node's intent state.
+    pub(crate) fn deref_node(&mut self, node_id: NodeId) -> bool {
+        let mut intent_modified = false;
+
+        // Drop if this node was our attached intent
         if self.intent.attached == Some(node_id) {
             self.intent.attached = None;
+            intent_modified = true;
         }
 
+        // Drop from the list of secondaries, and check if we modified it
+        let had_secondaries = self.intent.secondary.len();
         self.intent.secondary.retain(|n| n != &node_id);
-
-        self.observed.locations.remove(&node_id);
+        intent_modified |= self.intent.secondary.len() != had_secondaries;
 
         debug_assert!(!self.intent.all_pageservers().contains(&node_id));
+
+        intent_modified
     }
 
     pub(crate) fn set_scheduling_policy(&mut self, p: ShardSchedulingPolicy) {
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 5ca31644a9..463e4a3b01 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2287,6 +2287,14 @@ class NeonStorageController(MetricsGetter, LogUtils):
             headers=self.headers(TokenScope.ADMIN),
         )
 
+    def node_delete(self, node_id):
+        log.info(f"node_delete({node_id})")
+        self.request(
+            "DELETE",
+            f"{self.env.storage_controller_api}/control/v1/node/{node_id}",
+            headers=self.headers(TokenScope.ADMIN),
+        )
+
     def node_drain(self, node_id):
         log.info(f"node_drain({node_id})")
         self.request(
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index 65649e0c0a..1e5e320e0e 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -93,6 +93,29 @@ check_ondisk_data_compatibility_if_enabled = pytest.mark.skipif(
 )
 
 
+def fixup_storage_controller(env: NeonEnv):
+    """
+    After importing a repo_dir, we need to massage the storage controller's state a bit: it will have
+    initially started up with no nodes, but some tenants, and thereby those tenants won't be scheduled
+    anywhere.
+
+    After NeonEnv.start() is done (i.e. nodes are started + registered), call this function to get
+    the storage controller into a good state.
+
+    This function should go away once compat tests carry the controller database in their snapshots, so
+    that the controller properly remembers nodes between creating + restoring the snapshot.
+    """
+    env.storage_controller.allowed_errors.extend(
+        [
+            ".*Tenant shard .+ references non-existent node.*",
+            ".*Failed to schedule tenant .+ at startup.*",
+        ]
+    )
+    env.storage_controller.stop()
+    env.storage_controller.start()
+    env.storage_controller.reconcile_until_idle()
+
+
 @pytest.mark.xdist_group("compatibility")
 @pytest.mark.order(before="test_forward_compatibility")
 def test_create_snapshot(
@@ -175,6 +198,7 @@ def test_backward_compatibility(
         neon_env_builder.num_safekeepers = 3
         env = neon_env_builder.from_repo_dir(compatibility_snapshot_dir / "repo")
         neon_env_builder.start()
+        fixup_storage_controller(env)
 
         check_neon_works(
             env,
@@ -263,6 +287,7 @@ def test_forward_compatibility(
         assert not env.pageserver.log_contains("git-env:" + prev_pageserver_version)
 
         neon_env_builder.start()
+        fixup_storage_controller(env)
 
         # ensure the specified pageserver is running
         assert env.pageserver.log_contains("git-env:" + prev_pageserver_version)
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index d37f7aae3d..741f16685e 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -1611,3 +1611,91 @@ def test_background_operation_cancellation(neon_env_builder: NeonEnvBuilder):
     env.storage_controller.cancel_node_drain(ps_id_to_drain)
 
     env.storage_controller.poll_node_status(ps_id_to_drain, "Active", max_attempts=6, backoff=2)
+
+
+@pytest.mark.parametrize("while_offline", [True, False])
+def test_storage_controller_node_deletion(
+    neon_env_builder: NeonEnvBuilder,
+    compute_reconfigure_listener: ComputeReconfigure,
+    while_offline: bool,
+):
+    """
+    Test that deleting a node works & properly reschedules everything that was on the node.
+    """
+    neon_env_builder.num_pageservers = 3
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    tenant_count = 10
+    shard_count_per_tenant = 8
+    tenant_ids = []
+    for _ in range(0, tenant_count):
+        tid = TenantId.generate()
+        tenant_ids.append(tid)
+        env.neon_cli.create_tenant(
+            tid, placement_policy='{"Attached":1}', shard_count=shard_count_per_tenant
+        )
+
+    victim = env.pageservers[-1]
+
+    # The procedure a human would follow is:
+    # 1. Mark pageserver scheduling=pause
+    # 2. Mark pageserver availability=offline to trigger migrations away from it
+    # 3. Wait for attachments to all move elsewhere
+    # 4. Call deletion API
+    # 5. Stop the node.
+
+    env.storage_controller.node_configure(victim.id, {"scheduling": "Pause"})
+
+    if while_offline:
+        victim.stop(immediate=True)
+        env.storage_controller.node_configure(victim.id, {"availability": "Offline"})
+
+        def assert_shards_migrated():
+            counts = get_node_shard_counts(env, tenant_ids)
+            elsewhere = sum(v for (k, v) in counts.items() if k != victim.id)
+            log.info(f"Shards on nodes other than on victim: {elsewhere}")
+            assert elsewhere == tenant_count * shard_count_per_tenant
+
+        wait_until(30, 1, assert_shards_migrated)
+
+    log.info(f"Deleting pageserver {victim.id}")
+    env.storage_controller.node_delete(victim.id)
+
+    if not while_offline:
+
+        def assert_victim_evacuated():
+            counts = get_node_shard_counts(env, tenant_ids)
+            count = counts[victim.id]
+            log.info(f"Shards on node {victim.id}: {count}")
+            assert count == 0
+
+        wait_until(30, 1, assert_victim_evacuated)
+
+    # The node should be gone from the list API
+    assert victim.id not in [n["id"] for n in env.storage_controller.node_list()]
+
+    # No tenants should refer to the node in their intent
+    for tenant_id in tenant_ids:
+        describe = env.storage_controller.tenant_describe(tenant_id)
+        for shard in describe["shards"]:
+            assert shard["node_attached"] != victim.id
+            assert victim.id not in shard["node_secondary"]
+
+    # Reconciles running during deletion should all complete
+    # FIXME: this currently doesn't work because the deletion schedules shards without a proper ScheduleContext, resulting
+    # in states that background_reconcile wants to optimize, but can't proceed with migrations yet because this is a short3
+    # test that hasn't uploaded any heatmaps for secondaries.
+    # In the interim, just do a reconcile_all to enable the consistency check.
+    # env.storage_controller.reconcile_until_idle()
+    env.storage_controller.reconcile_all()
+
+    # Controller should pass its own consistency checks
+    env.storage_controller.consistency_check()
+
+    # The node should stay gone across a restart
+    env.storage_controller.stop()
+    env.storage_controller.start()
+    assert victim.id not in [n["id"] for n in env.storage_controller.node_list()]
+    env.storage_controller.reconcile_all()  # FIXME: workaround for optimizations happening on startup, see FIXME above.
+    env.storage_controller.consistency_check()

From 8532d7227622ed6d0be53f7bd0fae0195f0e1656 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Thu, 11 Jul 2024 19:14:49 +0300
Subject: [PATCH 229/412] Fix memory context of NeonWALReader allocation.

Allocating it in short living context is wrong because it is reused during
backend lifetime.
---
 pgxn/neon/neon_walreader.c                      | 9 +++++----
 test_runner/regress/test_logical_replication.py | 6 ++++++
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/pgxn/neon/neon_walreader.c b/pgxn/neon/neon_walreader.c
index 60eb8e1fc9..0f76514b86 100644
--- a/pgxn/neon/neon_walreader.c
+++ b/pgxn/neon/neon_walreader.c
@@ -109,11 +109,12 @@ NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, char *log_
 {
 	NeonWALReader *reader;
 
+	/*
+	 * Note: we allocate in TopMemoryContext, reusing the reader for all process
+	 * reads.
+	 */
 	reader = (NeonWALReader *)
-		palloc_extended(sizeof(NeonWALReader),
-						MCXT_ALLOC_NO_OOM | MCXT_ALLOC_ZERO);
-	if (!reader)
-		return NULL;
+		MemoryContextAllocZero(TopMemoryContext, sizeof(NeonWALReader));
 
 	reader->available_lsn = available_lsn;
 	reader->seg.ws_file = -1;
diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py
index 41283e4d2c..66afe9ddfd 100644
--- a/test_runner/regress/test_logical_replication.py
+++ b/test_runner/regress/test_logical_replication.py
@@ -247,6 +247,12 @@ FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of
         cur.execute(
             "SELECT * FROM pg_logical_slot_peek_binary_changes('slotty_mcslotface', NULL, NULL, 'include-xids', '0')"
         )
+        # do the peek second time: we've had a bug using wrong memory context
+        # for NeonWALReader leading to the crash in this case.
+        log.info("peek_changes again")
+        cur.execute(
+            "SELECT * FROM pg_logical_slot_peek_binary_changes('slotty_mcslotface', NULL, NULL, 'include-xids', '0')"
+        )
 
 
 # Tests that walsender correctly blocks until WAL is downloaded from safekeepers

From 7dd71f41266c315dd9fb6bb8cc9b17facde04a52 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Thu, 11 Jul 2024 14:28:16 -0400
Subject: [PATCH 230/412] feat(pageserver): rewrite streaming vectored read
 planner (#8242)

Rewrite streaming vectored read planner to be a separate struct. The API
is designed to produce batches around `max_read_size` instead of exactly
less than that so that `handle_XX` returns one batch a time.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 .../src/tenant/storage_layer/delta_layer.rs   |  10 +-
 .../src/tenant/storage_layer/image_layer.rs   |   7 +-
 pageserver/src/tenant/vectored_blob_io.rs     | 273 +++++++++++++-----
 3 files changed, 218 insertions(+), 72 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index dfd0196c87..2d36ac7442 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -1321,7 +1321,7 @@ impl DeltaLayerInner {
                         offsets.start.pos(),
                         offsets.end.pos(),
                         meta,
-                        Some(max_read_size),
+                        max_read_size,
                     ))
                 }
             } else {
@@ -1615,13 +1615,17 @@ impl<'a> DeltaLayerIterator<'a> {
                 let lsn = DeltaKey::extract_lsn_from_buf(&raw_key);
                 let blob_ref = BlobRef(value);
                 let offset = blob_ref.pos();
-                if let Some(batch_plan) = self.planner.handle(key, lsn, offset, BlobFlag::None) {
+                if let Some(batch_plan) = self.planner.handle(key, lsn, offset) {
                     break batch_plan;
                 }
             } else {
                 self.is_end = true;
                 let data_end_offset = self.delta_layer.index_start_offset();
-                break self.planner.handle_range_end(data_end_offset);
+                if let Some(item) = self.planner.handle_range_end(data_end_offset) {
+                    break item;
+                } else {
+                    return Ok(()); // TODO: test empty iterator
+                }
             }
         };
         let vectored_blob_reader = VectoredBlobReader::new(&self.delta_layer.file);
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 1e03e1a58c..1440c0db84 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -994,14 +994,17 @@ impl<'a> ImageLayerIterator<'a> {
                     Key::from_slice(&raw_key[..KEY_SIZE]),
                     self.image_layer.lsn,
                     offset,
-                    BlobFlag::None,
                 ) {
                     break batch_plan;
                 }
             } else {
                 self.is_end = true;
                 let payload_end = self.image_layer.index_start_blk as u64 * PAGE_SZ as u64;
-                break self.planner.handle_range_end(payload_end);
+                if let Some(item) = self.planner.handle_range_end(payload_end) {
+                    break item;
+                } else {
+                    return Ok(()); // TODO: a test case on empty iterator
+                }
             }
         };
         let vectored_blob_reader = VectoredBlobReader::new(&self.image_layer.file);
diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs
index 7ad8446e04..1b470034db 100644
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -68,7 +68,7 @@ impl VectoredRead {
     }
 }
 
-#[derive(Eq, PartialEq)]
+#[derive(Eq, PartialEq, Debug)]
 pub(crate) enum VectoredReadExtended {
     Yes,
     No,
@@ -91,7 +91,7 @@ impl VectoredReadBuilder {
         start_offset: u64,
         end_offset: u64,
         meta: BlobMeta,
-        max_read_size: Option<usize>,
+        max_read_size: usize,
     ) -> Self {
         let mut blobs_at = VecMap::default();
         blobs_at
@@ -102,10 +102,9 @@ impl VectoredReadBuilder {
             start: start_offset,
             end: end_offset,
             blobs_at,
-            max_read_size,
+            max_read_size: Some(max_read_size),
         }
     }
-
     /// Attempt to extend the current read with a new blob if the start
     /// offset matches with the current end of the vectored read
     /// and the resuting size is below the max read size
@@ -164,7 +163,7 @@ pub struct VectoredReadPlanner {
     // Arguments for previous blob passed into [`VectoredReadPlanner::handle`]
     prev: Option<(Key, Lsn, u64, BlobFlag)>,
 
-    max_read_size: Option<usize>,
+    max_read_size: usize,
 }
 
 impl VectoredReadPlanner {
@@ -172,20 +171,7 @@ impl VectoredReadPlanner {
         Self {
             blobs: BTreeMap::new(),
             prev: None,
-            max_read_size: Some(max_read_size),
-        }
-    }
-
-    /// This function should *only* be used if the caller has a way to control the limit. e.g., in [`StreamingVectoredReadPlanner`],
-    /// it uses the vectored read planner to avoid duplicated logic on handling blob start/end, while expecting the vectored
-    /// read planner to give a single read to a continuous range of bytes in the image layer. Therefore, it does not need the
-    /// code path to split reads into chunks of `max_read_size`, and controls the read size itself.
-    #[cfg(test)]
-    pub(crate) fn new_caller_controlled_max_limit() -> Self {
-        Self {
-            blobs: BTreeMap::new(),
-            prev: None,
-            max_read_size: None,
+            max_read_size,
         }
     }
 
@@ -376,17 +362,18 @@ impl<'a> VectoredBlobReader<'a> {
 }
 
 /// Read planner used in [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`]. It provides a streaming API for
-/// getting read blobs. It returns a batch when `handle` gets called and when the current key would exceed the read_size and
-/// max_cnt constraints. Underlying it uses [`VectoredReadPlanner`].
+/// getting read blobs. It returns a batch when `handle` gets called and when the current key would just exceed the read_size and
+/// max_cnt constraints.
 #[cfg(test)]
 pub struct StreamingVectoredReadPlanner {
-    planner: VectoredReadPlanner,
-    /// Max read size per batch
+    read_builder: Option<VectoredReadBuilder>,
+    // Arguments for previous blob passed into [`StreamingVectoredReadPlanner::handle`]
+    prev: Option<(Key, Lsn, u64)>,
+    /// Max read size per batch. This is not a strict limit. If there are [0, 100) and [100, 200), while the `max_read_size` is 150,
+    /// we will produce a single batch instead of split them.
     max_read_size: u64,
     /// Max item count per batch
     max_cnt: usize,
-    /// The first offset of this batch
-    this_batch_first_offset: Option<u64>,
     /// Size of the current batch
     cnt: usize,
 }
@@ -397,62 +384,88 @@ impl StreamingVectoredReadPlanner {
         assert!(max_cnt > 0);
         assert!(max_read_size > 0);
         Self {
-            // We want to have exactly one read syscall (plus several others for index lookup) for each `next_batch` call.
-            // Therefore, we enforce `self.max_read_size` by ourselves instead of using the VectoredReadPlanner's capability,
-            // to avoid splitting into two I/Os.
-            planner: VectoredReadPlanner::new_caller_controlled_max_limit(),
+            read_builder: None,
+            prev: None,
             max_cnt,
             max_read_size,
-            this_batch_first_offset: None,
             cnt: 0,
         }
     }
 
-    fn emit(&mut self, this_batch_first_offset: u64) -> VectoredRead {
-        let planner = std::mem::replace(
-            &mut self.planner,
-            VectoredReadPlanner::new_caller_controlled_max_limit(),
-        );
-        self.this_batch_first_offset = Some(this_batch_first_offset);
-        self.cnt = 1;
-        let mut batch = planner.finish();
-        assert_eq!(batch.len(), 1, "should have exactly one read batch");
-        batch.pop().unwrap()
+    pub fn handle(&mut self, key: Key, lsn: Lsn, offset: u64) -> Option<VectoredRead> {
+        // Implementation note: internally lag behind by one blob such that
+        // we have a start and end offset when initialising [`VectoredRead`]
+        let (prev_key, prev_lsn, prev_offset) = match self.prev {
+            None => {
+                self.prev = Some((key, lsn, offset));
+                return None;
+            }
+            Some(prev) => prev,
+        };
+
+        let res = self.add_blob(prev_key, prev_lsn, prev_offset, offset, false);
+
+        self.prev = Some((key, lsn, offset));
+
+        res
     }
 
-    pub fn handle(
+    pub fn handle_range_end(&mut self, offset: u64) -> Option<VectoredRead> {
+        let res = if let Some((prev_key, prev_lsn, prev_offset)) = self.prev {
+            self.add_blob(prev_key, prev_lsn, prev_offset, offset, true)
+        } else {
+            None
+        };
+
+        self.prev = None;
+
+        res
+    }
+
+    fn add_blob(
         &mut self,
         key: Key,
         lsn: Lsn,
-        offset: u64,
-        flag: BlobFlag,
+        start_offset: u64,
+        end_offset: u64,
+        is_last_blob_in_read: bool,
     ) -> Option<VectoredRead> {
-        if let Some(begin_offset) = self.this_batch_first_offset {
-            // Each batch will have at least one item b/c `self.this_batch_first_offset` is set
-            // after one item gets processed
-            if offset - begin_offset > self.max_read_size {
-                self.planner.handle_range_end(offset); // End the current batch with the offset
-                let batch = self.emit(offset); // Produce a batch
-                self.planner.handle(key, lsn, offset, flag); // Add this key to the next batch
-                return Some(batch);
+        match &mut self.read_builder {
+            Some(read_builder) => {
+                let extended = read_builder.extend(start_offset, end_offset, BlobMeta { key, lsn });
+                assert_eq!(extended, VectoredReadExtended::Yes);
             }
-        } else {
-            self.this_batch_first_offset = Some(offset)
-        }
-        if self.cnt >= self.max_cnt {
-            self.planner.handle_range_end(offset); // End the current batch with the offset
-            let batch = self.emit(offset); // Produce a batch
-            self.planner.handle(key, lsn, offset, flag); // Add this key to the next batch
-            return Some(batch);
-        }
-        self.planner.handle(key, lsn, offset, flag); // Add this key to the current batch
-        self.cnt += 1;
-        None
-    }
+            None => {
+                self.read_builder = {
+                    let mut blobs_at = VecMap::default();
+                    blobs_at
+                        .append(start_offset, BlobMeta { key, lsn })
+                        .expect("First insertion always succeeds");
 
-    pub fn handle_range_end(&mut self, offset: u64) -> VectoredRead {
-        self.planner.handle_range_end(offset);
-        self.emit(offset)
+                    Some(VectoredReadBuilder {
+                        start: start_offset,
+                        end: end_offset,
+                        blobs_at,
+                        max_read_size: None,
+                    })
+                };
+            }
+        }
+        let read_builder = self.read_builder.as_mut().unwrap();
+        self.cnt += 1;
+        if is_last_blob_in_read
+            || read_builder.size() >= self.max_read_size as usize
+            || self.cnt >= self.max_cnt
+        {
+            let prev_read_builder = self.read_builder.take();
+            self.cnt = 0;
+
+            // `current_read_builder` is None in the first iteration
+            if let Some(read_builder) = prev_read_builder {
+                return Some(read_builder.build());
+            }
+        }
+        None
     }
 }
 
@@ -509,8 +522,11 @@ mod tests {
         planner.handle_range_end(652 * 1024);
 
         let reads = planner.finish();
+
         assert_eq!(reads.len(), 6);
 
+        // TODO: could remove zero reads to produce 5 reads here
+
         for (idx, read) in reads.iter().enumerate() {
             validate_read(read, ranges[idx]);
         }
@@ -548,4 +564,127 @@ mod tests {
             validate_read(read, ranges[idx]);
         }
     }
+
+    #[test]
+    fn streaming_planner_max_read_size_test() {
+        let max_read_size = 128 * 1024;
+        let key = Key::MIN;
+        let lsn = Lsn(0);
+
+        let blob_descriptions = vec![
+            (key, lsn, 0, BlobFlag::None),
+            (key, lsn, 32 * 1024, BlobFlag::None),
+            (key, lsn, 96 * 1024, BlobFlag::None),
+            (key, lsn, 128 * 1024, BlobFlag::None),
+            (key, lsn, 198 * 1024, BlobFlag::None),
+            (key, lsn, 268 * 1024, BlobFlag::None),
+            (key, lsn, 396 * 1024, BlobFlag::None),
+            (key, lsn, 652 * 1024, BlobFlag::None),
+        ];
+
+        let ranges = [
+            &blob_descriptions[0..3],
+            &blob_descriptions[3..5],
+            &blob_descriptions[5..6],
+            &blob_descriptions[6..7],
+            &blob_descriptions[7..],
+        ];
+
+        let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 1000);
+        let mut reads = Vec::new();
+        for (key, lsn, offset, _) in blob_descriptions.clone() {
+            reads.extend(planner.handle(key, lsn, offset));
+        }
+        reads.extend(planner.handle_range_end(652 * 1024));
+
+        assert_eq!(reads.len(), ranges.len());
+
+        for (idx, read) in reads.iter().enumerate() {
+            validate_read(read, ranges[idx]);
+        }
+    }
+
+    #[test]
+    fn streaming_planner_max_cnt_test() {
+        let max_read_size = 1024 * 1024;
+        let key = Key::MIN;
+        let lsn = Lsn(0);
+
+        let blob_descriptions = vec![
+            (key, lsn, 0, BlobFlag::None),
+            (key, lsn, 32 * 1024, BlobFlag::None),
+            (key, lsn, 96 * 1024, BlobFlag::None),
+            (key, lsn, 128 * 1024, BlobFlag::None),
+            (key, lsn, 198 * 1024, BlobFlag::None),
+            (key, lsn, 268 * 1024, BlobFlag::None),
+            (key, lsn, 396 * 1024, BlobFlag::None),
+            (key, lsn, 652 * 1024, BlobFlag::None),
+        ];
+
+        let ranges = [
+            &blob_descriptions[0..2],
+            &blob_descriptions[2..4],
+            &blob_descriptions[4..6],
+            &blob_descriptions[6..],
+        ];
+
+        let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 2);
+        let mut reads = Vec::new();
+        for (key, lsn, offset, _) in blob_descriptions.clone() {
+            reads.extend(planner.handle(key, lsn, offset));
+        }
+        reads.extend(planner.handle_range_end(652 * 1024));
+
+        assert_eq!(reads.len(), ranges.len());
+
+        for (idx, read) in reads.iter().enumerate() {
+            validate_read(read, ranges[idx]);
+        }
+    }
+
+    #[test]
+    fn streaming_planner_edge_test() {
+        let max_read_size = 1024 * 1024;
+        let key = Key::MIN;
+        let lsn = Lsn(0);
+        {
+            let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 1);
+            let mut reads = Vec::new();
+            reads.extend(planner.handle_range_end(652 * 1024));
+            assert!(reads.is_empty());
+        }
+        {
+            let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 1);
+            let mut reads = Vec::new();
+            reads.extend(planner.handle(key, lsn, 0));
+            reads.extend(planner.handle_range_end(652 * 1024));
+            assert_eq!(reads.len(), 1);
+            validate_read(&reads[0], &[(key, lsn, 0, BlobFlag::None)]);
+        }
+        {
+            let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 1);
+            let mut reads = Vec::new();
+            reads.extend(planner.handle(key, lsn, 0));
+            reads.extend(planner.handle(key, lsn, 128 * 1024));
+            reads.extend(planner.handle_range_end(652 * 1024));
+            assert_eq!(reads.len(), 2);
+            validate_read(&reads[0], &[(key, lsn, 0, BlobFlag::None)]);
+            validate_read(&reads[1], &[(key, lsn, 128 * 1024, BlobFlag::None)]);
+        }
+        {
+            let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 2);
+            let mut reads = Vec::new();
+            reads.extend(planner.handle(key, lsn, 0));
+            reads.extend(planner.handle(key, lsn, 128 * 1024));
+            reads.extend(planner.handle_range_end(652 * 1024));
+            assert_eq!(reads.len(), 1);
+            validate_read(
+                &reads[0],
+                &[
+                    (key, lsn, 0, BlobFlag::None),
+                    (key, lsn, 128 * 1024, BlobFlag::None),
+                ],
+            );
+        }
+    }
 }

From 90f447b79da57ead0a34e689fe4eea44542a3cf6 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 11 Jul 2024 22:03:35 +0300
Subject: [PATCH 231/412] test: limit `test_layer_download_timeouted` to
 MOCK_S3 (#8331)

Requests against REAL_S3 on CI can consistently take longer than 1s;
testing the short timeouts against it made no sense in hindsight, as
MOCK_S3 works just as well.

evidence:
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-8229/9857994025/index.html#suites/b97efae3a617afb71cb8142f5afa5224/6828a50921660a32
---
 test_runner/regress/test_ondemand_download.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py
index 4a25dfd874..c8249bb2ce 100644
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -764,7 +764,9 @@ def test_layer_download_timeouted(neon_env_builder: NeonEnvBuilder):
     """
     Pause using a pausable_failpoint longer than the client timeout to simulate the timeout happening.
     """
-    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
+    # running this test is not reliable against REAL_S3, because operations can
+    # take longer than 1s we want to use as a timeout
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3)
     assert isinstance(neon_env_builder.pageserver_remote_storage, S3Storage)
     neon_env_builder.pageserver_remote_storage.custom_timeout = "1s"
 

From 119ddf6ccf8c5a690af183c919de5b3b4daa3f43 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Thu, 11 Jul 2024 13:29:35 -0700
Subject: [PATCH 232/412] Grant execute on snapshot functions to neon_superuser
 (#8346)

## Problem
I need `neon_superuser` to be allowed to create snapshots for
replication tests

## Summary of changes
Adds a migration that grants these functions to neon_superuser
---
 ...nt_snapshot_synchronization_funcs_to_neon_superuser.sql | 7 +++++++
 compute_tools/src/spec.rs                                  | 3 +++
 test_runner/regress/test_migrations.py                     | 2 +-
 3 files changed, 11 insertions(+), 1 deletion(-)
 create mode 100644 compute_tools/src/migrations/0009-grant_snapshot_synchronization_funcs_to_neon_superuser.sql

diff --git a/compute_tools/src/migrations/0009-grant_snapshot_synchronization_funcs_to_neon_superuser.sql b/compute_tools/src/migrations/0009-grant_snapshot_synchronization_funcs_to_neon_superuser.sql
new file mode 100644
index 0000000000..28750e00dd
--- /dev/null
+++ b/compute_tools/src/migrations/0009-grant_snapshot_synchronization_funcs_to_neon_superuser.sql
@@ -0,0 +1,7 @@
+DO $$
+BEGIN
+    IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN
+       EXECUTE 'GRANT EXECUTE ON FUNCTION pg_export_snapshot TO neon_superuser';
+       EXECUTE 'GRANT EXECUTE ON FUNCTION pg_log_standby_snapshot TO neon_superuser';
+    END IF;
+END $$;
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 37090b08fd..1d12b88c7c 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -790,6 +790,9 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> {
             "./migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql"
         ),
         include_str!("./migrations/0008-revoke_replication_for_previously_allowed_roles.sql"),
+        include_str!(
+            "./migrations/0009-grant_snapshot_synchronization_funcs_to_neon_superuser.sql"
+        ),
     ];
 
     MigrationRunner::new(client, &migrations).run_migrations()?;
diff --git a/test_runner/regress/test_migrations.py b/test_runner/regress/test_migrations.py
index 5637f160cf..91bd3ea50c 100644
--- a/test_runner/regress/test_migrations.py
+++ b/test_runner/regress/test_migrations.py
@@ -13,7 +13,7 @@ def test_migrations(neon_simple_env: NeonEnv):
 
     endpoint.wait_for_migrations()
 
-    num_migrations = 9
+    num_migrations = 10
 
     with endpoint.cursor() as cur:
         cur.execute("SELECT id FROM neon_migration.migration_id")

From a1df835e28fa6a1ca50579b9d327557f5d7e22d7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 12 Jul 2024 01:43:44 +0200
Subject: [PATCH 233/412] Pass configured compression param to image generation
 (#8363)

We need to pass on the configured compression param during image layer
generation.

This was an oversight of #8106, and the likely cause why #8288 didn't
bring any interesting regressions.

Part of https://github.com/neondatabase/neon/issues/5431
---
 pageserver/src/tenant/storage_layer/image_layer.rs | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 1440c0db84..a88a1e6429 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -809,7 +809,11 @@ impl ImageLayerWriterInner {
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         ensure!(self.key_range.contains(&key));
-        let (_img, res) = self.blob_writer.write_blob(img, ctx).await;
+        let compression = self.conf.image_compression;
+        let (_img, res) = self
+            .blob_writer
+            .write_blob_maybe_compressed(img, ctx, compression)
+            .await;
         // TODO: re-use the buffer for `img` further upstack
         let off = res?;
 

From 0e600eb921689aad27bee3291d0dceac84314280 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 12 Jul 2024 04:32:34 +0200
Subject: [PATCH 234/412] Implement decompression for vectored reads (#8302)

Implement decompression of images for vectored reads.

This doesn't implement support for still treating blobs as uncompressed
with the bits we reserved for compression, as we have removed that
functionality in #8300 anyways.

Part of #5431
---
 pageserver/src/tenant/blob_io.rs          |  40 ++++---
 pageserver/src/tenant/vectored_blob_io.rs | 127 +++++++++++++++++++---
 2 files changed, 139 insertions(+), 28 deletions(-)

diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs
index e98ed66ef9..791eefebe9 100644
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -137,14 +137,14 @@ impl<'a> BlockCursor<'a> {
 }
 
 /// Reserved bits for length and compression
-const LEN_COMPRESSION_BIT_MASK: u8 = 0xf0;
+pub(super) const LEN_COMPRESSION_BIT_MASK: u8 = 0xf0;
 
 /// The maximum size of blobs we support. The highest few bits
 /// are reserved for compression and other further uses.
 const MAX_SUPPORTED_LEN: usize = 0x0fff_ffff;
 
-const BYTE_UNCOMPRESSED: u8 = 0x80;
-const BYTE_ZSTD: u8 = BYTE_UNCOMPRESSED | 0x10;
+pub(super) const BYTE_UNCOMPRESSED: u8 = 0x80;
+pub(super) const BYTE_ZSTD: u8 = BYTE_UNCOMPRESSED | 0x10;
 
 /// A wrapper of `VirtualFile` that allows users to write blobs.
 ///
@@ -390,51 +390,63 @@ impl BlobWriter<false> {
 }
 
 #[cfg(test)]
-mod tests {
+pub(crate) mod tests {
     use super::*;
     use crate::{context::DownloadBehavior, task_mgr::TaskKind, tenant::block_io::BlockReaderRef};
+    use camino::Utf8PathBuf;
+    use camino_tempfile::Utf8TempDir;
     use rand::{Rng, SeedableRng};
 
     async fn round_trip_test<const BUFFERED: bool>(blobs: &[Vec<u8>]) -> Result<(), Error> {
         round_trip_test_compressed::<BUFFERED>(blobs, false).await
     }
 
-    async fn round_trip_test_compressed<const BUFFERED: bool>(
+    pub(crate) async fn write_maybe_compressed<const BUFFERED: bool>(
         blobs: &[Vec<u8>],
         compression: bool,
-    ) -> Result<(), Error> {
+        ctx: &RequestContext,
+    ) -> Result<(Utf8TempDir, Utf8PathBuf, Vec<u64>), Error> {
         let temp_dir = camino_tempfile::tempdir()?;
         let pathbuf = temp_dir.path().join("file");
-        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
 
         // Write part (in block to drop the file)
         let mut offsets = Vec::new();
         {
-            let file = VirtualFile::create(pathbuf.as_path(), &ctx).await?;
+            let file = VirtualFile::create(pathbuf.as_path(), ctx).await?;
             let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
             for blob in blobs.iter() {
                 let (_, res) = if compression {
                     wtr.write_blob_maybe_compressed(
                         blob.clone(),
-                        &ctx,
+                        ctx,
                         ImageCompressionAlgorithm::Zstd { level: Some(1) },
                     )
                     .await
                 } else {
-                    wtr.write_blob(blob.clone(), &ctx).await
+                    wtr.write_blob(blob.clone(), ctx).await
                 };
                 let offs = res?;
                 offsets.push(offs);
             }
             // Write out one page worth of zeros so that we can
             // read again with read_blk
-            let (_, res) = wtr.write_blob(vec![0; PAGE_SZ], &ctx).await;
+            let (_, res) = wtr.write_blob(vec![0; PAGE_SZ], ctx).await;
             let offs = res?;
             println!("Writing final blob at offs={offs}");
-            wtr.flush_buffer(&ctx).await?;
+            wtr.flush_buffer(ctx).await?;
         }
+        Ok((temp_dir, pathbuf, offsets))
+    }
 
-        let file = VirtualFile::open(pathbuf.as_path(), &ctx).await?;
+    async fn round_trip_test_compressed<const BUFFERED: bool>(
+        blobs: &[Vec<u8>],
+        compression: bool,
+    ) -> Result<(), Error> {
+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+        let (_temp_dir, pathbuf, offsets) =
+            write_maybe_compressed::<BUFFERED>(blobs, compression, &ctx).await?;
+
+        let file = VirtualFile::open(pathbuf, &ctx).await?;
         let rdr = BlockReaderRef::VirtualFile(&file);
         let rdr = BlockCursor::new_with_compression(rdr, compression);
         for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() {
@@ -447,7 +459,7 @@ mod tests {
         Ok(())
     }
 
-    fn random_array(len: usize) -> Vec<u8> {
+    pub(crate) fn random_array(len: usize) -> Vec<u8> {
         let mut rng = rand::thread_rng();
         (0..len).map(|_| rng.gen()).collect::<_>()
     }
diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs
index 1b470034db..cb81f1d76d 100644
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -20,11 +20,13 @@ use std::num::NonZeroUsize;
 
 use bytes::BytesMut;
 use pageserver_api::key::Key;
+use tokio::io::AsyncWriteExt;
 use tokio_epoll_uring::BoundedBuf;
 use utils::lsn::Lsn;
 use utils::vec_map::VecMap;
 
 use crate::context::RequestContext;
+use crate::tenant::blob_io::{BYTE_UNCOMPRESSED, BYTE_ZSTD, LEN_COMPRESSION_BIT_MASK};
 use crate::virtual_file::VirtualFile;
 
 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
@@ -301,7 +303,7 @@ impl<'a> VectoredBlobReader<'a> {
             read.size(),
             buf.capacity()
         );
-        let buf = self
+        let mut buf = self
             .file
             .read_exact_at(buf.slice(0..read.size()), read.start, ctx)
             .await?
@@ -323,38 +325,68 @@ impl<'a> VectoredBlobReader<'a> {
                 .chain(std::iter::once(None)),
         );
 
+        // Some scratch space, put here for reusing the allocation
+        let mut decompressed_vec = Vec::new();
+
         for ((offset, meta), next) in pairs {
             let offset_in_buf = offset - start_offset;
             let first_len_byte = buf[offset_in_buf as usize];
 
-            // Each blob is prefixed by a header containing it's size.
+            // Each blob is prefixed by a header containing its size and compression information.
             // Extract the size and skip that header to find the start of the data.
             // The size can be 1 or 4 bytes. The most significant bit is 0 in the
             // 1 byte case and 1 in the 4 byte case.
-            let (size_length, blob_size) = if first_len_byte < 0x80 {
-                (1, first_len_byte as u64)
+            let (size_length, blob_size, compression_bits) = if first_len_byte < 0x80 {
+                (1, first_len_byte as u64, BYTE_UNCOMPRESSED)
             } else {
                 let mut blob_size_buf = [0u8; 4];
                 let offset_in_buf = offset_in_buf as usize;
 
                 blob_size_buf.copy_from_slice(&buf[offset_in_buf..offset_in_buf + 4]);
-                blob_size_buf[0] &= 0x7f;
-                (4, u32::from_be_bytes(blob_size_buf) as u64)
+                blob_size_buf[0] &= !LEN_COMPRESSION_BIT_MASK;
+
+                let compression_bits = first_len_byte & LEN_COMPRESSION_BIT_MASK;
+                (
+                    4,
+                    u32::from_be_bytes(blob_size_buf) as u64,
+                    compression_bits,
+                )
             };
 
-            let start = offset_in_buf + size_length;
-            let end = match next {
+            let start_raw = offset_in_buf + size_length;
+            let end_raw = match next {
                 Some((next_blob_start_offset, _)) => next_blob_start_offset - start_offset,
-                None => start + blob_size,
+                None => start_raw + blob_size,
             };
-
-            assert_eq!(end - start, blob_size);
+            assert_eq!(end_raw - start_raw, blob_size);
+            let (start, end);
+            if compression_bits == BYTE_UNCOMPRESSED {
+                start = start_raw as usize;
+                end = end_raw as usize;
+            } else if compression_bits == BYTE_ZSTD {
+                let mut decoder =
+                    async_compression::tokio::write::ZstdDecoder::new(&mut decompressed_vec);
+                decoder
+                    .write_all(&buf[start_raw as usize..end_raw as usize])
+                    .await?;
+                decoder.flush().await?;
+                start = buf.len();
+                buf.extend_from_slice(&decompressed_vec);
+                end = buf.len();
+                decompressed_vec.clear();
+            } else {
+                let error = std::io::Error::new(
+                    std::io::ErrorKind::InvalidData,
+                    format!("invalid compression byte {compression_bits:x}"),
+                );
+                return Err(error);
+            }
 
             metas.push(VectoredBlob {
-                start: start as usize,
-                end: end as usize,
+                start,
+                end,
                 meta: *meta,
-            })
+            });
         }
 
         Ok(VectoredBlobsBuf { buf, blobs: metas })
@@ -471,6 +503,13 @@ impl StreamingVectoredReadPlanner {
 
 #[cfg(test)]
 mod tests {
+    use anyhow::Error;
+
+    use crate::context::DownloadBehavior;
+    use crate::page_cache::PAGE_SZ;
+    use crate::task_mgr::TaskKind;
+
+    use super::super::blob_io::tests::{random_array, write_maybe_compressed};
     use super::*;
 
     fn validate_read(read: &VectoredRead, offset_range: &[(Key, Lsn, u64, BlobFlag)]) {
@@ -687,4 +726,64 @@ mod tests {
             );
         }
     }
+
+    async fn round_trip_test_compressed(blobs: &[Vec<u8>], compression: bool) -> Result<(), Error> {
+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+        let (_temp_dir, pathbuf, offsets) =
+            write_maybe_compressed::<true>(blobs, compression, &ctx).await?;
+
+        let file = VirtualFile::open(&pathbuf, &ctx).await?;
+        let file_len = std::fs::metadata(&pathbuf)?.len();
+
+        // Multiply by two (compressed data might need more space), and add a few bytes for the header
+        let reserved_bytes = blobs.iter().map(|bl| bl.len()).max().unwrap() * 2 + 16;
+        let mut buf = BytesMut::with_capacity(reserved_bytes);
+
+        let vectored_blob_reader = VectoredBlobReader::new(&file);
+        let meta = BlobMeta {
+            key: Key::MIN,
+            lsn: Lsn(0),
+        };
+
+        for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() {
+            let end = offsets.get(idx + 1).unwrap_or(&file_len);
+            if idx + 1 == offsets.len() {
+                continue;
+            }
+            let read_builder = VectoredReadBuilder::new(*offset, *end, meta, 16 * 4096);
+            let read = read_builder.build();
+            let result = vectored_blob_reader.read_blobs(&read, buf, &ctx).await?;
+            assert_eq!(result.blobs.len(), 1);
+            let read_blob = &result.blobs[0];
+            let read_buf = &result.buf[read_blob.start..read_blob.end];
+            assert_eq!(blob, read_buf, "mismatch for idx={idx} at offset={offset}");
+            buf = result.buf;
+        }
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_really_big_array() -> Result<(), Error> {
+        let blobs = &[
+            b"test".to_vec(),
+            random_array(10 * PAGE_SZ),
+            b"hello".to_vec(),
+            random_array(66 * PAGE_SZ),
+            vec![0xf3; 24 * PAGE_SZ],
+            b"foobar".to_vec(),
+        ];
+        round_trip_test_compressed(blobs, false).await?;
+        round_trip_test_compressed(blobs, true).await?;
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_arrays_inc() -> Result<(), Error> {
+        let blobs = (0..PAGE_SZ / 8)
+            .map(|v| random_array(v * 16))
+            .collect::<Vec<_>>();
+        round_trip_test_compressed(&blobs, false).await?;
+        round_trip_test_compressed(&blobs, true).await?;
+        Ok(())
+    }
 }

From c4407564109b09054b777daa37cb37e9b5df2ed5 Mon Sep 17 00:00:00 2001
From: Japin Li <jianping.li@ww-it.cn>
Date: Fri, 12 Jul 2024 17:56:06 +0800
Subject: [PATCH 235/412] Remove fs2 dependency (#8350)

The fs2 dependency is not needed anymore after commit d42700280.
---
 Cargo.lock            | 11 -----------
 Cargo.toml            |  1 -
 safekeeper/Cargo.toml |  1 -
 3 files changed, 13 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index b31ac69e6c..bab0b4dd1f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2028,16 +2028,6 @@ dependencies = [
  "tokio-util",
 ]
 
-[[package]]
-name = "fs2"
-version = "0.4.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9564fc758e15025b46aa6643b1b77d047d1a56a1aea6e01002ac0c7026876213"
-dependencies = [
- "libc",
- "winapi",
-]
-
 [[package]]
 name = "fsevent-sys"
 version = "4.1.0"
@@ -5179,7 +5169,6 @@ dependencies = [
  "crc32c",
  "desim",
  "fail",
- "fs2",
  "futures",
  "git-version",
  "hex",
diff --git a/Cargo.toml b/Cargo.toml
index 6bad8e3b20..670e3241d5 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -83,7 +83,6 @@ enumset = "1.0.12"
 fail = "0.5.0"
 fallible-iterator = "0.2"
 framed-websockets = { version = "0.1.0", git = "https://github.com/neondatabase/framed-websockets" }
-fs2 = "0.4.3"
 futures = "0.3"
 futures-core = "0.3"
 futures-util = "0.3"
diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml
index 9f32016fd9..0fdb3147bf 100644
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -23,7 +23,6 @@ clap = { workspace = true, features = ["derive"] }
 const_format.workspace = true
 crc32c.workspace = true
 fail.workspace = true
-fs2.workspace = true
 git-version.workspace = true
 hex.workspace = true
 humantime.workspace = true

From 3f8819827cdbd1b7cca42fdf04a493007713126b Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 12 Jul 2024 12:04:02 +0100
Subject: [PATCH 236/412] pageserver: circuit breaker on compaction (#8359)

## Problem

We already back off on compaction retries, but the impact of a failing
compaction can be so great that backing off up to 300s isn't enough. The
impact is consuming a lot of I/O+CPU in the case of image layer
generation for large tenants, and potentially also leaking disk space.

Compaction failures are extremely rare and almost always indicate a bug,
frequently a bug that will not let compaction to proceed until it is
fixed.

Related: https://github.com/neondatabase/neon/issues/6738

## Summary of changes

- Introduce a CircuitBreaker type
- Add a circuit breaker for compaction, with a policy that after 5
failures, compaction will not be attempted again for 24 hours.
- Add metrics that we can alert on: any >0 value for
`pageserver_circuit_breaker_broken_total` should generate an alert.
- Add a test that checks this works as intended.

Couple notes to reviewers:
- Circuit breakers are intrinsically a defense-in-depth measure: this is
not the solution to any underlying issues, it is just a general
mitigation for "unknown unknowns" that might be encountered in future.
- This PR isn't primarily about writing a perfect CircuitBreaker type:
the one in this PR is meant to be just enough to mitigate issues in
compaction, and make it easy to monitor/alert on these failures. We can
refine this type in future as/when we want to use it elsewhere.
---
 libs/utils/src/circuit_breaker.rs      | 114 +++++++++++++++++++++++++
 libs/utils/src/lib.rs                  |   2 +
 pageserver/src/metrics.rs              |  16 ++++
 pageserver/src/tenant.rs               |  36 +++++++-
 test_runner/regress/test_compaction.py |  63 ++++++++++++++
 5 files changed, 229 insertions(+), 2 deletions(-)
 create mode 100644 libs/utils/src/circuit_breaker.rs

diff --git a/libs/utils/src/circuit_breaker.rs b/libs/utils/src/circuit_breaker.rs
new file mode 100644
index 0000000000..720ea39d4f
--- /dev/null
+++ b/libs/utils/src/circuit_breaker.rs
@@ -0,0 +1,114 @@
+use std::{
+    fmt::Display,
+    time::{Duration, Instant},
+};
+
+use metrics::IntCounter;
+
+/// Circuit breakers are for operations that are expensive and fallible: if they fail repeatedly,
+/// we will stop attempting them for some period of time, to avoid denial-of-service from retries, and
+/// to mitigate the log spam from repeated failures.
+pub struct CircuitBreaker {
+    /// An identifier that enables us to log useful errors when a circuit is broken
+    name: String,
+
+    /// Consecutive failures since last success
+    fail_count: usize,
+
+    /// How many consecutive failures before we break the circuit
+    fail_threshold: usize,
+
+    /// If circuit is broken, when was it broken?
+    broken_at: Option<Instant>,
+
+    /// If set, we will auto-reset the circuit this long after it was broken.  If None, broken
+    /// circuits stay broken forever, or until success() is called.
+    reset_period: Option<Duration>,
+
+    /// If this is true, no actual circuit-breaking happens.  This is for overriding a circuit breaker
+    /// to permit something to keep running even if it would otherwise have tripped it.
+    short_circuit: bool,
+}
+
+impl CircuitBreaker {
+    pub fn new(name: String, fail_threshold: usize, reset_period: Option<Duration>) -> Self {
+        Self {
+            name,
+            fail_count: 0,
+            fail_threshold,
+            broken_at: None,
+            reset_period,
+            short_circuit: false,
+        }
+    }
+
+    /// Construct an unbreakable circuit breaker, for use in unit tests etc.
+    pub fn short_circuit() -> Self {
+        Self {
+            name: String::new(),
+            fail_threshold: 0,
+            fail_count: 0,
+            broken_at: None,
+            reset_period: None,
+            short_circuit: true,
+        }
+    }
+
+    pub fn fail<E>(&mut self, metric: &IntCounter, error: E)
+    where
+        E: Display,
+    {
+        if self.short_circuit {
+            return;
+        }
+
+        self.fail_count += 1;
+        if self.broken_at.is_none() && self.fail_count >= self.fail_threshold {
+            self.break_circuit(metric, error);
+        }
+    }
+
+    /// Call this after successfully executing an operation
+    pub fn success(&mut self, metric: &IntCounter) {
+        self.fail_count = 0;
+        if let Some(broken_at) = &self.broken_at {
+            tracing::info!(breaker=%self.name, "Circuit breaker failure ended (was broken for {})",
+                humantime::format_duration(broken_at.elapsed()));
+            self.broken_at = None;
+            metric.inc();
+        }
+    }
+
+    /// Call this before attempting an operation, and skip the operation if we are currently broken.
+    pub fn is_broken(&mut self) -> bool {
+        if self.short_circuit {
+            return false;
+        }
+
+        if let Some(broken_at) = self.broken_at {
+            match self.reset_period {
+                Some(reset_period) if broken_at.elapsed() > reset_period => {
+                    self.reset_circuit();
+                    false
+                }
+                _ => true,
+            }
+        } else {
+            false
+        }
+    }
+
+    fn break_circuit<E>(&mut self, metric: &IntCounter, error: E)
+    where
+        E: Display,
+    {
+        self.broken_at = Some(Instant::now());
+        tracing::error!(breaker=%self.name, "Circuit breaker broken!  Last error: {error}");
+        metric.inc();
+    }
+
+    fn reset_circuit(&mut self) {
+        self.broken_at = None;
+        self.fail_count = 0;
+    }
+}
diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs
index 711e617801..9ad1752fb7 100644
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -98,6 +98,8 @@ pub mod poison;
 
 pub mod toml_edit_ext;
 
+pub mod circuit_breaker;
+
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index e67fa656d0..9b3bb481b9 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -569,6 +569,22 @@ static VALID_LSN_LEASE_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+pub(crate) static CIRCUIT_BREAKERS_BROKEN: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_circuit_breaker_broken",
+        "How many times a circuit breaker has broken"
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static CIRCUIT_BREAKERS_UNBROKEN: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_circuit_breaker_unbroken",
+        "How many times a circuit breaker has been un-broken (recovered)"
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) mod initial_logical_size {
     use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
     use once_cell::sync::Lazy;
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index bf23513527..6333fd3b63 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -39,6 +39,7 @@ use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::backoff;
+use utils::circuit_breaker::CircuitBreaker;
 use utils::completion;
 use utils::crashsafe::path_with_suffix_extension;
 use utils::failpoint_support;
@@ -76,7 +77,8 @@ use crate::is_uninit_mark;
 use crate::l0_flush::L0FlushGlobalState;
 use crate::metrics::TENANT;
 use crate::metrics::{
-    remove_tenant_metrics, BROKEN_TENANTS_SET, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC,
+    remove_tenant_metrics, BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN,
+    TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC,
 };
 use crate::repository::GcResult;
 use crate::task_mgr;
@@ -276,6 +278,10 @@ pub struct Tenant {
 
     eviction_task_tenant_state: tokio::sync::Mutex<EvictionTaskTenantState>,
 
+    /// Track repeated failures to compact, so that we can back off.
+    /// Overhead of mutex is acceptable because compaction is done with a multi-second period.
+    compaction_circuit_breaker: std::sync::Mutex<CircuitBreaker>,
+
     /// If the tenant is in Activating state, notify this to encourage it
     /// to proceed to Active as soon as possible, rather than waiting for lazy
     /// background warmup.
@@ -1641,13 +1647,31 @@ impl Tenant {
             timelines_to_compact
         };
 
+        // Before doing any I/O work, check our circuit breaker
+        if self.compaction_circuit_breaker.lock().unwrap().is_broken() {
+            info!("Skipping compaction due to previous failures");
+            return Ok(());
+        }
+
         for (timeline_id, timeline) in &timelines_to_compact {
             timeline
                 .compact(cancel, EnumSet::empty(), ctx)
                 .instrument(info_span!("compact_timeline", %timeline_id))
-                .await?;
+                .await
+                .map_err(|e| {
+                    self.compaction_circuit_breaker
+                        .lock()
+                        .unwrap()
+                        .fail(&CIRCUIT_BREAKERS_BROKEN, &e);
+                    e
+                })?;
         }
 
+        self.compaction_circuit_breaker
+            .lock()
+            .unwrap()
+            .success(&CIRCUIT_BREAKERS_UNBROKEN);
+
         Ok(())
     }
 
@@ -2563,6 +2587,14 @@ impl Tenant {
             cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
             cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
             eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()),
+            compaction_circuit_breaker: std::sync::Mutex::new(CircuitBreaker::new(
+                format!("compaction-{tenant_shard_id}"),
+                5,
+                // Compaction can be a very expensive operation, and might leak disk space.  It also ought
+                // to be infallible, as long as remote storage is available.  So if it repeatedly fails,
+                // use an extremely long backoff.
+                Some(Duration::from_secs(3600 * 24)),
+            )),
             activate_now_sem: tokio::sync::Semaphore::new(0),
             cancel: CancellationToken::default(),
             gate: Gate::default(),
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index 49dcb9b86a..f321c09b27 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -1,12 +1,14 @@
 import enum
 import json
 import os
+import time
 from typing import Optional
 
 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder, generate_uploads_and_deletions
 from fixtures.pageserver.http import PageserverApiException
+from fixtures.utils import wait_until
 from fixtures.workload import Workload
 
 AGGRESIVE_COMPACTION_TENANT_CONF = {
@@ -257,3 +259,64 @@ def test_uploads_and_deletions(
         found_allowed_error = any(env.pageserver.log_contains(e) for e in allowed_errors)
         if not found_allowed_error:
             raise Exception("None of the allowed_errors occured in the log")
+
+
+def test_pageserver_compaction_circuit_breaker(neon_env_builder: NeonEnvBuilder):
+    """
+    Check that repeated failures in compaction result in a circuit breaker breaking
+    """
+    TENANT_CONF = {
+        # Very frequent runs to rack up failures quickly
+        "compaction_period": "100ms",
+        # Small checkpoint distance to create many layers
+        "checkpoint_distance": 1024 * 128,
+        # Compact small layers
+        "compaction_target_size": 1024 * 128,
+        "image_creation_threshold": 1,
+    }
+
+    FAILPOINT = "delta-layer-writer-fail-before-finish"
+    BROKEN_LOG = ".*Circuit breaker broken!.*"
+
+    env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
+
+    workload = Workload(env, env.initial_tenant, env.initial_timeline)
+    workload.init()
+
+    # Set a failpoint that will prevent compaction succeeding
+    env.pageserver.http_client().configure_failpoints((FAILPOINT, "return"))
+
+    # Write some data to trigger compaction
+    workload.write_rows(1024, upload=False)
+    workload.write_rows(1024, upload=False)
+    workload.write_rows(1024, upload=False)
+
+    def assert_broken():
+        env.pageserver.assert_log_contains(BROKEN_LOG)
+        assert (
+            env.pageserver.http_client().get_metric_value("pageserver_circuit_breaker_broken_total")
+            or 0
+        ) == 1
+        assert (
+            env.pageserver.http_client().get_metric_value(
+                "pageserver_circuit_breaker_unbroken_total"
+            )
+            or 0
+        ) == 0
+
+    # Wait for enough failures to break the circuit breaker
+    # This wait is fairly long because we back off on compaction failures, so 5 retries takes ~30s
+    wait_until(60, 1, assert_broken)
+
+    # Sleep for a while, during which time we expect that compaction will _not_ be retried
+    time.sleep(10)
+
+    assert (
+        env.pageserver.http_client().get_metric_value("pageserver_circuit_breaker_broken_total")
+        or 0
+    ) == 1
+    assert (
+        env.pageserver.http_client().get_metric_value("pageserver_circuit_breaker_unbroken_total")
+        or 0
+    ) == 0
+    assert not env.pageserver.log_contains(".*Circuit breaker failure ended.*")

From 7472c69954f70391b1ad3590449e4ffa6e732fcb Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 12 Jul 2024 13:58:04 +0100
Subject: [PATCH 237/412] Fix nightly warnings 2024 june (#8151)

## Problem

new clippy warnings on nightly.

## Summary of changes

broken up each commit by warning type.
1. Remove some unnecessary refs.
2. In edition 2024, inference will default to `!` and not `()`.
3. Clippy complains about doc comment indentation
4. Fix `Trait + ?Sized` where `Trait: Sized`.
5. diesel_derives triggering `non_local_defintions`
---
 compute_tools/src/bin/compute_ctl.rs          |  3 +--
 compute_tools/src/compute.rs                  |  1 +
 control_plane/storcon_cli/src/main.rs         |  2 +-
 libs/utils/src/http/endpoint.rs               | 14 +++++------
 pageserver/compaction/src/interface.rs        |  2 +-
 pageserver/src/context.rs                     |  1 +
 pageserver/src/pgdatadir_mapping.rs           |  2 +-
 pageserver/src/tenant/disk_btree.rs           |  6 ++---
 .../src/tenant/storage_layer/layer_desc.rs    |  2 +-
 pageserver/src/tenant/timeline.rs             | 11 +++++----
 pageserver/src/tenant/timeline/delete.rs      |  6 +++--
 .../src/tenant/timeline/logical_size.rs       | 24 +++++++++----------
 pageserver/src/tenant/timeline/walreceiver.rs | 10 ++++----
 pageserver/src/tenant/vectored_blob_io.rs     |  4 ++--
 .../virtual_file/owned_buffers_io/write.rs    |  1 +
 proxy/src/compute.rs                          |  2 +-
 proxy/src/redis/cancellation_publisher.rs     |  2 +-
 .../connection_with_credentials_provider.rs   |  2 +-
 proxy/src/redis/notifications.rs              |  2 +-
 proxy/src/serverless/backend.rs               |  2 +-
 proxy/src/serverless/conn_pool.rs             |  2 +-
 proxy/src/waiters.rs                          |  2 +-
 safekeeper/src/wal_backup.rs                  |  1 +
 storage_controller/src/persistence.rs         |  1 +
 storage_controller/src/service.rs             | 14 +++++------
 storage_controller/src/tenant_shard.rs        |  1 +
 26 files changed, 64 insertions(+), 56 deletions(-)

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index f4c396a85d..0ba2c1aeb4 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -6,7 +6,7 @@
 //! - Every start is a fresh start, so the data directory is removed and
 //!   initialized again on each run.
 //! - If remote_extension_config is provided, it will be used to fetch extensions list
-//!  and download `shared_preload_libraries` from the remote storage.
+//!   and download `shared_preload_libraries` from the remote storage.
 //! - Next it will put configuration files into the `PGDATA` directory.
 //! - Sync safekeepers and get commit LSN.
 //! - Get `basebackup` from pageserver using the returned on the previous step LSN.
@@ -33,7 +33,6 @@
 //!             -b /usr/local/bin/postgres \
 //!             -r http://pg-ext-s3-gateway \
 //! ```
-//!
 use std::collections::HashMap;
 use std::fs::File;
 use std::path::Path;
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 1112795d30..91855d954d 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -56,6 +56,7 @@ pub struct ComputeNode {
     /// - we push new spec and it does reconfiguration
     /// - but then something happens and compute pod / VM is destroyed,
     ///   so k8s controller starts it again with the **old** spec
+    ///
     /// and the same for empty computes:
     /// - we started compute without any spec
     /// - we push spec and it does configuration
diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index 815f5c940f..777a717a73 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -341,7 +341,7 @@ async fn main() -> anyhow::Result<()> {
         }
         Command::TenantCreate { tenant_id } => {
             storcon_client
-                .dispatch(
+                .dispatch::<_, ()>(
                     Method::POST,
                     "v1/tenant".to_string(),
                     Some(TenantCreateRequest {
diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs
index f8a5f68131..8ee5abd434 100644
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -52,17 +52,17 @@ struct RequestId(String);
 /// There could be other ways to implement similar functionality:
 ///
 /// * procmacros placed on top of all handler methods
-/// With all the drawbacks of procmacros, brings no difference implementation-wise,
-/// and little code reduction compared to the existing approach.
+///   With all the drawbacks of procmacros, brings no difference implementation-wise,
+///   and little code reduction compared to the existing approach.
 ///
 /// * Another `TraitExt` with e.g. the `get_with_span`, `post_with_span` methods to do similar logic,
-/// implemented for [`RouterBuilder`].
-/// Could be simpler, but we don't want to depend on [`routerify`] more, targeting to use other library later.
+///   implemented for [`RouterBuilder`].
+///   Could be simpler, but we don't want to depend on [`routerify`] more, targeting to use other library later.
 ///
 /// * In theory, a span guard could've been created in a pre-request middleware and placed into a global collection, to be dropped
-/// later, in a post-response middleware.
-/// Due to suspendable nature of the futures, would give contradictive results which is exactly the opposite of what `tracing-futures`
-/// tries to achive with its `.instrument` used in the current approach.
+///   later, in a post-response middleware.
+///   Due to suspendable nature of the futures, would give contradictive results which is exactly the opposite of what `tracing-futures`
+///   tries to achive with its `.instrument` used in the current approach.
 ///
 /// If needed, a declarative macro to substitute the |r| ... closure boilerplate could be introduced.
 pub async fn request_span<R, H>(request: Request<Body>, handler: H) -> R::Output
diff --git a/pageserver/compaction/src/interface.rs b/pageserver/compaction/src/interface.rs
index 35519b5d0a..5bc9b5ca1d 100644
--- a/pageserver/compaction/src/interface.rs
+++ b/pageserver/compaction/src/interface.rs
@@ -131,7 +131,7 @@ impl CompactionKey for Key {
 pub type CompactionKeySpace<K> = Vec<Range<K>>;
 
 /// Functions needed from all layers.
-pub trait CompactionLayer<K: CompactionKey + ?Sized> {
+pub trait CompactionLayer<K: CompactionKey> {
     fn key_range(&self) -> &Range<K>;
     fn lsn_range(&self) -> &Range<Lsn>;
 
diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs
index 86d0390c30..0b07e07524 100644
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -59,6 +59,7 @@
 //! 1. It should be easy to forward the context to callees.
 //! 2. To propagate more data from high-level to low-level code, the functions in
 //!    the middle should not need to be modified.
+//!
 //! The solution is to have a container structure ([`RequestContext`]) that
 //! carries the information. Functions that don't care about what's in it
 //! pass it along to callees.
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 8a6cfea92b..a821b824d0 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -522,7 +522,7 @@ impl Timeline {
         ctx: &RequestContext,
     ) -> Result<Option<TimestampTz>, PageReconstructError> {
         let mut max: Option<TimestampTz> = None;
-        self.map_all_timestamps(probe_lsn, ctx, |timestamp| {
+        self.map_all_timestamps::<()>(probe_lsn, ctx, |timestamp| {
             if let Some(max_prev) = max {
                 max = Some(max_prev.max(timestamp));
             } else {
diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs
index b76498b608..251d2ab4ad 100644
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -550,10 +550,10 @@ where
     /// We maintain the length of the stack to be always greater than zero.
     /// Two exceptions are:
     /// 1. `Self::flush_node`. The method will push the new node if it extracted the last one.
-    ///   So because other methods cannot see the intermediate state invariant still holds.
+    ///    So because other methods cannot see the intermediate state invariant still holds.
     /// 2. `Self::finish`. It consumes self and does not return it back,
-    ///  which means that this is where the structure is destroyed.
-    ///  Thus stack of zero length cannot be observed by other methods.
+    ///    which means that this is where the structure is destroyed.
+    ///    Thus stack of zero length cannot be observed by other methods.
     stack: Vec<BuildNode<L>>,
 
     /// Last key that was appended to the tree. Used to sanity check that append
diff --git a/pageserver/src/tenant/storage_layer/layer_desc.rs b/pageserver/src/tenant/storage_layer/layer_desc.rs
index a89b66e4a1..bd765560e4 100644
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -25,7 +25,7 @@ pub struct PersistentLayerDesc {
     ///
     /// - For an open in-memory layer, the end bound is MAX_LSN
     /// - For a frozen in-memory layer or a delta layer, the end bound is a valid lsn after the
-    /// range start
+    ///   range start
     /// - An image layer represents snapshot at one LSN, so end_lsn is always the snapshot LSN + 1
     pub lsn_range: Range<Lsn>,
     /// Whether this is a delta layer, and also, is this incremental.
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 762e903bf8..a3ddb3a1d1 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3408,6 +3408,7 @@ impl Timeline {
         }
     }
 
+    #[allow(clippy::doc_lazy_continuation)]
     /// Get the data needed to reconstruct all keys in the provided keyspace
     ///
     /// The algorithm is as follows:
@@ -4474,10 +4475,10 @@ impl Timeline {
     /// are required. Since checking if new image layers are required is expensive in
     /// terms of CPU, we only do it in the following cases:
     /// 1. If the timeline has ingested sufficient WAL to justify the cost
-    /// 2. If enough time has passed since the last check
-    /// 2.1. For large tenants, we wish to perform the check more often since they
-    /// suffer from the lack of image layers
-    /// 2.2. For small tenants (that can mostly fit in RAM), we use a much longer interval
+    /// 2. If enough time has passed since the last check:
+    ///     1. For large tenants, we wish to perform the check more often since they
+    ///        suffer from the lack of image layers
+    ///     2. For small tenants (that can mostly fit in RAM), we use a much longer interval
     fn should_check_if_image_layers_required(self: &Arc<Timeline>, lsn: Lsn) -> bool {
         const LARGE_TENANT_THRESHOLD: u64 = 2 * 1024 * 1024 * 1024;
 
@@ -4719,7 +4720,7 @@ impl Timeline {
     /// Requires a timeline that:
     /// - has an ancestor to detach from
     /// - the ancestor does not have an ancestor -- follows from the original RFC limitations, not
-    /// a technical requirement
+    ///   a technical requirement
     ///
     /// After the operation has been started, it cannot be canceled. Upon restart it needs to be
     /// polled again until completion.
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index b0088f4ea2..d32945d9e4 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -182,13 +182,15 @@ async fn remove_timeline_from_tenant(
 /// 5. Delete index part
 /// 6. Delete meta, timeline directory
 /// 7. Delete mark file
+///
 /// It is resumable from any step in case a crash/restart occurs.
 /// There are three entrypoints to the process:
 /// 1. [`DeleteTimelineFlow::run`] this is the main one called by a management api handler.
 /// 2. [`DeleteTimelineFlow::resume_deletion`] is called during restarts when local metadata is still present
-/// and we possibly neeed to continue deletion of remote files.
+///    and we possibly neeed to continue deletion of remote files.
 /// 3. [`DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces`] is used when we deleted remote
-/// index but still have local metadata, timeline directory and delete mark.
+///    index but still have local metadata, timeline directory and delete mark.
+///
 /// Note the only other place that messes around timeline delete mark is the logic that scans directory with timelines during tenant load.
 #[derive(Default)]
 pub enum DeleteTimelineFlow {
diff --git a/pageserver/src/tenant/timeline/logical_size.rs b/pageserver/src/tenant/timeline/logical_size.rs
index 8f9ca0e29f..b0d6c4a27a 100644
--- a/pageserver/src/tenant/timeline/logical_size.rs
+++ b/pageserver/src/tenant/timeline/logical_size.rs
@@ -11,11 +11,11 @@ use std::sync::atomic::{AtomicBool, AtomicI64, Ordering as AtomicOrdering};
 /// Calculation consists of two stages:
 ///
 /// 1. Initial size calculation. That might take a long time, because it requires
-/// reading all layers containing relation sizes at `initial_part_end`.
+///    reading all layers containing relation sizes at `initial_part_end`.
 ///
 /// 2. Collecting an incremental part and adding that to the initial size.
-/// Increments are appended on walreceiver writing new timeline data,
-/// which result in increase or decrease of the logical size.
+///    Increments are appended on walreceiver writing new timeline data,
+///    which result in increase or decrease of the logical size.
 pub(super) struct LogicalSize {
     /// Size, potentially slow to compute. Calculating this might require reading multiple
     /// layers, and even ancestor's layers.
@@ -45,17 +45,17 @@ pub(super) struct LogicalSize {
     /// Size shouldn't ever be negative, but this is signed for two reasons:
     ///
     /// 1. If we initialized the "baseline" size lazily, while we already
-    /// process incoming WAL, the incoming WAL records could decrement the
-    /// variable and temporarily make it negative. (This is just future-proofing;
-    /// the initialization is currently not done lazily.)
+    ///    process incoming WAL, the incoming WAL records could decrement the
+    ///    variable and temporarily make it negative. (This is just future-proofing;
+    ///    the initialization is currently not done lazily.)
     ///
     /// 2. If there is a bug and we e.g. forget to increment it in some cases
-    /// when size grows, but remember to decrement it when it shrinks again, the
-    /// variable could go negative. In that case, it seems better to at least
-    /// try to keep tracking it, rather than clamp or overflow it. Note that
-    /// get_current_logical_size() will clamp the returned value to zero if it's
-    /// negative, and log an error. Could set it permanently to zero or some
-    /// special value to indicate "broken" instead, but this will do for now.
+    ///    when size grows, but remember to decrement it when it shrinks again, the
+    ///    variable could go negative. In that case, it seems better to at least
+    ///    try to keep tracking it, rather than clamp or overflow it. Note that
+    ///    get_current_logical_size() will clamp the returned value to zero if it's
+    ///    negative, and log an error. Could set it permanently to zero or some
+    ///    special value to indicate "broken" instead, but this will do for now.
     ///
     /// Note that we also expose a copy of this value as a prometheus metric,
     /// see `current_logical_size_gauge`. Use the `update_current_logical_size`
diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs
index a085154a5a..4a3a5c621b 100644
--- a/pageserver/src/tenant/timeline/walreceiver.rs
+++ b/pageserver/src/tenant/timeline/walreceiver.rs
@@ -2,13 +2,13 @@
 //! To do so, a current implementation needs to do the following:
 //!
 //! * acknowledge the timelines that it needs to stream WAL into.
-//! Pageserver is able to dynamically (un)load tenants on attach and detach,
-//! hence WAL receiver needs to react on such events.
+//!   Pageserver is able to dynamically (un)load tenants on attach and detach,
+//!   hence WAL receiver needs to react on such events.
 //!
 //! * get a broker subscription, stream data from it to determine that a timeline needs WAL streaming.
-//! For that, it watches specific keys in storage_broker and pulls the relevant data periodically.
-//! The data is produced by safekeepers, that push it periodically and pull it to synchronize between each other.
-//! Without this data, no WAL streaming is possible currently.
+//!   For that, it watches specific keys in storage_broker and pulls the relevant data periodically.
+//!   The data is produced by safekeepers, that push it periodically and pull it to synchronize between each other.
+//!   Without this data, no WAL streaming is possible currently.
 //!
 //! Only one active WAL streaming connection is allowed at a time.
 //! The connection is supposed to be updated periodically, based on safekeeper timeline data.
diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs
index cb81f1d76d..5a0986ea12 100644
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -191,9 +191,9 @@ impl VectoredReadPlanner {
     ///
     /// The `flag` argument has two interesting values:
     /// * [`BlobFlag::ReplaceAll`]: The blob for this key should replace all existing blobs.
-    /// This is used for WAL records that `will_init`.
+    ///   This is used for WAL records that `will_init`.
     /// * [`BlobFlag::Ignore`]: This blob should not be included in the read. This happens
-    /// if the blob is cached.
+    ///   if the blob is cached.
     pub fn handle(&mut self, key: Key, lsn: Lsn, offset: u64, flag: BlobFlag) {
         // Implementation note: internally lag behind by one blob such that
         // we have a start and end offset when initialising [`VectoredRead`]
diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs
index 885a9221c5..8599d95cdf 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/write.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs
@@ -33,6 +33,7 @@ pub struct BufferedWriter<B, W> {
     /// invariant: always remains Some(buf) except
     /// - while IO is ongoing => goes back to Some() once the IO completed successfully
     /// - after an IO error => stays `None` forever
+    ///
     /// In these exceptional cases, it's `None`.
     buf: Option<B>,
 }
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index a50a96e5e8..f91693c704 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -319,7 +319,7 @@ impl ConnCfg {
         let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
         let (client, connection) = self.0.connect_raw(stream, tls).await?;
         drop(pause);
-        tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id()));
+        tracing::Span::current().record("pid", tracing::field::display(client.get_process_id()));
         let stream = connection.stream.into_inner();
 
         info!(
diff --git a/proxy/src/redis/cancellation_publisher.rs b/proxy/src/redis/cancellation_publisher.rs
index 7baf104374..c9a946fa4a 100644
--- a/proxy/src/redis/cancellation_publisher.rs
+++ b/proxy/src/redis/cancellation_publisher.rs
@@ -106,7 +106,7 @@ impl RedisPublisherClient {
             cancel_key_data,
             session_id,
         }))?;
-        self.client.publish(PROXY_CHANNEL_NAME, payload).await?;
+        let _: () = self.client.publish(PROXY_CHANNEL_NAME, payload).await?;
         Ok(())
     }
     pub async fn try_connect(&mut self) -> anyhow::Result<()> {
diff --git a/proxy/src/redis/connection_with_credentials_provider.rs b/proxy/src/redis/connection_with_credentials_provider.rs
index 3a90d911c2..b02ce472c0 100644
--- a/proxy/src/redis/connection_with_credentials_provider.rs
+++ b/proxy/src/redis/connection_with_credentials_provider.rs
@@ -178,7 +178,7 @@ impl ConnectionWithCredentialsProvider {
         credentials_provider: Arc<CredentialsProvider>,
     ) -> anyhow::Result<()> {
         let (user, password) = credentials_provider.provide_credentials().await?;
-        redis::cmd("AUTH")
+        let _: () = redis::cmd("AUTH")
             .arg(user)
             .arg(password)
             .query_async(con)
diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs
index 87d723d17e..efd7437d5d 100644
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -127,7 +127,7 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
             Cancel(cancel_session) => {
                 tracing::Span::current().record(
                     "session_id",
-                    &tracing::field::display(cancel_session.session_id),
+                    tracing::field::display(cancel_session.session_id),
                 );
                 Metrics::get()
                     .proxy
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 6c34d48338..3b86c1838c 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -245,7 +245,7 @@ impl ConnectMechanism for TokioMechanism {
         drop(pause);
         let (client, connection) = permit.release_result(res)?;
 
-        tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id()));
+        tracing::Span::current().record("pid", tracing::field::display(client.get_process_id()));
         Ok(poll_client(
             self.pool.clone(),
             ctx,
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 170bda062e..dbc58d48ec 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -403,7 +403,7 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
                 tracing::Span::current().record("conn_id", tracing::field::display(client.conn_id));
                 tracing::Span::current().record(
                     "pid",
-                    &tracing::field::display(client.inner.get_process_id()),
+                    tracing::field::display(client.inner.get_process_id()),
                 );
                 info!(
                     cold_start_info = ColdStartInfo::HttpPoolHit.as_str(),
diff --git a/proxy/src/waiters.rs b/proxy/src/waiters.rs
index bba5494cfe..888ad38048 100644
--- a/proxy/src/waiters.rs
+++ b/proxy/src/waiters.rs
@@ -111,7 +111,7 @@ mod tests {
 
         let waiters = Arc::clone(&waiters);
         let notifier = tokio::spawn(async move {
-            waiters.notify(key, Default::default())?;
+            waiters.notify(key, ())?;
             Ok(())
         });
 
diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs
index 9ea048a3c7..5a590689c3 100644
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -119,6 +119,7 @@ async fn shut_down_task(entry: &mut Option<WalBackupTaskHandle>) {
 /// time we have several ones as they PUT the same files. Also,
 /// - frequently changing the offloader would be bad;
 /// - electing seriously lagging safekeeper is undesirable;
+///
 /// So we deterministically choose among the reasonably caught up candidates.
 /// TODO: take into account failed attempts to deal with hypothetical situation
 /// where s3 is unreachable only for some sks.
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index 47caf7ae81..9f7b2f775e 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -542,6 +542,7 @@ impl Persistence {
         Ok(Generation::new(g as u32))
     }
 
+    #[allow(non_local_definitions)]
     /// For use when updating a persistent property of a tenant, such as its config or placement_policy.
     ///
     /// Do not use this for settting generation, unless in the special onboarding code path (/location_config)
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index b6e2b53191..deaac83ea5 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -5070,7 +5070,7 @@ impl Service {
     ///      we did the split, but are probably better placed elsewhere.
     /// - Creating new secondary locations if it improves the spreading of a sharded tenant
     ///    * e.g. after a shard split, some locations will be on the same node (where the split
-    ///     happened), and will probably be better placed elsewhere.
+    ///      happened), and will probably be better placed elsewhere.
     ///
     /// To put it more briefly: whereas the scheduler respects soft constraints in a ScheduleContext at
     /// the time of scheduling, this function looks for cases where a better-scoring location is available
@@ -5633,14 +5633,14 @@ impl Service {
 
     /// Create a node fill plan (pick secondaries to promote) that meets the following requirements:
     /// 1. The node should be filled until it reaches the expected cluster average of
-    /// attached shards. If there are not enough secondaries on the node, the plan stops early.
+    ///    attached shards. If there are not enough secondaries on the node, the plan stops early.
     /// 2. Select tenant shards to promote such that the number of attached shards is balanced
-    /// throughout the cluster. We achieve this by picking tenant shards from each node,
-    /// starting from the ones with the largest number of attached shards, until the node
-    /// reaches the expected cluster average.
+    ///    throughout the cluster. We achieve this by picking tenant shards from each node,
+    ///    starting from the ones with the largest number of attached shards, until the node
+    ///    reaches the expected cluster average.
     /// 3. Avoid promoting more shards of the same tenant than required. The upper bound
-    /// for the number of tenants from the same shard promoted to the node being filled is:
-    /// shard count for the tenant divided by the number of nodes in the cluster.
+    ///    for the number of tenants from the same shard promoted to the node being filled is:
+    ///    shard count for the tenant divided by the number of nodes in the cluster.
     fn fill_node_plan(&self, node_id: NodeId) -> Vec<TenantShardId> {
         let mut locked = self.inner.write().unwrap();
         let fill_requirement = locked.scheduler.compute_fill_requirement(node_id);
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index 2574dc297a..ee2ba6c4ee 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -124,6 +124,7 @@ pub(crate) struct TenantShard {
     ///  - ReconcileWaiters need to Arc-clone the overall object to read it later
     ///  - ReconcileWaitError needs to use an `Arc<ReconcileError>` because we can construct
     ///    many waiters for one shard, and the underlying error types are not Clone.
+    ///
     /// TODO: generalize to an array of recent events
     /// TOOD: use a ArcSwap instead of mutex for faster reads?
     #[serde(serialize_with = "read_last_error")]

From 85b5219861cb1366b9be374e079af0ba77307c2c Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Fri, 12 Jul 2024 09:28:13 -0400
Subject: [PATCH 238/412] fix(pageserver): unique test harness name for
 merge_in_between (#8366)

As title, there should be a way to detect duplicated harness names in
the future :(

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/storage_layer/merge_iterator.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/storage_layer/merge_iterator.rs b/pageserver/src/tenant/storage_layer/merge_iterator.rs
index 36386c87c9..68759f7585 100644
--- a/pageserver/src/tenant/storage_layer/merge_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs
@@ -275,7 +275,7 @@ mod tests {
         use crate::repository::Value;
         use bytes::Bytes;
 
-        let harness = TenantHarness::create("merge_iterator_delta_merge").unwrap();
+        let harness = TenantHarness::create("merge_iterator_merge_in_between").unwrap();
         let (tenant, ctx) = harness.load().await;
 
         let tline = tenant

From 085bbaf5f86165e0b7bd307e634786c6ed9a4ec2 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 12 Jul 2024 17:31:17 +0100
Subject: [PATCH 239/412] tests: allow list breaching min resident size in
 statvfs test (#8358)

## Problem
This test would sometimes violate the min resident size during disk
eviction and fail due to the generate warning log.

Disk usage candidate collection only takes into account active tenants.
However, the statvfs call takes into account the entire tenants
directory, which includes tenants which haven't become active yet.

After re-starting the pageserver, disk usage eviction may kick in
*before* both tenants have become active. Hence, the logic will try to satisfy
thedisk usage requirements by evicting everything belonging to the active
tenant, and hence violating the tenant minimum resident size.

## Summary of changes

Allow the warning
---
 test_runner/regress/test_disk_usage_eviction.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py
index 7722828c79..fb8b7b22fa 100644
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -794,6 +794,16 @@ def test_statvfs_pressure_usage(eviction_env: EvictionEnv):
 
     wait_until(2, 2, less_than_max_usage_pct)
 
+    # Disk usage candidate collection only takes into account active tenants.
+    # However, the statvfs call takes into account the entire tenants directory,
+    # which includes tenants which haven't become active yet.
+    #
+    # After re-starting the pageserver, disk usage eviction may kick in *before*
+    # both tenants have become active. Hence, the logic will try to satisfy the
+    # disk usage requirements by evicting everything belonging to the active tenant,
+    # and hence violating the tenant minimum resident size.
+    env.neon_env.pageserver.allowed_errors.append(".*" + GLOBAL_LRU_LOG_LINE)
+
 
 def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv):
     """

From 7973c3e94129807185c6f268e7d381649b71fe11 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 15 Jul 2024 15:52:00 +0300
Subject: [PATCH 240/412] Add neon.running_xacts_overflow_policy to make it
 possible for RO replica to startup without primary even in case running xacts
 overflow (#8323)

## Problem

Right now if there are too many running xacts to be restored from CLOG
at replica startup,
then replica is not trying to restore them and wait for non-overflown
running-xacs WAL record from primary.
But if primary is not active, then replica will not start at all.

Too many running xacts can be caused by transactions with large number
of subtractions.
But right now it can be also cause by two reasons:
- Lack of shutdown checkpoint which updates `oldestRunningXid` (because
of immediate shutdown)
- nextXid alignment on 1024 boundary (which cause loosing ~1k XIDs on
each restart)

Both problems are somehow addressed now.
But we have existed customers with "sparse" CLOG and lack of
checkpoints.
To be able to start RO replicas for such customers I suggest to add GUC
which allows replica to start even in case of subxacts overflow.

## Summary of changes

Add `neon.running_xacts_overflow_policy` with the following values:
- ignore: restore from CLOG last N XIDs and accept connections
- skip: do not restore any XIDs from CXLOGbut still accept connections
- wait: wait non-overflown running xacts record from primary node

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/neon.c                          | 47 +++++++++++++++++++++--
 test_runner/regress/test_replica_start.py | 46 +++++++++++++++++++++-
 2 files changed, 88 insertions(+), 5 deletions(-)

diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index e4968bdf89..3197a7e715 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -46,6 +46,21 @@ void		_PG_init(void);
 
 static int	logical_replication_max_snap_files = 300;
 
+static int  running_xacts_overflow_policy;
+
+enum RunningXactsOverflowPolicies {
+	OP_IGNORE,
+	OP_SKIP,
+	OP_WAIT
+};
+
+static const struct config_enum_entry running_xacts_overflow_policies[] = {
+	{"ignore", OP_IGNORE, false},
+	{"skip", OP_SKIP, false},
+	{"wait", OP_WAIT, false},
+	{NULL, 0, false}
+};
+
 static void
 InitLogicalReplicationMonitor(void)
 {
@@ -414,6 +429,7 @@ RestoreRunningXactsFromClog(CheckPoint *checkpoint, TransactionId **xids, int *n
 	restored_xids = (TransactionId *) palloc(max_xcnt * sizeof(TransactionId));
 	n_restored_xids = 0;
 	next_prepared_idx = 0;
+
 	for (TransactionId xid = from; xid != till;)
 	{
 		XLogRecPtr	xidlsn;
@@ -424,7 +440,7 @@ RestoreRunningXactsFromClog(CheckPoint *checkpoint, TransactionId **xids, int *n
 		/*
 		 * "Merge" the prepared transactions into the restored_xids array as
 		 * we go.  The prepared transactions array is sorted. This is mostly
-		 * a sanity check to ensure that all the prpeared transactions are
+		 * a sanity check to ensure that all the prepared transactions are
 		 * seen as in-progress. (There is a check after the loop that we didn't
 		 * miss any.)
 		 */
@@ -522,14 +538,23 @@ RestoreRunningXactsFromClog(CheckPoint *checkpoint, TransactionId **xids, int *n
 			elog(LOG, "too many running xacts to restore from the CLOG; oldestXid=%u oldestActiveXid=%u nextXid %u",
 				 checkpoint->oldestXid, checkpoint->oldestActiveXid,
 				 XidFromFullTransactionId(checkpoint->nextXid));
-			goto fail;
+
+			switch (running_xacts_overflow_policy)
+			{
+				case OP_WAIT:
+					goto fail;
+				case OP_IGNORE:
+					goto success;
+				case OP_SKIP:
+					n_restored_xids = 0;
+					goto success;
+			}
 		}
 
 		restored_xids[n_restored_xids++] = xid;
 
 	skip:
 		TransactionIdAdvance(xid);
-		continue;
 	}
 
 	/* sanity check */
@@ -540,11 +565,13 @@ RestoreRunningXactsFromClog(CheckPoint *checkpoint, TransactionId **xids, int *n
 		Assert(false);
 		goto fail;
 	}
-
+   success:
 	elog(LOG, "restored %d running xacts by scanning the CLOG; oldestXid=%u oldestActiveXid=%u nextXid %u",
 		 n_restored_xids, checkpoint->oldestXid, checkpoint->oldestActiveXid, XidFromFullTransactionId(checkpoint->nextXid));
 	*nxids = n_restored_xids;
 	*xids = restored_xids;
+	if (prepared_xids)
+		pfree(prepared_xids);
 	return true;
 
  fail:
@@ -581,6 +608,18 @@ _PG_init(void)
 
 	restore_running_xacts_callback = RestoreRunningXactsFromClog;
 
+
+	DefineCustomEnumVariable(
+							"neon.running_xacts_overflow_policy",
+							"Action performed on snapshot overflow when restoring runnings xacts from CLOG",
+							NULL,
+							&running_xacts_overflow_policy,
+							OP_IGNORE,
+							running_xacts_overflow_policies,
+							PGC_POSTMASTER,
+							0,
+							NULL, NULL, NULL);
+
 	/*
 	 * Important: This must happen after other parts of the extension are
 	 * loaded, otherwise any settings to GUCs that were set before the
diff --git a/test_runner/regress/test_replica_start.py b/test_runner/regress/test_replica_start.py
index 17d476a8a6..0d95109d6b 100644
--- a/test_runner/regress/test_replica_start.py
+++ b/test_runner/regress/test_replica_start.py
@@ -210,7 +210,11 @@ def test_replica_start_wait_subxids_finish(neon_simple_env: NeonEnv):
     # Start it in a separate thread, so that we can do other stuff while it's
     # blocked waiting for the startup to finish.
     wait_for_last_flush_lsn(env, primary, env.initial_tenant, env.initial_timeline)
-    secondary = env.endpoints.new_replica(origin=primary, endpoint_id="secondary")
+    secondary = env.endpoints.new_replica(
+        origin=primary,
+        endpoint_id="secondary",
+        config_lines=["neon.running_xacts_overflow_policy='wait'"],
+    )
     start_secondary_thread = threading.Thread(target=secondary.start)
     start_secondary_thread.start()
 
@@ -644,3 +648,43 @@ def test_replica_start_with_prepared_xacts_with_many_subxacts(neon_simple_env: N
     wait_replica_caughtup(primary, secondary)
     secondary_cur.execute("select count(*) from t")
     assert secondary_cur.fetchone() == (200001,)
+
+
+def test_replica_start_with_too_many_unused_xids(neon_simple_env: NeonEnv):
+    """
+    Test the CLOG-scanning mechanism at hot standby startup in the presence of
+    large number of unsued XIDs, caused by  XID alignment and frequent primary restarts
+    """
+    n_restarts = 50
+
+    # Initialize the primary and a test table
+    env = neon_simple_env
+    primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary")
+    with primary.cursor() as primary_cur:
+        primary_cur.execute("create table t(pk serial primary key, payload integer)")
+
+    for _ in range(n_restarts):
+        with primary.cursor() as primary_cur:
+            primary_cur.execute("insert into t (payload) values (0)")
+        # restart primary
+        primary.stop("immediate")
+        primary.start()
+
+    # Wait for the WAL to be flushed
+    wait_for_last_flush_lsn(env, primary, env.initial_tenant, env.initial_timeline)
+
+    # stop primary to check that we can start replica without it
+    primary.stop(mode="immediate")
+
+    # Create a replica. It should start up normally, because of ignore policy
+    # mechanism.
+    secondary = env.endpoints.new_replica_start(
+        origin=primary,
+        endpoint_id="secondary",
+        config_lines=["neon.running_xacts_overflow_policy='ignore'"],
+    )
+
+    # Check that replica see all changes
+    with secondary.cursor() as secondary_cur:
+        secondary_cur.execute("select count(*) from t")
+        assert secondary_cur.fetchone() == (n_restarts,)

From 1637a6ee054608887d8ea04c8f0252fae5036acc Mon Sep 17 00:00:00 2001
From: Luca Bruno <lucab@lucabruno.net>
Date: Mon, 15 Jul 2024 13:38:52 +0200
Subject: [PATCH 241/412] proxy/http: switch to typed_json (#8377)

## Summary of changes

This switches JSON rendering logic to `typed_json` in order to
reduce the number of allocations in the HTTP responder path.

Followup from
https://github.com/neondatabase/neon/pull/8319#issuecomment-2216991760.

---------

Co-authored-by: Conrad Ludgate <conradludgate@gmail.com>
---
 Cargo.lock                            | 11 +++
 Cargo.toml                            |  1 +
 proxy/Cargo.toml                      |  1 +
 proxy/src/serverless/sql_over_http.rs | 97 +++++++++++++--------------
 4 files changed, 59 insertions(+), 51 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index bab0b4dd1f..8897364701 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4404,6 +4404,7 @@ dependencies = [
  "tracing-opentelemetry",
  "tracing-subscriber",
  "tracing-utils",
+ "typed-json",
  "url",
  "urlencoding",
  "utils",
@@ -6665,6 +6666,16 @@ dependencies = [
  "static_assertions",
 ]
 
+[[package]]
+name = "typed-json"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6024a8d0025400b3f6b189366e9aa92012cf9c4fe1cd2620848dd61425c49eed"
+dependencies = [
+ "serde",
+ "serde_json",
+]
+
 [[package]]
 name = "typenum"
 version = "1.16.0"
diff --git a/Cargo.toml b/Cargo.toml
index 670e3241d5..4f42203683 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -184,6 +184,7 @@ tracing-error = "0.2.0"
 tracing-opentelemetry = "0.21.0"
 tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json", "ansi"] }
 twox-hash = { version = "1.6.3", default-features = false }
+typed-json = "0.1"
 url = "2.2"
 urlencoding = "2.1"
 uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 288f7769fe..2f18b5fbc6 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -92,6 +92,7 @@ tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
 tracing-utils.workspace = true
 tracing.workspace = true
+typed-json.workspace = true
 url.workspace = true
 urlencoding.workspace = true
 utils.workspace = true
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 8118ae5ea8..6400e4ac7b 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -18,7 +18,7 @@ use hyper1::Response;
 use hyper1::StatusCode;
 use hyper1::{HeaderMap, Request};
 use pq_proto::StartupMessageParamsBuilder;
-use serde_json::json;
+use serde::Serialize;
 use serde_json::Value;
 use tokio::time;
 use tokio_postgres::error::DbError;
@@ -32,6 +32,7 @@ use tokio_postgres::Transaction;
 use tokio_util::sync::CancellationToken;
 use tracing::error;
 use tracing::info;
+use typed_json::json;
 use url::Url;
 use utils::http::error::ApiError;
 
@@ -263,13 +264,8 @@ pub async fn handle(
                 | SqlOverHttpError::Postgres(e) => e.as_db_error(),
                 _ => None,
             };
-            fn get<'a, T: serde::Serialize>(
-                db: Option<&'a DbError>,
-                x: impl FnOnce(&'a DbError) -> T,
-            ) -> Value {
-                db.map(x)
-                    .and_then(|t| serde_json::to_value(t).ok())
-                    .unwrap_or_default()
+            fn get<'a, T: Default>(db: Option<&'a DbError>, x: impl FnOnce(&'a DbError) -> T) -> T {
+                db.map(x).unwrap_or_default()
             }
 
             if let Some(db_error) = db_error {
@@ -278,17 +274,11 @@ pub async fn handle(
 
             let position = db_error.and_then(|db| db.position());
             let (position, internal_position, internal_query) = match position {
-                Some(ErrorPosition::Original(position)) => (
-                    Value::String(position.to_string()),
-                    Value::Null,
-                    Value::Null,
-                ),
-                Some(ErrorPosition::Internal { position, query }) => (
-                    Value::Null,
-                    Value::String(position.to_string()),
-                    Value::String(query.clone()),
-                ),
-                None => (Value::Null, Value::Null, Value::Null),
+                Some(ErrorPosition::Original(position)) => (Some(position.to_string()), None, None),
+                Some(ErrorPosition::Internal { position, query }) => {
+                    (None, Some(position.to_string()), Some(query.clone()))
+                }
+                None => (None, None, None),
             };
 
             let code = get(db_error, |db| db.code().code());
@@ -578,10 +568,8 @@ async fn handle_inner(
         .status(StatusCode::OK)
         .header(header::CONTENT_TYPE, "application/json");
 
-    //
-    // Now execute the query and return the result
-    //
-    let result = match payload {
+    // Now execute the query and return the result.
+    let json_output = match payload {
         Payload::Single(stmt) => stmt.process(cancel, &mut client, parsed_headers).await?,
         Payload::Batch(statements) => {
             if parsed_headers.txn_read_only {
@@ -605,11 +593,9 @@ async fn handle_inner(
 
     let metrics = client.metrics();
 
-    // how could this possibly fail
-    let body = serde_json::to_string(&result).expect("json serialization should not fail");
-    let len = body.len();
+    let len = json_output.len();
     let response = response
-        .body(Full::new(Bytes::from(body)))
+        .body(Full::new(Bytes::from(json_output)))
         // only fails if invalid status code or invalid header/values are given.
         // these are not user configurable so it cannot fail dynamically
         .expect("building response payload should not fail");
@@ -631,7 +617,7 @@ impl QueryData {
         cancel: CancellationToken,
         client: &mut Client<tokio_postgres::Client>,
         parsed_headers: HttpHeaders,
-    ) -> Result<Value, SqlOverHttpError> {
+    ) -> Result<String, SqlOverHttpError> {
         let (inner, mut discard) = client.inner();
         let cancel_token = inner.cancel_token();
 
@@ -644,7 +630,10 @@ impl QueryData {
             // The query successfully completed.
             Either::Left((Ok((status, results)), __not_yet_cancelled)) => {
                 discard.check_idle(status);
-                Ok(results)
+
+                let json_output =
+                    serde_json::to_string(&results).expect("json serialization should not fail");
+                Ok(json_output)
             }
             // The query failed with an error
             Either::Left((Err(e), __not_yet_cancelled)) => {
@@ -662,7 +651,10 @@ impl QueryData {
                     // query successed before it was cancelled.
                     Ok(Ok((status, results))) => {
                         discard.check_idle(status);
-                        Ok(results)
+
+                        let json_output = serde_json::to_string(&results)
+                            .expect("json serialization should not fail");
+                        Ok(json_output)
                     }
                     // query failed or was cancelled.
                     Ok(Err(error)) => {
@@ -696,7 +688,7 @@ impl BatchQueryData {
         cancel: CancellationToken,
         client: &mut Client<tokio_postgres::Client>,
         parsed_headers: HttpHeaders,
-    ) -> Result<Value, SqlOverHttpError> {
+    ) -> Result<String, SqlOverHttpError> {
         info!("starting transaction");
         let (inner, mut discard) = client.inner();
         let cancel_token = inner.cancel_token();
@@ -718,9 +710,9 @@ impl BatchQueryData {
             e
         })?;
 
-        let results =
+        let json_output =
             match query_batch(cancel.child_token(), &transaction, self, parsed_headers).await {
-                Ok(results) => {
+                Ok(json_output) => {
                     info!("commit");
                     let status = transaction.commit().await.map_err(|e| {
                         // if we cannot commit - for now don't return connection to pool
@@ -729,7 +721,7 @@ impl BatchQueryData {
                         e
                     })?;
                     discard.check_idle(status);
-                    results
+                    json_output
                 }
                 Err(SqlOverHttpError::Cancelled(_)) => {
                     if let Err(err) = cancel_token.cancel_query(NoTls).await {
@@ -753,7 +745,7 @@ impl BatchQueryData {
                 }
             };
 
-        Ok(json!({ "results": results }))
+        Ok(json_output)
     }
 }
 
@@ -762,7 +754,7 @@ async fn query_batch(
     transaction: &Transaction<'_>,
     queries: BatchQueryData,
     parsed_headers: HttpHeaders,
-) -> Result<Vec<Value>, SqlOverHttpError> {
+) -> Result<String, SqlOverHttpError> {
     let mut results = Vec::with_capacity(queries.queries.len());
     let mut current_size = 0;
     for stmt in queries.queries {
@@ -787,7 +779,11 @@ async fn query_batch(
             }
         }
     }
-    Ok(results)
+
+    let results = json!({ "results": results });
+    let json_output = serde_json::to_string(&results).expect("json serialization should not fail");
+
+    Ok(json_output)
 }
 
 async fn query_to_json<T: GenericClient>(
@@ -795,7 +791,7 @@ async fn query_to_json<T: GenericClient>(
     data: QueryData,
     current_size: &mut usize,
     parsed_headers: HttpHeaders,
-) -> Result<(ReadyForQueryStatus, Value), SqlOverHttpError> {
+) -> Result<(ReadyForQueryStatus, impl Serialize), SqlOverHttpError> {
     info!("executing query");
     let query_params = data.params;
     let mut row_stream = std::pin::pin!(client.query_raw_txt(&data.query, query_params).await?);
@@ -844,8 +840,8 @@ async fn query_to_json<T: GenericClient>(
 
     for c in row_stream.columns() {
         fields.push(json!({
-            "name": Value::String(c.name().to_owned()),
-            "dataTypeID": Value::Number(c.type_().oid().into()),
+            "name": c.name().to_owned(),
+            "dataTypeID": c.type_().oid(),
             "tableID": c.table_oid(),
             "columnID": c.column_id(),
             "dataTypeSize": c.type_size(),
@@ -863,15 +859,14 @@ async fn query_to_json<T: GenericClient>(
         .map(|row| pg_text_row_to_json(row, &columns, parsed_headers.raw_output, array_mode))
         .collect::<Result<Vec<_>, _>>()?;
 
-    // resulting JSON format is based on the format of node-postgres result
-    Ok((
-        ready,
-        json!({
-            "command": command_tag_name,
-            "rowCount": command_tag_count,
-            "rows": rows,
-            "fields": fields,
-            "rowAsArray": array_mode,
-        }),
-    ))
+    // Resulting JSON format is based on the format of node-postgres result.
+    let results = json!({
+        "command": command_tag_name.to_string(),
+        "rowCount": command_tag_count,
+        "rows": rows,
+        "fields": fields,
+        "rowAsArray": array_mode,
+    });
+
+    Ok((ready, results))
 }

From 537ecf45f87819fe95bf43b6ded7ef5a2c15d80f Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Thu, 11 Jul 2024 16:35:31 +0300
Subject: [PATCH 242/412] Fix test_timeline_copy flakiness.

fixes https://github.com/neondatabase/neon/issues/8355
---
 safekeeper/src/copy_timeline.rs          | 10 ++++++++--
 test_runner/regress/test_wal_acceptor.py |  5 +++++
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/safekeeper/src/copy_timeline.rs b/safekeeper/src/copy_timeline.rs
index 14bd3c03b8..220988c3ce 100644
--- a/safekeeper/src/copy_timeline.rs
+++ b/safekeeper/src/copy_timeline.rs
@@ -74,10 +74,16 @@ pub async fn handle_request(request: Request) -> Result<()> {
         assert!(flush_lsn >= start_lsn);
 
         if request.until_lsn > flush_lsn {
-            bail!("requested LSN is beyond the end of the timeline");
+            bail!(format!(
+                "requested LSN {} is beyond the end of the timeline {}",
+                request.until_lsn, flush_lsn
+            ));
         }
         if request.until_lsn < start_lsn {
-            bail!("requested LSN is before the start of the timeline");
+            bail!(format!(
+                "requested LSN {} is before the start of the timeline {}",
+                request.until_lsn, start_lsn
+            ));
         }
 
         if request.until_lsn > commit_lsn {
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index e0ad4fdd5c..2e906e6160 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -2065,6 +2065,11 @@ def test_timeline_copy(neon_env_builder: NeonEnvBuilder, insert_rows: int):
         log.info(f"Original digest: {orig_digest}")
 
         for sk in env.safekeepers:
+            wait(
+                partial(is_flush_lsn_caught_up, sk, tenant_id, timeline_id, lsn),
+                f"sk_id={sk.id} to flush {lsn}",
+            )
+
             sk.http_client().copy_timeline(
                 tenant_id,
                 timeline_id,

From 72c2d0812ee860354f1554ac76533ca02fe58237 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 15 Jul 2024 16:33:56 +0200
Subject: [PATCH 243/412] remove page_service `show <tenant_id>` (#8372)

This operation isn't used in practice, so let's remove it.

Context: in https://github.com/neondatabase/neon/pull/8339
---
 pageserver/src/metrics.rs               |  1 -
 pageserver/src/page_service.rs          | 60 ----------------
 test_runner/regress/test_auth.py        |  2 +-
 test_runner/regress/test_tenant_conf.py | 96 ++-----------------------
 4 files changed, 5 insertions(+), 154 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 9b3bb481b9..abad4b44b8 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1490,7 +1490,6 @@ pub(crate) enum ComputeCommandKind {
     Basebackup,
     Fullbackup,
     LeaseLsn,
-    Show,
 }
 
 pub(crate) struct ComputeCommandCounters {
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index f94b0d335e..00147a8ca6 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1479,66 +1479,6 @@ where
                     ))?
                 }
             };
-        } else if let Some(params) = parts.strip_prefix(&["show"]) {
-            // show <tenant_id>
-            if params.len() != 1 {
-                return Err(QueryError::Other(anyhow::anyhow!(
-                    "invalid param number for config command"
-                )));
-            }
-            let tenant_id = TenantId::from_str(params[0])
-                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
-
-            tracing::Span::current().record("tenant_id", field::display(tenant_id));
-
-            self.check_permission(Some(tenant_id))?;
-
-            COMPUTE_COMMANDS_COUNTERS
-                .for_command(ComputeCommandKind::Show)
-                .inc();
-
-            let tenant = self
-                .get_active_tenant_with_timeout(
-                    tenant_id,
-                    ShardSelector::Zero,
-                    ACTIVE_TENANT_TIMEOUT,
-                )
-                .await?;
-            pgb.write_message_noflush(&BeMessage::RowDescription(&[
-                RowDescriptor::int8_col(b"checkpoint_distance"),
-                RowDescriptor::int8_col(b"checkpoint_timeout"),
-                RowDescriptor::int8_col(b"compaction_target_size"),
-                RowDescriptor::int8_col(b"compaction_period"),
-                RowDescriptor::int8_col(b"compaction_threshold"),
-                RowDescriptor::int8_col(b"gc_horizon"),
-                RowDescriptor::int8_col(b"gc_period"),
-                RowDescriptor::int8_col(b"image_creation_threshold"),
-                RowDescriptor::int8_col(b"pitr_interval"),
-            ]))?
-            .write_message_noflush(&BeMessage::DataRow(&[
-                Some(tenant.get_checkpoint_distance().to_string().as_bytes()),
-                Some(
-                    tenant
-                        .get_checkpoint_timeout()
-                        .as_secs()
-                        .to_string()
-                        .as_bytes(),
-                ),
-                Some(tenant.get_compaction_target_size().to_string().as_bytes()),
-                Some(
-                    tenant
-                        .get_compaction_period()
-                        .as_secs()
-                        .to_string()
-                        .as_bytes(),
-                ),
-                Some(tenant.get_compaction_threshold().to_string().as_bytes()),
-                Some(tenant.get_gc_horizon().to_string().as_bytes()),
-                Some(tenant.get_gc_period().as_secs().to_string().as_bytes()),
-                Some(tenant.get_image_creation_threshold().to_string().as_bytes()),
-                Some(tenant.get_pitr_interval().as_secs().to_string().as_bytes()),
-            ]))?
-            .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
         } else {
             return Err(QueryError::Other(anyhow::anyhow!(
                 "unknown command {query_string}"
diff --git a/test_runner/regress/test_auth.py b/test_runner/regress/test_auth.py
index 922a21a999..7cb85e3dd1 100644
--- a/test_runner/regress/test_auth.py
+++ b/test_runner/regress/test_auth.py
@@ -211,7 +211,7 @@ def test_auth_failures(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
     def check_pageserver(expect_success: bool, **conn_kwargs):
         check_connection(
             env.pageserver,
-            f"show {env.initial_tenant}",
+            f"pagestream {env.initial_tenant} {env.initial_timeline}",
             expect_success,
             **conn_kwargs,
         )
diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py
index 80fb2b55b8..1a8bc3b983 100644
--- a/test_runner/regress/test_tenant_conf.py
+++ b/test_runner/regress/test_tenant_conf.py
@@ -1,10 +1,7 @@
 import json
-from contextlib import closing
 from typing import Any, Dict
 
-import psycopg2.extras
 from fixtures.common_types import Lsn
-from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
 )
@@ -63,25 +60,6 @@ def test_tenant_config(neon_env_builder: NeonEnvBuilder):
 
     # check the configuration of the default tenant
     # it should match global configuration
-    with closing(env.pageserver.connect()) as psconn:
-        with psconn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as pscur:
-            log.info(f"show {env.initial_tenant}")
-            pscur.execute(f"show {env.initial_tenant}")
-            res = pscur.fetchone()
-            assert res is not None
-            assert all(
-                i in res.items()
-                for i in {
-                    "checkpoint_distance": 10000,
-                    "compaction_target_size": 1048576,
-                    "compaction_period": 20,
-                    "compaction_threshold": 10,
-                    "gc_horizon": 67108864,
-                    "gc_period": 60 * 60,
-                    "image_creation_threshold": 3,
-                    "pitr_interval": 604800,  # 7 days
-                }.items()
-            ), f"Unexpected res: {res}"
     default_tenant_config = http_client.tenant_config(tenant_id=env.initial_tenant)
     assert (
         not default_tenant_config.tenant_specific_overrides
@@ -103,25 +81,6 @@ def test_tenant_config(neon_env_builder: NeonEnvBuilder):
     }
 
     # check the configuration of the new tenant
-    with closing(env.pageserver.connect()) as psconn:
-        with psconn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as pscur:
-            pscur.execute(f"show {tenant}")
-            res = pscur.fetchone()
-            log.info(f"res: {res}")
-            assert res is not None
-            assert all(
-                i in res.items()
-                for i in {
-                    "checkpoint_distance": 20000,
-                    "compaction_target_size": 1048576,
-                    "compaction_period": 20,
-                    "compaction_threshold": 10,
-                    "gc_horizon": 67108864,
-                    "gc_period": 30,
-                    "image_creation_threshold": 3,
-                    "pitr_interval": 604800,
-                }.items()
-            ), f"Unexpected res: {res}"
     new_tenant_config = http_client.tenant_config(tenant_id=tenant)
     new_specific_config = new_tenant_config.tenant_specific_overrides
     assert new_specific_config["checkpoint_distance"] == 20000
@@ -166,25 +125,6 @@ def test_tenant_config(neon_env_builder: NeonEnvBuilder):
         conf=conf_update,
     )
 
-    with closing(env.pageserver.connect()) as psconn:
-        with psconn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as pscur:
-            pscur.execute(f"show {tenant}")
-            res = pscur.fetchone()
-            log.info(f"after config res: {res}")
-            assert res is not None
-            assert all(
-                i in res.items()
-                for i in {
-                    "checkpoint_distance": 15000,
-                    "compaction_target_size": 1048576,
-                    "compaction_period": 80,
-                    "compaction_threshold": 10,
-                    "gc_horizon": 67108864,
-                    "gc_period": 80,
-                    "image_creation_threshold": 2,
-                    "pitr_interval": 604800,
-                }.items()
-            ), f"Unexpected res: {res}"
     updated_tenant_config = http_client.tenant_config(tenant_id=tenant)
     updated_specific_config = updated_tenant_config.tenant_specific_overrides
     assert updated_specific_config["checkpoint_distance"] == 15000
@@ -222,25 +162,6 @@ def test_tenant_config(neon_env_builder: NeonEnvBuilder):
     env.pageserver.stop()
     env.pageserver.start()
 
-    with closing(env.pageserver.connect()) as psconn:
-        with psconn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as pscur:
-            pscur.execute(f"show {tenant}")
-            res = pscur.fetchone()
-            log.info(f"after restart res: {res}")
-            assert res is not None
-            assert all(
-                i in res.items()
-                for i in {
-                    "checkpoint_distance": 15000,
-                    "compaction_target_size": 1048576,
-                    "compaction_period": 80,
-                    "compaction_threshold": 10,
-                    "gc_horizon": 67108864,
-                    "gc_period": 80,
-                    "image_creation_threshold": 2,
-                    "pitr_interval": 604800,
-                }.items()
-            ), f"Unexpected res: {res}"
     restarted_tenant_config = http_client.tenant_config(tenant_id=tenant)
     assert (
         restarted_tenant_config == updated_tenant_config
@@ -283,19 +204,10 @@ def test_tenant_config(neon_env_builder: NeonEnvBuilder):
     env.pageserver.stop()
     env.pageserver.start()
 
-    with closing(env.pageserver.connect()) as psconn:
-        with psconn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as pscur:
-            pscur.execute(f"show {tenant}")
-            res = pscur.fetchone()
-            log.info(f"after restart res: {res}")
-            assert res is not None
-            assert all(
-                i in res.items()
-                for i in {
-                    "compaction_period": 20,
-                    "pitr_interval": 60,
-                }.items()
-            ), f"Unexpected res: {res}"
+    restarted_final_tenant_config = http_client.tenant_config(tenant_id=tenant)
+    assert (
+        restarted_final_tenant_config == final_tenant_config
+    ), "Updated config should not change after the restart"
 
 
 def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder):

From cfaf30f5e84583907678e07d2df65c68dec47930 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 15 Jul 2024 18:08:24 +0300
Subject: [PATCH 244/412] feat(storcon): timeline detach ancestor passthrough
 (#8353)

Currently storage controller does not support forwarding timeline detach
ancestor requests to pageservers. Add support for forwarding `PUT
.../:tenant_id/timelines/:timeline_id/detach_ancestor`. Implement the
support mostly as is, because the timeline detach ancestor will be made
(mostly) idempotent in future PR.

Cc: #6994
---
 .../src/models/detach_ancestor.rs             |   2 +-
 pageserver/client/src/mgmt_api.rs             |  18 +++
 storage_controller/src/http.rs                |  26 ++++
 storage_controller/src/pageserver_client.rs   |  22 ++-
 storage_controller/src/service.rs             | 140 ++++++++++++++++--
 test_runner/fixtures/neon_fixtures.py         |   2 +-
 .../regress/test_timeline_detach_ancestor.py  |  97 +++++++++++-
 7 files changed, 281 insertions(+), 26 deletions(-)

diff --git a/libs/pageserver_api/src/models/detach_ancestor.rs b/libs/pageserver_api/src/models/detach_ancestor.rs
index fc1f10e734..ae5a21bab9 100644
--- a/libs/pageserver_api/src/models/detach_ancestor.rs
+++ b/libs/pageserver_api/src/models/detach_ancestor.rs
@@ -1,6 +1,6 @@
 use utils::id::TimelineId;
 
-#[derive(Default, serde::Serialize)]
+#[derive(Debug, Default, PartialEq, serde::Serialize, serde::Deserialize)]
 pub struct AncestorDetached {
     pub reparented_timelines: Vec<TimelineId>,
 }
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index e3ddb446fa..ac3ff1bb89 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -1,6 +1,7 @@
 use std::collections::HashMap;
 
 use bytes::Bytes;
+use detach_ancestor::AncestorDetached;
 use pageserver_api::{models::*, shard::TenantShardId};
 use reqwest::{IntoUrl, Method, StatusCode};
 use utils::{
@@ -418,6 +419,23 @@ impl Client {
         }
     }
 
+    pub async fn timeline_detach_ancestor(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+    ) -> Result<AncestorDetached> {
+        let uri = format!(
+            "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/detach_ancestor",
+            self.mgmt_api_endpoint
+        );
+
+        self.request(Method::PUT, &uri, ())
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
     pub async fn tenant_reset(&self, tenant_shard_id: TenantShardId) -> Result<()> {
         let uri = format!(
             "{}/v1/tenant/{}/reset",
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 3a62c0dd4f..9ddf98eb3b 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -330,6 +330,22 @@ async fn handle_tenant_timeline_delete(
     .await
 }
 
+async fn handle_tenant_timeline_detach_ancestor(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    check_permissions(&req, Scope::PageServerApi)?;
+
+    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
+
+    let res = service
+        .tenant_timeline_detach_ancestor(tenant_id, timeline_id)
+        .await?;
+
+    json_response(StatusCode::OK, res)
+}
+
 async fn handle_tenant_timeline_passthrough(
     service: Arc<Service>,
     req: Request<Body>,
@@ -1006,6 +1022,16 @@ pub fn make_router(
                 RequestName("v1_tenant_timeline"),
             )
         })
+        .put(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/detach_ancestor",
+            |r| {
+                tenant_service_handler(
+                    r,
+                    handle_tenant_timeline_detach_ancestor,
+                    RequestName("v1_tenant_timeline_detach_ancestor"),
+                )
+            },
+        )
         // Tenant detail GET passthrough to shard zero:
         .get("/v1/tenant/:tenant_id", |r| {
             tenant_service_handler(
diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs
index 769aba80ca..8d64201cd9 100644
--- a/storage_controller/src/pageserver_client.rs
+++ b/storage_controller/src/pageserver_client.rs
@@ -1,8 +1,9 @@
 use pageserver_api::{
     models::{
-        LocationConfig, LocationConfigListResponse, PageserverUtilization, SecondaryProgress,
-        TenantScanRemoteStorageResponse, TenantShardSplitRequest, TenantShardSplitResponse,
-        TimelineCreateRequest, TimelineInfo, TopTenantShardsRequest, TopTenantShardsResponse,
+        detach_ancestor::AncestorDetached, LocationConfig, LocationConfigListResponse,
+        PageserverUtilization, SecondaryProgress, TenantScanRemoteStorageResponse,
+        TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
+        TopTenantShardsRequest, TopTenantShardsResponse,
     },
     shard::TenantShardId,
 };
@@ -226,6 +227,21 @@ impl PageserverClient {
         )
     }
 
+    pub(crate) async fn timeline_detach_ancestor(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+    ) -> Result<AncestorDetached> {
+        measured_request!(
+            "timeline_detach_ancestor",
+            crate::metrics::Method::Put,
+            &self.node_id_label,
+            self.inner
+                .timeline_detach_ancestor(tenant_shard_id, timeline_id)
+                .await
+        )
+    }
+
     pub(crate) async fn get_utilization(&self) -> Result<PageserverUtilization> {
         measured_request!(
             "utilization",
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index deaac83ea5..95522525cb 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -117,6 +117,7 @@ enum TenantOperations {
     TimelineCreate,
     TimelineDelete,
     AttachHook,
+    TimelineDetachAncestor,
 }
 
 #[derive(Clone, strum_macros::Display)]
@@ -2376,18 +2377,18 @@ impl Service {
                 tracing::info!("Doing time travel recovery for shard {tenant_shard_id}",);
 
                 client
-                        .tenant_time_travel_remote_storage(
-                            tenant_shard_id,
-                            &timestamp,
-                            &done_if_after,
-                        )
-                        .await
-                        .map_err(|e| {
-                            ApiError::InternalServerError(anyhow::anyhow!(
-                                "Error doing time travel recovery for shard {tenant_shard_id} on node {}: {e}",
-                                node
-                            ))
-                        })?;
+                    .tenant_time_travel_remote_storage(
+                        tenant_shard_id,
+                        &timestamp,
+                        &done_if_after,
+                    )
+                    .await
+                    .map_err(|e| {
+                        ApiError::InternalServerError(anyhow::anyhow!(
+                            "Error doing time travel recovery for shard {tenant_shard_id} on node {}: {e}",
+                            node
+                        ))
+                    })?;
             }
         }
         Ok(())
@@ -2757,7 +2758,7 @@ impl Service {
         // Create timeline on remaining shards with number >0
         if !targets.is_empty() {
             // If we had multiple shards, issue requests for the remainder now.
-            let jwt = self.config.jwt_token.clone();
+            let jwt = &self.config.jwt_token;
             self.tenant_for_shards(targets, |tenant_shard_id: TenantShardId, node: Node| {
                 let create_req = create_req.clone();
                 Box::pin(create_one(tenant_shard_id, node, jwt.clone(), create_req))
@@ -2768,6 +2769,115 @@ impl Service {
         Ok(timeline_info)
     }
 
+    pub(crate) async fn tenant_timeline_detach_ancestor(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> Result<models::detach_ancestor::AncestorDetached, ApiError> {
+        tracing::info!("Detaching timeline {tenant_id}/{timeline_id}",);
+
+        let _tenant_lock = trace_shared_lock(
+            &self.tenant_op_locks,
+            tenant_id,
+            TenantOperations::TimelineDetachAncestor,
+        )
+        .await;
+
+        self.ensure_attached_wait(tenant_id).await?;
+
+        let targets = {
+            let locked = self.inner.read().unwrap();
+            let mut targets = Vec::new();
+
+            for (tenant_shard_id, shard) in
+                locked.tenants.range(TenantShardId::tenant_range(tenant_id))
+            {
+                let node_id = shard.intent.get_attached().ok_or_else(|| {
+                    ApiError::InternalServerError(anyhow::anyhow!("Shard not scheduled"))
+                })?;
+                let node = locked
+                    .nodes
+                    .get(&node_id)
+                    .expect("Pageservers may not be deleted while referenced");
+
+                targets.push((*tenant_shard_id, node.clone()));
+            }
+            targets
+        };
+
+        if targets.is_empty() {
+            return Err(ApiError::NotFound(
+                anyhow::anyhow!("Tenant not found").into(),
+            ));
+        }
+
+        async fn detach_one(
+            tenant_shard_id: TenantShardId,
+            timeline_id: TimelineId,
+            node: Node,
+            jwt: Option<String>,
+        ) -> Result<(ShardNumber, models::detach_ancestor::AncestorDetached), ApiError> {
+            tracing::info!(
+                "Detaching timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}",
+            );
+
+            let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref());
+            client
+                .timeline_detach_ancestor(tenant_shard_id, timeline_id)
+                .await
+                .map_err(|e| {
+                    use mgmt_api::Error;
+
+                    match e {
+                        // no ancestor (ever)
+                        Error::ApiError(StatusCode::CONFLICT, msg) => {
+                            ApiError::Conflict(format!("{node}: {msg}"))
+                        }
+                        // too many ancestors
+                        Error::ApiError(StatusCode::BAD_REQUEST, msg) => {
+                            ApiError::BadRequest(anyhow::anyhow!("{node}: {msg}"))
+                        }
+                        // rest can be mapped
+                        other => passthrough_api_error(&node, other),
+                    }
+                })
+                .map(|res| (tenant_shard_id.shard_number, res))
+        }
+
+        // no shard needs to go first/last; the operation should be idempotent
+        // TODO: it would be great to ensure that all shards return the same error
+        let mut results = self
+            .tenant_for_shards(targets, |tenant_shard_id, node| {
+                futures::FutureExt::boxed(detach_one(
+                    tenant_shard_id,
+                    timeline_id,
+                    node,
+                    self.config.jwt_token.clone(),
+                ))
+            })
+            .await?;
+
+        let any = results.pop().expect("we must have at least one response");
+
+        // FIXME: the ordering is not stable yet on pageserver, should be (ancestor_lsn,
+        // TimelineId)
+        let mismatching = results
+            .iter()
+            .filter(|(_, res)| res != &any.1)
+            .collect::<Vec<_>>();
+        if !mismatching.is_empty() {
+            let matching = results.len() - mismatching.len();
+            tracing::error!(
+                matching,
+                compared_against=?any,
+                ?mismatching,
+                "shards returned different results"
+            );
+        }
+
+        Ok(any.1)
+    }
+
     /// Helper for concurrently calling a pageserver API on a number of shards, such as timeline creation.
     ///
     /// On success, the returned vector contains exactly the same number of elements as the input `locations`.
@@ -2894,8 +3004,8 @@ impl Service {
                 .await
                 .map_err(|e| {
                     ApiError::InternalServerError(anyhow::anyhow!(
-                    "Error deleting timeline {timeline_id} on {tenant_shard_id} on node {node}: {e}",
-                ))
+                        "Error deleting timeline {timeline_id} on {tenant_shard_id} on node {node}: {e}",
+                    ))
                 })
         }
 
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 463e4a3b01..90ed838e1d 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2400,7 +2400,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
 
     def locate(self, tenant_id: TenantId) -> list[dict[str, Any]]:
         """
-        :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr: str, "listen_http_port: int}
+        :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr": str, "listen_http_port": int}
         """
         response = self.request(
             "GET",
diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index 606ce203cd..803fcac583 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -11,11 +11,12 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     PgBin,
+    flush_ep_to_pageserver,
     wait_for_last_flush_lsn,
 )
 from fixtures.pageserver.http import HistoricLayerInfo, PageserverApiException
-from fixtures.pageserver.utils import wait_timeline_detail_404
-from fixtures.remote_storage import LocalFsStorage
+from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_timeline_detail_404
+from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
 from fixtures.utils import assert_pageserver_backups_equal
 
 
@@ -559,11 +560,24 @@ def test_compaction_induced_by_detaches_in_history(
     assert_pageserver_backups_equal(fullbackup_before, fullbackup_after, set())
 
 
-def test_timeline_ancestor_errors(neon_env_builder: NeonEnvBuilder):
-    env = neon_env_builder.init_start()
-    env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
+@pytest.mark.parametrize("sharded", [True, False])
+def test_timeline_ancestor_detach_errors(neon_env_builder: NeonEnvBuilder, sharded: bool):
+    shards = 2 if sharded else 1
 
-    client = env.pageserver.http_client()
+    neon_env_builder.num_pageservers = shards
+    env = neon_env_builder.init_start(initial_tenant_shard_count=shards if sharded else None)
+
+    pageservers = dict((int(p.id), p) for p in env.pageservers)
+
+    for ps in pageservers.values():
+        ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
+
+    if sharded:
+        # FIXME: should this be in the neon_env_builder.init_start?
+        env.storage_controller.reconcile_until_idle()
+        client = env.storage_controller.pageserver_api()
+    else:
+        client = env.pageserver.http_client()
 
     with pytest.raises(PageserverApiException, match=".* no ancestors") as info:
         client.detach_ancestor(env.initial_tenant, env.initial_timeline)
@@ -577,6 +591,17 @@ def test_timeline_ancestor_errors(neon_env_builder: NeonEnvBuilder):
         client.detach_ancestor(env.initial_tenant, second_branch)
     assert info.value.status_code == 400
 
+    client.detach_ancestor(env.initial_tenant, first_branch)
+
+    # FIXME: this should be done by the http req handler
+    for ps in pageservers.values():
+        ps.quiesce_tenants()
+
+    with pytest.raises(PageserverApiException, match=".* no ancestors") as info:
+        client.detach_ancestor(env.initial_tenant, first_branch)
+    # FIXME: this should be 200 OK because we've already completed it
+    assert info.value.status_code == 409
+
     client.tenant_delete(env.initial_tenant)
 
     with pytest.raises(PageserverApiException) as e:
@@ -584,6 +609,58 @@ def test_timeline_ancestor_errors(neon_env_builder: NeonEnvBuilder):
     assert e.value.status_code == 404
 
 
+def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder):
+    branch_name = "soon_detached"
+    shard_count = 4
+    neon_env_builder.num_pageservers = shard_count
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3)
+
+    env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
+    for ps in env.pageservers:
+        ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
+
+    # FIXME: should this be in the neon_env_builder.init_start?
+    env.storage_controller.reconcile_until_idle()
+    shards = env.storage_controller.locate(env.initial_tenant)
+
+    branch_timeline_id = env.neon_cli.create_branch(branch_name, tenant_id=env.initial_tenant)
+
+    with env.endpoints.create_start(branch_name, tenant_id=env.initial_tenant) as ep:
+        ep.safe_psql(
+            "create table foo as select 1::bigint, i::bigint from generate_series(1, 10000) v(i)"
+        )
+        lsn = flush_ep_to_pageserver(env, ep, env.initial_tenant, branch_timeline_id)
+
+    pageservers = dict((int(p.id), p) for p in env.pageservers)
+
+    for shard_info in shards:
+        node_id = int(shard_info["node_id"])
+        shard_id = shard_info["shard_id"]
+        detail = pageservers[node_id].http_client().timeline_detail(shard_id, branch_timeline_id)
+
+        assert Lsn(detail["last_record_lsn"]) >= lsn
+        assert Lsn(detail["initdb_lsn"]) < lsn
+        assert TimelineId(detail["ancestor_timeline_id"]) == env.initial_timeline
+
+    env.storage_controller.pageserver_api().detach_ancestor(env.initial_tenant, branch_timeline_id)
+
+    for shard_info in shards:
+        node_id = int(shard_info["node_id"])
+        shard_id = shard_info["shard_id"]
+
+        # TODO: ensure quescing is done on pageserver?
+        pageservers[node_id].quiesce_tenants()
+        detail = pageservers[node_id].http_client().timeline_detail(shard_id, branch_timeline_id)
+        wait_for_last_record_lsn(
+            pageservers[node_id].http_client(), shard_id, branch_timeline_id, lsn
+        )
+        assert detail.get("ancestor_timeline_id") is None
+
+    with env.endpoints.create_start(branch_name, tenant_id=env.initial_tenant) as ep:
+        count = int(ep.safe_psql("select count(*) from foo")[0][0])
+        assert count == 10000
+
+
 # TODO:
 # - after starting the operation, tenant is deleted
 # - after starting the operation, pageserver is shutdown, restarted
@@ -591,3 +668,11 @@ def test_timeline_ancestor_errors(neon_env_builder: NeonEnvBuilder):
 # - deletion of reparented while reparenting should fail once, then succeed (?)
 # - branch near existing L1 boundary, image layers?
 # - investigate: why are layers started at uneven lsn? not just after branching, but in general.
+#
+# TEST: 1. tad which partially succeeds, one returns 500
+#       2. create branch below timeline? or delete timeline below
+#       3. on retry all should report the same reparented timelines
+#
+# TEST: 1. tad is started, one node stalls, other restarts
+#       2. client timeout before stall over
+#       3. on retry with stalled and other being able to proceed

From 2a3a1364746e8cd5bfab82308bba5fe84feb7ff3 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 15 Jul 2024 17:43:05 +0100
Subject: [PATCH 245/412] pageserver: use PITR GC cutoffs as authoritative
 (#8365)

## Problem

Pageserver GC uses a size-based condition (GC "horizon" in addition to
time-based "PITR").

Eventually we plan to retire the size-based condition:
https://github.com/neondatabase/neon/issues/6374

Currently, we always apply the more conservative of the two, meaning
that tenants always retain at least 64MB of history (default horizon),
even after a very long time has passed. This is particularly acute in
cases where someone has dropped tables/databases, and then leaves a
database idle: the horizon can prevent GCing very large quantities of
historical data (we already account for this in synthetic size by
ignoring gc horizon).

We're not entirely removing GC horizon right now because we don't want
to 100% rely on standby_horizon for robustness of physical replication,
but we can tweak our logic to avoid retaining that 64MB LSN length
indefinitely.

## Summary of changes

- Rework `Timeline::find_gc_cutoffs`, with new logic:
- If there is no PITR set, then use `DEFAULT_PITR_INTERVAL` (1 week) to
calculate a time threshold. Retain either the horizon or up to that
thresholds, whichever requires less data.
- When there is a PITR set, and we have unambiguously resolved the
timestamp to an LSN, then ignore the GC horizon entirely. For typical
PITRs (1 day, 1 week), this will still easily retain enough data to
avoid stressing read only replicas.

The key property we end up with, whether a PITR is set or not, is that
after enough time has passed, our GC cutoff on an idle timeline will
catch up with the last_record_lsn.

Using `DEFAULT_PITR_INTERVAL` is a bit of an arbitrary hack, but this
feels like it isn't really worth the noise of exposing in TenantConfig.
We could just make it a different named constant though. The end-end
state will be that there is no gc_horizon at all, and that tenants with
pitr_interval=0 would truly retain no history, so this constant would go
away.
---
 pageserver/src/tenant/timeline.rs         | 150 +++++++++++++---------
 test_runner/regress/test_branch_and_gc.py |   4 +-
 2 files changed, 91 insertions(+), 63 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index a3ddb3a1d1..0996616a67 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -69,6 +69,7 @@ use std::{
 use crate::{
     aux_file::AuxFileSizeEstimator,
     tenant::{
+        config::defaults::DEFAULT_PITR_INTERVAL,
         layer_map::{LayerMap, SearchResult},
         metadata::TimelineMetadata,
         storage_layer::PersistentLayerDesc,
@@ -4945,20 +4946,17 @@ impl Timeline {
     }
 
     /// Find the Lsns above which layer files need to be retained on
-    /// garbage collection. This is separate from actually performing the GC,
-    /// and is updated more frequently, so that compaction can remove obsolete
-    /// page versions more aggressively.
+    /// garbage collection.
     ///
-    /// TODO: that's wishful thinking, compaction doesn't actually do that
-    /// currently.
+    /// We calculate two cutoffs, one based on time and one based on WAL size.  `pitr`
+    /// controls the time cutoff (or ZERO to disable time-based retention), and `cutoff_horizon` controls
+    /// the space-based retention.
     ///
-    /// The 'cutoff_horizon' point is used to retain recent versions that might still be
-    /// needed by read-only nodes. (As of this writing, the caller just passes
-    /// the latest LSN subtracted by a constant, and doesn't do anything smart
-    /// to figure out what read-only nodes might actually need.)
-    ///
-    /// The 'pitr' duration is used to calculate a 'pitr_cutoff', which can be used to determine
-    /// whether a record is needed for PITR.
+    /// This function doesn't simply to calculate time & space based retention: it treats time-based
+    /// retention as authoritative if enabled, and falls back to space-based retention if calculating
+    /// the LSN for a time point isn't possible.  Therefore the GcCutoffs::horizon in the response might
+    /// be different to the `cutoff_horizon` input.  Callers should treat the min() of the two cutoffs
+    /// in the response as the GC cutoff point for the timeline.
     #[instrument(skip_all, fields(timeline_id=%self.timeline_id))]
     pub(super) async fn find_gc_cutoffs(
         &self,
@@ -4975,58 +4973,88 @@ impl Timeline {
 
         pausable_failpoint!("Timeline::find_gc_cutoffs-pausable");
 
-        // First, calculate pitr_cutoff_timestamp and then convert it to LSN.
-        //
-        // Some unit tests depend on garbage-collection working even when
-        // CLOG data is missing, so that find_lsn_for_timestamp() doesn't
-        // work, so avoid calling it altogether if time-based retention is not
-        // configured. It would be pointless anyway.
-        let pitr_cutoff = if pitr != Duration::ZERO {
-            let now = SystemTime::now();
-            if let Some(pitr_cutoff_timestamp) = now.checked_sub(pitr) {
-                let pitr_timestamp = to_pg_timestamp(pitr_cutoff_timestamp);
-
-                match self
-                    .find_lsn_for_timestamp(pitr_timestamp, cancel, ctx)
-                    .await?
-                {
-                    LsnForTimestamp::Present(lsn) => lsn,
-                    LsnForTimestamp::Future(lsn) => {
-                        // The timestamp is in the future. That sounds impossible,
-                        // but what it really means is that there hasn't been
-                        // any commits since the cutoff timestamp.
-                        //
-                        // In this case we should use the LSN of the most recent commit,
-                        // which is implicitly the last LSN in the log.
-                        debug!("future({})", lsn);
-                        self.get_last_record_lsn()
-                    }
-                    LsnForTimestamp::Past(lsn) => {
-                        debug!("past({})", lsn);
-                        // conservative, safe default is to remove nothing, when we
-                        // have no commit timestamp data available
-                        *self.get_latest_gc_cutoff_lsn()
-                    }
-                    LsnForTimestamp::NoData(lsn) => {
-                        debug!("nodata({})", lsn);
-                        // conservative, safe default is to remove nothing, when we
-                        // have no commit timestamp data available
-                        *self.get_latest_gc_cutoff_lsn()
-                    }
-                }
-            } else {
-                // If we don't have enough data to convert to LSN,
-                // play safe and don't remove any layers.
-                *self.get_latest_gc_cutoff_lsn()
+        if cfg!(test) {
+            // Unit tests which specify zero PITR interval expect to avoid doing any I/O for timestamp lookup
+            if pitr == Duration::ZERO {
+                return Ok(GcCutoffs {
+                    pitr: self.get_last_record_lsn(),
+                    horizon: cutoff_horizon,
+                });
+            }
+        }
+
+        // Calculate a time-based limit on how much to retain:
+        // - if PITR interval is set, then this is our cutoff.
+        // - if PITR interval is not set, then we do a lookup
+        //   based on DEFAULT_PITR_INTERVAL, so that size-based retention (horizon)
+        //   does not result in keeping history around permanently on idle databases.
+        let time_cutoff = {
+            let now = SystemTime::now();
+            let time_range = if pitr == Duration::ZERO {
+                humantime::parse_duration(DEFAULT_PITR_INTERVAL).expect("constant is invalid")
+            } else {
+                pitr
+            };
+
+            // If PITR is so large or `now` is so small that this underflows, we will retain no history (highly unexpected case)
+            let time_cutoff = now.checked_sub(time_range).unwrap_or(now);
+            let timestamp = to_pg_timestamp(time_cutoff);
+
+            match self.find_lsn_for_timestamp(timestamp, cancel, ctx).await? {
+                LsnForTimestamp::Present(lsn) => Some(lsn),
+                LsnForTimestamp::Future(lsn) => {
+                    // The timestamp is in the future. That sounds impossible,
+                    // but what it really means is that there hasn't been
+                    // any commits since the cutoff timestamp.
+                    //
+                    // In this case we should use the LSN of the most recent commit,
+                    // which is implicitly the last LSN in the log.
+                    debug!("future({})", lsn);
+                    Some(self.get_last_record_lsn())
+                }
+                LsnForTimestamp::Past(lsn) => {
+                    debug!("past({})", lsn);
+                    None
+                }
+                LsnForTimestamp::NoData(lsn) => {
+                    debug!("nodata({})", lsn);
+                    None
+                }
             }
-        } else {
-            // No time-based retention was configured. Interpret this as "keep no history".
-            self.get_last_record_lsn()
         };
 
-        Ok(GcCutoffs {
-            horizon: cutoff_horizon,
-            pitr: pitr_cutoff,
+        Ok(match (pitr, time_cutoff) {
+            (Duration::ZERO, Some(time_cutoff)) => {
+                // PITR is not set. Retain the size-based limit, or the default time retention,
+                // whichever requires less data.
+                GcCutoffs {
+                    pitr: std::cmp::max(time_cutoff, cutoff_horizon),
+                    horizon: std::cmp::max(time_cutoff, cutoff_horizon),
+                }
+            }
+            (Duration::ZERO, None) => {
+                // PITR is not set, and time lookup failed
+                GcCutoffs {
+                    pitr: self.get_last_record_lsn(),
+                    horizon: cutoff_horizon,
+                }
+            }
+            (_, None) => {
+                // PITR interval is set & we didn't look up a timestamp successfully.  Conservatively assume PITR
+                // cannot advance beyond what was already GC'd, and respect space-based retention
+                GcCutoffs {
+                    pitr: *self.get_latest_gc_cutoff_lsn(),
+                    horizon: cutoff_horizon,
+                }
+            }
+            (_, Some(time_cutoff)) => {
+                // PITR interval is set and we looked up timestamp successfully.  Ignore
+                // size based retention and make time cutoff authoritative
+                GcCutoffs {
+                    pitr: time_cutoff,
+                    horizon: time_cutoff,
+                }
+            }
         })
     }
 
diff --git a/test_runner/regress/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py
index eb503ddbfa..f2e3855c12 100644
--- a/test_runner/regress/test_branch_and_gc.py
+++ b/test_runner/regress/test_branch_and_gc.py
@@ -65,8 +65,8 @@ def test_branch_and_gc(neon_simple_env: NeonEnv, build_type: str):
             "compaction_period": "1 s",
             "compaction_threshold": "2",
             "image_creation_threshold": "1",
-            # set PITR interval to be small, so we can do GC
-            "pitr_interval": "1 s",
+            # Disable PITR, this test will set an explicit space-based GC limit
+            "pitr_interval": "0 s",
         }
     )
 

From 957f99cad5b3137cd5d754044902e3837658b568 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 15 Jul 2024 20:47:53 +0300
Subject: [PATCH 246/412] feat(timeline_detach_ancestor): success idempotency 
 (#8354)

Right now timeline detach ancestor reports an error (409, "no ancestor")
on a new attempt after successful completion. This makes it troublesome
for storage controller retries. Fix it to respond with `200 OK` as if
the operation had just completed quickly.

Additionally, the returned timeline identifiers in the 200 OK response
are now ordered so that responses between different nodes for error
comparison are done by the storage controller added in #8353.

Design-wise, this PR introduces a new strategy for accessing the latest
uploaded IndexPart:
`RemoteTimelineClient::initialized_upload_queue(&self) ->
Result<UploadQueueAccessor<'_>, NotInitialized>`. It should be a more
scalable way to query the latest uploaded `IndexPart` than to add a
query method for each question directly on `RemoteTimelineClient`.

GC blocking will need to be introduced to make the operation fully
idempotent. However, it is idempotent for the cases demonstrated by
tests.

Cc: #6994
---
 pageserver/src/http/routes.rs                 |  45 +-
 .../src/tenant/remote_timeline_client.rs      |  27 +-
 .../tenant/remote_timeline_client/index.rs    |  26 ++
 pageserver/src/tenant/timeline.rs             |   8 +-
 .../src/tenant/timeline/detach_ancestor.rs    | 130 +++++-
 pageserver/src/tenant/upload_queue.rs         |  10 +-
 storage_controller/src/service.rs             |   9 +-
 test_runner/fixtures/pageserver/http.py       |  21 +-
 .../regress/test_timeline_detach_ancestor.py  | 430 ++++++++++++++++--
 9 files changed, 632 insertions(+), 74 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 6f8f3e6389..d7ef70477f 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1721,7 +1721,9 @@ async fn timeline_detach_ancestor_handler(
     request: Request<Body>,
     _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    use crate::tenant::timeline::detach_ancestor::Options;
+    use crate::tenant::timeline::detach_ancestor;
+    use pageserver_api::models::detach_ancestor::AncestorDetached;
+
     let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
@@ -1729,7 +1731,7 @@ async fn timeline_detach_ancestor_handler(
     let span = tracing::info_span!("detach_ancestor", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id);
 
     async move {
-        let mut options = Options::default();
+        let mut options = detach_ancestor::Options::default();
 
         let rewrite_concurrency =
             parse_query_param::<_, std::num::NonZeroUsize>(&request, "rewrite_concurrency")?;
@@ -1757,27 +1759,36 @@ async fn timeline_detach_ancestor_handler(
 
         let timeline = tenant.get_timeline(timeline_id, true)?;
 
-        let (_guard, prepared) = timeline
+        let progress = timeline
             .prepare_to_detach_from_ancestor(&tenant, options, ctx)
             .await?;
 
-        let res = state
-            .tenant_manager
-            .complete_detaching_timeline_ancestor(tenant_shard_id, timeline_id, prepared, ctx)
-            .await;
+        // uncomment to allow early as possible Tenant::drop
+        // drop(tenant);
 
-        match res {
-            Ok(reparented_timelines) => {
-                let resp = pageserver_api::models::detach_ancestor::AncestorDetached {
+        let resp = match progress {
+            detach_ancestor::Progress::Prepared(_guard, prepared) => {
+                // it would be great to tag the guard on to the tenant activation future
+                let reparented_timelines = state
+                    .tenant_manager
+                    .complete_detaching_timeline_ancestor(
+                        tenant_shard_id,
+                        timeline_id,
+                        prepared,
+                        ctx,
+                    )
+                    .await
+                    .context("timeline detach ancestor completion")
+                    .map_err(ApiError::InternalServerError)?;
+
+                AncestorDetached {
                     reparented_timelines,
-                };
-
-                json_response(StatusCode::OK, resp)
+                }
             }
-            Err(e) => Err(ApiError::InternalServerError(
-                e.context("timeline detach completion"),
-            )),
-        }
+            detach_ancestor::Progress::Done(resp) => resp,
+        };
+
+        json_response(StatusCode::OK, resp)
     }
     .instrument(span)
     .await
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index bc9364de61..66b759c8e0 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -241,7 +241,7 @@ use self::index::IndexPart;
 
 use super::metadata::MetadataUpdate;
 use super::storage_layer::{Layer, LayerName, ResidentLayer};
-use super::upload_queue::SetDeletedFlagProgress;
+use super::upload_queue::{NotInitialized, SetDeletedFlagProgress};
 use super::Generation;
 
 pub(crate) use download::{
@@ -1930,6 +1930,31 @@ impl RemoteTimelineClient {
             }
         }
     }
+
+    /// Returns an accessor which will hold the UploadQueue mutex for accessing the upload queue
+    /// externally to RemoteTimelineClient.
+    pub(crate) fn initialized_upload_queue(
+        &self,
+    ) -> Result<UploadQueueAccessor<'_>, NotInitialized> {
+        let mut inner = self.upload_queue.lock().unwrap();
+        inner.initialized_mut()?;
+        Ok(UploadQueueAccessor { inner })
+    }
+}
+
+pub(crate) struct UploadQueueAccessor<'a> {
+    inner: std::sync::MutexGuard<'a, UploadQueue>,
+}
+
+impl<'a> UploadQueueAccessor<'a> {
+    pub(crate) fn latest_uploaded_index_part(&self) -> &IndexPart {
+        match &*self.inner {
+            UploadQueue::Initialized(x) => &x.clean.0,
+            UploadQueue::Uninitialized | UploadQueue::Stopped(_) => {
+                unreachable!("checked before constructing")
+            }
+        }
+    }
 }
 
 pub fn remote_tenant_path(tenant_shard_id: &TenantShardId) -> RemotePath {
diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index 6233a3477e..b439df8edb 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -176,6 +176,24 @@ pub(crate) struct Lineage {
     ///
     /// If you are adding support for detaching from a hierarchy, consider changing the ancestry
     /// into a `Vec<(TimelineId, Lsn)>` to be a path instead.
+    // FIXME: this is insufficient even for path of two timelines for future wal recovery
+    // purposes:
+    //
+    // assuming a "old main" which has received most of the WAL, and has a branch "new main",
+    // starting a bit before "old main" last_record_lsn. the current version works fine,
+    // because we will know to replay wal and branch at the recorded Lsn to do wal recovery.
+    //
+    // then assuming "new main" would similarly receive a branch right before its last_record_lsn,
+    // "new new main". the current implementation would just store ("new main", ancestor_lsn, _)
+    // here. however, we cannot recover from WAL using only that information, we would need the
+    // whole ancestry here:
+    //
+    // ```json
+    // [
+    //   ["old main", ancestor_lsn("new main"), _],
+    //   ["new main", ancestor_lsn("new new main"), _]
+    // ]
+    // ```
     #[serde(skip_serializing_if = "Option::is_none", default)]
     original_ancestor: Option<(TimelineId, Lsn, NaiveDateTime)>,
 }
@@ -217,6 +235,14 @@ impl Lineage {
         self.original_ancestor
             .is_some_and(|(_, ancestor_lsn, _)| ancestor_lsn == lsn)
     }
+
+    pub(crate) fn is_detached_from_original_ancestor(&self) -> bool {
+        self.original_ancestor.is_some()
+    }
+
+    pub(crate) fn is_reparented(&self) -> bool {
+        !self.reparenting_history.is_empty()
+    }
 }
 
 #[cfg(test)]
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 0996616a67..239dce8786 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4733,13 +4733,7 @@ impl Timeline {
         tenant: &crate::tenant::Tenant,
         options: detach_ancestor::Options,
         ctx: &RequestContext,
-    ) -> Result<
-        (
-            completion::Completion,
-            detach_ancestor::PreparedTimelineDetach,
-        ),
-        detach_ancestor::Error,
-    > {
+    ) -> Result<detach_ancestor::Progress, detach_ancestor::Error> {
         detach_ancestor::prepare(self, tenant, options, ctx).await
     }
 
diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs
index 4fc89330ba..49ce3db3e6 100644
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -10,6 +10,7 @@ use crate::{
     },
     virtual_file::{MaybeFatalIo, VirtualFile},
 };
+use pageserver_api::models::detach_ancestor::AncestorDetached;
 use tokio_util::sync::CancellationToken;
 use tracing::Instrument;
 use utils::{completion, generation::Generation, http::error::ApiError, id::TimelineId, lsn::Lsn};
@@ -39,6 +40,9 @@ pub(crate) enum Error {
 
     #[error("unexpected error")]
     Unexpected(#[source] anyhow::Error),
+
+    #[error("failpoint: {}", .0)]
+    Failpoint(&'static str),
 }
 
 impl From<Error> for ApiError {
@@ -57,11 +61,41 @@ impl From<Error> for ApiError {
             | e @ Error::CopyDeltaPrefix(_)
             | e @ Error::UploadRewritten(_)
             | e @ Error::CopyFailed(_)
-            | e @ Error::Unexpected(_) => ApiError::InternalServerError(e.into()),
+            | e @ Error::Unexpected(_)
+            | e @ Error::Failpoint(_) => ApiError::InternalServerError(e.into()),
         }
     }
 }
 
+impl From<crate::tenant::upload_queue::NotInitialized> for Error {
+    fn from(_: crate::tenant::upload_queue::NotInitialized) -> Self {
+        // treat all as shutting down signals, even though that is not entirely correct
+        // (uninitialized state)
+        Error::ShuttingDown
+    }
+}
+
+impl From<FlushLayerError> for Error {
+    fn from(value: FlushLayerError) -> Self {
+        match value {
+            FlushLayerError::Cancelled => Error::ShuttingDown,
+            FlushLayerError::NotRunning(_) => {
+                // FIXME(#6424): technically statically unreachable right now, given how we never
+                // drop the sender
+                Error::ShuttingDown
+            }
+            FlushLayerError::CreateImageLayersError(_) | FlushLayerError::Other(_) => {
+                Error::FlushAncestor(value)
+            }
+        }
+    }
+}
+
+pub(crate) enum Progress {
+    Prepared(completion::Completion, PreparedTimelineDetach),
+    Done(AncestorDetached),
+}
+
 pub(crate) struct PreparedTimelineDetach {
     layers: Vec<Layer>,
 }
@@ -88,7 +122,7 @@ pub(super) async fn prepare(
     tenant: &Tenant,
     options: Options,
     ctx: &RequestContext,
-) -> Result<(completion::Completion, PreparedTimelineDetach), Error> {
+) -> Result<Progress, Error> {
     use Error::*;
 
     let Some((ancestor, ancestor_lsn)) = detached
@@ -96,15 +130,67 @@ pub(super) async fn prepare(
         .as_ref()
         .map(|tl| (tl.clone(), detached.ancestor_lsn))
     else {
-        // TODO: check if we have already been detached; for this we need to read the stored data
-        // on remote client, for that we need a follow-up which makes uploads cheaper and maintains
-        // a projection of the commited data.
+        {
+            let accessor = detached.remote_client.initialized_upload_queue()?;
+
+            // we are safe to inspect the latest uploaded, because we can only witness this after
+            // restart is complete and ancestor is no more.
+            let latest = accessor.latest_uploaded_index_part();
+            if !latest.lineage.is_detached_from_original_ancestor() {
+                return Err(NoAncestor);
+            }
+        }
+
+        // detached has previously been detached; let's inspect each of the current timelines and
+        // report back the timelines which have been reparented by our detach
+        let mut all_direct_children = tenant
+            .timelines
+            .lock()
+            .unwrap()
+            .values()
+            .filter(|tl| matches!(tl.ancestor_timeline.as_ref(), Some(ancestor) if Arc::ptr_eq(ancestor, detached)))
+            .map(|tl| (tl.ancestor_lsn, tl.clone()))
+            .collect::<Vec<_>>();
+
+        let mut any_shutdown = false;
+
+        all_direct_children.retain(
+            |(_, tl)| match tl.remote_client.initialized_upload_queue() {
+                Ok(accessor) => accessor
+                    .latest_uploaded_index_part()
+                    .lineage
+                    .is_reparented(),
+                Err(_shutdownalike) => {
+                    // not 100% a shutdown, but let's bail early not to give inconsistent results in
+                    // sharded enviroment.
+                    any_shutdown = true;
+                    true
+                }
+            },
+        );
+
+        if any_shutdown {
+            // it could be one or many being deleted; have client retry
+            return Err(Error::ShuttingDown);
+        }
+
+        let mut reparented = all_direct_children;
+        // why this instead of hashset? there is a reason, but I've forgotten it many times.
         //
-        // the error is wrong per openapi
-        return Err(NoAncestor);
+        // maybe if this was a hashset we would not be able to distinguish some race condition.
+        reparented.sort_unstable_by_key(|(lsn, tl)| (*lsn, tl.timeline_id));
+
+        return Ok(Progress::Done(AncestorDetached {
+            reparented_timelines: reparented
+                .into_iter()
+                .map(|(_, tl)| tl.timeline_id)
+                .collect(),
+        }));
     };
 
     if !ancestor_lsn.is_valid() {
+        // rare case, probably wouldn't even load
+        tracing::error!("ancestor is set, but ancestor_lsn is invalid, this timeline needs fixing");
         return Err(NoAncestor);
     }
 
@@ -131,6 +217,15 @@ pub(super) async fn prepare(
 
     let _gate_entered = detached.gate.enter().map_err(|_| ShuttingDown)?;
 
+    utils::pausable_failpoint!("timeline-detach-ancestor::before_starting_after_locking_pausable");
+
+    fail::fail_point!(
+        "timeline-detach-ancestor::before_starting_after_locking",
+        |_| Err(Error::Failpoint(
+            "timeline-detach-ancestor::before_starting_after_locking"
+        ))
+    );
+
     if ancestor_lsn >= ancestor.get_disk_consistent_lsn() {
         let span =
             tracing::info_span!("freeze_and_flush", ancestor_timeline_id=%ancestor.timeline_id);
@@ -151,7 +246,7 @@ pub(super) async fn prepare(
                 }
             };
 
-            res.map_err(FlushAncestor)?;
+            res?;
 
             // we do not need to wait for uploads to complete but we do need `struct Layer`,
             // copying delta prefix is unsupported currently for `InMemoryLayer`.
@@ -159,7 +254,7 @@ pub(super) async fn prepare(
                 elapsed_ms = started_at.elapsed().as_millis(),
                 "froze and flushed the ancestor"
             );
-            Ok(())
+            Ok::<_, Error>(())
         }
         .instrument(span)
         .await?;
@@ -283,7 +378,7 @@ pub(super) async fn prepare(
 
     let prepared = PreparedTimelineDetach { layers: new_layers };
 
-    Ok((guard, prepared))
+    Ok(Progress::Prepared(guard, prepared))
 }
 
 fn partition_work(
@@ -350,7 +445,11 @@ async fn copy_lsn_prefix(
     target_timeline: &Arc<Timeline>,
     ctx: &RequestContext,
 ) -> Result<Option<ResidentLayer>, Error> {
-    use Error::{CopyDeltaPrefix, RewrittenDeltaDownloadFailed};
+    use Error::{CopyDeltaPrefix, RewrittenDeltaDownloadFailed, ShuttingDown};
+
+    if target_timeline.cancel.is_cancelled() {
+        return Err(ShuttingDown);
+    }
 
     tracing::debug!(%layer, %end_lsn, "copying lsn prefix");
 
@@ -529,7 +628,7 @@ pub(super) async fn complete(
         match res {
             Ok(Some(timeline)) => {
                 tracing::info!(reparented=%timeline.timeline_id, "reparenting done");
-                reparented.push(timeline.timeline_id);
+                reparented.push((timeline.ancestor_lsn, timeline.timeline_id));
             }
             Ok(None) => {
                 // lets just ignore this for now. one or all reparented timelines could had
@@ -551,5 +650,12 @@ pub(super) async fn complete(
         tracing::info!("failed to reparent some candidates");
     }
 
+    reparented.sort_unstable();
+
+    let reparented = reparented
+        .into_iter()
+        .map(|(_, timeline_id)| timeline_id)
+        .collect();
+
     Ok(reparented)
 }
diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs
index 50c977a950..f7440ecdae 100644
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -228,18 +228,20 @@ impl UploadQueue {
         Ok(self.initialized_mut().expect("we just set it"))
     }
 
-    pub(crate) fn initialized_mut(&mut self) -> anyhow::Result<&mut UploadQueueInitialized> {
+    pub(crate) fn initialized_mut(
+        &mut self,
+    ) -> Result<&mut UploadQueueInitialized, NotInitialized> {
         use UploadQueue::*;
         match self {
-            Uninitialized => Err(NotInitialized::Uninitialized.into()),
+            Uninitialized => Err(NotInitialized::Uninitialized),
             Initialized(x) => {
                 if x.shutting_down {
-                    Err(NotInitialized::ShuttingDown.into())
+                    Err(NotInitialized::ShuttingDown)
                 } else {
                     Ok(x)
                 }
             }
-            Stopped(_) => Err(NotInitialized::Stopped.into()),
+            Stopped(_) => Err(NotInitialized::Stopped),
         }
     }
 
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 95522525cb..3c24433c42 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -2830,9 +2830,10 @@ impl Service {
 
                     match e {
                         // no ancestor (ever)
-                        Error::ApiError(StatusCode::CONFLICT, msg) => {
-                            ApiError::Conflict(format!("{node}: {msg}"))
-                        }
+                        Error::ApiError(StatusCode::CONFLICT, msg) => ApiError::Conflict(format!(
+                            "{node}: {}",
+                            msg.strip_prefix("Conflict: ").unwrap_or(&msg)
+                        )),
                         // too many ancestors
                         Error::ApiError(StatusCode::BAD_REQUEST, msg) => {
                             ApiError::BadRequest(anyhow::anyhow!("{node}: {msg}"))
@@ -2859,8 +2860,6 @@ impl Service {
 
         let any = results.pop().expect("we must have at least one response");
 
-        // FIXME: the ordering is not stable yet on pageserver, should be (ancestor_lsn,
-        // TimelineId)
         let mismatching = results
             .iter()
             .filter(|(_, res)| res != &any.1)
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 03aee9e5c5..d66b94948a 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -172,6 +172,21 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         if auth_token is not None:
             self.headers["Authorization"] = f"Bearer {auth_token}"
 
+    def without_status_retrying(self) -> PageserverHttpClient:
+        retries = Retry(
+            status=0,
+            connect=5,
+            read=False,
+            backoff_factor=0.2,
+            status_forcelist=[],
+            allowed_methods=None,
+            remove_headers_on_redirect=[],
+        )
+
+        return PageserverHttpClient(
+            self.port, self.is_testing_enabled_or_skip, self.auth_token, retries
+        )
+
     @property
     def base_url(self) -> str:
         return f"http://localhost:{self.port}"
@@ -814,17 +829,19 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         tenant_id: Union[TenantId, TenantShardId],
         timeline_id: TimelineId,
         batch_size: int | None = None,
-    ) -> Set[TimelineId]:
+        **kwargs,
+    ) -> List[TimelineId]:
         params = {}
         if batch_size is not None:
             params["batch_size"] = batch_size
         res = self.put(
             f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/detach_ancestor",
             params=params,
+            **kwargs,
         )
         self.verbose_error(res)
         json = res.json()
-        return set(map(TimelineId, json["reparented_timelines"]))
+        return list(map(TimelineId, json["reparented_timelines"]))
 
     def evict_layer(
         self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, layer_name: str
diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index 803fcac583..d75ab4c060 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -1,5 +1,7 @@
 import datetime
 import enum
+import threading
+import time
 from concurrent.futures import ThreadPoolExecutor
 from queue import Empty, Queue
 from threading import Barrier
@@ -9,6 +11,7 @@ import pytest
 from fixtures.common_types import Lsn, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
+    LogCursor,
     NeonEnvBuilder,
     PgBin,
     flush_ep_to_pageserver,
@@ -17,7 +20,8 @@ from fixtures.neon_fixtures import (
 from fixtures.pageserver.http import HistoricLayerInfo, PageserverApiException
 from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_timeline_detail_404
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
-from fixtures.utils import assert_pageserver_backups_equal
+from fixtures.utils import assert_pageserver_backups_equal, wait_until
+from requests import ReadTimeout
 
 
 def by_end_lsn(info: HistoricLayerInfo) -> Lsn:
@@ -161,7 +165,7 @@ def test_ancestor_detach_branched_from(
     )
 
     all_reparented = client.detach_ancestor(env.initial_tenant, timeline_id)
-    assert all_reparented == set()
+    assert all_reparented == []
 
     if restart_after:
         env.pageserver.stop()
@@ -270,7 +274,7 @@ def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder):
     after = env.neon_cli.create_branch("after", "main", env.initial_tenant, ancestor_start_lsn=None)
 
     all_reparented = client.detach_ancestor(env.initial_tenant, timeline_id)
-    assert all_reparented == {reparented, same_branchpoint}
+    assert set(all_reparented) == {reparented, same_branchpoint}
 
     env.pageserver.quiesce_tenants()
 
@@ -530,7 +534,7 @@ def test_compaction_induced_by_detaches_in_history(
 
     for _, timeline_id in skip_main:
         reparented = client.detach_ancestor(env.initial_tenant, timeline_id)
-        assert reparented == set(), "we have no earlier branches at any level"
+        assert reparented == [], "we have no earlier branches at any level"
 
     post_detach_l0s = list(filter(lambda x: x.l0, delta_layers(branch_timeline_id)))
     assert len(post_detach_l0s) == 5, "should had inherited 4 L0s, have 5 in total"
@@ -561,7 +565,9 @@ def test_compaction_induced_by_detaches_in_history(
 
 
 @pytest.mark.parametrize("sharded", [True, False])
-def test_timeline_ancestor_detach_errors(neon_env_builder: NeonEnvBuilder, sharded: bool):
+def test_timeline_ancestor_detach_idempotent_success(
+    neon_env_builder: NeonEnvBuilder, sharded: bool
+):
     shards = 2 if sharded else 1
 
     neon_env_builder.num_pageservers = shards
@@ -579,28 +585,28 @@ def test_timeline_ancestor_detach_errors(neon_env_builder: NeonEnvBuilder, shard
     else:
         client = env.pageserver.http_client()
 
-    with pytest.raises(PageserverApiException, match=".* no ancestors") as info:
-        client.detach_ancestor(env.initial_tenant, env.initial_timeline)
-    assert info.value.status_code == 409
-
     first_branch = env.neon_cli.create_branch("first_branch")
-    second_branch = env.neon_cli.create_branch("second_branch", ancestor_branch_name="first_branch")
 
-    # funnily enough this does not have a prefix
-    with pytest.raises(PageserverApiException, match="too many ancestors") as info:
-        client.detach_ancestor(env.initial_tenant, second_branch)
-    assert info.value.status_code == 400
+    _ = env.neon_cli.create_branch("second_branch", ancestor_branch_name="first_branch")
 
-    client.detach_ancestor(env.initial_tenant, first_branch)
+    # these two will be reparented, and they should be returned in stable order
+    # from pageservers OR otherwise there will be an `error!` logging from
+    # storage controller
+    reparented1 = env.neon_cli.create_branch("first_reparented", ancestor_branch_name="main")
+    reparented2 = env.neon_cli.create_branch("second_reparented", ancestor_branch_name="main")
+
+    first_reparenting_response = client.detach_ancestor(env.initial_tenant, first_branch)
+    assert set(first_reparenting_response) == {reparented1, reparented2}
 
     # FIXME: this should be done by the http req handler
     for ps in pageservers.values():
         ps.quiesce_tenants()
 
-    with pytest.raises(PageserverApiException, match=".* no ancestors") as info:
-        client.detach_ancestor(env.initial_tenant, first_branch)
-    # FIXME: this should be 200 OK because we've already completed it
-    assert info.value.status_code == 409
+    for _ in range(5):
+        # once completed, we can retry this how many times
+        assert (
+            client.detach_ancestor(env.initial_tenant, first_branch) == first_reparenting_response
+        )
 
     client.tenant_delete(env.initial_tenant)
 
@@ -609,7 +615,50 @@ def test_timeline_ancestor_detach_errors(neon_env_builder: NeonEnvBuilder, shard
     assert e.value.status_code == 404
 
 
+@pytest.mark.parametrize("sharded", [True, False])
+def test_timeline_ancestor_detach_errors(neon_env_builder: NeonEnvBuilder, sharded: bool):
+    # the test is split from test_timeline_ancestor_detach_idempotent_success as only these error cases should create "request was dropped before completing",
+    # given the current first error handling
+    shards = 2 if sharded else 1
+
+    neon_env_builder.num_pageservers = shards
+    env = neon_env_builder.init_start(initial_tenant_shard_count=shards if sharded else None)
+
+    pageservers = dict((int(p.id), p) for p in env.pageservers)
+
+    for ps in pageservers.values():
+        ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
+        ps.allowed_errors.append(
+            ".* WARN .* path=/v1/tenant/.*/timeline/.*/detach_ancestor request_id=.*: request was dropped before completing"
+        )
+
+    client = (
+        env.pageserver.http_client() if not sharded else env.storage_controller.pageserver_api()
+    )
+
+    with pytest.raises(PageserverApiException, match=".* no ancestors") as info:
+        client.detach_ancestor(env.initial_tenant, env.initial_timeline)
+    assert info.value.status_code == 409
+
+    _ = env.neon_cli.create_branch("first_branch")
+
+    second_branch = env.neon_cli.create_branch("second_branch", ancestor_branch_name="first_branch")
+
+    # funnily enough this does not have a prefix
+    with pytest.raises(PageserverApiException, match="too many ancestors") as info:
+        client.detach_ancestor(env.initial_tenant, second_branch)
+    assert info.value.status_code == 400
+
+
 def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder):
+    """
+    Sharded timeline detach ancestor; 4 nodes: 1 stuck, 1 restarted, 2 normal.
+
+    Stuck node gets stuck on a pause failpoint for first storage controller request.
+    Restarted node remains stuck until explicit restart from test code.
+
+    We retry the request until storage controller gets 200 OK from all nodes.
+    """
     branch_name = "soon_detached"
     shard_count = 4
     neon_env_builder.num_pageservers = shard_count
@@ -621,8 +670,15 @@ def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder):
 
     # FIXME: should this be in the neon_env_builder.init_start?
     env.storage_controller.reconcile_until_idle()
+    # as we will stop a node, make sure there is no clever rebalancing
+    env.storage_controller.tenant_policy_update(env.initial_tenant, body={"scheduling": "Stop"})
+    env.storage_controller.allowed_errors.append(".*: Scheduling is disabled by policy Stop .*")
+
     shards = env.storage_controller.locate(env.initial_tenant)
 
+    utilized_pageservers = {x["node_id"] for x in shards}
+    assert len(utilized_pageservers) > 1, "all shards got placed on single pageserver?"
+
     branch_timeline_id = env.neon_cli.create_branch(branch_name, tenant_id=env.initial_tenant)
 
     with env.endpoints.create_start(branch_name, tenant_id=env.initial_tenant) as ep:
@@ -642,7 +698,79 @@ def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder):
         assert Lsn(detail["initdb_lsn"]) < lsn
         assert TimelineId(detail["ancestor_timeline_id"]) == env.initial_timeline
 
-    env.storage_controller.pageserver_api().detach_ancestor(env.initial_tenant, branch_timeline_id)
+    # make one of the nodes get stuck, but continue the initial operation
+    # make another of the nodes get stuck, then restart
+
+    stuck = pageservers[int(shards[0]["node_id"])]
+    stuck.allowed_errors.append(".*: request was dropped before completing")
+    env.storage_controller.allowed_errors.append(".*: request was dropped before completing")
+    stuck_http = stuck.http_client()
+    stuck_http.configure_failpoints(
+        ("timeline-detach-ancestor::before_starting_after_locking_pausable", "pause")
+    )
+
+    restarted = pageservers[int(shards[1]["node_id"])]
+    restarted.allowed_errors.extend(
+        [
+            ".*: request was dropped before completing",
+            ".*: Cancelled request finished with an error: ShuttingDown",
+        ]
+    )
+    assert restarted.id != stuck.id
+    restarted_http = restarted.http_client()
+    restarted_http.configure_failpoints(
+        [
+            ("timeline-detach-ancestor::before_starting_after_locking_pausable", "pause"),
+        ]
+    )
+
+    target = env.storage_controller.pageserver_api()
+
+    with pytest.raises(ReadTimeout):
+        target.detach_ancestor(env.initial_tenant, branch_timeline_id, timeout=1)
+
+    stuck_http.configure_failpoints(
+        ("timeline-detach-ancestor::before_starting_after_locking_pausable", "off")
+    )
+
+    barrier = threading.Barrier(2)
+
+    def restart_restarted():
+        barrier.wait()
+        # graceful shutdown should just work, because simultaneously unpaused
+        restarted.stop()
+        # this does not happen always, depends how fast we exit after unpausing
+        # restarted.assert_log_contains("Cancelled request finished with an error: ShuttingDown")
+        restarted.start()
+
+    with ThreadPoolExecutor(max_workers=1) as pool:
+        fut = pool.submit(restart_restarted)
+        barrier.wait()
+        # we have 10s, lets use 1/2 of that to help the shutdown start
+        time.sleep(5)
+        restarted_http.configure_failpoints(
+            ("timeline-detach-ancestor::before_starting_after_locking_pausable", "off")
+        )
+        fut.result()
+
+    # detach ancestor request handling is not sensitive to http cancellation.
+    # this means that the "stuck" is on its way to complete the detach, but the restarted is off
+    # now it can either be complete on all nodes, or still in progress with
+    # one.
+    without_retrying = target.without_status_retrying()
+
+    # this retry loop will be long enough that the tenant can always activate
+    reparented = None
+    for _ in range(10):
+        try:
+            reparented = without_retrying.detach_ancestor(env.initial_tenant, branch_timeline_id)
+        except PageserverApiException as info:
+            assert info.status_code == 503
+            time.sleep(2)
+        else:
+            break
+
+    assert reparented == [], "too many retries (None) or unexpected reparentings"
 
     for shard_info in shards:
         node_id = int(shard_info["node_id"])
@@ -661,8 +789,262 @@ def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder):
         assert count == 10000
 
 
+@pytest.mark.parametrize("mode", ["delete_timeline", "delete_tenant"])
+@pytest.mark.parametrize("sharded", [False, True])
+def test_timeline_detach_ancestor_interrupted_by_deletion(
+    neon_env_builder: NeonEnvBuilder, mode: str, sharded: bool
+):
+    """
+    Timeline ancestor detach interrupted by deleting either:
+    - the detached timeline
+    - the whole tenant
+
+    after starting the detach.
+
+    What remains not tested by this:
+    - shutdown winning over complete
+
+    Shutdown winning over complete needs gc blocking and reparenting any left-overs on retry.
+    """
+
+    if sharded and mode == "delete_tenant":
+        # the shared/exclusive lock for tenant is blocking this:
+        # timeline detach ancestor takes shared, delete tenant takes exclusive
+        pytest.skip(
+            "tenant deletion while timeline ancestor detach is underway is not supported yet"
+        )
+
+    shard_count = 2 if sharded else 1
+
+    neon_env_builder.num_pageservers = shard_count
+
+    env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count if sharded else None)
+
+    for ps in env.pageservers:
+        ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
+
+    pageservers = dict((int(p.id), p) for p in env.pageservers)
+
+    detached_timeline = env.neon_cli.create_branch("detached soon", "main")
+
+    failpoint = "timeline-detach-ancestor::before_starting_after_locking_pausable"
+
+    env.storage_controller.reconcile_until_idle()
+    shards = env.storage_controller.locate(env.initial_tenant)
+
+    assert len(set(info["node_id"] for info in shards)) == shard_count
+
+    target = env.storage_controller.pageserver_api() if sharded else env.pageserver.http_client()
+    target = target.without_status_retrying()
+
+    victim = pageservers[int(shards[-1]["node_id"])]
+    victim_http = victim.http_client()
+    victim_http.configure_failpoints((failpoint, "pause"))
+
+    def detach_ancestor():
+        target.detach_ancestor(env.initial_tenant, detached_timeline)
+
+    def at_failpoint() -> Tuple[str, LogCursor]:
+        return victim.assert_log_contains(f"at failpoint {failpoint}")
+
+    def start_delete():
+        if mode == "delete_timeline":
+            target.timeline_delete(env.initial_tenant, detached_timeline)
+        elif mode == "delete_tenant":
+            target.tenant_delete(env.initial_tenant)
+        else:
+            raise RuntimeError(f"unimplemented mode {mode}")
+
+    def at_waiting_on_gate_close(start_offset: LogCursor) -> LogCursor:
+        _, offset = victim.assert_log_contains(
+            "closing is taking longer than expected", offset=start_offset
+        )
+        return offset
+
+    def is_deleted():
+        try:
+            if mode == "delete_timeline":
+                target.timeline_detail(env.initial_tenant, detached_timeline)
+            elif mode == "delete_tenant":
+                target.tenant_status(env.initial_tenant)
+            else:
+                return False
+        except PageserverApiException as e:
+            assert e.status_code == 404
+            return True
+        else:
+            raise RuntimeError("waiting for 404")
+
+    with ThreadPoolExecutor(max_workers=2) as pool:
+        try:
+            fut = pool.submit(detach_ancestor)
+            _, offset = wait_until(10, 1.0, at_failpoint)
+
+            delete = pool.submit(start_delete)
+
+            wait_until(10, 1.0, lambda: at_waiting_on_gate_close(offset))
+
+            victim_http.configure_failpoints((failpoint, "off"))
+
+            delete.result()
+
+            assert wait_until(10, 1.0, is_deleted), f"unimplemented mode {mode}"
+
+            with pytest.raises(PageserverApiException) as exc:
+                fut.result()
+            assert exc.value.status_code == 503
+        finally:
+            victim_http.configure_failpoints((failpoint, "off"))
+
+
+@pytest.mark.parametrize("mode", ["delete_reparentable_timeline"])
+def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnvBuilder, mode: str):
+    """
+    Technically possible storage controller concurrent interleaving timeline
+    deletion with timeline detach.
+
+    Deletion is fine, as any sharded pageservers reach the same end state, but
+    creating reparentable timeline would create an issue as the two nodes would
+    never agree. There is a solution though: the created reparentable timeline
+    must be detached.
+    """
+
+    assert (
+        mode == "delete_reparentable_timeline"
+    ), "only one now, but we could have the create just as well, need gc blocking"
+
+    shard_count = 2
+    neon_env_builder.num_pageservers = shard_count
+    env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
+
+    for ps in env.pageservers:
+        ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
+
+    pageservers = dict((int(p.id), p) for p in env.pageservers)
+
+    env.storage_controller.reconcile_until_idle()
+    shards = env.storage_controller.locate(env.initial_tenant)
+    assert len(set(x["node_id"] for x in shards)) == shard_count
+
+    with env.endpoints.create_start("main") as ep:
+        ep.safe_psql("create table foo as select i::bigint from generate_series(1, 1000) t(i)")
+
+        # as the interleaved operation, we will delete this timeline, which was reparenting candidate
+        first_branch_lsn = wait_for_last_flush_lsn(
+            env, ep, env.initial_tenant, env.initial_timeline
+        )
+        for ps, shard_id in [(pageservers[int(x["node_id"])], x["shard_id"]) for x in shards]:
+            ps.http_client().timeline_checkpoint(shard_id, env.initial_timeline)
+
+        ep.safe_psql("create table bar as select i::bigint from generate_series(1, 2000) t(i)")
+        detached_branch_lsn = flush_ep_to_pageserver(
+            env, ep, env.initial_tenant, env.initial_timeline
+        )
+
+    for ps, shard_id in [(pageservers[int(x["node_id"])], x["shard_id"]) for x in shards]:
+        ps.http_client().timeline_checkpoint(shard_id, env.initial_timeline)
+
+    first_branch = env.neon_cli.create_branch(
+        "first_branch", ancestor_branch_name="main", ancestor_start_lsn=first_branch_lsn
+    )
+    detached_branch = env.neon_cli.create_branch(
+        "detached_branch", ancestor_branch_name="main", ancestor_start_lsn=detached_branch_lsn
+    )
+
+    pausepoint = "timeline-detach-ancestor::before_starting_after_locking_pausable"
+
+    stuck = pageservers[int(shards[0]["node_id"])]
+    stuck_http = stuck.http_client().without_status_retrying()
+    stuck_http.configure_failpoints((pausepoint, "pause"))
+
+    victim = pageservers[int(shards[-1]["node_id"])]
+    victim_http = victim.http_client().without_status_retrying()
+    victim_http.configure_failpoints(
+        (pausepoint, "pause"),
+    )
+
+    # noticed a surprising 409 if the other one would fail instead
+    # victim_http.configure_failpoints([
+    #     (pausepoint, "pause"),
+    #     ("timeline-detach-ancestor::before_starting_after_locking", "return"),
+    # ])
+
+    # interleaving a create_timeline which could be reparented will produce two
+    # permanently different reparentings: one node has reparented, other has
+    # not
+    #
+    # with deletion there is no such problem
+    def detach_timeline():
+        env.storage_controller.pageserver_api().detach_ancestor(env.initial_tenant, detached_branch)
+
+    def paused_at_failpoint():
+        stuck.assert_log_contains(f"at failpoint {pausepoint}")
+        victim.assert_log_contains(f"at failpoint {pausepoint}")
+
+    def first_completed():
+        detail = stuck_http.timeline_detail(shards[0]["shard_id"], detached_branch)
+        log.info(detail)
+        assert detail.get("ancestor_lsn") is None
+
+    def first_branch_gone():
+        try:
+            env.storage_controller.pageserver_api().timeline_detail(
+                env.initial_tenant, first_branch
+            )
+        except PageserverApiException as e:
+            log.info(f"error {e}")
+            assert e.status_code == 404
+        else:
+            log.info("still ok")
+            raise RuntimeError("not done yet")
+
+    with ThreadPoolExecutor(max_workers=1) as pool:
+        try:
+            fut = pool.submit(detach_timeline)
+            wait_until(10, 1.0, paused_at_failpoint)
+
+            # let stuck complete
+            stuck_http.configure_failpoints((pausepoint, "off"))
+            wait_until(10, 1.0, first_completed)
+
+            # if we would let victim fail, for some reason there'd be a 409 response instead of 500
+            # victim_http.configure_failpoints((pausepoint, "off"))
+            # with pytest.raises(PageserverApiException, match=".* 500 Internal Server Error failpoint: timeline-detach-ancestor::before_starting_after_locking") as exc:
+            #     fut.result()
+            # assert exc.value.status_code == 409
+
+            env.storage_controller.pageserver_api().timeline_delete(
+                env.initial_tenant, first_branch
+            )
+            victim_http.configure_failpoints((pausepoint, "off"))
+            wait_until(10, 1.0, first_branch_gone)
+
+            # it now passes, and we should get an error messages about mixed reparenting as the stuck still had something to reparent
+            fut.result()
+
+            msg, offset = env.storage_controller.assert_log_contains(
+                ".*/timeline/\\S+/detach_ancestor.*: shards returned different results matching=0 .*"
+            )
+            log.info(f"expected error message: {msg}")
+            env.storage_controller.allowed_errors.append(
+                ".*: shards returned different results matching=0 .*"
+            )
+
+            detach_timeline()
+
+            # FIXME: perhaps the above should be automatically retried, if we get mixed results?
+            not_found = env.storage_controller.log_contains(
+                ".*/timeline/\\S+/detach_ancestor.*: shards returned different results matching=0 .*",
+                offset=offset,
+            )
+
+            assert not_found is None
+        finally:
+            stuck_http.configure_failpoints((pausepoint, "off"))
+            victim_http.configure_failpoints((pausepoint, "off"))
+
+
 # TODO:
-# - after starting the operation, tenant is deleted
 # - after starting the operation, pageserver is shutdown, restarted
 # - after starting the operation, bottom-most timeline is deleted, pageserver is restarted, gc is inhibited
 # - deletion of reparented while reparenting should fail once, then succeed (?)
@@ -670,9 +1052,5 @@ def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder):
 # - investigate: why are layers started at uneven lsn? not just after branching, but in general.
 #
 # TEST: 1. tad which partially succeeds, one returns 500
-#       2. create branch below timeline? or delete timeline below
+#       2. create branch below timeline? ~or delete reparented timeline~ (done)
 #       3. on retry all should report the same reparented timelines
-#
-# TEST: 1. tad is started, one node stalls, other restarts
-#       2. client timeout before stall over
-#       3. on retry with stalled and other being able to proceed

From 349373cb11d5f40f69bd9c17f3fdeccadc321141 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Mon, 15 Jul 2024 14:55:57 -0700
Subject: [PATCH 247/412] Allow reusing projects between runs of logical
 replication benchmarks (#8393)

---
 test_runner/fixtures/neon_api.py              |  44 +++
 test_runner/fixtures/neon_fixtures.py         |  14 +-
 .../performance/test_logical_replication.py   | 335 +++++++-----------
 3 files changed, 179 insertions(+), 214 deletions(-)

diff --git a/test_runner/fixtures/neon_api.py b/test_runner/fixtures/neon_api.py
index 39baf5fab6..658ed119a1 100644
--- a/test_runner/fixtures/neon_api.py
+++ b/test_runner/fixtures/neon_api.py
@@ -261,3 +261,47 @@ class NeonAPI:
                 if op["status"] in {"scheduling", "running", "cancelling"}:
                     has_running = True
             time.sleep(0.5)
+
+
+class NeonApiEndpoint:
+    def __init__(self, neon_api: NeonAPI, pg_version: PgVersion, project_id: Optional[str]):
+        self.neon_api = neon_api
+        if project_id is None:
+            project = neon_api.create_project(pg_version)
+            neon_api.wait_for_operation_to_finish(project["project"]["id"])
+            self.project_id = project["project"]["id"]
+            self.endpoint_id = project["endpoints"][0]["id"]
+            self.connstr = project["connection_uris"][0]["connection_uri"]
+            self.pgbench_env = connection_parameters_to_env(
+                project["connection_uris"][0]["connection_parameters"]
+            )
+            self.is_new = True
+        else:
+            project = neon_api.get_project_details(project_id)
+            if int(project["project"]["pg_version"]) != int(pg_version):
+                raise Exception(
+                    f"A project with the provided ID exists, but it's not of the specified version (expected {pg_version}, got {project['project']['pg_version']})"
+                )
+            self.project_id = project_id
+            eps = neon_api.get_endpoints(project_id)["endpoints"]
+            self.endpoint_id = eps[0]["id"]
+            self.connstr = neon_api.get_connection_uri(project_id, endpoint_id=self.endpoint_id)[
+                "uri"
+            ]
+            pw = self.connstr.split("@")[0].split(":")[-1]
+            self.pgbench_env = {
+                "PGHOST": eps[0]["host"],
+                "PGDATABASE": "neondb",
+                "PGUSER": "neondb_owner",
+                "PGPASSWORD": pw,
+            }
+            self.is_new = False
+
+    def restart(self):
+        self.neon_api.restart_endpoint(self.project_id, self.endpoint_id)
+        self.neon_api.wait_for_operation_to_finish(self.project_id)
+
+    def get_synthetic_storage_size(self) -> int:
+        return int(
+            self.neon_api.get_project_details(self.project_id)["project"]["synthetic_storage_size"]
+        )
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 90ed838e1d..fe4a334458 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -87,7 +87,7 @@ from fixtures.utils import (
 )
 from fixtures.utils import AuxFileStore as AuxFileStore  # reexport
 
-from .neon_api import NeonAPI
+from .neon_api import NeonAPI, NeonApiEndpoint
 
 """
 This file contains pytest fixtures. A fixture is a test resource that can be
@@ -3158,6 +3158,18 @@ class RemotePostgres(PgProtocol):
         pass
 
 
+@pytest.fixture(scope="function")
+def benchmark_project_pub(neon_api: NeonAPI, pg_version: PgVersion) -> NeonApiEndpoint:
+    project_id = os.getenv("BENCHMARK_PROJECT_ID_PUB")
+    return NeonApiEndpoint(neon_api, pg_version, project_id)
+
+
+@pytest.fixture(scope="function")
+def benchmark_project_sub(neon_api: NeonAPI, pg_version: PgVersion) -> NeonApiEndpoint:
+    project_id = os.getenv("BENCHMARK_PROJECT_ID_SUB")
+    return NeonApiEndpoint(neon_api, pg_version, project_id)
+
+
 @pytest.fixture(scope="function")
 def remote_pg(
     test_output_dir: Path, pg_distrib_dir: Path, pg_version: PgVersion
diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py
index 5ab83dd31d..53bb29a659 100644
--- a/test_runner/performance/test_logical_replication.py
+++ b/test_runner/performance/test_logical_replication.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
 
 import time
-import traceback
 from typing import TYPE_CHECKING
 
 import psycopg2
@@ -10,15 +9,12 @@ import pytest
 from fixtures.benchmark_fixture import MetricReport
 from fixtures.common_types import Lsn
 from fixtures.log_helper import log
-from fixtures.neon_api import connection_parameters_to_env
 from fixtures.neon_fixtures import AuxFileStore, logical_replication_sync
-from fixtures.pg_version import PgVersion
 
 if TYPE_CHECKING:
     from fixtures.benchmark_fixture import NeonBenchmarker
-    from fixtures.neon_api import NeonAPI
+    from fixtures.neon_api import NeonApiEndpoint
     from fixtures.neon_fixtures import NeonEnv, PgBin
-    from fixtures.pg_version import PgVersion
 
 
 @pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.V2])
@@ -86,8 +82,8 @@ def measure_logical_replication_lag(sub_cur, pub_cur, timeout_sec=600):
 @pytest.mark.timeout(2 * 60 * 60)
 def test_subscriber_lag(
     pg_bin: PgBin,
-    neon_api: NeonAPI,
-    pg_version: PgVersion,
+    benchmark_project_pub: NeonApiEndpoint,
+    benchmark_project_sub: NeonApiEndpoint,
     zenbenchmark: NeonBenchmarker,
 ):
     """
@@ -99,125 +95,82 @@ def test_subscriber_lag(
     sync_interval_min = 5
     pgbench_duration = f"-T{test_duration_min * 60 * 2}"
 
-    pub_project = neon_api.create_project(pg_version)
-    pub_project_id = pub_project["project"]["id"]
-    neon_api.wait_for_operation_to_finish(pub_project_id)
-    error_occurred = False
+    pub_env = benchmark_project_pub.pgbench_env
+    sub_env = benchmark_project_sub.pgbench_env
+    pub_connstr = benchmark_project_pub.connstr
+    sub_connstr = benchmark_project_sub.connstr
+
+    pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env)
+    pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env)
+
+    pub_conn = psycopg2.connect(pub_connstr)
+    sub_conn = psycopg2.connect(sub_connstr)
+    pub_conn.autocommit = True
+    sub_conn.autocommit = True
+    with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
+        if benchmark_project_pub.is_new:
+            pub_cur.execute("create publication pub1 for table pgbench_accounts, pgbench_history")
+
+        if benchmark_project_sub.is_new:
+            sub_cur.execute("truncate table pgbench_accounts")
+            sub_cur.execute("truncate table pgbench_history")
+
+            sub_cur.execute(f"create subscription sub1 connection '{pub_connstr}' publication pub1")
+
+        initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur)
+    pub_conn.close()
+    sub_conn.close()
+
+    zenbenchmark.record("initial_sync_lag", initial_sync_lag, "s", MetricReport.LOWER_IS_BETTER)
+
+    pub_workload = pg_bin.run_nonblocking(
+        ["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=pub_env
+    )
     try:
-        sub_project = neon_api.create_project(pg_version)
-        sub_project_id = sub_project["project"]["id"]
-        sub_endpoint_id = sub_project["endpoints"][0]["id"]
-        neon_api.wait_for_operation_to_finish(sub_project_id)
+        sub_workload = pg_bin.run_nonblocking(
+            ["pgbench", "-c10", pgbench_duration, "-S"],
+            env=sub_env,
+        )
         try:
-            pub_env = connection_parameters_to_env(
-                pub_project["connection_uris"][0]["connection_parameters"]
-            )
-            sub_env = connection_parameters_to_env(
-                sub_project["connection_uris"][0]["connection_parameters"]
-            )
-            pub_connstr = pub_project["connection_uris"][0]["connection_uri"]
-            sub_connstr = sub_project["connection_uris"][0]["connection_uri"]
+            start = time.time()
+            while time.time() - start < test_duration_min * 60:
+                time.sleep(sync_interval_min * 60)
+                check_pgbench_still_running(pub_workload, "pub")
+                check_pgbench_still_running(sub_workload, "sub")
 
-            pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env)
-            pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env)
+                with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect(
+                    sub_connstr
+                ) as sub_conn:
+                    with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
+                        lag = measure_logical_replication_lag(sub_cur, pub_cur)
 
-            pub_conn = psycopg2.connect(pub_connstr)
-            sub_conn = psycopg2.connect(sub_connstr)
-            pub_conn.autocommit = True
-            sub_conn.autocommit = True
-            with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
-                sub_cur.execute("truncate table pgbench_accounts")
-                sub_cur.execute("truncate table pgbench_history")
+                log.info(f"Replica lagged behind master by {lag} seconds")
+                zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
+                sub_workload.terminate()
+                benchmark_project_sub.restart()
 
-                pub_cur.execute(
-                    "create publication pub1 for table pgbench_accounts, pgbench_history"
-                )
-                sub_cur.execute(
-                    f"create subscription sub1 connection '{pub_connstr}' publication pub1"
-                )
-
-                initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur)
-            pub_conn.close()
-            sub_conn.close()
-
-            zenbenchmark.record(
-                "initial_sync_lag", initial_sync_lag, "s", MetricReport.LOWER_IS_BETTER
-            )
-
-            pub_workload = pg_bin.run_nonblocking(
-                ["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=pub_env
-            )
-            try:
                 sub_workload = pg_bin.run_nonblocking(
                     ["pgbench", "-c10", pgbench_duration, "-S"],
                     env=sub_env,
                 )
-                try:
-                    start = time.time()
-                    while time.time() - start < test_duration_min * 60:
-                        time.sleep(sync_interval_min * 60)
-                        check_pgbench_still_running(pub_workload, "pub")
-                        check_pgbench_still_running(sub_workload, "sub")
 
-                        with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect(
-                            sub_connstr
-                        ) as sub_conn:
-                            with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
-                                lag = measure_logical_replication_lag(sub_cur, pub_cur)
-
-                        log.info(f"Replica lagged behind master by {lag} seconds")
-                        zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
-                        sub_workload.terminate()
-                        neon_api.restart_endpoint(
-                            sub_project_id,
-                            sub_endpoint_id,
-                        )
-                        neon_api.wait_for_operation_to_finish(sub_project_id)
-                        sub_workload = pg_bin.run_nonblocking(
-                            ["pgbench", "-c10", pgbench_duration, "-S"],
-                            env=sub_env,
-                        )
-
-                        # Measure storage to make sure replication information isn't bloating storage
-                        sub_storage = neon_api.get_project_details(sub_project_id)["project"][
-                            "synthetic_storage_size"
-                        ]
-                        pub_storage = neon_api.get_project_details(pub_project_id)["project"][
-                            "synthetic_storage_size"
-                        ]
-                        zenbenchmark.record(
-                            "sub_storage", sub_storage, "B", MetricReport.LOWER_IS_BETTER
-                        )
-                        zenbenchmark.record(
-                            "pub_storage", pub_storage, "B", MetricReport.LOWER_IS_BETTER
-                        )
-
-                finally:
-                    sub_workload.terminate()
-            finally:
-                pub_workload.terminate()
-        except Exception as e:
-            error_occurred = True
-            log.error(f"Caught exception {e}")
-            log.error(traceback.format_exc())
+                # Measure storage to make sure replication information isn't bloating storage
+                sub_storage = benchmark_project_sub.get_synthetic_storage_size()
+                pub_storage = benchmark_project_pub.get_synthetic_storage_size()
+                zenbenchmark.record("sub_storage", sub_storage, "B", MetricReport.LOWER_IS_BETTER)
+                zenbenchmark.record("pub_storage", pub_storage, "B", MetricReport.LOWER_IS_BETTER)
         finally:
-            if not error_occurred:
-                neon_api.delete_project(sub_project_id)
-    except Exception as e:
-        error_occurred = True
-        log.error(f"Caught exception {e}")
-        log.error(traceback.format_exc())
+            sub_workload.terminate()
     finally:
-        assert not error_occurred
-        neon_api.delete_project(pub_project_id)
+        pub_workload.terminate()
 
 
 @pytest.mark.remote_cluster
 @pytest.mark.timeout(2 * 60 * 60)
 def test_publisher_restart(
     pg_bin: PgBin,
-    neon_api: NeonAPI,
-    pg_version: PgVersion,
+    benchmark_project_pub: NeonApiEndpoint,
+    benchmark_project_sub: NeonApiEndpoint,
     zenbenchmark: NeonBenchmarker,
 ):
     """
@@ -229,114 +182,70 @@ def test_publisher_restart(
     sync_interval_min = 5
     pgbench_duration = f"-T{test_duration_min * 60 * 2}"
 
-    pub_project = neon_api.create_project(pg_version)
-    pub_project_id = pub_project["project"]["id"]
-    pub_endpoint_id = pub_project["endpoints"][0]["id"]
-    neon_api.wait_for_operation_to_finish(pub_project_id)
-    error_occurred = False
+    pub_env = benchmark_project_pub.pgbench_env
+    sub_env = benchmark_project_sub.pgbench_env
+    pub_connstr = benchmark_project_pub.connstr
+    sub_connstr = benchmark_project_sub.connstr
+
+    pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env)
+    pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env)
+
+    pub_conn = psycopg2.connect(pub_connstr)
+    sub_conn = psycopg2.connect(sub_connstr)
+    pub_conn.autocommit = True
+    sub_conn.autocommit = True
+    with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
+        if benchmark_project_pub.is_new:
+            pub_cur.execute("create publication pub1 for table pgbench_accounts, pgbench_history")
+
+        if benchmark_project_sub.is_new:
+            sub_cur.execute("truncate table pgbench_accounts")
+            sub_cur.execute("truncate table pgbench_history")
+
+            sub_cur.execute(f"create subscription sub1 connection '{pub_connstr}' publication pub1")
+
+        initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur)
+    pub_conn.close()
+    sub_conn.close()
+
+    zenbenchmark.record("initial_sync_lag", initial_sync_lag, "s", MetricReport.LOWER_IS_BETTER)
+
+    pub_workload = pg_bin.run_nonblocking(
+        ["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=pub_env
+    )
     try:
-        sub_project = neon_api.create_project(pg_version)
-        sub_project_id = sub_project["project"]["id"]
-        neon_api.wait_for_operation_to_finish(sub_project_id)
+        sub_workload = pg_bin.run_nonblocking(
+            ["pgbench", "-c10", pgbench_duration, "-S"],
+            env=sub_env,
+        )
         try:
-            pub_env = connection_parameters_to_env(
-                pub_project["connection_uris"][0]["connection_parameters"]
-            )
-            sub_env = connection_parameters_to_env(
-                sub_project["connection_uris"][0]["connection_parameters"]
-            )
-            pub_connstr = pub_project["connection_uris"][0]["connection_uri"]
-            sub_connstr = sub_project["connection_uris"][0]["connection_uri"]
+            start = time.time()
+            while time.time() - start < test_duration_min * 60:
+                time.sleep(sync_interval_min * 60)
+                check_pgbench_still_running(pub_workload, "pub")
+                check_pgbench_still_running(sub_workload, "sub")
 
-            pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env)
-            pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env)
-
-            pub_conn = psycopg2.connect(pub_connstr)
-            sub_conn = psycopg2.connect(sub_connstr)
-            pub_conn.autocommit = True
-            sub_conn.autocommit = True
-            with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
-                sub_cur.execute("truncate table pgbench_accounts")
-                sub_cur.execute("truncate table pgbench_history")
-
-                pub_cur.execute(
-                    "create publication pub1 for table pgbench_accounts, pgbench_history"
-                )
-                sub_cur.execute(
-                    f"create subscription sub1 connection '{pub_connstr}' publication pub1"
-                )
-
-                initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur)
-            pub_conn.close()
-            sub_conn.close()
-
-            zenbenchmark.record(
-                "initial_sync_lag", initial_sync_lag, "s", MetricReport.LOWER_IS_BETTER
-            )
-
-            pub_workload = pg_bin.run_nonblocking(
-                ["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=pub_env
-            )
-            try:
-                sub_workload = pg_bin.run_nonblocking(
-                    ["pgbench", "-c10", pgbench_duration, "-S"],
-                    env=sub_env,
-                )
-                try:
-                    start = time.time()
-                    while time.time() - start < test_duration_min * 60:
-                        time.sleep(sync_interval_min * 60)
-                        check_pgbench_still_running(pub_workload, "pub")
-                        check_pgbench_still_running(sub_workload, "sub")
-
-                        pub_workload.terminate()
-                        neon_api.restart_endpoint(
-                            pub_project_id,
-                            pub_endpoint_id,
-                        )
-                        neon_api.wait_for_operation_to_finish(pub_project_id)
-                        pub_workload = pg_bin.run_nonblocking(
-                            ["pgbench", "-c10", pgbench_duration, "-Mprepared"],
-                            env=pub_env,
-                        )
-                        with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect(
-                            sub_connstr
-                        ) as sub_conn:
-                            with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
-                                lag = measure_logical_replication_lag(sub_cur, pub_cur)
-
-                        log.info(f"Replica lagged behind master by {lag} seconds")
-                        zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
-
-                        # Measure storage to make sure replication information isn't bloating storage
-                        sub_storage = neon_api.get_project_details(sub_project_id)["project"][
-                            "synthetic_storage_size"
-                        ]
-                        pub_storage = neon_api.get_project_details(pub_project_id)["project"][
-                            "synthetic_storage_size"
-                        ]
-                        zenbenchmark.record(
-                            "sub_storage", sub_storage, "B", MetricReport.LOWER_IS_BETTER
-                        )
-                        zenbenchmark.record(
-                            "pub_storage", pub_storage, "B", MetricReport.LOWER_IS_BETTER
-                        )
-
-                finally:
-                    sub_workload.terminate()
-            finally:
                 pub_workload.terminate()
-        except Exception as e:
-            error_occurred = True
-            log.error(f"Caught exception {e}")
-            log.error(traceback.format_exc())
+                benchmark_project_pub.restart()
+                pub_workload = pg_bin.run_nonblocking(
+                    ["pgbench", "-c10", pgbench_duration, "-Mprepared"],
+                    env=pub_env,
+                )
+                with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect(
+                    sub_connstr
+                ) as sub_conn:
+                    with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
+                        lag = measure_logical_replication_lag(sub_cur, pub_cur)
+
+                log.info(f"Replica lagged behind master by {lag} seconds")
+                zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
+
+                # Measure storage to make sure replication information isn't bloating storage
+                sub_storage = benchmark_project_sub.get_synthetic_storage_size()
+                pub_storage = benchmark_project_pub.get_synthetic_storage_size()
+                zenbenchmark.record("sub_storage", sub_storage, "B", MetricReport.LOWER_IS_BETTER)
+                zenbenchmark.record("pub_storage", pub_storage, "B", MetricReport.LOWER_IS_BETTER)
         finally:
-            if not error_occurred:
-                neon_api.delete_project(sub_project_id)
-    except Exception as e:
-        error_occurred = True
-        log.error(f"Caught exception {e}")
-        log.error(traceback.format_exc())
+            sub_workload.terminate()
     finally:
-        assert not error_occurred
-        neon_api.delete_project(pub_project_id)
+        pub_workload.terminate()

From 5b16624bcc2104135485bcd9c2b7a57f3544e6da Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 16 Jul 2024 02:16:18 +0200
Subject: [PATCH 248/412] Allow the new clippy::doc_lazy_continuation lint
 (#8388)

The `doc_lazy_continuation` lint of clippy is still unknown on latest
rust stable.

Fixes fall-out from #8151.
---
 pageserver/src/tenant/timeline.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 239dce8786..58c6257c65 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3409,6 +3409,7 @@ impl Timeline {
         }
     }
 
+    #[allow(unknown_lints)] // doc_lazy_continuation is still a new lint
     #[allow(clippy::doc_lazy_continuation)]
     /// Get the data needed to reconstruct all keys in the provided keyspace
     ///

From ea5460843c889bb5c2ebb4585e92ad554fc6380e Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 16 Jul 2024 08:52:49 +0100
Subject: [PATCH 249/412] pageserver: un-Arc Timeline::layers (#8386)

## Problem

This structure was in an Arc<> unnecessarily, making it harder to reason
about its lifetime (i.e. it was superficially possible for LayerManager
to outlive timeline, even though no code used it that way)

## Summary of changes

- Remove the Arc<>
---
 pageserver/src/tenant/timeline.rs            |  4 ++--
 pageserver/src/tenant/timeline/compaction.rs | 12 ++++++------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 58c6257c65..48a5b2d32b 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -198,7 +198,7 @@ impl PartialOrd for Hole {
 
 /// Temporary function for immutable storage state refactor, ensures we are dropping mutex guard instead of other things.
 /// Can be removed after all refactors are done.
-fn drop_rlock<T>(rlock: tokio::sync::OwnedRwLockReadGuard<T>) {
+fn drop_rlock<T>(rlock: tokio::sync::RwLockReadGuard<T>) {
     drop(rlock)
 }
 
@@ -271,7 +271,7 @@ pub struct Timeline {
     ///
     /// In the future, we'll be able to split up the tuple of LayerMap and `LayerFileManager`,
     /// so that e.g. on-demand-download/eviction, and layer spreading, can operate just on `LayerFileManager`.
-    pub(crate) layers: Arc<tokio::sync::RwLock<LayerManager>>,
+    pub(crate) layers: tokio::sync::RwLock<LayerManager>,
 
     last_freeze_at: AtomicLsn,
     // Atomic would be more appropriate here.
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index efaa6144af..eec5e5e53c 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -27,8 +27,8 @@ use utils::id::TimelineId;
 use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
 use crate::page_cache;
 use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc};
-use crate::tenant::timeline::{drop_rlock, Hole, ImageLayerCreationOutcome};
-use crate::tenant::timeline::{DeltaLayerWriter, ImageLayerWriter};
+use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter};
+use crate::tenant::timeline::{Hole, ImageLayerCreationOutcome};
 use crate::tenant::timeline::{Layer, ResidentLayer};
 use crate::tenant::DeltaLayer;
 use crate::virtual_file::{MaybeFatalIo, VirtualFile};
@@ -379,7 +379,7 @@ impl Timeline {
             };
 
             let begin = tokio::time::Instant::now();
-            let phase1_layers_locked = Arc::clone(&self.layers).read_owned().await;
+            let phase1_layers_locked = self.layers.read().await;
             let now = tokio::time::Instant::now();
             stats.read_lock_acquisition_micros =
                 DurationRecorder::Recorded(RecordedDuration(now - begin), now);
@@ -399,9 +399,9 @@ impl Timeline {
     }
 
     /// Level0 files first phase of compaction, explained in the [`Self::compact_legacy`] comment.
-    async fn compact_level0_phase1(
-        self: &Arc<Self>,
-        guard: tokio::sync::OwnedRwLockReadGuard<LayerManager>,
+    async fn compact_level0_phase1<'a>(
+        self: &'a Arc<Self>,
+        guard: tokio::sync::RwLockReadGuard<'a, LayerManager>,
         mut stats: CompactLevel0Phase1StatsBuilder,
         target_file_size: u64,
         ctx: &RequestContext,

From 2ede9d7a25259a8f226eb44c1f2439485b417e3b Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Mon, 15 Jul 2024 12:48:53 +0100
Subject: [PATCH 250/412] Compute: add compatibility patch for rum Fixes #8251

---
 Dockerfile.compute-node |  3 +++
 patches/rum.patch       | 54 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+)
 create mode 100644 patches/rum.patch

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 7ab685625a..48a52bfc6d 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -311,9 +311,12 @@ RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz
 FROM build-deps AS rum-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
+COPY patches/rum.patch /rum.patch
+
 RUN wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \
     echo "6ab370532c965568df6210bd844ac6ba649f53055e48243525b0b7e5c4d69a7d rum.tar.gz" | sha256sum --check && \
     mkdir rum-src && cd rum-src && tar xzf ../rum.tar.gz --strip-components=1 -C . && \
+    patch -p1 < /rum.patch && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/rum.control
diff --git a/patches/rum.patch b/patches/rum.patch
new file mode 100644
index 0000000000..3041f8df81
--- /dev/null
+++ b/patches/rum.patch
@@ -0,0 +1,54 @@
+commit 68f3b3b0d594f08aacc4a082ee210749ed5677eb
+Author: Anastasia Lubennikova <anastasia@neon.tech>
+Date:   Mon Jul 15 12:31:56 2024 +0100
+
+    Neon: fix unlogged index build patch
+
+diff --git a/src/ruminsert.c b/src/ruminsert.c
+index e8b209d..e89bf2a 100644
+--- a/src/ruminsert.c
++++ b/src/ruminsert.c
+@@ -628,6 +628,10 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo)
+ 		elog(ERROR, "index \"%s\" already contains data",
+ 			 RelationGetRelationName(index));
+ 
++#ifdef NEON_SMGR
++	smgr_start_unlogged_build(index->rd_smgr);
++#endif
++
+ 	initRumState(&buildstate.rumstate, index);
+ 	buildstate.rumstate.isBuild = true;
+ 	buildstate.indtuples = 0;
+@@ -693,6 +697,10 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo)
+ 	buildstate.buildStats.nTotalPages = RelationGetNumberOfBlocks(index);
+ 	rumUpdateStats(index, &buildstate.buildStats, buildstate.rumstate.isBuild);
+ 
++#ifdef NEON_SMGR
++	smgr_finish_unlogged_build_phase_1(index->rd_smgr);
++#endif
++
+ 	/*
+ 	 * Write index to xlog
+ 	 */
+@@ -713,6 +721,21 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo)
+ 		UnlockReleaseBuffer(buffer);
+ 	}
+ 
++#ifdef NEON_SMGR
++	{
++#if PG_VERSION_NUM >= 160000
++		RelFileLocator rlocator = RelationGetSmgr(index)->smgr_rlocator.locator;
++#else
++		RelFileNode rlocator = RelationGetSmgr(index)->smgr_rnode.node;
++#endif
++
++		SetLastWrittenLSNForBlockRange(XactLastRecEnd, rlocator, MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
++		SetLastWrittenLSNForRelation(XactLastRecEnd, rlocator, MAIN_FORKNUM);
++
++		smgr_end_unlogged_build(index->rd_smgr);
++	}
++#endif
++
+ 	/*
+ 	 * Return statistics
+ 	 */

From 9dc71f5a884dd25be14757369072cfb37efd3a9a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 16 Jul 2024 12:19:28 +0200
Subject: [PATCH 251/412] Avoid the storage controller in
 test_tenant_creation_fails (#8392)

As described in #8385, the likely source for flakiness in
test_tenant_creation_fails is the following sequence of events:

1. test instructs the storage controller to create the tenant
2. storage controller adds the tenant and persists it to the database.
issues a creation request
3. the pageserver restarts with the failpoint disabled
4. storage controller's background reconciliation still wants to create
the tenant
5. pageserver gets new request to create the tenant from background
reconciliation

This commit just avoids the storage controller entirely. It has its own
set of issues, as the re-attach request will obviously not include the
tenant, but it's still useful to test for non-existence of the tenant.

The generation is also not optional any more during tenant attachment.
If you omit it, the pageserver yields an error. We change the signature
of `tenant_attach` to reflect that.

Alternative to #8385
Fixes #8266
---
 test_runner/fixtures/neon_fixtures.py   |  2 +-
 test_runner/fixtures/pageserver/http.py |  2 +-
 test_runner/regress/test_tenants.py     | 13 +++----------
 3 files changed, 5 insertions(+), 12 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index fe4a334458..625e9096f5 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2786,8 +2786,8 @@ class NeonPageserver(PgProtocol, LogUtils):
             )
         return client.tenant_attach(
             tenant_id,
+            generation,
             config,
-            generation=generation,
         )
 
     def tenant_detach(self, tenant_id: TenantId):
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index d66b94948a..f1e3d1a309 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -238,8 +238,8 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
     def tenant_attach(
         self,
         tenant_id: Union[TenantId, TenantShardId],
+        generation: int,
         config: None | Dict[str, Any] = None,
-        generation: Optional[int] = None,
     ):
         config = config or {}
 
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index 04b3fdd80f..0ebf714de0 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -45,17 +45,10 @@ def test_tenant_creation_fails(neon_simple_env: NeonEnv):
     # Failure to write a config to local disk makes the pageserver assume that local disk is bad and abort the process
     pageserver_http.configure_failpoints(("tenant-config-before-write", "return"))
 
-    # Storage controller will see a torn TCP connection when the crash point is reached, and follow an unclean 500 error path
-    neon_simple_env.storage_controller.allowed_errors.extend(
-        [
-            ".*Reconcile not done yet while creating tenant.*",
-            ".*Reconcile error: receive body: error sending request.*",
-            ".*Error processing HTTP request: InternalServerError.*",
-        ]
-    )
+    tenant_id = TenantId.generate()
 
-    with pytest.raises(Exception, match="error sending request"):
-        _ = neon_simple_env.neon_cli.create_tenant()
+    with pytest.raises(requests.exceptions.ConnectionError, match="Connection aborted"):
+        neon_simple_env.pageserver.http_client().tenant_attach(tenant_id=tenant_id, generation=1)
 
     # Any files left behind on disk during failed creation do not prevent
     # a retry from succeeding.  Restart pageserver with no failpoints.

From bf7de92dc2ae15263c03d825ce1dfcd78b5825bb Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 16 Jul 2024 12:20:23 +0200
Subject: [PATCH 252/412] build(deps): bump setuptools from 65.5.1 to 70.0.0
 (#8387)

Bumps [setuptools](https://github.com/pypa/setuptools) from 65.5.1 to
70.0.0.

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: a-masterov <72613290+a-masterov@users.noreply.github.com>
---
 poetry.lock | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 8091141411..5192a574cc 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -2641,19 +2641,18 @@ pbr = "*"
 
 [[package]]
 name = "setuptools"
-version = "65.5.1"
+version = "70.0.0"
 description = "Easily download, build, install, upgrade, and uninstall Python packages"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "setuptools-65.5.1-py3-none-any.whl", hash = "sha256:d0b9a8433464d5800cbe05094acf5c6d52a91bfac9b52bcfc4d41382be5d5d31"},
-    {file = "setuptools-65.5.1.tar.gz", hash = "sha256:e197a19aa8ec9722928f2206f8de752def0e4c9fc6953527360d1c36d94ddb2f"},
+    {file = "setuptools-70.0.0-py3-none-any.whl", hash = "sha256:54faa7f2e8d2d11bcd2c07bed282eef1046b5c080d1c32add737d7b5817b1ad4"},
+    {file = "setuptools-70.0.0.tar.gz", hash = "sha256:f211a66637b8fa059bb28183da127d4e86396c991a942b028c6650d4319c3fd0"},
 ]
 
 [package.extras]
-docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-notfound-page (==0.8.3)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
-testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8 (<5)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pip-run (>=8.8)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
-testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"]
+docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
+testing = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mypy (==1.9)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.1)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf", "pytest-ruff (>=0.2.1)", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
 
 [[package]]
 name = "six"

From bff505426e711138eec3f851bce3293526657942 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 16 Jul 2024 14:54:54 +0100
Subject: [PATCH 253/412] pageserver: clean up GcCutoffs names (#8379)

- `horizon` is a confusing term, it's not at all obvious that this means
space-based retention limit, rather than the total GC history limit.
Rename to `GcCutoffs::space`.
- `pitr` is less confusing, but still an unecessary level of indirection
from what we really mean: a time-based condition. The fact that we use
that that time-history for Point In Time Recovery doesn't mean we have
to refer to time as "pitr" everywhere. Rename to `GcCutoffs::time`.
---
 pageserver/src/tenant.rs                     | 14 +--
 pageserver/src/tenant/size.rs                | 61 +++++--------
 pageserver/src/tenant/timeline.rs            | 94 +++++++++-----------
 pageserver/src/tenant/timeline/compaction.rs |  4 +-
 4 files changed, 75 insertions(+), 98 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 6333fd3b63..dc6f42eaeb 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2912,7 +2912,7 @@ impl Tenant {
                 if let Some(ancestor_id) = timeline.get_ancestor_timeline_id() {
                     if let Some(ancestor_gc_cutoffs) = gc_cutoffs.get(&ancestor_id) {
                         target.within_ancestor_pitr =
-                            timeline.get_ancestor_lsn() >= ancestor_gc_cutoffs.pitr;
+                            timeline.get_ancestor_lsn() >= ancestor_gc_cutoffs.time;
                     }
                 }
 
@@ -2928,7 +2928,7 @@ impl Tenant {
                 timeline.metrics.pitr_history_size.set(
                     timeline
                         .get_last_record_lsn()
-                        .checked_sub(target.cutoffs.pitr)
+                        .checked_sub(target.cutoffs.time)
                         .unwrap_or(Lsn(0))
                         .0,
                 );
@@ -4262,7 +4262,7 @@ mod tests {
                     .source()
                     .unwrap()
                     .to_string()
-                    .contains("is earlier than latest GC horizon"));
+                    .contains("is earlier than latest GC cutoff"));
             }
         }
 
@@ -6718,8 +6718,8 @@ mod tests {
         {
             // Update GC info
             let mut guard = tline.gc_info.write().unwrap();
-            guard.cutoffs.pitr = Lsn(0x30);
-            guard.cutoffs.horizon = Lsn(0x30);
+            guard.cutoffs.time = Lsn(0x30);
+            guard.cutoffs.space = Lsn(0x30);
         }
 
         let expected_result = [
@@ -7109,8 +7109,8 @@ mod tests {
             *guard = GcInfo {
                 retain_lsns: vec![],
                 cutoffs: GcCutoffs {
-                    pitr: Lsn(0x30),
-                    horizon: Lsn(0x30),
+                    time: Lsn(0x30),
+                    space: Lsn(0x30),
                 },
                 leases: Default::default(),
                 within_ancestor_pitr: false,
diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs
index 23354417e7..e4728ca8a8 100644
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -135,11 +135,9 @@ pub struct TimelineInputs {
     ancestor_lsn: Lsn,
     last_record: Lsn,
     latest_gc_cutoff: Lsn,
-    horizon_cutoff: Lsn,
-    pitr_cutoff: Lsn,
 
     /// Cutoff point based on GC settings
-    next_gc_cutoff: Lsn,
+    next_pitr_cutoff: Lsn,
 
     /// Cutoff point calculated from the user-supplied 'max_retention_period'
     retention_param_cutoff: Option<Lsn>,
@@ -150,7 +148,7 @@ pub struct TimelineInputs {
 
 /// Gathers the inputs for the tenant sizing model.
 ///
-/// Tenant size does not consider the latest state, but only the state until next_gc_cutoff, which
+/// Tenant size does not consider the latest state, but only the state until next_pitr_cutoff, which
 /// is updated on-demand, during the start of this calculation and separate from the
 /// [`TimelineInputs::latest_gc_cutoff`].
 ///
@@ -158,11 +156,8 @@ pub struct TimelineInputs {
 ///
 /// ```text
 /// 0-----|---------|----|------------| · · · · · |·> lsn
-///   initdb_lsn  branchpoints*  next_gc_cutoff  latest
+///   initdb_lsn  branchpoints*  next_pitr_cutoff  latest
 /// ```
-///
-/// Until gc_horizon_cutoff > `Timeline::last_record_lsn` for any of the tenant's timelines, the
-/// tenant size will be zero.
 pub(super) async fn gather_inputs(
     tenant: &Tenant,
     limit: &Arc<Semaphore>,
@@ -172,7 +167,7 @@ pub(super) async fn gather_inputs(
     cancel: &CancellationToken,
     ctx: &RequestContext,
 ) -> Result<ModelInputs, CalculateSyntheticSizeError> {
-    // refresh is needed to update gc related pitr_cutoff and horizon_cutoff
+    // refresh is needed to update [`timeline::GcCutoffs`]
     tenant.refresh_gc_info(cancel, ctx).await?;
 
     // Collect information about all the timelines
@@ -236,20 +231,18 @@ pub(super) async fn gather_inputs(
         // we don't consider the `Timeline::disk_consistent_lsn` at all, because we are not
         // actually removing files.
         //
-        // We only consider [`GcInfo::pitr_cutoff`], and not [`GcInfo::horizon_cutoff`], because from
+        // We only consider [`timeline::GcCutoffs::time`], and not [`timeline::GcCutoffs::space`], because from
         // a user's perspective they have only requested retention up to the time bound (pitr_cutoff), rather
-        // than a space bound (horizon cutoff).  This means that if someone drops a database and waits for their
+        // than our internal space cutoff.  This means that if someone drops a database and waits for their
         // PITR interval, they will see synthetic size decrease, even if we are still storing data inside
-        // horizon_cutoff.
-        let pitr_cutoff = gc_info.cutoffs.pitr;
-        let horizon_cutoff = gc_info.cutoffs.horizon;
-        let mut next_gc_cutoff = pitr_cutoff;
+        // the space cutoff.
+        let mut next_pitr_cutoff = gc_info.cutoffs.time;
 
         // If the caller provided a shorter retention period, use that instead of the GC cutoff.
         let retention_param_cutoff = if let Some(max_retention_period) = max_retention_period {
             let param_cutoff = Lsn(last_record_lsn.0.saturating_sub(max_retention_period));
-            if next_gc_cutoff < param_cutoff {
-                next_gc_cutoff = param_cutoff;
+            if next_pitr_cutoff < param_cutoff {
+                next_pitr_cutoff = param_cutoff;
             }
             Some(param_cutoff)
         } else {
@@ -263,7 +256,7 @@ pub(super) async fn gather_inputs(
             .copied()
             .collect::<Vec<_>>();
 
-        // next_gc_cutoff in parent branch are not of interest (right now at least), nor do we
+        // next_pitr_cutoff in parent branch are not of interest (right now at least), nor do we
         // want to query any logical size before initdb_lsn.
         let branch_start_lsn = cmp::max(ancestor_lsn, timeline.initdb_lsn);
 
@@ -291,10 +284,10 @@ pub(super) async fn gather_inputs(
             )
         }
 
-        // Add a point for the GC cutoff
-        let branch_start_needed = next_gc_cutoff <= branch_start_lsn;
+        // Add a point for the PITR cutoff
+        let branch_start_needed = next_pitr_cutoff <= branch_start_lsn;
         if !branch_start_needed {
-            lsns.push((next_gc_cutoff, LsnKind::GcCutOff));
+            lsns.push((next_pitr_cutoff, LsnKind::GcCutOff));
         }
 
         lsns.sort_unstable();
@@ -333,7 +326,7 @@ pub(super) async fn gather_inputs(
                     parent: Some(parent),
                     lsn: lsn.0,
                     size: None,
-                    needed: lsn > next_gc_cutoff,
+                    needed: lsn > next_pitr_cutoff,
                 },
                 timeline_id: timeline.timeline_id,
                 kind,
@@ -357,8 +350,8 @@ pub(super) async fn gather_inputs(
                     segment: Segment {
                         parent: Some(lease_parent),
                         lsn: lsn.0,
-                        size: None,                   // Filled in later, if necessary
-                        needed: lsn > next_gc_cutoff, // only needed if the point is within rentention.
+                        size: None,                     // Filled in later, if necessary
+                        needed: lsn > next_pitr_cutoff, // only needed if the point is within rentention.
                     },
                     timeline_id: timeline.timeline_id,
                     kind: LsnKind::LeaseStart,
@@ -398,9 +391,7 @@ pub(super) async fn gather_inputs(
             last_record: last_record_lsn,
             // this is not used above, because it might not have updated recently enough
             latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(),
-            horizon_cutoff,
-            pitr_cutoff,
-            next_gc_cutoff,
+            next_pitr_cutoff,
             retention_param_cutoff,
             lease_points,
         });
@@ -742,9 +733,7 @@ fn verify_size_for_multiple_branches() {
       "ancestor_lsn": "0/18D3D98",
       "last_record": "0/2230CD0",
       "latest_gc_cutoff": "0/1698C48",
-      "horizon_cutoff": "0/2210CD0",
-      "pitr_cutoff": "0/2210CD0",
-      "next_gc_cutoff": "0/2210CD0",
+      "next_pitr_cutoff": "0/2210CD0",
       "retention_param_cutoff": null,
       "lease_points": []
     },
@@ -753,9 +742,7 @@ fn verify_size_for_multiple_branches() {
       "ancestor_lsn": "0/176D998",
       "last_record": "0/1837770",
       "latest_gc_cutoff": "0/1698C48",
-      "horizon_cutoff": "0/1817770",
-      "pitr_cutoff": "0/1817770",
-      "next_gc_cutoff": "0/1817770",
+      "next_pitr_cutoff": "0/1817770",
       "retention_param_cutoff": null,
       "lease_points": []
     },
@@ -764,9 +751,7 @@ fn verify_size_for_multiple_branches() {
       "ancestor_lsn": "0/0",
       "last_record": "0/18D3D98",
       "latest_gc_cutoff": "0/1698C48",
-      "horizon_cutoff": "0/18B3D98",
-      "pitr_cutoff": "0/18B3D98",
-      "next_gc_cutoff": "0/18B3D98",
+      "next_pitr_cutoff": "0/18B3D98",
       "retention_param_cutoff": null,
       "lease_points": []
     }
@@ -820,9 +805,7 @@ fn verify_size_for_one_branch() {
       "ancestor_lsn": "0/0",
       "last_record": "47/280A5860",
       "latest_gc_cutoff": "47/240A5860",
-      "horizon_cutoff": "47/240A5860",
-      "pitr_cutoff": "47/240A5860",
-      "next_gc_cutoff": "47/240A5860",
+      "next_pitr_cutoff": "47/240A5860",
       "retention_param_cutoff": "0/0",
       "lease_points": []
     }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 48a5b2d32b..3d3d3ac34d 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -478,37 +478,32 @@ impl GcInfo {
     }
 }
 
-/// The `GcInfo` component describing which Lsns need to be retained.
+/// The `GcInfo` component describing which Lsns need to be retained.  Functionally, this
+/// is a single number (the oldest LSN which we must retain), but it internally distinguishes
+/// between time-based and space-based retention for observability and consumption metrics purposes.
 #[derive(Debug)]
 pub(crate) struct GcCutoffs {
-    /// Keep everything newer than this point.
-    ///
-    /// This is calculated by subtracting 'gc_horizon' setting from
-    /// last-record LSN
-    ///
-    /// FIXME: is this inclusive or exclusive?
-    pub(crate) horizon: Lsn,
+    /// Calculated from the [`TenantConf::gc_horizon`], this LSN indicates how much
+    /// history we must keep to retain a specified number of bytes of WAL.
+    pub(crate) space: Lsn,
 
-    /// In addition to 'retain_lsns' and 'horizon_cutoff', keep everything newer than this
-    /// point.
-    ///
-    /// This is calculated by finding a number such that a record is needed for PITR
-    /// if only if its LSN is larger than 'pitr_cutoff'.
-    pub(crate) pitr: Lsn,
+    /// Calculated from [`TenantConf::pitr_interval`], this LSN indicates how much
+    /// history we must keep to enable reading back at least the PITR interval duration.
+    pub(crate) time: Lsn,
 }
 
 impl Default for GcCutoffs {
     fn default() -> Self {
         Self {
-            horizon: Lsn::INVALID,
-            pitr: Lsn::INVALID,
+            space: Lsn::INVALID,
+            time: Lsn::INVALID,
         }
     }
 }
 
 impl GcCutoffs {
     fn select_min(&self) -> Lsn {
-        std::cmp::min(self.horizon, self.pitr)
+        std::cmp::min(self.space, self.time)
     }
 }
 
@@ -867,7 +862,7 @@ impl Timeline {
         let gc_info = self.gc_info.read().unwrap();
         let history = self
             .get_last_record_lsn()
-            .checked_sub(gc_info.cutoffs.pitr)
+            .checked_sub(gc_info.cutoffs.time)
             .unwrap_or(Lsn(0))
             .0;
         (history, gc_info.within_ancestor_pitr)
@@ -1566,7 +1561,7 @@ impl Timeline {
     ) -> anyhow::Result<()> {
         ensure!(
             lsn >= **latest_gc_cutoff_lsn,
-            "LSN {} is earlier than latest GC horizon {} (we might've already garbage collected needed data)",
+            "LSN {} is earlier than latest GC cutoff {} (we might've already garbage collected needed data)",
             lsn,
             **latest_gc_cutoff_lsn,
         );
@@ -4944,18 +4939,18 @@ impl Timeline {
     /// garbage collection.
     ///
     /// We calculate two cutoffs, one based on time and one based on WAL size.  `pitr`
-    /// controls the time cutoff (or ZERO to disable time-based retention), and `cutoff_horizon` controls
+    /// controls the time cutoff (or ZERO to disable time-based retention), and `space_cutoff` controls
     /// the space-based retention.
     ///
     /// This function doesn't simply to calculate time & space based retention: it treats time-based
     /// retention as authoritative if enabled, and falls back to space-based retention if calculating
     /// the LSN for a time point isn't possible.  Therefore the GcCutoffs::horizon in the response might
-    /// be different to the `cutoff_horizon` input.  Callers should treat the min() of the two cutoffs
+    /// be different to the `space_cutoff` input.  Callers should treat the min() of the two cutoffs
     /// in the response as the GC cutoff point for the timeline.
     #[instrument(skip_all, fields(timeline_id=%self.timeline_id))]
     pub(super) async fn find_gc_cutoffs(
         &self,
-        cutoff_horizon: Lsn,
+        space_cutoff: Lsn,
         pitr: Duration,
         cancel: &CancellationToken,
         ctx: &RequestContext,
@@ -4972,8 +4967,8 @@ impl Timeline {
             // Unit tests which specify zero PITR interval expect to avoid doing any I/O for timestamp lookup
             if pitr == Duration::ZERO {
                 return Ok(GcCutoffs {
-                    pitr: self.get_last_record_lsn(),
-                    horizon: cutoff_horizon,
+                    time: self.get_last_record_lsn(),
+                    space: space_cutoff,
                 });
             }
         }
@@ -4981,8 +4976,7 @@ impl Timeline {
         // Calculate a time-based limit on how much to retain:
         // - if PITR interval is set, then this is our cutoff.
         // - if PITR interval is not set, then we do a lookup
-        //   based on DEFAULT_PITR_INTERVAL, so that size-based retention (horizon)
-        //   does not result in keeping history around permanently on idle databases.
+        //   based on DEFAULT_PITR_INTERVAL, so that size-based retention does not result in keeping history around permanently on idle databases.
         let time_cutoff = {
             let now = SystemTime::now();
             let time_range = if pitr == Duration::ZERO {
@@ -5023,31 +5017,31 @@ impl Timeline {
                 // PITR is not set. Retain the size-based limit, or the default time retention,
                 // whichever requires less data.
                 GcCutoffs {
-                    pitr: std::cmp::max(time_cutoff, cutoff_horizon),
-                    horizon: std::cmp::max(time_cutoff, cutoff_horizon),
+                    time: self.get_last_record_lsn(),
+                    space: std::cmp::max(time_cutoff, space_cutoff),
                 }
             }
             (Duration::ZERO, None) => {
                 // PITR is not set, and time lookup failed
                 GcCutoffs {
-                    pitr: self.get_last_record_lsn(),
-                    horizon: cutoff_horizon,
+                    time: self.get_last_record_lsn(),
+                    space: space_cutoff,
                 }
             }
             (_, None) => {
                 // PITR interval is set & we didn't look up a timestamp successfully.  Conservatively assume PITR
                 // cannot advance beyond what was already GC'd, and respect space-based retention
                 GcCutoffs {
-                    pitr: *self.get_latest_gc_cutoff_lsn(),
-                    horizon: cutoff_horizon,
+                    time: *self.get_latest_gc_cutoff_lsn(),
+                    space: space_cutoff,
                 }
             }
             (_, Some(time_cutoff)) => {
                 // PITR interval is set and we looked up timestamp successfully.  Ignore
                 // size based retention and make time cutoff authoritative
                 GcCutoffs {
-                    pitr: time_cutoff,
-                    horizon: time_cutoff,
+                    time: time_cutoff,
+                    space: time_cutoff,
                 }
             }
         })
@@ -5074,11 +5068,11 @@ impl Timeline {
             return Err(GcError::TimelineCancelled);
         }
 
-        let (horizon_cutoff, pitr_cutoff, retain_lsns, max_lsn_with_valid_lease) = {
+        let (space_cutoff, time_cutoff, retain_lsns, max_lsn_with_valid_lease) = {
             let gc_info = self.gc_info.read().unwrap();
 
-            let horizon_cutoff = min(gc_info.cutoffs.horizon, self.get_disk_consistent_lsn());
-            let pitr_cutoff = gc_info.cutoffs.pitr;
+            let space_cutoff = min(gc_info.cutoffs.space, self.get_disk_consistent_lsn());
+            let time_cutoff = gc_info.cutoffs.time;
             let retain_lsns = gc_info.retain_lsns.clone();
 
             // Gets the maximum LSN that holds the valid lease.
@@ -5088,14 +5082,14 @@ impl Timeline {
             let max_lsn_with_valid_lease = gc_info.leases.last_key_value().map(|(lsn, _)| *lsn);
 
             (
-                horizon_cutoff,
-                pitr_cutoff,
+                space_cutoff,
+                time_cutoff,
                 retain_lsns,
                 max_lsn_with_valid_lease,
             )
         };
 
-        let mut new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff);
+        let mut new_gc_cutoff = Lsn::min(space_cutoff, time_cutoff);
         let standby_horizon = self.standby_horizon.load();
         // Hold GC for the standby, but as a safety guard do it only within some
         // reasonable lag.
@@ -5124,8 +5118,8 @@ impl Timeline {
 
         let res = self
             .gc_timeline(
-                horizon_cutoff,
-                pitr_cutoff,
+                space_cutoff,
+                time_cutoff,
                 retain_lsns,
                 max_lsn_with_valid_lease,
                 new_gc_cutoff,
@@ -5143,8 +5137,8 @@ impl Timeline {
 
     async fn gc_timeline(
         &self,
-        horizon_cutoff: Lsn,
-        pitr_cutoff: Lsn,
+        space_cutoff: Lsn,
+        time_cutoff: Lsn,
         retain_lsns: Vec<Lsn>,
         max_lsn_with_valid_lease: Option<Lsn>,
         new_gc_cutoff: Lsn,
@@ -5205,22 +5199,22 @@ impl Timeline {
             result.layers_total += 1;
 
             // 1. Is it newer than GC horizon cutoff point?
-            if l.get_lsn_range().end > horizon_cutoff {
+            if l.get_lsn_range().end > space_cutoff {
                 debug!(
-                    "keeping {} because it's newer than horizon_cutoff {}",
+                    "keeping {} because it's newer than space_cutoff {}",
                     l.layer_name(),
-                    horizon_cutoff,
+                    space_cutoff,
                 );
                 result.layers_needed_by_cutoff += 1;
                 continue 'outer;
             }
 
             // 2. It is newer than PiTR cutoff point?
-            if l.get_lsn_range().end > pitr_cutoff {
+            if l.get_lsn_range().end > time_cutoff {
                 debug!(
-                    "keeping {} because it's newer than pitr_cutoff {}",
+                    "keeping {} because it's newer than time_cutoff {}",
                     l.layer_name(),
-                    pitr_cutoff,
+                    time_cutoff,
                 );
                 result.layers_needed_by_pitr += 1;
                 continue 'outer;
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index eec5e5e53c..cbb3303341 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -195,7 +195,7 @@ impl Timeline {
         tracing::info!(
             "latest_gc_cutoff: {}, pitr cutoff {}",
             *latest_gc_cutoff,
-            self.gc_info.read().unwrap().cutoffs.pitr
+            self.gc_info.read().unwrap().cutoffs.time
         );
 
         let layers = self.layers.read().await;
@@ -990,7 +990,7 @@ impl Timeline {
                     "enhanced legacy compaction currently does not support retain_lsns (branches)"
                 )));
             }
-            let gc_cutoff = Lsn::min(gc_info.cutoffs.horizon, gc_info.cutoffs.pitr);
+            let gc_cutoff = gc_info.cutoffs.select_min();
             let mut selected_layers = Vec::new();
             // TODO: consider retain_lsns
             drop(gc_info);

From 7e818ee390b541115167e20059948d1ccaf041f9 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Fri, 12 Jul 2024 13:46:14 -0500
Subject: [PATCH 254/412] Rename compute migrations to start at 1

This matches what we put into the neon_migration.migration_id table.
---
 compute_tools/src/migration.rs                | 17 +++++++++++++---
 ...sql => 0001-neon_superuser_bypass_rls.sql} |  0
 ...1-alter_roles.sql => 0002-alter_roles.sql} |  0
 ...create_subscription_to_neon_superuser.sql} |  0
 ...04-grant_pg_monitor_to_neon_superuser.sql} |  0
 ...grant_all_on_tables_to_neon_superuser.sql} |  0
 ...nt_all_on_sequences_to_neon_superuser.sql} |  0
 ...s_to_neon_superuser_with_grant_option.sql} |  0
 ...s_to_neon_superuser_with_grant_option.sql} |  0
 ...lication_for_previously_allowed_roles.sql} |  0
 ...nchronization_funcs_to_neon_superuser.sql} |  0
 compute_tools/src/spec.rs                     | 20 +++++++++----------
 12 files changed, 24 insertions(+), 13 deletions(-)
 rename compute_tools/src/migrations/{0000-neon_superuser_bypass_rls.sql => 0001-neon_superuser_bypass_rls.sql} (100%)
 rename compute_tools/src/migrations/{0001-alter_roles.sql => 0002-alter_roles.sql} (100%)
 rename compute_tools/src/migrations/{0002-grant_pg_create_subscription_to_neon_superuser.sql => 0003-grant_pg_create_subscription_to_neon_superuser.sql} (100%)
 rename compute_tools/src/migrations/{0003-grant_pg_monitor_to_neon_superuser.sql => 0004-grant_pg_monitor_to_neon_superuser.sql} (100%)
 rename compute_tools/src/migrations/{0004-grant_all_on_tables_to_neon_superuser.sql => 0005-grant_all_on_tables_to_neon_superuser.sql} (100%)
 rename compute_tools/src/migrations/{0005-grant_all_on_sequences_to_neon_superuser.sql => 0006-grant_all_on_sequences_to_neon_superuser.sql} (100%)
 rename compute_tools/src/migrations/{0006-grant_all_on_tables_to_neon_superuser_with_grant_option.sql => 0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql} (100%)
 rename compute_tools/src/migrations/{0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql => 0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql} (100%)
 rename compute_tools/src/migrations/{0008-revoke_replication_for_previously_allowed_roles.sql => 0009-revoke_replication_for_previously_allowed_roles.sql} (100%)
 rename compute_tools/src/migrations/{0009-grant_snapshot_synchronization_funcs_to_neon_superuser.sql => 0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql} (100%)

diff --git a/compute_tools/src/migration.rs b/compute_tools/src/migration.rs
index 61dcf01c84..241ccd4100 100644
--- a/compute_tools/src/migration.rs
+++ b/compute_tools/src/migration.rs
@@ -66,17 +66,28 @@ impl<'m> MigrationRunner<'m> {
             .context("run_migrations begin")?;
 
         while current_migration < self.migrations.len() {
+            macro_rules! migration_id {
+                ($cm:expr) => {
+                    ($cm + 1) as i64
+                };
+            }
+
             let migration = self.migrations[current_migration];
 
             if migration.starts_with("-- SKIP") {
-                info!("Skipping migration id={}", current_migration);
+                info!("Skipping migration id={}", migration_id!(current_migration));
             } else {
                 info!(
                     "Running migration id={}:\n{}\n",
-                    current_migration, migration
+                    migration_id!(current_migration),
+                    migration
                 );
+
                 self.client.simple_query(migration).with_context(|| {
-                    format!("run_migration current_migration={}", current_migration)
+                    format!(
+                        "run_migration migration id={}",
+                        migration_id!(current_migration)
+                    )
                 })?;
             }
 
diff --git a/compute_tools/src/migrations/0000-neon_superuser_bypass_rls.sql b/compute_tools/src/migrations/0001-neon_superuser_bypass_rls.sql
similarity index 100%
rename from compute_tools/src/migrations/0000-neon_superuser_bypass_rls.sql
rename to compute_tools/src/migrations/0001-neon_superuser_bypass_rls.sql
diff --git a/compute_tools/src/migrations/0001-alter_roles.sql b/compute_tools/src/migrations/0002-alter_roles.sql
similarity index 100%
rename from compute_tools/src/migrations/0001-alter_roles.sql
rename to compute_tools/src/migrations/0002-alter_roles.sql
diff --git a/compute_tools/src/migrations/0002-grant_pg_create_subscription_to_neon_superuser.sql b/compute_tools/src/migrations/0003-grant_pg_create_subscription_to_neon_superuser.sql
similarity index 100%
rename from compute_tools/src/migrations/0002-grant_pg_create_subscription_to_neon_superuser.sql
rename to compute_tools/src/migrations/0003-grant_pg_create_subscription_to_neon_superuser.sql
diff --git a/compute_tools/src/migrations/0003-grant_pg_monitor_to_neon_superuser.sql b/compute_tools/src/migrations/0004-grant_pg_monitor_to_neon_superuser.sql
similarity index 100%
rename from compute_tools/src/migrations/0003-grant_pg_monitor_to_neon_superuser.sql
rename to compute_tools/src/migrations/0004-grant_pg_monitor_to_neon_superuser.sql
diff --git a/compute_tools/src/migrations/0004-grant_all_on_tables_to_neon_superuser.sql b/compute_tools/src/migrations/0005-grant_all_on_tables_to_neon_superuser.sql
similarity index 100%
rename from compute_tools/src/migrations/0004-grant_all_on_tables_to_neon_superuser.sql
rename to compute_tools/src/migrations/0005-grant_all_on_tables_to_neon_superuser.sql
diff --git a/compute_tools/src/migrations/0005-grant_all_on_sequences_to_neon_superuser.sql b/compute_tools/src/migrations/0006-grant_all_on_sequences_to_neon_superuser.sql
similarity index 100%
rename from compute_tools/src/migrations/0005-grant_all_on_sequences_to_neon_superuser.sql
rename to compute_tools/src/migrations/0006-grant_all_on_sequences_to_neon_superuser.sql
diff --git a/compute_tools/src/migrations/0006-grant_all_on_tables_to_neon_superuser_with_grant_option.sql b/compute_tools/src/migrations/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql
similarity index 100%
rename from compute_tools/src/migrations/0006-grant_all_on_tables_to_neon_superuser_with_grant_option.sql
rename to compute_tools/src/migrations/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql
diff --git a/compute_tools/src/migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql b/compute_tools/src/migrations/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql
similarity index 100%
rename from compute_tools/src/migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql
rename to compute_tools/src/migrations/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql
diff --git a/compute_tools/src/migrations/0008-revoke_replication_for_previously_allowed_roles.sql b/compute_tools/src/migrations/0009-revoke_replication_for_previously_allowed_roles.sql
similarity index 100%
rename from compute_tools/src/migrations/0008-revoke_replication_for_previously_allowed_roles.sql
rename to compute_tools/src/migrations/0009-revoke_replication_for_previously_allowed_roles.sql
diff --git a/compute_tools/src/migrations/0009-grant_snapshot_synchronization_funcs_to_neon_superuser.sql b/compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql
similarity index 100%
rename from compute_tools/src/migrations/0009-grant_snapshot_synchronization_funcs_to_neon_superuser.sql
rename to compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 1d12b88c7c..6a87263821 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -777,21 +777,21 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> {
 
     // Add new migrations in numerical order.
     let migrations = [
-        include_str!("./migrations/0000-neon_superuser_bypass_rls.sql"),
-        include_str!("./migrations/0001-alter_roles.sql"),
-        include_str!("./migrations/0002-grant_pg_create_subscription_to_neon_superuser.sql"),
-        include_str!("./migrations/0003-grant_pg_monitor_to_neon_superuser.sql"),
-        include_str!("./migrations/0004-grant_all_on_tables_to_neon_superuser.sql"),
-        include_str!("./migrations/0005-grant_all_on_sequences_to_neon_superuser.sql"),
+        include_str!("./migrations/0001-neon_superuser_bypass_rls.sql"),
+        include_str!("./migrations/0002-alter_roles.sql"),
+        include_str!("./migrations/0003-grant_pg_create_subscription_to_neon_superuser.sql"),
+        include_str!("./migrations/0004-grant_pg_monitor_to_neon_superuser.sql"),
+        include_str!("./migrations/0005-grant_all_on_tables_to_neon_superuser.sql"),
+        include_str!("./migrations/0006-grant_all_on_sequences_to_neon_superuser.sql"),
         include_str!(
-            "./migrations/0006-grant_all_on_tables_to_neon_superuser_with_grant_option.sql"
+            "./migrations/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql"
         ),
         include_str!(
-            "./migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql"
+            "./migrations/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql"
         ),
-        include_str!("./migrations/0008-revoke_replication_for_previously_allowed_roles.sql"),
+        include_str!("./migrations/0009-revoke_replication_for_previously_allowed_roles.sql"),
         include_str!(
-            "./migrations/0009-grant_snapshot_synchronization_funcs_to_neon_superuser.sql"
+            "./migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql"
         ),
     ];
 

From 85d47637ee2875b42d3068ff2cbb9567d1c49e56 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Fri, 12 Jul 2024 13:38:51 -0500
Subject: [PATCH 255/412] Run each migration in its own transaction

Previously, every migration was run in the same transaction. This
is preparatory work for fixing CVE-2024-4317.
---
 compute_tools/src/migration.rs         | 46 +++++++++++---------------
 test_runner/fixtures/neon_fixtures.py  |  6 ++--
 test_runner/regress/test_migrations.py |  7 +---
 3 files changed, 24 insertions(+), 35 deletions(-)

diff --git a/compute_tools/src/migration.rs b/compute_tools/src/migration.rs
index 241ccd4100..22ab145eda 100644
--- a/compute_tools/src/migration.rs
+++ b/compute_tools/src/migration.rs
@@ -9,6 +9,9 @@ pub(crate) struct MigrationRunner<'m> {
 
 impl<'m> MigrationRunner<'m> {
     pub fn new(client: &'m mut Client, migrations: &'m [&'m str]) -> Self {
+        // The neon_migration.migration_id::id column is a bigint, which is equivalent to an i64
+        assert!(migrations.len() + 1 < i64::MAX as usize);
+
         Self { client, migrations }
     }
 
@@ -22,11 +25,8 @@ impl<'m> MigrationRunner<'m> {
         Ok(row.get::<&str, i64>("id"))
     }
 
-    fn update_migration_id(&mut self) -> Result<()> {
-        let setval = format!(
-            "UPDATE neon_migration.migration_id SET id={}",
-            self.migrations.len()
-        );
+    fn update_migration_id(&mut self, migration_id: i64) -> Result<()> {
+        let setval = format!("UPDATE neon_migration.migration_id SET id={}", migration_id);
 
         self.client
             .simple_query(&setval)
@@ -57,14 +57,7 @@ impl<'m> MigrationRunner<'m> {
     pub fn run_migrations(mut self) -> Result<()> {
         self.prepare_migrations()?;
 
-        let mut current_migration: usize = self.get_migration_id()? as usize;
-        let starting_migration_id = current_migration;
-
-        let query = "BEGIN";
-        self.client
-            .simple_query(query)
-            .context("run_migrations begin")?;
-
+        let mut current_migration = self.get_migration_id()? as usize;
         while current_migration < self.migrations.len() {
             macro_rules! migration_id {
                 ($cm:expr) => {
@@ -83,29 +76,30 @@ impl<'m> MigrationRunner<'m> {
                     migration
                 );
 
+                self.client
+                    .simple_query("BEGIN")
+                    .context("begin migration")?;
+
                 self.client.simple_query(migration).with_context(|| {
                     format!(
-                        "run_migration migration id={}",
+                        "run_migrations migration id={}",
                         migration_id!(current_migration)
                     )
                 })?;
+
+                // Migration IDs start at 1
+                self.update_migration_id(migration_id!(current_migration))?;
+
+                self.client
+                    .simple_query("COMMIT")
+                    .context("commit migration")?;
+
+                info!("Finished migration id={}", migration_id!(current_migration));
             }
 
             current_migration += 1;
         }
 
-        self.update_migration_id()?;
-
-        let query = "COMMIT";
-        self.client
-            .simple_query(query)
-            .context("run_migrations commit")?;
-
-        info!(
-            "Ran {} migrations",
-            (self.migrations.len() - starting_migration_id)
-        );
-
         Ok(())
     }
 }
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 625e9096f5..4766b72516 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3798,13 +3798,13 @@ class Endpoint(PgProtocol, LogUtils):
             json.dump(dict(data_dict, **kwargs), file, indent=4)
 
     # Please note: Migrations only run if pg_skip_catalog_updates is false
-    def wait_for_migrations(self):
+    def wait_for_migrations(self, num_migrations: int = 10):
         with self.cursor() as cur:
 
             def check_migrations_done():
                 cur.execute("SELECT id FROM neon_migration.migration_id")
-                migration_id = cur.fetchall()[0][0]
-                assert migration_id != 0
+                migration_id: int = cur.fetchall()[0][0]
+                assert migration_id >= num_migrations
 
             wait_until(20, 0.5, check_migrations_done)
 
diff --git a/test_runner/regress/test_migrations.py b/test_runner/regress/test_migrations.py
index 91bd3ea50c..880dead4e8 100644
--- a/test_runner/regress/test_migrations.py
+++ b/test_runner/regress/test_migrations.py
@@ -11,17 +11,14 @@ def test_migrations(neon_simple_env: NeonEnv):
     endpoint.respec(skip_pg_catalog_updates=False)
     endpoint.start()
 
-    endpoint.wait_for_migrations()
-
     num_migrations = 10
+    endpoint.wait_for_migrations(num_migrations=num_migrations)
 
     with endpoint.cursor() as cur:
         cur.execute("SELECT id FROM neon_migration.migration_id")
         migration_id = cur.fetchall()
         assert migration_id[0][0] == num_migrations
 
-    endpoint.assert_log_contains(f"INFO handle_migrations: Ran {num_migrations} migrations")
-
     endpoint.stop()
     endpoint.start()
     # We don't have a good way of knowing that the migrations code path finished executing
@@ -31,5 +28,3 @@ def test_migrations(neon_simple_env: NeonEnv):
         cur.execute("SELECT id FROM neon_migration.migration_id")
         migration_id = cur.fetchall()
         assert migration_id[0][0] == num_migrations
-
-    endpoint.assert_log_contains("INFO handle_migrations: Ran 0 migrations")

From ad5d784fb776de2ba6c2d4adf88e9ababe554cd7 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 15 Jul 2024 10:30:04 -0500
Subject: [PATCH 256/412] Hide import behind TYPE_CHECKING

---
 test_runner/regress/test_migrations.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/test_runner/regress/test_migrations.py b/test_runner/regress/test_migrations.py
index 880dead4e8..bdc5ca907e 100644
--- a/test_runner/regress/test_migrations.py
+++ b/test_runner/regress/test_migrations.py
@@ -1,6 +1,10 @@
-import time
+from __future__ import annotations
 
-from fixtures.neon_fixtures import NeonEnv
+import time
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from fixtures.neon_fixtures import NeonEnv
 
 
 def test_migrations(neon_simple_env: NeonEnv):

From 18e7c2b7a10f779a0253b713c0ca3ffd7a1b6c1a Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 15 Jul 2024 10:35:49 -0500
Subject: [PATCH 257/412] Add some typing to Endpoint.respec()

---
 test_runner/fixtures/neon_fixtures.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 4766b72516..2765ff916e 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3785,12 +3785,12 @@ class Endpoint(PgProtocol, LogUtils):
             self.endpoint_id, self.tenant_id, pageserver_id, self.active_safekeepers
         )
 
-    def respec(self, **kwargs):
+    def respec(self, **kwargs: Any) -> None:
         """Update the endpoint.json file used by control_plane."""
         # Read config
         config_path = os.path.join(self.endpoint_path(), "endpoint.json")
         with open(config_path, "r") as f:
-            data_dict = json.load(f)
+            data_dict: dict[str, Any] = json.load(f)
 
         # Write it back updated
         with open(config_path, "w") as file:

From abe3b4e005a918f5d5ed242095d3cf9a9b4b57a5 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <iskyzh@gmail.com>
Date: Tue, 16 Jul 2024 15:43:24 -0400
Subject: [PATCH 258/412] fix(pageserver): limit num of delta layers for l0
 compaction (#8391)

## Problem

close https://github.com/neondatabase/neon/issues/8389

## Summary of changes

A quick mitigation for tenants with fast writes. We compact at most 60
delta layers at a time, expecting a memory footprint of 15GB. We will
pick the oldest 60 L0 layers.

This should be a relatively safe change so no test is added. Question is
whether to make this parameter configurable via tenant config.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: John Spray <john@neon.tech>
---
 pageserver/src/tenant/timeline/compaction.rs | 31 ++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index cbb3303341..f251b667c2 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -26,6 +26,7 @@ use utils::id::TimelineId;
 
 use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
 use crate::page_cache;
+use crate::tenant::config::defaults::{DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD};
 use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc};
 use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter};
 use crate::tenant::timeline::{Hole, ImageLayerCreationOutcome};
@@ -415,6 +416,7 @@ impl Timeline {
             .map(|x| guard.get_from_desc(&x))
             .collect_vec();
         stats.level0_deltas_count = Some(level0_deltas.len());
+
         // Only compact if enough layers have accumulated.
         let threshold = self.get_compaction_threshold();
         if level0_deltas.is_empty() || level0_deltas.len() < threshold {
@@ -445,6 +447,22 @@ impl Timeline {
         let mut prev_lsn_end = first_level0_delta.layer_desc().lsn_range.end;
         let mut deltas_to_compact = Vec::with_capacity(level0_deltas.len());
 
+        // Accumulate the size of layers in `deltas_to_compact`
+        let mut deltas_to_compact_bytes = 0;
+
+        // Under normal circumstances, we will accumulate up to compaction_interval L0s of size
+        // checkpoint_distance each.  To avoid edge cases using extra system resources, bound our
+        // work in this function to only operate on this much delta data at once.
+        //
+        // Take the max of the configured value & the default, so that tests that configure tiny values
+        // can still use a sensible amount of memory, but if a deployed system configures bigger values we
+        // still let them compact a full stack of L0s in one go.
+        let delta_size_limit = std::cmp::max(
+            self.get_compaction_threshold(),
+            DEFAULT_COMPACTION_THRESHOLD,
+        ) as u64
+            * std::cmp::max(self.get_checkpoint_distance(), DEFAULT_CHECKPOINT_DISTANCE);
+
         deltas_to_compact.push(first_level0_delta.download_and_keep_resident().await?);
         for l in level0_deltas_iter {
             let lsn_range = &l.layer_desc().lsn_range;
@@ -453,7 +471,20 @@ impl Timeline {
                 break;
             }
             deltas_to_compact.push(l.download_and_keep_resident().await?);
+            deltas_to_compact_bytes += l.metadata().file_size;
             prev_lsn_end = lsn_range.end;
+
+            if deltas_to_compact_bytes >= delta_size_limit {
+                info!(
+                    l0_deltas_selected = deltas_to_compact.len(),
+                    l0_deltas_total = level0_deltas.len(),
+                    "L0 compaction picker hit max delta layer size limit: {}",
+                    delta_size_limit
+                );
+
+                // Proceed with compaction, but only a subset of L0s
+                break;
+            }
         }
         let lsn_range = Range {
             start: deltas_to_compact

From b21e131d115a90128d3a3c1b02ecc23fff7dda6c Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 16 Jul 2024 20:55:29 +0100
Subject: [PATCH 259/412] pageserver: exclude un-read layers from short
 residence statistic (#8396)

## Problem

The `evictions_with_low_residence_duration` is used as an indicator of
cache thrashing. However, there are situations where it is quite
legitimate to only have a short residence during compaction, where a
delta is downloaded, used to generate an image layer, and then
discarded. This can lead to false positive alerts.

## Summary of changes

- Only track low residence duration for layers that have been accessed
at least once (compaction doesn't count as an access). This will give us
a metric that indicates thrashing on layers that the _user_ is using,
rather than those we're downloading for housekeeping purposes.

Once we add "layer visibility" as an explicit property of layers, this
can also be used as a cleaner condition (residence of non-visible layers
should never be alertable)
---
 pageserver/src/tenant/storage_layer.rs       | 20 ++++++++++++++++++++
 pageserver/src/tenant/storage_layer/layer.rs | 20 ++++++++++++++------
 test_runner/regress/test_tenant_conf.py      | 11 +++++++++++
 3 files changed, 45 insertions(+), 6 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 62730f88b2..2f0c45317d 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -676,6 +676,26 @@ impl LayerAccessStats {
             },
         }
     }
+
+    /// Whether this layer has been accessed (excluding in [`AccessStatsBehavior::Skip`]).
+    ///
+    /// This indicates whether the layer has been used for some purpose that would motivate
+    /// us to keep it on disk, such as for serving a getpage request.
+    fn accessed(&self) -> bool {
+        let locked = self.0.lock().unwrap();
+        let inner = &locked.for_eviction_policy;
+
+        // Consider it accessed if the most recent access is more recent than
+        // the most recent change in residence status.
+        match (
+            inner.last_accesses.recent(),
+            inner.last_residence_changes.recent(),
+        ) {
+            (None, _) => false,
+            (Some(_), None) => true,
+            (Some(a), Some(r)) => a.when >= r.timestamp,
+        }
+    }
 }
 
 /// Get a layer descriptor from a layer.
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 02069c29d2..4500bc94dd 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1469,14 +1469,22 @@ impl LayerInner {
                 let duration = SystemTime::now().duration_since(local_layer_mtime);
                 match duration {
                     Ok(elapsed) => {
-                        timeline
-                            .metrics
-                            .evictions_with_low_residence_duration
-                            .read()
-                            .unwrap()
-                            .observe(elapsed);
+                        let accessed = self.access_stats.accessed();
+                        if accessed {
+                            // Only layers used for reads contribute to our "low residence" metric that is used
+                            // to detect thrashing.  Layers promoted for other reasons (e.g. compaction) are allowed
+                            // to be rapidly evicted without contributing to this metric.
+                            timeline
+                                .metrics
+                                .evictions_with_low_residence_duration
+                                .read()
+                                .unwrap()
+                                .observe(elapsed);
+                        }
+
                         tracing::info!(
                             residence_millis = elapsed.as_millis(),
+                            accessed,
                             "evicted layer after known residence period"
                         );
                     }
diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py
index 1a8bc3b983..9fb7324fa1 100644
--- a/test_runner/regress/test_tenant_conf.py
+++ b/test_runner/regress/test_tenant_conf.py
@@ -8,6 +8,7 @@ from fixtures.neon_fixtures import (
 from fixtures.pageserver.utils import assert_tenant_state, wait_for_upload
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
 from fixtures.utils import wait_until
+from fixtures.workload import Workload
 
 
 def test_tenant_config(neon_env_builder: NeonEnvBuilder):
@@ -265,6 +266,13 @@ def test_live_reconfig_get_evictions_low_residence_duration_metric_threshold(
     (tenant_id, timeline_id) = env.initial_tenant, env.initial_timeline
     ps_http = env.pageserver.http_client()
 
+    # When we evict/download layers, we will use this Workload to generate getpage requests
+    # that touch some layers, as otherwise the pageserver doesn't report totally unused layers
+    # as problems when they have short residence duration.
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init()
+    workload.write_rows(100)
+
     def get_metric():
         metrics = ps_http.get_metrics()
         metric = metrics.query_one(
@@ -285,6 +293,7 @@ def test_live_reconfig_get_evictions_low_residence_duration_metric_threshold(
     assert default_value == "1day"
 
     ps_http.download_all_layers(tenant_id, timeline_id)
+    workload.validate()
     ps_http.evict_all_layers(tenant_id, timeline_id)
     metric = get_metric()
     assert int(metric.value) > 0, "metric is updated"
@@ -305,6 +314,7 @@ def test_live_reconfig_get_evictions_low_residence_duration_metric_threshold(
     assert int(metric.value) == 0
 
     ps_http.download_all_layers(tenant_id, timeline_id)
+    workload.validate()
     ps_http.evict_all_layers(tenant_id, timeline_id)
     metric = get_metric()
     assert int(metric.labels["low_threshold_secs"]) == 2 * 24 * 60 * 60
@@ -318,6 +328,7 @@ def test_live_reconfig_get_evictions_low_residence_duration_metric_threshold(
     assert int(metric.value) == 0, "value resets if label changes"
 
     ps_http.download_all_layers(tenant_id, timeline_id)
+    workload.validate()
     ps_http.evict_all_layers(tenant_id, timeline_id)
     metric = get_metric()
     assert int(metric.labels["low_threshold_secs"]) == 2 * 60 * 60

From 07e78102bf613c723e1c98e752456770524f5250 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 16 Jul 2024 21:36:17 +0100
Subject: [PATCH 260/412] pageserver: reduce size of delta layer ValueRef
 (#8401)

## Problem

ValueRef is an unnecessarily large structure, because it carries a
cursor. L0 compaction currently instantiates gigabytes of these under
some circumstances.

## Summary of changes

- Carry a ref to the parent layer instead of a cursor, and construct a
cursor on demand.

This reduces RSS high watermark during L0 compaction by about 20%.
---
 .../src/tenant/storage_layer/delta_layer.rs   | 22 +++++++++++--------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 2d36ac7442..64412fe4af 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -1180,9 +1180,7 @@ impl DeltaLayerInner {
                     let delta_key = DeltaKey::from_slice(key);
                     let val_ref = ValueRef {
                         blob_ref: BlobRef(value),
-                        reader: BlockCursor::new(crate::tenant::block_io::BlockReaderRef::Adapter(
-                            Adapter(self),
-                        )),
+                        layer: self,
                     };
                     let pos = BlobRef(value).pos();
                     if let Some(last) = all_keys.last_mut() {
@@ -1426,7 +1424,7 @@ impl DeltaLayerInner {
         let keys = self.load_keys(ctx).await?;
 
         async fn dump_blob(val: &ValueRef<'_>, ctx: &RequestContext) -> anyhow::Result<String> {
-            let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?;
+            let buf = val.load_raw(ctx).await?;
             let val = Value::des(&buf)?;
             let desc = match val {
                 Value::Image(img) => {
@@ -1461,8 +1459,7 @@ impl DeltaLayerInner {
             use pageserver_api::key::CHECKPOINT_KEY;
             use postgres_ffi::CheckPoint;
             if key == CHECKPOINT_KEY {
-                let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?;
-                let val = Value::des(&buf)?;
+                let val = val.load(ctx).await?;
                 match val {
                     Value::Image(img) => {
                         let checkpoint = CheckPoint::decode(&img)?;
@@ -1547,17 +1544,24 @@ pub struct DeltaEntry<'a> {
 /// Reference to an on-disk value
 pub struct ValueRef<'a> {
     blob_ref: BlobRef,
-    reader: BlockCursor<'a>,
+    layer: &'a DeltaLayerInner,
 }
 
 impl<'a> ValueRef<'a> {
     /// Loads the value from disk
     pub async fn load(&self, ctx: &RequestContext) -> Result<Value> {
-        // theoretically we *could* record an access time for each, but it does not really matter
-        let buf = self.reader.read_blob(self.blob_ref.pos(), ctx).await?;
+        let buf = self.load_raw(ctx).await?;
         let val = Value::des(&buf)?;
         Ok(val)
     }
+
+    async fn load_raw(&self, ctx: &RequestContext) -> Result<Vec<u8>> {
+        let reader = BlockCursor::new(crate::tenant::block_io::BlockReaderRef::Adapter(Adapter(
+            self.layer,
+        )));
+        let buf = reader.read_blob(self.blob_ref.pos(), ctx).await?;
+        Ok(buf)
+    }
 }
 
 pub(crate) struct Adapter<T>(T);

From d51ca338c453e0243428d6156cbee4fcbc8bdd05 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 17 Jul 2024 15:25:35 +0100
Subject: [PATCH 261/412] docs/rfcs: timeline ancestor detach API (#6888)

## Problem

When a tenant creates a new timeline that they will treat as their
'main' history,
it is awkward to permanently retain an 'old main' timeline as its
ancestor. Currently
this is necessary because it is forbidden to delete a timeline which has
descendents.

## Summary of changes

A new pageserver API is proposed to 'adopt' data from a parent timeline
into
one of its children, such that the link between ancestor and child can
be severed,
leaving the parent in a state where it may then be deleted.

---------

Co-authored-by: Joonas Koivunen <joonas@neon.tech>
---
 docs/rfcs/034-ancestor-deletion.md | 252 +++++++++++++++++++++++++++++
 1 file changed, 252 insertions(+)
 create mode 100644 docs/rfcs/034-ancestor-deletion.md

diff --git a/docs/rfcs/034-ancestor-deletion.md b/docs/rfcs/034-ancestor-deletion.md
new file mode 100644
index 0000000000..7341d930e2
--- /dev/null
+++ b/docs/rfcs/034-ancestor-deletion.md
@@ -0,0 +1,252 @@
+# Ancestor Timeline Deletion
+
+Created on: 2024-02-23
+
+Author: John Spray
+
+# Summary
+
+When a tenant creates a new timeline that they will treat as their 'main' history,
+it is awkward to permanently retain an 'old main' timeline as its ancestor. Currently
+this is necessary because it is forbidden to delete a timeline which has descendents.
+
+A new pageserver API is proposed to 'adopt' data from a parent timeline into
+one of its children, such that the link between ancestor and child can be severed,
+leaving the parent in a state where it may then be deleted.
+
+# Motivation
+
+Retaining parent timelines currently has two costs:
+
+- Cognitive load on users, who have to remember which is the "real" main timeline.
+- Storage capacity cost, as the parent timeline will retain layers up to the
+  child's timeline point, even if the child fully covers its keyspace with image
+  layers and will never actually read from the parent.
+
+# Solution
+
+A new pageserver API `PUT /v1/tenant/:tenant_id/timeline/:timeline_id/detach_ancestor`
+will be added. The `timeline_id` in this URL is that of the _child_ timeline that we
+wish to detach from its parent.
+
+On success, this API will leave the following state:
+
+- The detached child timeline will no longer have an ancestor, and will contain all
+  the data needed to service reads without recursing into an ancestor.
+- Any other children of the parent whose timeline points were at a lower LSN than
+  the detached child timeline will be modified to have the child timeline as their
+  new parent.
+- The parent timeline will still exist, but the child will no longer have it as an
+  ancestor. If this was the last timeline that depended on the parent, then the
+  parent will become deletable.
+
+This API's implementation will consist of a series of retryable steps, such that
+on failures/timeout it can safely be called again to reach the target state.
+
+## Example
+
+### Before
+
+The user has "rolled back" their project to LSN X, resulting in a "new main"
+timeline. The parent "old main" timeline still exists, and they would like
+to clean it up.
+
+They have two other timelines A and B. A is from before the rollback point,
+and B is from after the rollback point.
+
+```
+----"old main" timeline-------X-------------------------------------------->
+                |             |                         |
+                |-> child A   |                         |
+                              |-> "new main" timeline   |
+                                                        -> child B
+
+```
+
+### After calling detach ancestor API
+
+The "new main" timeline is no longer dependent on old main, and neither
+is child A, because it had a branch point before X.
+
+The user may now choose to delete child B and "old main" to get to
+a pristine state. Child B is likely to be unwanted since the user
+chose to roll back to X, and it branches from after X. However, we
+don't assume this in the API; it is up to the user to delete it.
+
+```
+|----"old main" timeline---------------------------------------------------->
+                                                         |
+                                                         |
+                                                         |
+                                                         -> child B
+
+|----"new main" timeline--------->
+                 |
+                 |-> child A
+
+
+```
+
+### After removing timelines
+
+We end up with a totally clean state that leaves no trace that a rollback
+ever happened: there is only one root timeline.
+
+```
+| ----"new main" timeline----------->
+                |
+                |-> child A
+
+
+```
+
+## Caveats
+
+Important things for API users to bear in mind:
+
+- this API does not delete the parent timeline: you must still do that explicitly.
+- if there are other child timelines ahead of the branch point of the detached
+  child, the parent won't be deletable: you must either delete or detach those
+  children.
+- do _not_ simply loop over all children and detach them all: this can have an
+  extremely high storage cost. The detach ancestor API is intended for use on a single
+  timeline to make it the new "main".
+- The detach ancestor API should also not be
+  exposed directly to the user as button/API, because they might decide
+  to click it for all the children and thereby generate many copies of the
+  parent's data -- the detach ancestor API should be used as part
+  of a high level "clean up after rollback" feature.
+
+## `detach_ancestor` API implementation
+
+Terms used in the following sections:
+
+- "the child": the timeline whose ID is specified in the detach ancestor API URL, also
+  called "new main" in the example.
+- "the parent": the parent of "the child". Also called "old main" in the example.
+- "the branch point" the ancestor_lsn of "the child"
+
+### Phase 1: write out adopted layers to S3
+
+The child will "adopt" layers from the parent, such that its end state contains
+all the parent's history as well as its own.
+
+For all layers in the parent's layer map whose high LSN is below the branch
+point, issue S3 CopyObject requests to duplicate them into the child timeline's
+prefix. Do not add them to the child's layer map yet.
+
+For delta layers in the parent's layer map which straddle the branch point, read them
+and write out only content up to the branch point into new layer objects.
+
+This is a long running operation if the parent has many layers: it should be
+implemented in a way that resumes rather than restarting from scratch, if the API
+times out and is called again.
+
+As an optimization, if there are no other timelines that will be adopted into
+the child, _and_ the child's image layers already full cover the branch LSN,
+then we may skip adopting layers.
+
+### Phase 2: update the child's index
+
+Having written out all needed layers in phase 1, atomically link them all
+into the child's IndexPart and upload to S3. This may be done while the
+child Timeline is still running.
+
+### Phase 3: modify timelines ancestry
+
+Modify the child's ancestor to None, and upload its IndexPart to persist the change.
+
+For all timelines which have the same parent as the child, and have a branch
+point lower than our branch point, switch their ancestor_timeline to the child,
+and upload their IndexPart to persist the change.
+
+## Alternatives considered
+
+### Generate full image layer on child, rather than adopting parent deltas
+
+This would work for the case of a single child, but would prevent re-targeting
+other timelines that depended on the parent. If we detached many children this
+way, the storage cost would become prohibitive (consider a 1TB database with
+100 child timelines: it would cost 100TiB if they all generated their own image layers).
+
+### Don't rewrite anything: just fake it in the API
+
+We could add a layer of indirection that let a child "pretend" that it had no
+ancestor, when in reality it still had the parent. The pageserver API could
+accept deletion of ancestor timelines, and just update child metadata to make
+them look like they have no ancestor.
+
+This would not achieve the desired reduction in storage cost, and may well be more
+complex to maintain than simply implementing the API described in this RFC.
+
+### Avoid copying objects: enable child index to use parent layers directly
+
+We could teach IndexPart to store a TimelineId for each layer, such that a child
+timeline could reference a parent's layers directly, rather than copying them
+into the child's prefix.
+
+This would impose a cost for the normal case of indices that only target the
+timeline's own layers, add complexity, and break the useful simplifying
+invariant that timelines "own" their own path. If child timelines were
+referencing layers from the parent, we would have to ensure that the parent
+never runs GC/compaction again, which would make the API less flexible (the
+proposal in this RFC enables deletion of the parent but doesn't require it.)
+
+## Performance
+
+### Adopting layers
+
+- CopyObject is a relatively cheap operation, but we may need to issue tens of thousands
+  of such requests: this can take up to tens of seconds and will compete for RemoteStorage
+  semaphore units with other activity on the pageserver.
+- If we are running on storage backend that doesn't implement CopyObject, then
+  this part will be much more expensive as we would stream all layer content
+  through the pageserver. This is no different to issuing a lot
+  of reads to a timeline that does not have a warm local cache: it will move
+  a lot of gigabytes, but that shouldn't break anything.
+- Generating truncated layers for delta that straddle the branch point will
+  require streaming read/write of all the layers in question.
+
+### Updating timeline ancestry
+
+The simplest way to update timeline ancestry will probably be to stop and start
+all the Timeline objects: this is preferable to the complexity of making their
+ancestry mutable at runtime.
+
+There will be a corresponding "stutter" in the availability of the timelines,
+of the order 10-100ms, which is the time taken to upload their IndexPart, and
+restart the Timeline.
+
+# Interaction with other features
+
+## Concurrent timeline creation
+
+If new historic timelines are created using the parent as an ancestor while the
+detach ancestor API is running, they will not be re-parented to the child. This
+doesn't break anything, but it leaves the parent in a state where it might not
+be possible to delete it.
+
+Since timeline creations are an explicit user action, this is not something we need to
+worry about as the storage layer: a user who wants to delete their parent timeline will not create
+new children, and if they do, they can choose to delete those children to
+enable deleting the parent.
+
+For the least surprise to the user, before starting the detach ancestor branch
+operation, the control plane should wait until all branches are created and not
+allow any branches to be created before the branch point on the ancestor branch
+while the operation is ongoing.
+
+## WAL based disaster recovery
+
+WAL based disaster recovery currently supports only restoring of the main
+branch. Enabling WAL based disaster recovery in the future requires that we
+keep a record which timeline generated the WAL and at which LSN was a parent
+detached. Keep a list of timeline ids and the LSN in which they were detached in
+the `index_part.json`. Limit the size of the list to 100 first entries, after
+which the WAL disaster recovery will not be possible.
+
+## Sharded tenants
+
+For sharded tenants, calls to the detach ancestor API will pass through the storage
+controller, which will handle them the same as timeline creations: invoke first
+on shard zero, and then on all the other shards.

From 9f796ebba9c089881fa66f402c08cb10df370b44 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Wed, 17 Jul 2024 16:56:32 +0200
Subject: [PATCH 262/412] Bodobolero/pgbench compare azure (#8409)

## Problem

We want to run performance tests on all supported cloud providers.
We want to run most tests on the postgres version which is default for
new projects in production, currently (July 24) this is postgres version
16

## Summary of changes

- change default postgres version for some (performance) tests to 16
(which is our default for new projects in prod anyhow)
- add azure region to pgbench_compare jobs

- add azure region to pgvector benchmarking jobs
- re-used project `weathered-snowflake-88107345` was prepared with 1
million embeddings running on 7 minCU 7 maxCU in azure region to compare
with AWS region (pgvector indexing and hnsw queries)
  - see job pgbench-pgvector

- Note we now have a 11 environments combinations where we run
pgbench-compare and 5 are for k8s-pod (deprecated) which we can remove
in the future once auto-scaling team approves.

## Logs

A current run with the changes from this pull request is running here
https://github.com/neondatabase/neon/actions/runs/9972096222

Note that we currently expect some failures due to
- https://github.com/neondatabase/neon/issues/8275
- instability of projects on azure region
---
 .../actions/neon-project-create/action.yml    |  4 +-
 .github/workflows/benchmarking.yml            | 70 ++++++++++++++-----
 2 files changed, 56 insertions(+), 18 deletions(-)

diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml
index 16759ad038..d4029bd37c 100644
--- a/.github/actions/neon-project-create/action.yml
+++ b/.github/actions/neon-project-create/action.yml
@@ -9,8 +9,8 @@ inputs:
     description: 'Region ID, if not set the project will be created in the default region'
     default: aws-us-east-2
   postgres_version:
-    description: 'Postgres version; default is 15'
-    default: '15'
+    description: 'Postgres version; default is 16'
+    default: '16'
   api_host:
     description: 'Neon API host'
     default: console-stage.neon.build
diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index d038f64f15..d785156a29 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -59,7 +59,7 @@ jobs:
     strategy:
       matrix:
         include:
-          - DEFAULT_PG_VERSION: 14
+          - DEFAULT_PG_VERSION: 16
             PLATFORM: "neon-staging"
             region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
             provisioner: 'k8s-pod' 
@@ -146,6 +146,7 @@ jobs:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
   replication-tests:
+    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
     env:
       POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
       DEFAULT_PG_VERSION: 14
@@ -190,6 +191,7 @@ jobs:
         run_in_parallel: false
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 5400
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
       env:
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
         PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -215,11 +217,14 @@ jobs:
     # Available platforms:
     # - neon-captest-new: Freshly created project (1 CU)
     # - neon-captest-freetier: Use freetier-sized compute (0.25 CU)
+    # - neonvm-captest-azure-new: Freshly created project (1 CU) in azure region
+    # - neonvm-captest-azure-freetier: Use freetier-sized compute (0.25 CU) in azure region
     # - neon-captest-reuse: Reusing existing project
     # - rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
     # - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
     env:
       RUN_AWS_RDS_AND_AURORA: ${{ github.event.inputs.run_AWS_RDS_AND_AURORA || 'false' }}
+      DEFAULT_REGION_ID: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
     runs-on: ubuntu-22.04
     outputs:
       pgbench-compare-matrix: ${{ steps.pgbench-compare-matrix.outputs.matrix }}
@@ -230,23 +235,33 @@ jobs:
     - name: Generate matrix for pgbench benchmark
       id: pgbench-compare-matrix
       run: |
+        region_id_default=${{ env.DEFAULT_REGION_ID }}
         matrix='{
+          "pg_version" : [
+            16
+          ],
+          "region_id" : [
+            "'"$region_id_default"'"
+            ],
           "platform": [
             "neon-captest-new",
             "neon-captest-reuse",
             "neonvm-captest-new"
           ],
           "db_size": [ "10gb" ],
-          "include": [{ "platform": "neon-captest-freetier",         "db_size": "3gb"  },
-                      { "platform": "neon-captest-new",              "db_size": "50gb" },
-                      { "platform": "neonvm-captest-freetier",       "db_size": "3gb"  },
-                      { "platform": "neonvm-captest-new",            "db_size": "50gb" },
-                      { "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
+          "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neon-captest-freetier",         "db_size": "3gb"  },
+                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neon-captest-new",              "db_size": "50gb" },
+                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier",       "db_size": "3gb"  },
+                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "50gb" },
+                      { "pg_version": 16, "region_id": "azure-eastus2",      "platform": "neonvm-azure-captest-freetier", "db_size": "3gb"  },
+                      { "pg_version": 16, "region_id": "azure-eastus2",      "platform": "neonvm-azure-captest-new",      "db_size": "10gb" },
+                      { "pg_version": 16, "region_id": "azure-eastus2",      "platform": "neonvm-azure-captest-new",      "db_size": "50gb" },
+                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
         }'
 
         if [ "$(date +%A)" = "Saturday" ]; then
-          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"},
-                                                     { "platform": "rds-aurora",   "db_size": "50gb"}]')
+          matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb"},
+                                                     { "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-aurora",   "db_size": "50gb"}]')
         fi
 
         echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -298,7 +313,7 @@ jobs:
       TEST_PG_BENCH_DURATIONS_MATRIX: "60m"
       TEST_PG_BENCH_SCALES_MATRIX: ${{ matrix.db_size }}
       POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 14
+      DEFAULT_PG_VERSION: ${{ matrix.pg_version }}
       TEST_OUTPUT: /tmp/test_output
       BUILD_TYPE: remote
       SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
@@ -323,14 +338,14 @@ jobs:
         prefix: latest
 
     - name: Create Neon Project
-      if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier"]'), matrix.platform)
+      if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform)
       id: create-neon-project
       uses: ./.github/actions/neon-project-create
       with:
-        region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
+        region_id: ${{ matrix.region_id }}
         postgres_version: ${{ env.DEFAULT_PG_VERSION }}
         api_key: ${{ secrets.NEON_STAGING_API_KEY }}
-        compute_units: ${{ (matrix.platform == 'neon-captest-freetier' && '[0.25, 0.25]') || '[1, 1]' }}
+        compute_units: ${{ (contains(matrix.platform, 'captest-freetier') && '[0.25, 0.25]') || '[1, 1]' }}
         provisioner: ${{ (contains(matrix.platform, 'neonvm-') && 'k8s-neonvm') || 'k8s-pod' }}
 
     - name: Set up Connection String
@@ -343,7 +358,7 @@ jobs:
           neonvm-captest-sharding-reuse)
             CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }}
             ;;
-          neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier)
+          neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier | neonvm-azure-captest-new | neonvm-azure-captest-freetier)
             CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
             ;;
           rds-aurora)
@@ -368,6 +383,7 @@ jobs:
         run_in_parallel: false
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_init
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
       env:
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -381,6 +397,7 @@ jobs:
         run_in_parallel: false
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_simple_update
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
       env:
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -394,6 +411,7 @@ jobs:
         run_in_parallel: false
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_select_only
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
       env:
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -420,6 +438,12 @@ jobs:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
   pgbench-pgvector:
+    strategy:
+      matrix:
+        include:
+          - PLATFORM: "neon-captest-pgvector"
+          - PLATFORM: "azure-captest-pgvector"
+            
     env:
       TEST_PG_BENCH_DURATIONS_MATRIX: "15m"
       TEST_PG_BENCH_SCALES_MATRIX: "1"
@@ -428,7 +452,7 @@ jobs:
       TEST_OUTPUT: /tmp/test_output
       BUILD_TYPE: remote
       SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
-      PLATFORM: "neon-captest-pgvector"
+      PLATFORM: ${{ matrix.PLATFORM }}
 
     runs-on: [ self-hosted, us-east-2, x64 ]
     container:
@@ -448,7 +472,18 @@ jobs:
     - name: Set up Connection String
       id: set-up-connstr
       run: |
-        CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
+        case "${PLATFORM}" in
+          neon-captest-pgvector)
+            CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
+            ;;
+          azure-captest-pgvector)
+            CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR_AZURE }}
+            ;;
+          *)
+            echo >&2 "Unknown PLATFORM=${PLATFORM}"
+            exit 1
+            ;;
+        esac
 
         echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
 
@@ -460,6 +495,7 @@ jobs:
         run_in_parallel: false
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 21600 -k test_pgvector_indexing
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
       env:
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
         PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -473,6 +509,7 @@ jobs:
         run_in_parallel: false
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 21600
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
       env:
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -487,7 +524,7 @@ jobs:
       uses: slackapi/slack-github-action@v1
       with:
         channel-id: "C033QLM5P7D" # dev-staging-stream
-        slack-message: "Periodic perf testing neon-captest-pgvector: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+        slack-message: "Periodic perf testing ${PLATFORM}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
       env:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
@@ -735,6 +772,7 @@ jobs:
         run_in_parallel: false
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 21600 -k test_user_examples
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
       env:
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
         PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"

From a98ccd185b109cbfefe64a5d56d6ec30684a8d59 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <iskyzh@gmail.com>
Date: Wed, 17 Jul 2024 11:22:38 -0400
Subject: [PATCH 263/412] test(pageserver): more k-merge tests on duplicated
 keys (#8404)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Existing tenants and some selection of layers might produce duplicated
keys. Add tests to ensure the k-merge iterator handles it correctly. We
also enforced ordering of the k-merge iterator to put images before
deltas.

part of https://github.com/neondatabase/neon/issues/8002

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
---
 .../src/tenant/storage_layer/delta_layer.rs   |  16 +-
 .../tenant/storage_layer/merge_iterator.rs    | 163 ++++++++++++++++--
 2 files changed, 163 insertions(+), 16 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 64412fe4af..43941b6e17 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -1672,6 +1672,7 @@ pub(crate) mod test {
     use rand::RngCore;
 
     use super::*;
+    use crate::repository::Value;
     use crate::tenant::harness::TIMELINE_ID;
     use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner;
     use crate::tenant::Tenant;
@@ -1681,6 +1682,7 @@ pub(crate) mod test {
         tenant::{disk_btree::tests::TestDisk, harness::TenantHarness},
         DEFAULT_PG_VERSION,
     };
+    use bytes::Bytes;
 
     /// Construct an index for a fictional delta layer and and then
     /// traverse in order to plan vectored reads for a query. Finally,
@@ -2249,6 +2251,15 @@ pub(crate) mod test {
         (k1, l1).cmp(&(k2, l2))
     }
 
+    pub(crate) fn sort_delta_value(
+        (k1, l1, v1): &(Key, Lsn, Value),
+        (k2, l2, v2): &(Key, Lsn, Value),
+    ) -> std::cmp::Ordering {
+        let order_1 = if v1.is_image() { 0 } else { 1 };
+        let order_2 = if v2.is_image() { 0 } else { 1 };
+        (k1, l1, order_1).cmp(&(k2, l2, order_2))
+    }
+
     pub(crate) async fn produce_delta_layer(
         tenant: &Tenant,
         tline: &Arc<Timeline>,
@@ -2257,7 +2268,7 @@ pub(crate) mod test {
     ) -> anyhow::Result<ResidentLayer> {
         deltas.sort_by(sort_delta);
         let (key_start, _, _) = deltas.first().unwrap();
-        let (key_max, _, _) = deltas.first().unwrap();
+        let (key_max, _, _) = deltas.last().unwrap();
         let lsn_min = deltas.iter().map(|(_, lsn, _)| lsn).min().unwrap();
         let lsn_max = deltas.iter().map(|(_, lsn, _)| lsn).max().unwrap();
         let lsn_end = Lsn(lsn_max.0 + 1);
@@ -2302,9 +2313,6 @@ pub(crate) mod test {
 
     #[tokio::test]
     async fn delta_layer_iterator() {
-        use crate::repository::Value;
-        use bytes::Bytes;
-
         let harness = TenantHarness::create("delta_layer_iterator").unwrap();
         let (tenant, ctx) = harness.load().await;
 
diff --git a/pageserver/src/tenant/storage_layer/merge_iterator.rs b/pageserver/src/tenant/storage_layer/merge_iterator.rs
index 68759f7585..0edfd4bd40 100644
--- a/pageserver/src/tenant/storage_layer/merge_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs
@@ -96,15 +96,22 @@ impl<'a> std::cmp::PartialOrd for IteratorWrapper<'a> {
 impl<'a> std::cmp::Ord for IteratorWrapper<'a> {
     fn cmp(&self, other: &Self) -> std::cmp::Ordering {
         use std::cmp::Ordering;
-        let a = self.peek_next_key_lsn();
-        let b = other.peek_next_key_lsn();
+        let a = self.peek_next_key_lsn_value();
+        let b = other.peek_next_key_lsn_value();
         match (a, b) {
-            (Some((k1, l1)), Some((k2, l2))) => {
-                let loaded_1 = if self.is_loaded() { 1 } else { 0 };
-                let loaded_2 = if other.is_loaded() { 1 } else { 0 };
+            (Some((k1, l1, v1)), Some((k2, l2, v2))) => {
+                fn map_value_to_num(val: &Option<&Value>) -> usize {
+                    match val {
+                        None => 0,
+                        Some(Value::Image(_)) => 1,
+                        Some(Value::WalRecord(_)) => 2,
+                    }
+                }
+                let order_1 = map_value_to_num(&v1);
+                let order_2 = map_value_to_num(&v2);
                 // When key_lsn are the same, the unloaded iter will always appear before the loaded one.
                 // And note that we do a reverse at the end of the comparison, so it works with the max heap.
-                (k1, l1, loaded_1).cmp(&(k2, l2, loaded_2))
+                (k1, l1, order_1).cmp(&(k2, l2, order_2))
             }
             (Some(_), None) => Ordering::Less,
             (None, Some(_)) => Ordering::Greater,
@@ -137,13 +144,16 @@ impl<'a> IteratorWrapper<'a> {
         }
     }
 
-    fn peek_next_key_lsn(&self) -> Option<(&Key, Lsn)> {
+    fn peek_next_key_lsn_value(&self) -> Option<(&Key, Lsn, Option<&Value>)> {
         match self {
-            Self::Loaded { iter } => iter.peek().as_ref().map(|(key, lsn, _)| (key, *lsn)),
+            Self::Loaded { iter } => iter
+                .peek()
+                .as_ref()
+                .map(|(key, lsn, val)| (key, *lsn, Some(val))),
             Self::NotLoaded {
                 first_key_lower_bound: (key, lsn),
                 ..
-            } => Some((key, *lsn)),
+            } => Some((key, *lsn, None)),
         }
     }
 
@@ -191,6 +201,13 @@ impl<'a> IteratorWrapper<'a> {
     }
 }
 
+/// A merge iterator over delta/image layer iterators. When duplicated records are
+/// found, the iterator will not perform any deduplication, and the caller should handle
+/// these situation. By saying duplicated records, there are many possibilities:
+/// * Two same delta at the same LSN.
+/// * Two same image at the same LSN.
+/// * Delta/image at the same LSN where the image has already applied the delta.
+/// The iterator will always put the image before the delta.
 pub struct MergeIterator<'a> {
     heap: BinaryHeap<IteratorWrapper<'a>>,
 }
@@ -245,8 +262,9 @@ mod tests {
     use crate::{
         tenant::{
             harness::{TenantHarness, TIMELINE_ID},
-            storage_layer::delta_layer::test::{produce_delta_layer, sort_delta},
+            storage_layer::delta_layer::test::{produce_delta_layer, sort_delta, sort_delta_value},
         },
+        walrecord::NeonWalRecord,
         DEFAULT_PG_VERSION,
     };
 
@@ -407,6 +425,127 @@ mod tests {
         // TODO: test layers are loaded only when needed, reducing num of active iterators in k-merge
     }
 
-    // TODO: image layer merge, delta+image mixed merge
-    // TODO: is it possible to have duplicated delta at same LSN now? we might need to test that
+    #[tokio::test]
+    async fn delta_image_mixed_merge() {
+        use crate::repository::Value;
+        use bytes::Bytes;
+
+        let harness = TenantHarness::create("merge_iterator_delta_image_mixed_merge").unwrap();
+        let (tenant, ctx) = harness.load().await;
+
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await
+            .unwrap();
+
+        fn get_key(id: u32) -> Key {
+            let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
+            key.field6 = id;
+            key
+        }
+        // In this test case, we want to test if the iterator still works correctly with multiple copies
+        // of a delta+image at the same LSN, for example, the following sequence a@10=+a, a@10=+a, a@10=ab, a@10=ab.
+        // Duplicated deltas/images are possible for old tenants before the full L0 compaction file name fix.
+        // An incomplete compaction could produce multiple exactly-the-same delta layers. Force image generation
+        // could produce overlapping images. Apart from duplicated deltas/images, in the current storage implementation
+        // one key-lsn could have a delta in the delta layer and one image in the image layer. The iterator should
+        // correctly process these situations and return everything as-is, and the upper layer of the system
+        // will handle duplicated LSNs.
+        let test_deltas1 = vec![
+            (
+                get_key(0),
+                Lsn(0x10),
+                Value::WalRecord(NeonWalRecord::wal_init()),
+            ),
+            (
+                get_key(0),
+                Lsn(0x18),
+                Value::WalRecord(NeonWalRecord::wal_append("a")),
+            ),
+            (
+                get_key(5),
+                Lsn(0x10),
+                Value::WalRecord(NeonWalRecord::wal_init()),
+            ),
+            (
+                get_key(5),
+                Lsn(0x18),
+                Value::WalRecord(NeonWalRecord::wal_append("b")),
+            ),
+        ];
+        let resident_layer_1 = produce_delta_layer(&tenant, &tline, test_deltas1.clone(), &ctx)
+            .await
+            .unwrap();
+        let mut test_deltas2 = test_deltas1.clone();
+        test_deltas2.push((
+            get_key(10),
+            Lsn(0x20),
+            Value::Image(Bytes::copy_from_slice(b"test")),
+        ));
+        let resident_layer_2 = produce_delta_layer(&tenant, &tline, test_deltas2.clone(), &ctx)
+            .await
+            .unwrap();
+        let test_deltas3 = vec![
+            (
+                get_key(0),
+                Lsn(0x10),
+                Value::Image(Bytes::copy_from_slice(b"")),
+            ),
+            (
+                get_key(5),
+                Lsn(0x18),
+                Value::Image(Bytes::copy_from_slice(b"b")),
+            ),
+            (
+                get_key(15),
+                Lsn(0x20),
+                Value::Image(Bytes::copy_from_slice(b"test")),
+            ),
+        ];
+        let resident_layer_3 = produce_delta_layer(&tenant, &tline, test_deltas3.clone(), &ctx)
+            .await
+            .unwrap();
+        let mut test_deltas4 = test_deltas3.clone();
+        test_deltas4.push((
+            get_key(20),
+            Lsn(0x20),
+            Value::Image(Bytes::copy_from_slice(b"test")),
+        ));
+        let resident_layer_4 = produce_delta_layer(&tenant, &tline, test_deltas4.clone(), &ctx)
+            .await
+            .unwrap();
+        let mut expect = Vec::new();
+        expect.extend(test_deltas1);
+        expect.extend(test_deltas2);
+        expect.extend(test_deltas3);
+        expect.extend(test_deltas4);
+        expect.sort_by(sort_delta_value);
+
+        // Test with different layer order for MergeIterator::create to ensure the order
+        // is stable.
+
+        let mut merge_iter = MergeIterator::create(
+            &[
+                resident_layer_4.get_as_delta(&ctx).await.unwrap(),
+                resident_layer_1.get_as_delta(&ctx).await.unwrap(),
+                resident_layer_3.get_as_delta(&ctx).await.unwrap(),
+                resident_layer_2.get_as_delta(&ctx).await.unwrap(),
+            ],
+            &[],
+            &ctx,
+        );
+        assert_merge_iter_equal(&mut merge_iter, &expect).await;
+
+        let mut merge_iter = MergeIterator::create(
+            &[
+                resident_layer_1.get_as_delta(&ctx).await.unwrap(),
+                resident_layer_4.get_as_delta(&ctx).await.unwrap(),
+                resident_layer_3.get_as_delta(&ctx).await.unwrap(),
+                resident_layer_2.get_as_delta(&ctx).await.unwrap(),
+            ],
+            &[],
+            &ctx,
+        );
+        assert_merge_iter_equal(&mut merge_iter, &expect).await;
+    }
 }

From c150ad4ee25cf47b1d1f1d1837b39c9c72e678aa Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 17 Jul 2024 18:35:27 +0100
Subject: [PATCH 264/412] tests: add test_compaction_l0_memory (#8403)

This test reproduces the case of a writer creating a deep stack of L0
layers. It uses realistic layer sizes and writes several gigabytes of
data, therefore runs as a performance test although it is validating
memory footprint rather than performance per se.

It acts a regression test for two recent fixes:
- https://github.com/neondatabase/neon/pull/8401
- https://github.com/neondatabase/neon/pull/8391

In future it will demonstrate the larger improvement of using a k-merge
iterator for L0 compaction (#8184)

This test can be extended to enforce limits on the memory consumption of
other housekeeping steps, by restarting the pageserver and then running
other things to do the same "how much did RSS increase" measurement.
---
 test_runner/fixtures/pageserver/http.py    |  3 +
 test_runner/performance/test_compaction.py | 96 ++++++++++++++++++++++
 2 files changed, 99 insertions(+)

diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index f1e3d1a309..c7cea4ec04 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -117,6 +117,9 @@ class LayerMapInfo:
     def image_layers(self) -> List[HistoricLayerInfo]:
         return [x for x in self.historic_layers if x.kind == "Image"]
 
+    def delta_l0_layers(self) -> List[HistoricLayerInfo]:
+        return [x for x in self.historic_layers if x.kind == "Delta" and x.l0]
+
     def historic_by_name(self) -> Set[str]:
         return set(x.layer_file_name for x in self.historic_layers)
 
diff --git a/test_runner/performance/test_compaction.py b/test_runner/performance/test_compaction.py
index 326c4f5c6f..077b76104c 100644
--- a/test_runner/performance/test_compaction.py
+++ b/test_runner/performance/test_compaction.py
@@ -2,6 +2,7 @@ from contextlib import closing
 
 import pytest
 from fixtures.compare_fixtures import NeonCompare
+from fixtures.log_helper import log
 from fixtures.neon_fixtures import wait_for_last_flush_lsn
 
 
@@ -56,3 +57,98 @@ def test_compaction(neon_compare: NeonCompare):
         pageserver_http.timeline_compact(tenant_id, timeline_id)
 
     neon_compare.report_size()
+
+
+def test_compaction_l0_memory(neon_compare: NeonCompare):
+    """
+    Generate a large stack of L0s pending compaction into L1s, and
+    measure the pageserver's peak RSS while doing so
+    """
+
+    env = neon_compare.env
+    pageserver_http = env.pageserver.http_client()
+
+    tenant_id, timeline_id = env.neon_cli.create_tenant(
+        conf={
+            # Initially disable compaction so that we will build up a stack of L0s
+            "compaction_period": "0s",
+            "gc_period": "0s",
+        }
+    )
+    neon_compare.tenant = tenant_id
+    neon_compare.timeline = timeline_id
+
+    endpoint = env.endpoints.create_start(
+        "main", tenant_id=tenant_id, config_lines=["shared_buffers=512MB"]
+    )
+
+    # Read tenant effective config and assert on checkpoint_distance and compaction_threshold,
+    # as we do want to test with defaults (to be same as the field), but this test's workload size makes assumptions about them.
+    #
+    # If these assertions fail, it probably means we changed the default.
+    tenant_conf = pageserver_http.tenant_config(tenant_id)
+    assert tenant_conf.effective_config["checkpoint_distance"] == 256 * 1024 * 1024
+    assert tenant_conf.effective_config["compaction_threshold"] == 10
+
+    # Aim to write about 20 L0s, so that we will hit the limit on how many
+    # to compact at once
+    with closing(endpoint.connect()) as conn:
+        with conn.cursor() as cur:
+            for i in range(200):
+                cur.execute(f"create table tbl{i} (i int, j int);")
+                cur.execute(f"insert into tbl{i} values (generate_series(1, 1000), 0);")
+                for j in range(100):
+                    cur.execute(f"update tbl{i} set j = {j};")
+
+    wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
+    endpoint.stop()
+
+    # Check we have generated the L0 stack we expected
+    layers = pageserver_http.layer_map_info(tenant_id, timeline_id)
+    initial_l0s = len(layers.delta_l0_layers())
+    initial_l0s_size = sum(x.layer_file_size for x in layers.delta_l0_layers())
+    log.info(f"l0s before compaction {initial_l0s} ({initial_l0s_size})")
+
+    def rss_hwm():
+        v = pageserver_http.get_metric_value("libmetrics_maxrss_kb")
+        assert v is not None
+        assert v > 0
+        return v * 1024
+
+    before = rss_hwm()
+    pageserver_http.timeline_compact(tenant_id, timeline_id)
+    after = rss_hwm()
+
+    log.info(f"RSS across compaction: {before} -> {after} (grew {after - before})")
+
+    layers = pageserver_http.layer_map_info(tenant_id, timeline_id)
+    final_l0s_size = sum(x.layer_file_size for x in layers.delta_l0_layers())
+    log.info(f"l0s after compaction {len(layers.delta_l0_layers())} ({final_l0s_size})")
+
+    assert after > before  # If we didn't use some memory the test is probably buggy
+    compaction_mapped_rss = after - before
+
+    # During L0 compaction, we require as much memory as the physical size of what we compacted, and then some,
+    # because the key->value mapping in L0s compaction is exhaustive, non-streaming, and does not de-duplicate
+    # repeated references to the same key.
+    #
+    # To be fixed in https://github.com/neondatabase/neon/issues/8184, after which
+    # this memory estimate can be revised far downwards to something that doesn't scale
+    # linearly with the layer sizes.
+    MEMORY_ESTIMATE = (initial_l0s_size - final_l0s_size) * 1.25
+
+    # If we find that compaction is using more memory, this may indicate a regression
+    assert compaction_mapped_rss < MEMORY_ESTIMATE
+
+    # If we find that compaction is using <0.5 the expected memory then:
+    # - maybe we made a big efficiency improvement, in which case update the test
+    # - maybe something is functionally wrong with the test and it's not driving the system as expected
+    assert compaction_mapped_rss > MEMORY_ESTIMATE / 2
+
+    # We should have compacted some but not all of the l0s, based on the limit on how much
+    # l0 to compact in one go
+    assert len(layers.delta_l0_layers()) > 0
+    assert len(layers.delta_l0_layers()) < initial_l0s
+
+    # The pageserver should have logged when it hit the compaction size limit
+    env.pageserver.assert_log_contains(".*hit max delta layer size limit.*")

From ae1af558b4ba48c1b48415967e00839584688c50 Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Wed, 17 Jul 2024 15:19:40 -0400
Subject: [PATCH 265/412] docs: update storage controller db name in doc
 (#8411)

The db name was renamed to storage_controller from attachment_service.
Doc was stale.
---
 docs/storage_controller.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/storage_controller.md b/docs/storage_controller.md
index daf4d0c8b7..6d2ef929a4 100644
--- a/docs/storage_controller.md
+++ b/docs/storage_controller.md
@@ -44,7 +44,7 @@ If you need to modify the database schema, here’s how to create a migration:
 - Use `diesel migration generate <name>` to create a new migration
 - Populate the SQL files in the `migrations/` subdirectory
 - Use `DATABASE_URL=... diesel migration run` to apply the migration you just wrote: this will update the `[schema.rs](http://schema.rs)` file automatically.
-  - This requires a running database: the easiest way to do that is to just run `cargo neon init ; cargo neon start`, which will leave a database available at `postgresql://localhost:1235/attachment_service`
+  - This requires a running database: the easiest way to do that is to just run `cargo neon init ; cargo neon start`, which will leave a database available at `postgresql://localhost:1235/storage_controller`
 - Commit the migration files and the changes to schema.rs
 - If you need to iterate, you can rewind migrations with `diesel migration revert -a` and then `diesel migration run` again.
 - The migrations are build into the storage controller binary, and automatically run at startup after it is deployed, so once you’ve committed a migration no further steps are needed.

From ef3ebfaf67592d30462f585b6828ca86175be381 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 17 Jul 2024 21:55:20 +0100
Subject: [PATCH 266/412] pageserver: layer count & size metrics (#8410)

## Problem

We lack insight into:
- How much of a tenant's physical size is image vs. delta layers
- Average sizes of image vs. delta layers
- Total layer counts per timeline, indicating size of index_part object

As well as general observability love, this is motivated by
https://github.com/neondatabase/neon/issues/6738, where we need to
define some sensible thresholds for storage amplification, and using
total physical size may not work well (if someone does a lot of DROPs
then it's legitimate for the physical-synthetic ratio to be huge), but
the ratio between image layer size and delta layer size may be a better
indicator of whether we're generating unreasonable quantities of image
layers.

## Summary of changes

- Add pageserver_layer_bytes and pageserver_layer_count metrics,
labelled by timeline and `kind` (delta or image)
- Add & subtract these with LayerInner's lifetime.

I'm intentionally avoiding using a generic metric RAII guard object, to
avoid bloating LayerInner: it already has all the information it needs
to update metric on new+drop.
---
 pageserver/src/metrics.rs                    | 94 ++++++++++++++++++++
 pageserver/src/tenant/storage_layer/layer.rs | 21 +++++
 test_runner/fixtures/metrics.py              |  2 +
 3 files changed, 117 insertions(+)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index abad4b44b8..753f5524c5 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -473,6 +473,31 @@ static PITR_HISTORY_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+#[derive(strum_macros::EnumString, strum_macros::Display, strum_macros::IntoStaticStr)]
+#[strum(serialize_all = "kebab_case")]
+pub(crate) enum MetricLayerKind {
+    Delta,
+    Image,
+}
+
+static TIMELINE_LAYER_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_layer_bytes",
+        "Sum of layer physical sizes in bytes",
+        &["tenant_id", "shard_id", "timeline_id", "kind"]
+    )
+    .expect("failed to define a metric")
+});
+
+static TIMELINE_LAYER_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_layer_count",
+        "Number of layers that exist",
+        &["tenant_id", "shard_id", "timeline_id", "kind"]
+    )
+    .expect("failed to define a metric")
+});
+
 static TIMELINE_ARCHIVE_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
     register_uint_gauge_vec!(
         "pageserver_archive_size",
@@ -2141,6 +2166,10 @@ pub(crate) struct TimelineMetrics {
     pub last_record_gauge: IntGauge,
     pub pitr_history_size: UIntGauge,
     pub archival_size: UIntGauge,
+    pub(crate) layer_size_image: UIntGauge,
+    pub(crate) layer_count_image: UIntGauge,
+    pub(crate) layer_size_delta: UIntGauge,
+    pub(crate) layer_count_delta: UIntGauge,
     pub standby_horizon_gauge: IntGauge,
     pub resident_physical_size_gauge: UIntGauge,
     /// copy of LayeredTimeline.current_logical_size
@@ -2223,6 +2252,42 @@ impl TimelineMetrics {
             .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
             .unwrap();
 
+        let layer_size_image = TIMELINE_LAYER_SIZE
+            .get_metric_with_label_values(&[
+                &tenant_id,
+                &shard_id,
+                &timeline_id,
+                MetricLayerKind::Image.into(),
+            ])
+            .unwrap();
+
+        let layer_count_image = TIMELINE_LAYER_COUNT
+            .get_metric_with_label_values(&[
+                &tenant_id,
+                &shard_id,
+                &timeline_id,
+                MetricLayerKind::Image.into(),
+            ])
+            .unwrap();
+
+        let layer_size_delta = TIMELINE_LAYER_SIZE
+            .get_metric_with_label_values(&[
+                &tenant_id,
+                &shard_id,
+                &timeline_id,
+                MetricLayerKind::Delta.into(),
+            ])
+            .unwrap();
+
+        let layer_count_delta = TIMELINE_LAYER_COUNT
+            .get_metric_with_label_values(&[
+                &tenant_id,
+                &shard_id,
+                &timeline_id,
+                MetricLayerKind::Delta.into(),
+            ])
+            .unwrap();
+
         let standby_horizon_gauge = STANDBY_HORIZON
             .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
             .unwrap();
@@ -2277,6 +2342,10 @@ impl TimelineMetrics {
             last_record_gauge,
             pitr_history_size,
             archival_size,
+            layer_size_image,
+            layer_count_image,
+            layer_size_delta,
+            layer_count_delta,
             standby_horizon_gauge,
             resident_physical_size_gauge,
             current_logical_size_gauge,
@@ -2338,6 +2407,31 @@ impl TimelineMetrics {
         let _ = TIMELINE_ARCHIVE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         let _ = PITR_HISTORY_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
 
+        let _ = TIMELINE_LAYER_SIZE.remove_label_values(&[
+            tenant_id,
+            shard_id,
+            timeline_id,
+            MetricLayerKind::Image.into(),
+        ]);
+        let _ = TIMELINE_LAYER_COUNT.remove_label_values(&[
+            tenant_id,
+            shard_id,
+            timeline_id,
+            MetricLayerKind::Image.into(),
+        ]);
+        let _ = TIMELINE_LAYER_SIZE.remove_label_values(&[
+            tenant_id,
+            shard_id,
+            timeline_id,
+            MetricLayerKind::Delta.into(),
+        ]);
+        let _ = TIMELINE_LAYER_COUNT.remove_label_values(&[
+            tenant_id,
+            shard_id,
+            timeline_id,
+            MetricLayerKind::Delta.into(),
+        ]);
+
         let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         let _ = VALID_LSN_LEASE_COUNT.remove_label_values(&[tenant_id, shard_id, timeline_id]);
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 4500bc94dd..dbf6c60aae 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -693,6 +693,18 @@ impl Drop for LayerInner {
             // and we could be delaying shutdown for nothing.
         }
 
+        if let Some(timeline) = self.timeline.upgrade() {
+            // Only need to decrement metrics if the timeline still exists: otherwise
+            // it will have already de-registered these metrics via TimelineMetrics::shutdown
+            if self.desc.is_delta() {
+                timeline.metrics.layer_count_delta.dec();
+                timeline.metrics.layer_size_delta.sub(self.desc.file_size);
+            } else {
+                timeline.metrics.layer_count_image.dec();
+                timeline.metrics.layer_size_image.sub(self.desc.file_size);
+            }
+        }
+
         if !*self.wanted_deleted.get_mut() {
             return;
         }
@@ -791,6 +803,15 @@ impl LayerInner {
             (heavier_once_cell::OnceCell::default(), 0, Status::Evicted)
         };
 
+        // This object acts as a RAII guard on these metrics: increment on construction
+        if desc.is_delta() {
+            timeline.metrics.layer_count_delta.inc();
+            timeline.metrics.layer_size_delta.add(desc.file_size);
+        } else {
+            timeline.metrics.layer_count_image.inc();
+            timeline.metrics.layer_size_image.add(desc.file_size);
+        }
+
         LayerInner {
             conf,
             debug_str: {
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index c019cbbc77..4836d42db5 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -146,6 +146,8 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
     "pageserver_smgr_query_seconds_sum",
     "pageserver_archive_size",
     "pageserver_pitr_history_size",
+    "pageserver_layer_bytes",
+    "pageserver_layer_count",
     "pageserver_storage_operations_seconds_count_total",
     "pageserver_storage_operations_seconds_sum_total",
     "pageserver_evictions_total",

From ff174a88c0544d1270a6f993810d67a6eb4cc0b2 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 18 Jul 2024 00:03:02 +0300
Subject: [PATCH 267/412] test: allow requests to any pageserver get cancelled
 (#8413)

Fix flakyness on `test_sharded_timeline_detach_ancestor` which does not
reproduce on a fast enough runner by allowing cancelled request before
completing on all pageservers. It was only allowed on half of the
pageservers.

Failure evidence:
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-8352/9972357040/index.html#suites/a1c2be32556270764423c495fad75d47/7cca3e3d94fe12f2
---
 .../regress/test_timeline_detach_ancestor.py  | 20 +++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index d75ab4c060..38f8dfa885 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -702,20 +702,16 @@ def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder):
     # make another of the nodes get stuck, then restart
 
     stuck = pageservers[int(shards[0]["node_id"])]
-    stuck.allowed_errors.append(".*: request was dropped before completing")
-    env.storage_controller.allowed_errors.append(".*: request was dropped before completing")
+    log.info(f"stuck pageserver is id={stuck.id}")
     stuck_http = stuck.http_client()
     stuck_http.configure_failpoints(
         ("timeline-detach-ancestor::before_starting_after_locking_pausable", "pause")
     )
 
     restarted = pageservers[int(shards[1]["node_id"])]
-    restarted.allowed_errors.extend(
-        [
-            ".*: request was dropped before completing",
-            ".*: Cancelled request finished with an error: ShuttingDown",
-        ]
-    )
+    log.info(f"restarted pageserver is id={restarted.id}")
+    # this might be hit; see `restart_restarted`
+    restarted.allowed_errors.append(".*: Cancelled request finished with an error: ShuttingDown")
     assert restarted.id != stuck.id
     restarted_http = restarted.http_client()
     restarted_http.configure_failpoints(
@@ -724,6 +720,14 @@ def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder):
         ]
     )
 
+    for info in shards:
+        pageserver = pageservers[int(info["node_id"])]
+        # the first request can cause these, but does not repeatedly
+        pageserver.allowed_errors.append(".*: request was dropped before completing")
+
+    # first request again
+    env.storage_controller.allowed_errors.append(".*: request was dropped before completing")
+
     target = env.storage_controller.pageserver_api()
 
     with pytest.raises(ReadTimeout):

From 82a2081d61c423d19b79fb8c08ca3945c3feadcd Mon Sep 17 00:00:00 2001
From: dotdister <odsk.dr@gmail.com>
Date: Thu, 18 Jul 2024 17:33:46 +0900
Subject: [PATCH 268/412] Fix comment in Control Plane (#8406)

## Problem
There are something wrong in the comment of
`control_plane/src/broker.rs` and `control_plane/src/pageserver.rs`

## Summary of changes
Fixed the comment about component name and their data path in
`control_plane/src/broker.rs` and `control_plane/src/pageserver.rs`.
---
 control_plane/src/broker.rs     | 4 ++--
 control_plane/src/pageserver.rs | 6 ++++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/control_plane/src/broker.rs b/control_plane/src/broker.rs
index c3cfc140da..c8ac5d8981 100644
--- a/control_plane/src/broker.rs
+++ b/control_plane/src/broker.rs
@@ -1,9 +1,9 @@
 //! Code to manage the storage broker
 //!
-//! In the local test environment, the data for each safekeeper is stored in
+//! In the local test environment, the storage broker stores its data directly in
 //!
 //! ```text
-//!   .neon/safekeepers/<safekeeper id>
+//!   .neon
 //! ```
 use std::time::Duration;
 
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 5f2373e95a..e3d1d0e110 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -1,8 +1,10 @@
 //! Code to manage pageservers
 //!
-//! In the local test environment, the pageserver stores its data directly in
+//! In the local test environment, the data for each pageserver is stored in
 //!
-//!   .neon/
+//! ```text
+//!   .neon/pageserver_<pageserver_id>
+//! ```
 //!
 use std::collections::HashMap;
 

From 3d2c2ce139de5221ce4ac0c046f0c850bd174652 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 18 Jul 2024 10:56:07 +0200
Subject: [PATCH 269/412] NeonEnv.from_repo_dir: use storage_controller_db
 instead of `attachments.json` (#8382)

When `NeonEnv.from_repo_dir` was introduced, storage controller stored
its
state exclusively `attachments.json`.
Since then, it has moved to using Postgres, which stores its state in
`storage_controller_db`.

But `NeonEnv.from_repo_dir` wasn't adjusted to do this.
This PR rectifies the situation.

Context for this is failures in
`test_pageserver_characterize_throughput_with_n_tenants`
CF:
https://neondb.slack.com/archives/C033RQ5SPDH/p1721035799502239?thread_ts=1720901332.293769&cid=C033RQ5SPDH

Notably, `from_repo_dir` is also used by the backwards- and
forwards-compatibility.
Thus, the changes in this PR affect those tests as well.
However, it turns out that the compatibility snapshot already contains
the `storage_controller_db`.
Thus, it should just work and in fact we can remove hacks like
`fixup_storage_controller`.

Follow-ups created as part of this work:
* https://github.com/neondatabase/neon/issues/8399
* https://github.com/neondatabase/neon/issues/8400
---
 Cargo.lock                                    |  27 +++++
 Cargo.toml                                    |   1 +
 control_plane/Cargo.toml                      |   1 +
 control_plane/src/storage_controller.rs       |  87 +++++++++++----
 storage_controller/src/main.rs                |  19 +---
 storage_controller/src/persistence.rs         | 100 ++----------------
 test_runner/fixtures/neon_fixtures.py         |  27 ++++-
 ...er_max_throughput_getpage_at_latest_lsn.py |   8 --
 test_runner/regress/test_compatibility.py     |  25 -----
 9 files changed, 133 insertions(+), 162 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 8897364701..d08da0babd 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1368,6 +1368,7 @@ dependencies = [
  "tracing",
  "url",
  "utils",
+ "whoami",
  "workspace_hack",
 ]
 
@@ -4603,6 +4604,15 @@ dependencies = [
  "bitflags 1.3.2",
 ]
 
+[[package]]
+name = "redox_syscall"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa"
+dependencies = [
+ "bitflags 1.3.2",
+]
+
 [[package]]
 name = "regex"
 version = "1.10.2"
@@ -6972,6 +6982,12 @@ version = "0.11.0+wasi-snapshot-preview1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
 
+[[package]]
+name = "wasite"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b8dad83b4f25e74f184f64c43b150b91efe7647395b42289f38e50566d82855b"
+
 [[package]]
 name = "wasm-bindgen"
 version = "0.2.92"
@@ -7124,6 +7140,17 @@ dependencies = [
  "once_cell",
 ]
 
+[[package]]
+name = "whoami"
+version = "1.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a44ab49fad634e88f55bf8f9bb3abd2f27d7204172a112c7c9987e01c1c94ea9"
+dependencies = [
+ "redox_syscall 0.4.1",
+ "wasite",
+ "web-sys",
+]
+
 [[package]]
 name = "winapi"
 version = "0.3.9"
diff --git a/Cargo.toml b/Cargo.toml
index 4f42203683..b9b4bafb4f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -191,6 +191,7 @@ uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
 walkdir = "2.3.2"
 rustls-native-certs = "0.7"
 x509-parser = "0.15"
+whoami = "1.5.1"
 
 ## TODO replace this with tracing
 env_logger = "0.10"
diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml
index e62f3b8a47..487ac8f047 100644
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -40,6 +40,7 @@ safekeeper_api.workspace = true
 postgres_connection.workspace = true
 storage_broker.workspace = true
 utils.workspace = true
+whoami.workspace = true
 
 compute_api.workspace = true
 workspace_hack.workspace = true
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index 47103a2e0a..d7aedd711a 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -29,7 +29,6 @@ use utils::{
 pub struct StorageController {
     env: LocalEnv,
     listen: String,
-    path: Utf8PathBuf,
     private_key: Option<Vec<u8>>,
     public_key: Option<String>,
     postgres_port: u16,
@@ -41,6 +40,8 @@ const COMMAND: &str = "storage_controller";
 
 const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;
 
+const DB_NAME: &str = "storage_controller";
+
 #[derive(Serialize, Deserialize)]
 pub struct AttachHookRequest {
     pub tenant_shard_id: TenantShardId,
@@ -65,10 +66,6 @@ pub struct InspectResponse {
 
 impl StorageController {
     pub fn from_env(env: &LocalEnv) -> Self {
-        let path = Utf8PathBuf::from_path_buf(env.base_data_dir.clone())
-            .unwrap()
-            .join("attachments.json");
-
         // Makes no sense to construct this if pageservers aren't going to use it: assume
         // pageservers have control plane API set
         let listen_url = env.control_plane_api.clone().unwrap();
@@ -128,7 +125,6 @@ impl StorageController {
 
         Self {
             env: env.clone(),
-            path,
             listen,
             private_key,
             public_key,
@@ -203,7 +199,6 @@ impl StorageController {
     ///
     /// Returns the database url
     pub async fn setup_database(&self) -> anyhow::Result<String> {
-        const DB_NAME: &str = "storage_controller";
         let database_url = format!("postgresql://localhost:{}/{DB_NAME}", self.postgres_port);
 
         let pg_bin_dir = self.get_pg_bin_dir().await?;
@@ -232,6 +227,30 @@ impl StorageController {
         Ok(database_url)
     }
 
+    pub async fn connect_to_database(
+        &self,
+    ) -> anyhow::Result<(
+        tokio_postgres::Client,
+        tokio_postgres::Connection<tokio_postgres::Socket, tokio_postgres::tls::NoTlsStream>,
+    )> {
+        tokio_postgres::Config::new()
+            .host("localhost")
+            .port(self.postgres_port)
+            // The user is the ambient operating system user name.
+            // That is an impurity which we want to fix in => TODO https://github.com/neondatabase/neon/issues/8400
+            //
+            // Until we get there, use the ambient operating system user name.
+            // Recent tokio-postgres versions default to this if the user isn't specified.
+            // But tokio-postgres fork doesn't have this upstream commit:
+            // https://github.com/sfackler/rust-postgres/commit/cb609be758f3fb5af537f04b584a2ee0cebd5e79
+            // => we should rebase our fork => TODO https://github.com/neondatabase/neon/issues/8399
+            .user(&whoami::username())
+            .dbname(DB_NAME)
+            .connect(tokio_postgres::NoTls)
+            .await
+            .map_err(anyhow::Error::new)
+    }
+
     pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
         // Start a vanilla Postgres process used by the storage controller for persistence.
         let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
@@ -256,18 +275,21 @@ impl StorageController {
             if !status.success() {
                 anyhow::bail!("initdb failed with status {status}");
             }
-
-            // Write a minimal config file:
-            // - Specify the port, since this is chosen dynamically
-            // - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
-            //   the storage controller we don't want a slow local disk to interfere with that.
-            tokio::fs::write(
-                &pg_data_path.join("postgresql.conf"),
-                format!("port = {}\nfsync=off\n", self.postgres_port),
-            )
-            .await?;
         };
 
+        // Write a minimal config file:
+        // - Specify the port, since this is chosen dynamically
+        // - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
+        //   the storage controller we don't want a slow local disk to interfere with that.
+        //
+        // NB: it's important that we rewrite this file on each start command so we propagate changes
+        // from `LocalEnv`'s config file (`.neon/config`).
+        tokio::fs::write(
+            &pg_data_path.join("postgresql.conf"),
+            format!("port = {}\nfsync=off\n", self.postgres_port),
+        )
+        .await?;
+
         println!("Starting storage controller database...");
         let db_start_args = [
             "-w",
@@ -296,11 +318,38 @@ impl StorageController {
         // Run migrations on every startup, in case something changed.
         let database_url = self.setup_database().await?;
 
+        // We support running a startup SQL script to fiddle with the database before we launch storcon.
+        // This is used by the test suite.
+        let startup_script_path = self
+            .env
+            .base_data_dir
+            .join("storage_controller_db.startup.sql");
+        let startup_script = match tokio::fs::read_to_string(&startup_script_path).await {
+            Ok(script) => {
+                tokio::fs::remove_file(startup_script_path).await?;
+                script
+            }
+            Err(e) => {
+                if e.kind() == std::io::ErrorKind::NotFound {
+                    // always run some startup script so that this code path doesn't bit rot
+                    "BEGIN; COMMIT;".to_string()
+                } else {
+                    anyhow::bail!("Failed to read startup script: {e}")
+                }
+            }
+        };
+        let (mut client, conn) = self.connect_to_database().await?;
+        let conn = tokio::spawn(conn);
+        let tx = client.build_transaction();
+        let tx = tx.start().await?;
+        tx.batch_execute(&startup_script).await?;
+        tx.commit().await?;
+        drop(client);
+        conn.await??;
+
         let mut args = vec![
             "-l",
             &self.listen,
-            "-p",
-            self.path.as_ref(),
             "--dev",
             "--database-url",
             &database_url,
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index f1eb0b30fc..4bf6b528f4 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -1,5 +1,4 @@
 use anyhow::{anyhow, Context};
-use camino::Utf8PathBuf;
 use clap::Parser;
 use diesel::Connection;
 use metrics::launch_timestamp::LaunchTimestamp;
@@ -51,10 +50,6 @@ struct Cli {
     #[arg(long)]
     compute_hook_url: Option<String>,
 
-    /// Path to the .json file to store state (will be created if it doesn't exist)
-    #[arg(short, long)]
-    path: Option<Utf8PathBuf>,
-
     /// URL to connect to postgres, like postgresql://localhost:1234/storage_controller
     #[arg(long)]
     database_url: Option<String>,
@@ -206,11 +201,10 @@ async fn async_main() -> anyhow::Result<()> {
 
     let args = Cli::parse();
     tracing::info!(
-        "version: {}, launch_timestamp: {}, build_tag {}, state at {}, listening on {}",
+        "version: {}, launch_timestamp: {}, build_tag {}, listening on {}",
         GIT_VERSION,
         launch_ts.to_string(),
         BUILD_TAG,
-        args.path.as_ref().unwrap_or(&Utf8PathBuf::from("<none>")),
         args.listen
     );
 
@@ -277,8 +271,7 @@ async fn async_main() -> anyhow::Result<()> {
         .await
         .context("Running database migrations")?;
 
-    let json_path = args.path;
-    let persistence = Arc::new(Persistence::new(secrets.database_url, json_path.clone()));
+    let persistence = Arc::new(Persistence::new(secrets.database_url));
 
     let service = Service::spawn(config, persistence.clone()).await?;
 
@@ -316,14 +309,6 @@ async fn async_main() -> anyhow::Result<()> {
     }
     tracing::info!("Terminating on signal");
 
-    if json_path.is_some() {
-        // Write out a JSON dump on shutdown: this is used in compat tests to avoid passing
-        // full postgres dumps around.
-        if let Err(e) = persistence.write_tenants_json().await {
-            tracing::error!("Failed to write JSON on shutdown: {e}")
-        }
-    }
-
     // Stop HTTP server first, so that we don't have to service requests
     // while shutting down Service
     server_shutdown.cancel();
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index 9f7b2f775e..d8f31e86e5 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -5,8 +5,6 @@ use std::time::Duration;
 use std::time::Instant;
 
 use self::split_state::SplitState;
-use camino::Utf8Path;
-use camino::Utf8PathBuf;
 use diesel::pg::PgConnection;
 use diesel::prelude::*;
 use diesel::Connection;
@@ -55,11 +53,6 @@ use crate::node::Node;
 /// we can UPDATE a node's scheduling mode reasonably quickly to mark a bad node offline.
 pub struct Persistence {
     connection_pool: diesel::r2d2::Pool<diesel::r2d2::ConnectionManager<PgConnection>>,
-
-    // In test environments, we support loading+saving a JSON file.  This is temporary, for the benefit of
-    // test_compatibility.py, so that we don't have to commit to making the database contents fully backward/forward
-    // compatible just yet.
-    json_path: Option<Utf8PathBuf>,
 }
 
 /// Legacy format, for use in JSON compat objects in test environment
@@ -124,7 +117,7 @@ impl Persistence {
     const IDLE_CONNECTION_TIMEOUT: Duration = Duration::from_secs(10);
     const MAX_CONNECTION_LIFETIME: Duration = Duration::from_secs(60);
 
-    pub fn new(database_url: String, json_path: Option<Utf8PathBuf>) -> Self {
+    pub fn new(database_url: String) -> Self {
         let manager = diesel::r2d2::ConnectionManager::<PgConnection>::new(database_url);
 
         // We will use a connection pool: this is primarily to _limit_ our connection count, rather than to optimize time
@@ -139,10 +132,7 @@ impl Persistence {
             .build(manager)
             .expect("Could not build connection pool");
 
-        Self {
-            connection_pool,
-            json_path,
-        }
+        Self { connection_pool }
     }
 
     /// A helper for use during startup, where we would like to tolerate concurrent restarts of the
@@ -302,85 +292,13 @@ impl Persistence {
     /// At startup, load the high level state for shards, such as their config + policy.  This will
     /// be enriched at runtime with state discovered on pageservers.
     pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult<Vec<TenantShardPersistence>> {
-        let loaded = self
-            .with_measured_conn(
-                DatabaseOperation::ListTenantShards,
-                move |conn| -> DatabaseResult<_> {
-                    Ok(crate::schema::tenant_shards::table.load::<TenantShardPersistence>(conn)?)
-                },
-            )
-            .await?;
-
-        if loaded.is_empty() {
-            if let Some(path) = &self.json_path {
-                if tokio::fs::try_exists(path)
-                    .await
-                    .map_err(|e| DatabaseError::Logical(format!("Error stat'ing JSON file: {e}")))?
-                {
-                    tracing::info!("Importing from legacy JSON format at {path}");
-                    return self.list_tenant_shards_json(path).await;
-                }
-            }
-        }
-        Ok(loaded)
-    }
-
-    /// Shim for automated compatibility tests: load tenants from a JSON file instead of database
-    pub(crate) async fn list_tenant_shards_json(
-        &self,
-        path: &Utf8Path,
-    ) -> DatabaseResult<Vec<TenantShardPersistence>> {
-        let bytes = tokio::fs::read(path)
-            .await
-            .map_err(|e| DatabaseError::Logical(format!("Failed to load JSON: {e}")))?;
-
-        let mut decoded = serde_json::from_slice::<JsonPersistence>(&bytes)
-            .map_err(|e| DatabaseError::Logical(format!("Deserialization error: {e}")))?;
-        for shard in decoded.tenants.values_mut() {
-            if shard.placement_policy == "\"Single\"" {
-                // Backward compat for test data after PR https://github.com/neondatabase/neon/pull/7165
-                shard.placement_policy = "{\"Attached\":0}".to_string();
-            }
-
-            if shard.scheduling_policy.is_empty() {
-                shard.scheduling_policy =
-                    serde_json::to_string(&ShardSchedulingPolicy::default()).unwrap();
-            }
-        }
-
-        let tenants: Vec<TenantShardPersistence> = decoded.tenants.into_values().collect();
-
-        // Synchronize database with what is in the JSON file
-        self.insert_tenant_shards(tenants.clone()).await?;
-
-        Ok(tenants)
-    }
-
-    /// For use in testing environments, where we dump out JSON on shutdown.
-    pub async fn write_tenants_json(&self) -> anyhow::Result<()> {
-        let Some(path) = &self.json_path else {
-            anyhow::bail!("Cannot write JSON if path isn't set (test environment bug)");
-        };
-        tracing::info!("Writing state to {path}...");
-        let tenants = self.list_tenant_shards().await?;
-        let mut tenants_map = HashMap::new();
-        for tsp in tenants {
-            let tenant_shard_id = TenantShardId {
-                tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?,
-                shard_number: ShardNumber(tsp.shard_number as u8),
-                shard_count: ShardCount::new(tsp.shard_count as u8),
-            };
-
-            tenants_map.insert(tenant_shard_id, tsp);
-        }
-        let json = serde_json::to_string(&JsonPersistence {
-            tenants: tenants_map,
-        })?;
-
-        tokio::fs::write(path, &json).await?;
-        tracing::info!("Wrote {} bytes to {path}...", json.len());
-
-        Ok(())
+        self.with_measured_conn(
+            DatabaseOperation::ListTenantShards,
+            move |conn| -> DatabaseResult<_> {
+                Ok(crate::schema::tenant_shards::table.load::<TenantShardPersistence>(conn)?)
+            },
+        )
+        .await
     }
 
     /// Tenants must be persisted before we schedule them for the first time.  This enables us
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 2765ff916e..fcfd4ea676 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -31,6 +31,7 @@ import backoff
 import httpx
 import jwt
 import psycopg2
+import psycopg2.sql
 import pytest
 import requests
 import toml
@@ -727,8 +728,30 @@ class NeonEnvBuilder:
                 self.repo_dir / "local_fs_remote_storage",
             )
 
-        if (attachments_json := Path(repo_dir / "attachments.json")).exists():
-            shutil.copyfile(attachments_json, self.repo_dir / attachments_json.name)
+        # restore storage controller (the db is small, don't bother with overlayfs)
+        storcon_db_from_dir = repo_dir / "storage_controller_db"
+        storcon_db_to_dir = self.repo_dir / "storage_controller_db"
+        log.info(f"Copying storage_controller_db from {storcon_db_from_dir} to {storcon_db_to_dir}")
+        assert storcon_db_from_dir.is_dir()
+        assert not storcon_db_to_dir.exists()
+
+        def ignore_postgres_log(path: str, _names):
+            if Path(path) == storcon_db_from_dir:
+                return {"postgres.log"}
+            return set()
+
+        shutil.copytree(storcon_db_from_dir, storcon_db_to_dir, ignore=ignore_postgres_log)
+        assert not (storcon_db_to_dir / "postgres.log").exists()
+        # NB: neon_local rewrites postgresql.conf on each start based on neon_local config. No need to patch it.
+        # However, in this new NeonEnv, the pageservers listen on different ports, and the storage controller
+        # will currently reject re-attach requests from them because the NodeMetadata isn't identical.
+        # So, from_repo_dir patches up the the storcon database.
+        patch_script_path = self.repo_dir / "storage_controller_db.startup.sql"
+        assert not patch_script_path.exists()
+        patch_script = ""
+        for ps in self.env.pageservers:
+            patch_script += f"UPDATE nodes SET listen_http_port={ps.service_port.http}, listen_pg_port={ps.service_port.pg}  WHERE node_id = '{ps.id}';"
+        patch_script_path.write_text(patch_script)
 
         # Update the config with info about tenants and timelines
         with (self.repo_dir / "config").open("r") as f:
diff --git a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
index 60861cf939..949813c984 100644
--- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
+++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
@@ -255,11 +255,3 @@ def run_pagebench_benchmark(
             unit="ms",
             report=MetricReport.LOWER_IS_BETTER,
         )
-
-    env.storage_controller.allowed_errors.append(
-        # The test setup swaps NeonEnv instances, hence different
-        # pg instances are used for the storage controller db. This means
-        # the storage controller doesn't know about the nodes mentioned
-        # in attachments.json at start-up.
-        ".* Scheduler missing node 1",
-    )
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index 1e5e320e0e..65649e0c0a 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -93,29 +93,6 @@ check_ondisk_data_compatibility_if_enabled = pytest.mark.skipif(
 )
 
 
-def fixup_storage_controller(env: NeonEnv):
-    """
-    After importing a repo_dir, we need to massage the storage controller's state a bit: it will have
-    initially started up with no nodes, but some tenants, and thereby those tenants won't be scheduled
-    anywhere.
-
-    After NeonEnv.start() is done (i.e. nodes are started + registered), call this function to get
-    the storage controller into a good state.
-
-    This function should go away once compat tests carry the controller database in their snapshots, so
-    that the controller properly remembers nodes between creating + restoring the snapshot.
-    """
-    env.storage_controller.allowed_errors.extend(
-        [
-            ".*Tenant shard .+ references non-existent node.*",
-            ".*Failed to schedule tenant .+ at startup.*",
-        ]
-    )
-    env.storage_controller.stop()
-    env.storage_controller.start()
-    env.storage_controller.reconcile_until_idle()
-
-
 @pytest.mark.xdist_group("compatibility")
 @pytest.mark.order(before="test_forward_compatibility")
 def test_create_snapshot(
@@ -198,7 +175,6 @@ def test_backward_compatibility(
         neon_env_builder.num_safekeepers = 3
         env = neon_env_builder.from_repo_dir(compatibility_snapshot_dir / "repo")
         neon_env_builder.start()
-        fixup_storage_controller(env)
 
         check_neon_works(
             env,
@@ -287,7 +263,6 @@ def test_forward_compatibility(
         assert not env.pageserver.log_contains("git-env:" + prev_pageserver_version)
 
         neon_env_builder.start()
-        fixup_storage_controller(env)
 
         # ensure the specified pageserver is running
         assert env.pageserver.log_contains("git-env:" + prev_pageserver_version)

From de9bf2af6c37f2a48b7897ed5dd2b2c9d15aa419 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 18 Jul 2024 10:14:56 +0100
Subject: [PATCH 270/412] tests: fix metrics check in test_s3_eviction (#8419)

## Problem

This test would occasionally fail its metric check. This could happen in
the rare case that the nodes had all been restarted before their most
recent eviction.

The metric check was added in
https://github.com/neondatabase/neon/pull/8348

## Summary of changes

- Check metrics before each restart, accumulate into a bool that we
assert on at the end of the test
---
 test_runner/regress/test_wal_acceptor.py | 43 +++++++++++++-----------
 1 file changed, 24 insertions(+), 19 deletions(-)

diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 2e906e6160..f02f19c588 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -2242,6 +2242,8 @@ def test_s3_eviction(
 
     check_values = [0] * n_timelines
 
+    event_metrics_seen = False
+
     n_iters = 20
     for _ in range(n_iters):
         if log.isEnabledFor(logging.DEBUG):
@@ -2266,6 +2268,27 @@ def test_s3_eviction(
         # update remote_consistent_lsn on pageserver
         ps_client.timeline_checkpoint(env.initial_tenant, timelines[i], wait_until_uploaded=True)
 
+        # Do metrics check before restarts, since these will reset to zero across a restart
+        event_metrics_seen |= any(
+            sk.http_client().get_metric_value(
+                "safekeeper_eviction_events_started_total", {"kind": "evict"}
+            )
+            or 0 > 0
+            and sk.http_client().get_metric_value(
+                "safekeeper_eviction_events_completed_total", {"kind": "evict"}
+            )
+            or 0 > 0
+            and sk.http_client().get_metric_value(
+                "safekeeper_eviction_events_started_total", {"kind": "restore"}
+            )
+            or 0 > 0
+            and sk.http_client().get_metric_value(
+                "safekeeper_eviction_events_completed_total", {"kind": "restore"}
+            )
+            or 0 > 0
+            for sk in env.safekeepers
+        )
+
         # restarting random safekeepers
         for sk in env.safekeepers:
             if random.random() < restart_chance:
@@ -2280,22 +2303,4 @@ def test_s3_eviction(
         for sk in env.safekeepers
     )
 
-    assert any(
-        sk.http_client().get_metric_value(
-            "safekeeper_eviction_events_started_total", {"kind": "evict"}
-        )
-        or 0 > 0
-        and sk.http_client().get_metric_value(
-            "safekeeper_eviction_events_completed_total", {"kind": "evict"}
-        )
-        or 0 > 0
-        and sk.http_client().get_metric_value(
-            "safekeeper_eviction_events_started_total", {"kind": "restore"}
-        )
-        or 0 > 0
-        and sk.http_client().get_metric_value(
-            "safekeeper_eviction_events_completed_total", {"kind": "restore"}
-        )
-        or 0 > 0
-        for sk in env.safekeepers
-    )
+    assert event_metrics_seen

From 27da0e9cf50247138c69d1b29515e66a7622b456 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 18 Jul 2024 10:23:17 +0100
Subject: [PATCH 271/412] tests: increase test_pg_regress and test_isolation
 timeouts (#8418)

## Problem

These tests time out ~1 in 50 runs when in debug mode.

There is no indication of a real issue: they're just wrappers that have
large numbers of individual tests contained within on pytest case.

## Summary of changes

- Bump pg_regress timeout from 600 to 900s
- Bump test_isolation timeout from 300s (default) to 600s

In future it would be nice to break out these tests to run individual
cases (or batches thereof) as separate tests, rather than this monolith.
---
 test_runner/regress/test_pg_regress.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py
index 54b493ec70..d5b5ac3f75 100644
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -117,7 +117,7 @@ def post_checks(env: NeonEnv, test_output_dir: Path, db_name: str, endpoint: End
 
 # Run the main PostgreSQL regression tests, in src/test/regress.
 #
-@pytest.mark.timeout(600)
+@pytest.mark.timeout(900)  # Contains many sub-tests, is slow in debug builds
 @pytest.mark.parametrize("shard_count", [None, 4])
 def test_pg_regress(
     neon_env_builder: NeonEnvBuilder,
@@ -186,6 +186,7 @@ def test_pg_regress(
 
 # Run the PostgreSQL "isolation" tests, in src/test/isolation.
 #
+@pytest.mark.timeout(600)  # Contains many sub-tests, is slow in debug builds
 @pytest.mark.parametrize("shard_count", [None, 4])
 def test_isolation(
     neon_env_builder: NeonEnvBuilder,

From 9868bb3346d27841e3cee67165745992270f65c9 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 18 Jul 2024 12:59:14 +0100
Subject: [PATCH 272/412] tests: turn on safekeeper eviction by default (#8352)

## Problem

Ahead of enabling eviction in the field, where it will become the
normal/default mode, let's enable it by default throughout our tests in
case any issues become visible there.

## Summary of changes

- Make default `extra_opts` for safekeepers enable offload & deletion
- Set low timeouts in `extra_opts` so that tests running for tens of
seconds have a chance to hit some of these background operations.
---
 test_runner/fixtures/neon_fixtures.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index fcfd4ea676..567ca532f9 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4077,6 +4077,22 @@ class Safekeeper(LogUtils):
         self.id = id
         self.running = running
         self.logfile = Path(self.data_dir) / f"safekeeper-{id}.log"
+
+        if extra_opts is None:
+            # Testing defaults: enable everything, and set short timeouts so that background
+            # work will happen during short tests.
+            # **Note**: Any test that explicitly sets extra_opts will not get these defaults.
+            extra_opts = [
+                "--enable-offload",
+                "--delete-offloaded-wal",
+                "--partial-backup-timeout",
+                "10s",
+                "--control-file-save-interval",
+                "1s",
+                "--eviction-min-resident",
+                "10s",
+            ]
+
         self.extra_opts = extra_opts
 
     def start(

From 9f1ba2c4bfdb366c2710e0fc9629932ecde99d51 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Thu, 18 Jul 2024 13:46:00 +0100
Subject: [PATCH 273/412] Fix partial upload bug with invalid remote state
 (#8383)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We have an issue that some partial uploaded segments can be actually
missing in remote storage. I found this issue when was looking at the
logs in staging, and it can be triggered by failed uploads:
1. Code tries to upload `SEG_TERM_LSN_LSN_sk5.partial`, but receives
error from S3
2. The failed attempt is saved to `segments` vec
3. After some time, the code tries to upload
`SEG_TERM_LSN_LSN_sk5.partial` again
4. This time the upload is successful and code calls `gc()` to delete
previous uploads
5. Since new object and old object share the same name, uploaded data
gets deleted from remote storage

This commit fixes the issue by patching `gc()` not to delete objects
with the same name as currently uploaded.

---------

Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
---
 safekeeper/src/timeline_eviction.rs  |  5 +----
 safekeeper/src/wal_backup_partial.rs | 12 ++++++++++++
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/safekeeper/src/timeline_eviction.rs b/safekeeper/src/timeline_eviction.rs
index 0b8d58ee8a..7947d83eb4 100644
--- a/safekeeper/src/timeline_eviction.rs
+++ b/safekeeper/src/timeline_eviction.rs
@@ -199,10 +199,7 @@ async fn redownload_partial_segment(
     file.flush().await?;
 
     let final_path = local_segment_path(mgr, partial);
-    info!(
-        "downloaded {} bytes, renaming to {}",
-        final_path, final_path,
-    );
+    info!("downloaded {actual_len} bytes, renaming to {final_path}");
     if let Err(e) = durable_rename(&tmp_file, &final_path, !mgr.conf.no_sync).await {
         // Probably rename succeeded, but fsync of it failed. Remove
         // the file then to avoid using it.
diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs
index 825851c97c..b1efa9749f 100644
--- a/safekeeper/src/wal_backup_partial.rs
+++ b/safekeeper/src/wal_backup_partial.rs
@@ -289,6 +289,18 @@ impl PartialBackup {
             })
             .collect();
 
+        if new_segments.len() == 1 {
+            // we have an uploaded segment, it must not be deleted from remote storage
+            segments_to_delete.retain(|name| name != &new_segments[0].name);
+        } else {
+            // there should always be zero or one uploaded segment
+            assert!(
+                new_segments.is_empty(),
+                "too many uploaded segments: {:?}",
+                new_segments
+            );
+        }
+
         info!("deleting objects: {:?}", segments_to_delete);
         let mut objects_to_delete = vec![];
         for seg in segments_to_delete.iter() {

From f87b031876e042d46b3b3228e34ab1aab3159e28 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <iskyzh@gmail.com>
Date: Thu, 18 Jul 2024 12:16:44 -0400
Subject: [PATCH 274/412] pageserver: integrate k-merge with bottom-most
 compaction (#8415)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Use the k-merge iterator in the compaction process to reduce memory
footprint.

part of https://github.com/neondatabase/neon/issues/8002

## Summary of changes

* refactor the bottom-most compaction code to use k-merge iterator
* add Send bound on some structs as it is used across the await points

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
---
 pageserver/src/tenant.rs                      |  2 +-
 pageserver/src/tenant/disk_btree.rs           |  4 +-
 pageserver/src/tenant/storage_layer.rs        |  2 -
 .../src/tenant/storage_layer/delta_layer.rs   | 21 +++---
 .../src/tenant/storage_layer/image_layer.rs   | 23 +++---
 pageserver/src/tenant/storage_layer/layer.rs  |  5 +-
 .../tenant/storage_layer/merge_iterator.rs    |  4 ++
 pageserver/src/tenant/timeline/compaction.rs  | 70 ++++++++-----------
 pageserver/src/tenant/vectored_blob_io.rs     |  2 -
 9 files changed, 62 insertions(+), 71 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index dc6f42eaeb..637051413f 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -6810,7 +6810,7 @@ mod tests {
             vec![
                 // Image layer at GC horizon
                 PersistentLayerKey {
-                    key_range: Key::MIN..get_key(10),
+                    key_range: Key::MIN..Key::MAX,
                     lsn_range: Lsn(0x30)..Lsn(0x31),
                     is_delta: false
                 },
diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs
index 251d2ab4ad..1583a3826a 100644
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -262,7 +262,7 @@ where
 
     pub fn iter<'a>(self, start_key: &'a [u8; L], ctx: &'a RequestContext) -> DiskBtreeIterator<'a>
     where
-        R: 'a,
+        R: 'a + Send,
     {
         DiskBtreeIterator {
             stream: Box::pin(self.into_stream(start_key, ctx)),
@@ -521,7 +521,7 @@ where
 pub struct DiskBtreeIterator<'a> {
     #[allow(clippy::type_complexity)]
     stream: std::pin::Pin<
-        Box<dyn Stream<Item = std::result::Result<(Vec<u8>, u64), DiskBtreeError>> + 'a>,
+        Box<dyn Stream<Item = std::result::Result<(Vec<u8>, u64), DiskBtreeError>> + 'a + Send>,
     >,
 }
 
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 2f0c45317d..a389358f0d 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -6,8 +6,6 @@ pub(crate) mod inmemory_layer;
 pub(crate) mod layer;
 mod layer_desc;
 mod layer_name;
-
-#[cfg(test)]
 pub mod merge_iterator;
 
 use crate::context::{AccessStatsBehavior, RequestContext};
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 43941b6e17..c34923320a 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -33,11 +33,14 @@ use crate::page_cache::{self, FileId, PAGE_SZ};
 use crate::repository::{Key, Value, KEY_SIZE};
 use crate::tenant::blob_io::BlobWriter;
 use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader};
-use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
+use crate::tenant::disk_btree::{
+    DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
+};
 use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
-    BlobFlag, MaxVectoredReadBytes, VectoredBlobReader, VectoredRead, VectoredReadPlanner,
+    BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
+    VectoredReadPlanner,
 };
 use crate::tenant::{PageReconstructError, Timeline};
 use crate::virtual_file::{self, VirtualFile};
@@ -53,6 +56,7 @@ use pageserver_api::models::{ImageCompressionAlgorithm, LayerAccessKind};
 use pageserver_api::shard::TenantShardId;
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
+use std::collections::VecDeque;
 use std::fs::File;
 use std::io::SeekFrom;
 use std::ops::Range;
@@ -747,12 +751,10 @@ impl DeltaLayer {
 }
 
 impl DeltaLayerInner {
-    #[cfg(test)]
     pub(crate) fn key_range(&self) -> &Range<Key> {
         &self.layer_key_range
     }
 
-    #[cfg(test)]
     pub(crate) fn lsn_range(&self) -> &Range<Lsn> {
         &self.layer_lsn_range
     }
@@ -1512,7 +1514,6 @@ impl DeltaLayerInner {
         offset
     }
 
-    #[cfg(test)]
     pub(crate) fn iter<'a>(&'a self, ctx: &'a RequestContext) -> DeltaLayerIterator<'a> {
         let block_reader = FileBlockReader::new(&self.file, self.file_id);
         let tree_reader =
@@ -1523,7 +1524,7 @@ impl DeltaLayerInner {
             index_iter: tree_reader.iter(&[0; DELTA_KEY_SIZE], ctx),
             key_values_batch: std::collections::VecDeque::new(),
             is_end: false,
-            planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner::new(
+            planner: StreamingVectoredReadPlanner::new(
                 1024 * 8192, // The default value. Unit tests might use a different value. 1024 * 8K = 8MB buffer.
                 1024,        // The default value. Unit tests might use a different value
             ),
@@ -1595,17 +1596,15 @@ impl<'a> pageserver_compaction::interface::CompactionDeltaEntry<'a, Key> for Del
     }
 }
 
-#[cfg(test)]
 pub struct DeltaLayerIterator<'a> {
     delta_layer: &'a DeltaLayerInner,
     ctx: &'a RequestContext,
-    planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner,
-    index_iter: crate::tenant::disk_btree::DiskBtreeIterator<'a>,
-    key_values_batch: std::collections::VecDeque<(Key, Lsn, Value)>,
+    planner: StreamingVectoredReadPlanner,
+    index_iter: DiskBtreeIterator<'a>,
+    key_values_batch: VecDeque<(Key, Lsn, Value)>,
     is_end: bool,
 }
 
-#[cfg(test)]
 impl<'a> DeltaLayerIterator<'a> {
     /// Retrieve a batch of key-value pairs into the iterator buffer.
     async fn next_batch(&mut self) -> anyhow::Result<()> {
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index a88a1e6429..c7f41b66be 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -29,13 +29,16 @@ use crate::page_cache::{self, FileId, PAGE_SZ};
 use crate::repository::{Key, Value, KEY_SIZE};
 use crate::tenant::blob_io::BlobWriter;
 use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
-use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
+use crate::tenant::disk_btree::{
+    DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
+};
 use crate::tenant::storage_layer::{
     LayerAccessStats, ValueReconstructResult, ValueReconstructState,
 };
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
-    BlobFlag, MaxVectoredReadBytes, VectoredBlobReader, VectoredRead, VectoredReadPlanner,
+    BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
+    VectoredReadPlanner,
 };
 use crate::tenant::{PageReconstructError, Timeline};
 use crate::virtual_file::{self, VirtualFile};
@@ -50,6 +53,7 @@ use pageserver_api::models::LayerAccessKind;
 use pageserver_api::shard::{ShardIdentity, TenantShardId};
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
+use std::collections::VecDeque;
 use std::fs::File;
 use std::io::SeekFrom;
 use std::ops::Range;
@@ -369,12 +373,10 @@ impl ImageLayer {
 }
 
 impl ImageLayerInner {
-    #[cfg(test)]
     pub(crate) fn key_range(&self) -> &Range<Key> {
         &self.key_range
     }
 
-    #[cfg(test)]
     pub(crate) fn lsn(&self) -> Lsn {
         self.lsn
     }
@@ -699,7 +701,6 @@ impl ImageLayerInner {
         }
     }
 
-    #[cfg(test)]
     pub(crate) fn iter<'a>(&'a self, ctx: &'a RequestContext) -> ImageLayerIterator<'a> {
         let block_reader = FileBlockReader::new(&self.file, self.file_id);
         let tree_reader =
@@ -708,9 +709,9 @@ impl ImageLayerInner {
             image_layer: self,
             ctx,
             index_iter: tree_reader.iter(&[0; KEY_SIZE], ctx),
-            key_values_batch: std::collections::VecDeque::new(),
+            key_values_batch: VecDeque::new(),
             is_end: false,
-            planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner::new(
+            planner: StreamingVectoredReadPlanner::new(
                 1024 * 8192, // The default value. Unit tests might use a different value. 1024 * 8K = 8MB buffer.
                 1024,        // The default value. Unit tests might use a different value
             ),
@@ -974,17 +975,15 @@ impl Drop for ImageLayerWriter {
     }
 }
 
-#[cfg(test)]
 pub struct ImageLayerIterator<'a> {
     image_layer: &'a ImageLayerInner,
     ctx: &'a RequestContext,
-    planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner,
-    index_iter: crate::tenant::disk_btree::DiskBtreeIterator<'a>,
-    key_values_batch: std::collections::VecDeque<(Key, Lsn, Value)>,
+    planner: StreamingVectoredReadPlanner,
+    index_iter: DiskBtreeIterator<'a>,
+    key_values_batch: VecDeque<(Key, Lsn, Value)>,
     is_end: bool,
 }
 
-#[cfg(test)]
 impl<'a> ImageLayerIterator<'a> {
     /// Retrieve a batch of key-value pairs into the iterator buffer.
     async fn next_batch(&mut self) -> anyhow::Result<()> {
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index dbf6c60aae..d9cbaba529 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -385,6 +385,7 @@ impl Layer {
     }
 
     /// Get all key/values in the layer. Should be replaced with an iterator-based API in the future.
+    #[allow(dead_code)]
     pub(crate) async fn load_key_values(
         &self,
         ctx: &RequestContext,
@@ -1918,7 +1919,7 @@ impl ResidentLayer {
         self.owner.metadata()
     }
 
-    #[cfg(test)]
+    /// Cast the layer to a delta, return an error if it is an image layer.
     pub(crate) async fn get_as_delta(
         &self,
         ctx: &RequestContext,
@@ -1930,7 +1931,7 @@ impl ResidentLayer {
         }
     }
 
-    #[cfg(test)]
+    /// Cast the layer to an image, return an error if it is a delta layer.
     pub(crate) async fn get_as_image(
         &self,
         ctx: &RequestContext,
diff --git a/pageserver/src/tenant/storage_layer/merge_iterator.rs b/pageserver/src/tenant/storage_layer/merge_iterator.rs
index 0edfd4bd40..6f59b2fd77 100644
--- a/pageserver/src/tenant/storage_layer/merge_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs
@@ -547,5 +547,9 @@ mod tests {
             &ctx,
         );
         assert_merge_iter_equal(&mut merge_iter, &expect).await;
+
+        is_send(merge_iter);
     }
+
+    fn is_send(_: impl Send) {}
 }
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index f251b667c2..a648432b4d 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -27,6 +27,7 @@ use utils::id::TimelineId;
 use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
 use crate::page_cache;
 use crate::tenant::config::defaults::{DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD};
+use crate::tenant::storage_layer::merge_iterator::MergeIterator;
 use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc};
 use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter};
 use crate::tenant::timeline::{Hole, ImageLayerCreationOutcome};
@@ -1039,10 +1040,12 @@ impl Timeline {
         );
         // Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs.
         // Also, collect the layer information to decide when to split the new delta layers.
-        let mut all_key_values = Vec::new();
+        let mut downloaded_layers = Vec::new();
         let mut delta_split_points = BTreeSet::new();
         for layer in &layer_selection {
-            all_key_values.extend(layer.load_key_values(ctx).await?);
+            let resident_layer = layer.download_and_keep_resident().await?;
+            downloaded_layers.push(resident_layer);
+
             let desc = layer.layer_desc();
             if desc.is_delta() {
                 // TODO: is it correct to only record split points for deltas intersecting with the GC horizon? (exclude those below/above the horizon)
@@ -1052,44 +1055,28 @@ impl Timeline {
                 delta_split_points.insert(key_range.end);
             }
         }
-        // Key small to large, LSN low to high, if the same LSN has both image and delta due to the merge of delta layers and
-        // image layers, make image appear before than delta.
-        struct ValueWrapper<'a>(&'a crate::repository::Value);
-        impl Ord for ValueWrapper<'_> {
-            fn cmp(&self, other: &Self) -> std::cmp::Ordering {
-                use crate::repository::Value;
-                use std::cmp::Ordering;
-                match (self.0, other.0) {
-                    (Value::Image(_), Value::WalRecord(_)) => Ordering::Less,
-                    (Value::WalRecord(_), Value::Image(_)) => Ordering::Greater,
-                    _ => Ordering::Equal,
-                }
+        let mut delta_layers = Vec::new();
+        let mut image_layers = Vec::new();
+        for resident_layer in &downloaded_layers {
+            if resident_layer.layer_desc().is_delta() {
+                let layer = resident_layer.get_as_delta(ctx).await?;
+                delta_layers.push(layer);
+            } else {
+                let layer = resident_layer.get_as_image(ctx).await?;
+                image_layers.push(layer);
             }
         }
-        impl PartialOrd for ValueWrapper<'_> {
-            fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
-                Some(self.cmp(other))
-            }
-        }
-        impl PartialEq for ValueWrapper<'_> {
-            fn eq(&self, other: &Self) -> bool {
-                self.cmp(other) == std::cmp::Ordering::Equal
-            }
-        }
-        impl Eq for ValueWrapper<'_> {}
-        all_key_values.sort_by(|(k1, l1, v1), (k2, l2, v2)| {
-            (k1, l1, ValueWrapper(v1)).cmp(&(k2, l2, ValueWrapper(v2)))
-        });
+        let mut merge_iter = MergeIterator::create(&delta_layers, &image_layers, ctx);
         // Step 2: Produce images+deltas. TODO: ensure newly-produced delta does not overlap with other deltas.
         // Data of the same key.
         let mut accumulated_values = Vec::new();
-        let mut last_key = all_key_values.first().unwrap().0; // TODO: assert all_key_values not empty
+        let mut last_key: Option<Key> = None;
 
         /// Take a list of images and deltas, produce an image at the GC horizon, and a list of deltas above the GC horizon.
         async fn flush_accumulated_states(
             tline: &Arc<Timeline>,
             key: Key,
-            accumulated_values: &[&(Key, Lsn, crate::repository::Value)],
+            accumulated_values: &[(Key, Lsn, crate::repository::Value)],
             horizon: Lsn,
         ) -> anyhow::Result<(Vec<(Key, Lsn, crate::repository::Value)>, bytes::Bytes)> {
             let mut base_image = None;
@@ -1190,7 +1177,7 @@ impl Timeline {
             self.conf,
             self.timeline_id,
             self.tenant_shard_id,
-            &(all_key_values.first().unwrap().0..all_key_values.last().unwrap().0.next()),
+            &(Key::MIN..Key::MAX), // covers the full key range
             gc_cutoff,
             ctx,
         )
@@ -1200,20 +1187,24 @@ impl Timeline {
         let delta_split_points = delta_split_points.into_iter().collect_vec();
         let mut current_delta_split_point = 0;
         let mut delta_layers = Vec::new();
-        for item @ (key, _, _) in &all_key_values {
-            if &last_key == key {
-                accumulated_values.push(item);
+        while let Some((key, lsn, val)) = merge_iter.next().await? {
+            if last_key.is_none() || last_key.as_ref() == Some(&key) {
+                if last_key.is_none() {
+                    last_key = Some(key);
+                }
+                accumulated_values.push((key, lsn, val));
             } else {
+                let last_key = last_key.as_mut().unwrap();
                 let (deltas, image) =
-                    flush_accumulated_states(self, last_key, &accumulated_values, gc_cutoff)
+                    flush_accumulated_states(self, *last_key, &accumulated_values, gc_cutoff)
                         .await?;
                 // Put the image into the image layer. Currently we have a single big layer for the compaction.
-                image_layer_writer.put_image(last_key, image, ctx).await?;
+                image_layer_writer.put_image(*last_key, image, ctx).await?;
                 delta_values.extend(deltas);
                 delta_layers.extend(
                     flush_deltas(
                         &mut delta_values,
-                        last_key,
+                        *last_key,
                         &delta_split_points,
                         &mut current_delta_split_point,
                         self,
@@ -1223,11 +1214,12 @@ impl Timeline {
                     .await?,
                 );
                 accumulated_values.clear();
-                accumulated_values.push(item);
-                last_key = *key;
+                *last_key = key;
+                accumulated_values.push((key, lsn, val));
             }
         }
 
+        let last_key = last_key.expect("no keys produced during compaction");
         // TODO: move this part to the loop body
         let (deltas, image) =
             flush_accumulated_states(self, last_key, &accumulated_values, gc_cutoff).await?;
diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs
index 5a0986ea12..54a3ad789b 100644
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -396,7 +396,6 @@ impl<'a> VectoredBlobReader<'a> {
 /// Read planner used in [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`]. It provides a streaming API for
 /// getting read blobs. It returns a batch when `handle` gets called and when the current key would just exceed the read_size and
 /// max_cnt constraints.
-#[cfg(test)]
 pub struct StreamingVectoredReadPlanner {
     read_builder: Option<VectoredReadBuilder>,
     // Arguments for previous blob passed into [`StreamingVectoredReadPlanner::handle`]
@@ -410,7 +409,6 @@ pub struct StreamingVectoredReadPlanner {
     cnt: usize,
 }
 
-#[cfg(test)]
 impl StreamingVectoredReadPlanner {
     pub fn new(max_read_size: u64, max_cnt: usize) -> Self {
         assert!(max_cnt > 0);

From 1b508a6082adee5506c44b3bbd05f2e7f67c400d Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Thu, 18 Jul 2024 18:18:18 +0200
Subject: [PATCH 275/412] Temporarily use vanilla pgbench and psql (client) for
 running pgvector benchmark (#8422)

## Problem

https://github.com/neondatabase/neon/issues/8275 is not yet fixed

Periodic benchmarking fails with SIGABRT in pgvector step, see
https://github.com/neondatabase/neon/actions/runs/9967453263/job/27541159738#step:7:393

## Summary of changes

Instead of using pgbench and psql from Neon artifacts, download vanilla
postgres binaries into the container and use those to run the client
side of the test.
---
 .github/workflows/benchmarking.yml | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index d785156a29..833a4ce33c 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -457,17 +457,21 @@ jobs:
     runs-on: [ self-hosted, us-east-2, x64 ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
-      options: --init
+      options: --init --user root
 
     steps:
     - uses: actions/checkout@v4
 
-    - name: Download Neon artifact
-      uses: ./.github/actions/download
-      with:
-        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
-        path: /tmp/neon/
-        prefix: latest
+    # until https://github.com/neondatabase/neon/issues/8275 is fixed we temporarily install postgresql-16
+    # instead of using Neon artifacts containing pgbench
+    - name: Install postgresql-16 where pytest expects it
+      run: |
+        apt-get update && apt-get install -y postgresql-common
+        /usr/share/postgresql-common/pgdg/apt.postgresql.org.sh -y
+        apt-get -y install postgresql-16
+        mkdir -p /tmp/neon/pg_install/v16/bin
+        ln -s /usr/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench
+        ln -s /usr/bin/psql /tmp/neon/pg_install/v16/bin/psql
 
     - name: Set up Connection String
       id: set-up-connstr

From fceace835b318868c072af3c5cdaa6ddd152bb44 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Thu, 18 Jul 2024 17:26:27 +0100
Subject: [PATCH 276/412] Change log level for GuardDrop error (#8305)

The error means that manager exited earlier than `ResidenceGuard` and
it's not unexpected with current deletion implementation. This commit
changes log level to reduse noise.
---
 safekeeper/src/timeline_guard.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/safekeeper/src/timeline_guard.rs b/safekeeper/src/timeline_guard.rs
index e249c859b4..dbdf46412d 100644
--- a/safekeeper/src/timeline_guard.rs
+++ b/safekeeper/src/timeline_guard.rs
@@ -4,7 +4,7 @@
 
 use std::collections::HashSet;
 
-use tracing::{debug, warn};
+use tracing::debug;
 
 use crate::timeline_manager::ManagerCtlMessage;
 
@@ -23,7 +23,7 @@ impl Drop for ResidenceGuard {
             .manager_tx
             .send(ManagerCtlMessage::GuardDrop(self.guard_id));
         if let Err(e) = res {
-            warn!("failed to send GuardDrop message: {:?}", e);
+            debug!("failed to send GuardDrop message: {:?}", e);
         }
     }
 }

From ed7ee73cbae80f2c5e6cffb7dff278409c9a8bc6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 18 Jul 2024 20:09:57 +0200
Subject: [PATCH 277/412] Enable zstd in tests (#8368)

Successor of #8288 , just enable zstd in tests. Also adds a test that
creates easily compressable data.

Part of #5431

---------

Co-authored-by: John Spray <john@neon.tech>
Co-authored-by: Joonas Koivunen <joonas@neon.tech>
---
 pageserver/src/metrics.rs                     | 16 ++++
 .../src/tenant/storage_layer/image_layer.rs   | 10 ++
 test_runner/fixtures/neon_fixtures.py         |  1 +
 test_runner/regress/test_compaction.py        | 93 ++++++++++++++++++-
 .../regress/test_disk_usage_eviction.py       |  3 +
 5 files changed, 122 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 753f5524c5..c03567f6ef 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -610,6 +610,22 @@ pub(crate) static CIRCUIT_BREAKERS_UNBROKEN: Lazy<IntCounter> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_compression_image_in_bytes_total",
+        "Size of uncompressed data written into image layers"
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static COMPRESSION_IMAGE_OUTPUT_BYTES: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_compression_image_out_bytes_total",
+        "Size of compressed image layer written"
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) mod initial_logical_size {
     use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
     use once_cell::sync::Lazy;
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index c7f41b66be..45b47bb62b 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -738,6 +738,9 @@ struct ImageLayerWriterInner {
     key_range: Range<Key>,
     lsn: Lsn,
 
+    // Total uncompressed bytes passed into put_image
+    uncompressed_bytes: u64,
+
     blob_writer: BlobWriter<false>,
     tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
 }
@@ -793,6 +796,7 @@ impl ImageLayerWriterInner {
             lsn,
             tree: tree_builder,
             blob_writer,
+            uncompressed_bytes: 0,
         };
 
         Ok(writer)
@@ -811,6 +815,7 @@ impl ImageLayerWriterInner {
     ) -> anyhow::Result<()> {
         ensure!(self.key_range.contains(&key));
         let compression = self.conf.image_compression;
+        self.uncompressed_bytes += img.len() as u64;
         let (_img, res) = self
             .blob_writer
             .write_blob_maybe_compressed(img, ctx, compression)
@@ -836,6 +841,11 @@ impl ImageLayerWriterInner {
         let index_start_blk =
             ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
 
+        // Calculate compression ratio
+        let compressed_size = self.blob_writer.size() - PAGE_SZ as u64; // Subtract PAGE_SZ for header
+        crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES.inc_by(self.uncompressed_bytes);
+        crate::metrics::COMPRESSION_IMAGE_OUTPUT_BYTES.inc_by(compressed_size);
+
         let mut file = self.blob_writer.into_inner();
 
         // Write out the index
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 567ca532f9..db7269ad41 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1158,6 +1158,7 @@ class NeonEnv:
                 "listen_http_addr": f"localhost:{pageserver_port.http}",
                 "pg_auth_type": pg_auth_type,
                 "http_auth_type": http_auth_type,
+                "image_compression": "zstd",
             }
             if self.pageserver_virtual_file_io_engine is not None:
                 ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index f321c09b27..be787e0642 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -6,7 +6,10 @@ from typing import Optional
 
 import pytest
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnvBuilder, generate_uploads_and_deletions
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+    generate_uploads_and_deletions,
+)
 from fixtures.pageserver.http import PageserverApiException
 from fixtures.utils import wait_until
 from fixtures.workload import Workload
@@ -142,6 +145,10 @@ def test_sharding_compaction(
         "image_layer_creation_check_threshold": 0,
     }
 
+    # Disable compression, as we can't estimate the size of layers with compression enabled
+    # TODO: implement eager layer cutting during compaction
+    neon_env_builder.pageserver_config_override = "image_compression='disabled'"
+
     neon_env_builder.num_pageservers = 1 if shard_count is None else shard_count
     env = neon_env_builder.init_start(
         initial_tenant_conf=TENANT_CONF,
@@ -320,3 +327,87 @@ def test_pageserver_compaction_circuit_breaker(neon_env_builder: NeonEnvBuilder)
         or 0
     ) == 0
     assert not env.pageserver.log_contains(".*Circuit breaker failure ended.*")
+
+
+@pytest.mark.parametrize("enabled", [True, False])
+def test_image_layer_compression(neon_env_builder: NeonEnvBuilder, enabled: bool):
+    tenant_conf = {
+        # small checkpointing and compaction targets to ensure we generate many upload operations
+        "checkpoint_distance": f"{128 * 1024}",
+        "compaction_threshold": "1",
+        "compaction_target_size": f"{128 * 1024}",
+        # no PITR horizon, we specify the horizon when we request on-demand GC
+        "pitr_interval": "0s",
+        # disable background compaction and GC. We invoke it manually when we want it to happen.
+        "gc_period": "0s",
+        "compaction_period": "0s",
+        # create image layers as eagerly as possible
+        "image_creation_threshold": "1",
+        "image_layer_creation_check_threshold": "0",
+    }
+
+    # Explicitly enable/disable compression, rather than using default
+    if enabled:
+        neon_env_builder.pageserver_config_override = "image_compression='zstd'"
+    else:
+        neon_env_builder.pageserver_config_override = "image_compression='disabled'"
+
+    env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf)
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    pageserver = env.pageserver
+    ps_http = env.pageserver.http_client()
+    with env.endpoints.create_start(
+        "main", tenant_id=tenant_id, pageserver_id=pageserver.id
+    ) as endpoint:
+        endpoint.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)")
+        # Generate around 800k worth of easily compressible data to store
+        for v in range(100):
+            endpoint.safe_psql(
+                f"INSERT INTO foo (id, val) VALUES ({v}, repeat('abcde{v:0>3}', 500))"
+            )
+    # run compaction to create image layers
+    ps_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=True)
+
+    layer_map = ps_http.layer_map_info(tenant_id, timeline_id)
+    image_layer_count = 0
+    delta_layer_count = 0
+    for layer in layer_map.historic_layers:
+        if layer.kind == "Image":
+            image_layer_count += 1
+        elif layer.kind == "Delta":
+            delta_layer_count += 1
+    assert image_layer_count > 0
+    assert delta_layer_count > 0
+
+    log.info(f"images: {image_layer_count}, deltas: {delta_layer_count}")
+
+    bytes_in = pageserver.http_client().get_metric_value(
+        "pageserver_compression_image_in_bytes_total"
+    )
+    bytes_out = pageserver.http_client().get_metric_value(
+        "pageserver_compression_image_out_bytes_total"
+    )
+    assert bytes_in is not None
+    assert bytes_out is not None
+    log.info(f"Compression ratio: {bytes_out/bytes_in} ({bytes_out} in, {bytes_out} out)")
+
+    if enabled:
+        # We are writing high compressible repetitive plain text, expect excellent compression
+        EXPECT_RATIO = 0.2
+        assert bytes_out / bytes_in < EXPECT_RATIO
+    else:
+        # Nothing should be compressed if we disabled it.
+        assert bytes_out >= bytes_in
+
+    # Destroy the endpoint and create a new one to resetthe caches
+    with env.endpoints.create_start(
+        "main", tenant_id=tenant_id, pageserver_id=pageserver.id
+    ) as endpoint:
+        for v in range(100):
+            res = endpoint.safe_psql(
+                f"SELECT count(*) FROM foo WHERE id={v} and val=repeat('abcde{v:0>3}', 500)"
+            )
+            assert res[0][0] == 1
diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py
index fb8b7b22fa..3c834f430b 100644
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -230,6 +230,9 @@ def _eviction_env(
     neon_env_builder.num_pageservers = num_pageservers
     neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
 
+    # Disable compression support for EvictionEnv to get larger layer sizes
+    neon_env_builder.pageserver_config_override = "image_compression='disabled'"
+
     # initial tenant will not be present on this pageserver
     env = neon_env_builder.init_configs()
     env.start()

From b98b301d56d2abe67f5637fed3e62e368345f4d9 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Fri, 19 Jul 2024 15:40:55 +0200
Subject: [PATCH 278/412] Bodobolero/fix root permissions (#8429)

## Problem

My prior PR https://github.com/neondatabase/neon/pull/8422
caused leftovers in the GitHub action runner work directory with root
permission.
As an example see here
https://github.com/neondatabase/neon/actions/runs/10001857641/job/27646237324#step:3:37
To work-around we install vanilla postgres as non-root using deb
packages in /home/nonroot user directory

## Summary of changes

- since we cannot use root we install the deb pkgs directly and create
symbolic links for psql, pgbench and libs in expected places
- continue jobs an aws even if azure jobs fail (because this region is
currently unreliable)
---
 .github/workflows/benchmarking.yml | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 833a4ce33c..c132b5b513 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -57,6 +57,7 @@ jobs:
   bench:
     if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
     strategy:
+      fail-fast: false
       matrix:
         include:
           - DEFAULT_PG_VERSION: 16
@@ -439,6 +440,7 @@ jobs:
 
   pgbench-pgvector:
     strategy:
+      fail-fast: false
       matrix:
         include:
           - PLATFORM: "neon-captest-pgvector"
@@ -451,13 +453,14 @@ jobs:
       DEFAULT_PG_VERSION: 16
       TEST_OUTPUT: /tmp/test_output
       BUILD_TYPE: remote
+      LD_LIBRARY_PATH: /home/nonroot/pg/usr/lib/x86_64-linux-gnu
       SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
       PLATFORM: ${{ matrix.PLATFORM }}
 
     runs-on: [ self-hosted, us-east-2, x64 ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
-      options: --init --user root
+      options: --init
 
     steps:
     - uses: actions/checkout@v4
@@ -466,12 +469,19 @@ jobs:
     # instead of using Neon artifacts containing pgbench
     - name: Install postgresql-16 where pytest expects it
       run: |
-        apt-get update && apt-get install -y postgresql-common
-        /usr/share/postgresql-common/pgdg/apt.postgresql.org.sh -y
-        apt-get -y install postgresql-16
+        cd /home/nonroot
+        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/libpq5_16.3-1.pgdg110%2B1_amd64.deb
+        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.3-1.pgdg110%2B1_amd64.deb
+        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.3-1.pgdg110%2B1_amd64.deb 
+        dpkg -x libpq5_16.3-1.pgdg110+1_amd64.deb pg
+        dpkg -x postgresql-client-16_16.3-1.pgdg110+1_amd64.deb pg
+        dpkg -x postgresql-16_16.3-1.pgdg110+1_amd64.deb pg
         mkdir -p /tmp/neon/pg_install/v16/bin
-        ln -s /usr/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench
-        ln -s /usr/bin/psql /tmp/neon/pg_install/v16/bin/psql
+        ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench  
+        ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql /tmp/neon/pg_install/v16/bin/psql  
+        ln -s /home/nonroot/pg/usr/lib/x86_64-linux-gnu /tmp/neon/pg_install/v16/lib 
+        /tmp/neon/pg_install/v16/bin/pgbench --version
+        /tmp/neon/pg_install/v16/bin/psql --version
 
     - name: Set up Connection String
       id: set-up-connstr
@@ -532,7 +542,6 @@ jobs:
       env:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
-
   clickbench-compare:
     # ClichBench DB for rds-aurora and rds-Postgres deployed to the same clusters
     # we use for performance testing in pgbench-compare.

From 9b883e46516fab4191573a4494562fc764492cff Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 19 Jul 2024 18:01:02 +0200
Subject: [PATCH 279/412] pageserver: remove obsolete
 cached_metric_collection_interval (#8370)

We're removing the usage of this long-meaningless config field in
https://github.com/neondatabase/aws/pull/1599

Once that PR has been deployed to staging and prod, we can merge this
PR.
---
 pageserver/src/bin/pageserver.rs              |  1 -
 pageserver/src/config.rs                      | 24 -------------------
 pageserver/src/consumption_metrics.rs         |  7 ------
 .../test_pageserver_metric_collection.py      |  2 --
 4 files changed, 34 deletions(-)

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 9f705f0bc9..fceddfb757 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -622,7 +622,6 @@ fn start_pageserver(
                         metric_collection_endpoint,
                         &conf.metric_collection_bucket,
                         conf.metric_collection_interval,
-                        conf.cached_metric_collection_interval,
                         conf.synthetic_size_calculation_interval,
                         conf.id,
                         local_disk_storage,
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 5b103b551f..35b4e79365 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -68,7 +68,6 @@ pub mod defaults {
         super::ConfigurableSemaphore::DEFAULT_INITIAL.get();
 
     pub const DEFAULT_METRIC_COLLECTION_INTERVAL: &str = "10 min";
-    pub const DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL: &str = "0s";
     pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option<reqwest::Url> = None;
     pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min";
     pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";
@@ -123,7 +122,6 @@ pub mod defaults {
 #concurrent_tenant_warmup = '{DEFAULT_CONCURRENT_TENANT_WARMUP}'
 
 #metric_collection_interval = '{DEFAULT_METRIC_COLLECTION_INTERVAL}'
-#cached_metric_collection_interval = '{DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL}'
 #synthetic_size_calculation_interval = '{DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL}'
 
 #disk_usage_based_eviction = {{ max_usage_pct = .., min_avail_bytes = .., period = "10s"}}
@@ -238,7 +236,6 @@ pub struct PageServerConf {
     // How often to collect metrics and send them to the metrics endpoint.
     pub metric_collection_interval: Duration,
     // How often to send unchanged cached metrics to the metrics endpoint.
-    pub cached_metric_collection_interval: Duration,
     pub metric_collection_endpoint: Option<Url>,
     pub metric_collection_bucket: Option<RemoteStorageConfig>,
     pub synthetic_size_calculation_interval: Duration,
@@ -370,7 +367,6 @@ struct PageServerConfigBuilder {
     concurrent_tenant_size_logical_size_queries: BuilderValue<NonZeroUsize>,
 
     metric_collection_interval: BuilderValue<Duration>,
-    cached_metric_collection_interval: BuilderValue<Duration>,
     metric_collection_endpoint: BuilderValue<Option<Url>>,
     synthetic_size_calculation_interval: BuilderValue<Duration>,
     metric_collection_bucket: BuilderValue<Option<RemoteStorageConfig>>,
@@ -454,10 +450,6 @@ impl PageServerConfigBuilder {
                 DEFAULT_METRIC_COLLECTION_INTERVAL,
             )
             .expect("cannot parse default metric collection interval")),
-            cached_metric_collection_interval: Set(humantime::parse_duration(
-                DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL,
-            )
-            .expect("cannot parse default cached_metric_collection_interval")),
             synthetic_size_calculation_interval: Set(humantime::parse_duration(
                 DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL,
             )
@@ -589,14 +581,6 @@ impl PageServerConfigBuilder {
         self.metric_collection_interval = BuilderValue::Set(metric_collection_interval)
     }
 
-    pub fn cached_metric_collection_interval(
-        &mut self,
-        cached_metric_collection_interval: Duration,
-    ) {
-        self.cached_metric_collection_interval =
-            BuilderValue::Set(cached_metric_collection_interval)
-    }
-
     pub fn metric_collection_endpoint(&mut self, metric_collection_endpoint: Option<Url>) {
         self.metric_collection_endpoint = BuilderValue::Set(metric_collection_endpoint)
     }
@@ -730,7 +714,6 @@ impl PageServerConfigBuilder {
                 broker_keepalive_interval,
                 log_format,
                 metric_collection_interval,
-                cached_metric_collection_interval,
                 metric_collection_endpoint,
                 metric_collection_bucket,
                 synthetic_size_calculation_interval,
@@ -947,7 +930,6 @@ impl PageServerConf {
                     NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?
                 }),
                 "metric_collection_interval" => builder.metric_collection_interval(parse_toml_duration(key, item)?),
-                "cached_metric_collection_interval" => builder.cached_metric_collection_interval(parse_toml_duration(key, item)?),
                 "metric_collection_endpoint" => {
                     let endpoint = parse_toml_string(key, item)?.parse().context("failed to parse metric_collection_endpoint")?;
                     builder.metric_collection_endpoint(Some(endpoint));
@@ -1080,7 +1062,6 @@ impl PageServerConf {
             eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::default(
             ),
             metric_collection_interval: Duration::from_secs(60),
-            cached_metric_collection_interval: Duration::from_secs(60 * 60),
             metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
             metric_collection_bucket: None,
             synthetic_size_calculation_interval: Duration::from_secs(60),
@@ -1259,7 +1240,6 @@ initial_superuser_name = 'zzzz'
 id = 10
 
 metric_collection_interval = '222 s'
-cached_metric_collection_interval = '22200 s'
 metric_collection_endpoint = 'http://localhost:80/metrics'
 synthetic_size_calculation_interval = '333 s'
 
@@ -1315,9 +1295,6 @@ background_task_maximum_delay = '334 s'
                 metric_collection_interval: humantime::parse_duration(
                     defaults::DEFAULT_METRIC_COLLECTION_INTERVAL
                 )?,
-                cached_metric_collection_interval: humantime::parse_duration(
-                    defaults::DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL
-                )?,
                 metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
                 metric_collection_bucket: None,
                 synthetic_size_calculation_interval: humantime::parse_duration(
@@ -1396,7 +1373,6 @@ background_task_maximum_delay = '334 s'
                 eviction_task_immitated_concurrent_logical_size_queries:
                     ConfigurableSemaphore::default(),
                 metric_collection_interval: Duration::from_secs(222),
-                cached_metric_collection_interval: Duration::from_secs(22200),
                 metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?),
                 metric_collection_bucket: None,
                 synthetic_size_calculation_interval: Duration::from_secs(333),
diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index 18c1a6cd9b..6861adad2c 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -46,19 +46,12 @@ pub async fn collect_metrics(
     metric_collection_endpoint: &Url,
     metric_collection_bucket: &Option<RemoteStorageConfig>,
     metric_collection_interval: Duration,
-    _cached_metric_collection_interval: Duration,
     synthetic_size_calculation_interval: Duration,
     node_id: NodeId,
     local_disk_storage: Utf8PathBuf,
     cancel: CancellationToken,
     ctx: RequestContext,
 ) -> anyhow::Result<()> {
-    if _cached_metric_collection_interval != Duration::ZERO {
-        tracing::warn!(
-            "cached_metric_collection_interval is no longer used, please set it to zero."
-        )
-    }
-
     // spin up background worker that caclulates tenant sizes
     let worker_ctx =
         ctx.detached_child(TaskKind::CalculateSyntheticSize, DownloadBehavior::Download);
diff --git a/test_runner/regress/test_pageserver_metric_collection.py b/test_runner/regress/test_pageserver_metric_collection.py
index cea35a6acb..24a37b04ec 100644
--- a/test_runner/regress/test_pageserver_metric_collection.py
+++ b/test_runner/regress/test_pageserver_metric_collection.py
@@ -58,7 +58,6 @@ def test_metric_collection(
         metric_collection_interval="1s"
         metric_collection_endpoint="{metric_collection_endpoint}"
         metric_collection_bucket={remote_storage_to_toml_inline_table(neon_env_builder.pageserver_remote_storage)}
-        cached_metric_collection_interval="0s"
         synthetic_size_calculation_interval="3s"
         """
 
@@ -216,7 +215,6 @@ def test_metric_collection_cleans_up_tempfile(
     neon_env_builder.pageserver_config_override = f"""
         metric_collection_interval="1s"
         metric_collection_endpoint="{metric_collection_endpoint}"
-        cached_metric_collection_interval="0s"
         synthetic_size_calculation_interval="3s"
         """
 

From affe4084334a6700a858ba3e8c3a63e65ea85249 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 19 Jul 2024 17:07:59 +0100
Subject: [PATCH 280/412] storage scrubber: GC ancestor shard layers (#8196)

## Problem

After a shard split, the pageserver leaves the ancestor shard's content
in place. It may be referenced by child shards, but eventually child
shards will de-reference most ancestor layers as they write their own
data and do GC. We would like to eventually clean up those ancestor
layers to reclaim space.

## Summary of changes

- Extend the physical GC command with `--mode=full`, which includes
cleaning up unreferenced ancestor shard layers
- Add test `test_scrubber_physical_gc_ancestors`
- Remove colored log output: in testing this is irritating ANSI code
spam in logs, and in interactive use doesn't add much.
- Refactor storage controller API client code out of storcon_client into
a `storage_controller/client` crate
- During physical GC of ancestors, call into the storage controller to
check that the latest shards seen in S3 reflect the latest state of the
tenant, and there is no shard split in progress.
---
 Cargo.lock                                    |  41 +-
 Cargo.toml                                    |   4 +-
 control_plane/storcon_cli/Cargo.toml          |   1 +
 control_plane/storcon_cli/src/main.rs         |  62 +--
 libs/pageserver_api/src/controller_api.rs     |   4 +-
 libs/utils/src/auth.rs                        |   4 +
 pageserver/src/auth.rs                        |  16 +-
 safekeeper/src/auth.rs                        |  16 +-
 storage_controller/client/Cargo.toml          |  23 +
 storage_controller/client/src/control_api.rs  |  62 +++
 storage_controller/client/src/lib.rs          |   1 +
 storage_controller/src/http.rs                |   2 +-
 storage_controller/src/main.rs                |  18 +-
 storage_controller/src/service.rs             |   2 +
 storage_scrubber/Cargo.toml                   |   1 +
 storage_scrubber/src/lib.rs                   |  33 +-
 storage_scrubber/src/main.rs                  |  46 +-
 .../src/pageserver_physical_gc.rs             | 481 +++++++++++++++---
 test_runner/fixtures/neon_fixtures.py         |  24 +-
 .../regress/test_pageserver_generations.py    |   3 +-
 .../regress/test_pageserver_secondary.py      |   6 +-
 test_runner/regress/test_sharding.py          |   3 +-
 test_runner/regress/test_storage_scrubber.py  | 237 ++++++++-
 test_runner/regress/test_tenant_delete.py     |   6 +-
 24 files changed, 905 insertions(+), 191 deletions(-)
 create mode 100644 storage_controller/client/Cargo.toml
 create mode 100644 storage_controller/client/src/control_api.rs
 create mode 100644 storage_controller/client/src/lib.rs

diff --git a/Cargo.lock b/Cargo.lock
index d08da0babd..2505d4d3ed 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3234,16 +3234,6 @@ dependencies = [
  "winapi",
 ]
 
-[[package]]
-name = "nu-ansi-term"
-version = "0.46.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
-dependencies = [
- "overload",
- "winapi",
-]
-
 [[package]]
 name = "num"
 version = "0.4.1"
@@ -3539,12 +3529,6 @@ version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a"
 
-[[package]]
-name = "overload"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
-
 [[package]]
 name = "p256"
 version = "0.11.1"
@@ -5822,6 +5806,28 @@ dependencies = [
  "workspace_hack",
 ]
 
+[[package]]
+name = "storage_controller_client"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "async-trait",
+ "bytes",
+ "futures",
+ "pageserver_api",
+ "pageserver_client",
+ "postgres",
+ "reqwest 0.12.4",
+ "serde",
+ "thiserror",
+ "tokio",
+ "tokio-postgres",
+ "tokio-stream",
+ "tokio-util",
+ "utils",
+ "workspace_hack",
+]
+
 [[package]]
 name = "storage_scrubber"
 version = "0.1.0"
@@ -5856,6 +5862,7 @@ dependencies = [
  "serde",
  "serde_json",
  "serde_with",
+ "storage_controller_client",
  "thiserror",
  "tokio",
  "tokio-postgres",
@@ -5885,6 +5892,7 @@ dependencies = [
  "reqwest 0.12.4",
  "serde",
  "serde_json",
+ "storage_controller_client",
  "thiserror",
  "tokio",
  "tracing",
@@ -6611,7 +6619,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77"
 dependencies = [
  "matchers",
- "nu-ansi-term",
  "once_cell",
  "regex",
  "serde",
diff --git a/Cargo.toml b/Cargo.toml
index b9b4bafb4f..615f5472ec 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -13,6 +13,7 @@ members = [
     "safekeeper",
     "storage_broker",
     "storage_controller",
+    "storage_controller/client",
     "storage_scrubber",
     "workspace_hack",
     "libs/compute_api",
@@ -182,7 +183,7 @@ tower-service = "0.3.2"
 tracing = "0.1"
 tracing-error = "0.2.0"
 tracing-opentelemetry = "0.21.0"
-tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json", "ansi"] }
+tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
 twox-hash = { version = "1.6.3", default-features = false }
 typed-json = "0.1"
 url = "2.2"
@@ -221,6 +222,7 @@ remote_storage = { version = "0.1", path = "./libs/remote_storage/" }
 safekeeper_api = { version = "0.1", path = "./libs/safekeeper_api" }
 desim = { version = "0.1", path = "./libs/desim" }
 storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main broker code is inside the binary crate, so linking with the library shouldn't be heavy.
+storage_controller_client = { path = "./storage_controller/client" }
 tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" }
 tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
 utils = { version = "0.1", path = "./libs/utils/" }
diff --git a/control_plane/storcon_cli/Cargo.toml b/control_plane/storcon_cli/Cargo.toml
index f96f0084b2..be69208d0d 100644
--- a/control_plane/storcon_cli/Cargo.toml
+++ b/control_plane/storcon_cli/Cargo.toml
@@ -17,6 +17,7 @@ pageserver_client.workspace = true
 reqwest.workspace = true
 serde.workspace = true
 serde_json = { workspace = true, features = ["raw_value"] }
+storage_controller_client.workspace = true
 thiserror.workspace = true
 tokio.workspace = true
 tracing.workspace = true
diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index 777a717a73..5c1add070a 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -14,15 +14,15 @@ use pageserver_api::{
     },
     shard::{ShardStripeSize, TenantShardId},
 };
-use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt};
+use pageserver_client::mgmt_api::{self};
 use reqwest::{Method, StatusCode, Url};
-use serde::{de::DeserializeOwned, Serialize};
 use utils::id::{NodeId, TenantId};
 
 use pageserver_api::controller_api::{
     NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
     TenantShardMigrateRequest, TenantShardMigrateResponse,
 };
+use storage_controller_client::control_api::Client;
 
 #[derive(Subcommand, Debug)]
 enum Command {
@@ -249,64 +249,6 @@ impl FromStr for NodeAvailabilityArg {
     }
 }
 
-struct Client {
-    base_url: Url,
-    jwt_token: Option<String>,
-    client: reqwest::Client,
-}
-
-impl Client {
-    fn new(base_url: Url, jwt_token: Option<String>) -> Self {
-        Self {
-            base_url,
-            jwt_token,
-            client: reqwest::ClientBuilder::new()
-                .build()
-                .expect("Failed to construct http client"),
-        }
-    }
-
-    /// Simple HTTP request wrapper for calling into storage controller
-    async fn dispatch<RQ, RS>(
-        &self,
-        method: Method,
-        path: String,
-        body: Option<RQ>,
-    ) -> mgmt_api::Result<RS>
-    where
-        RQ: Serialize + Sized,
-        RS: DeserializeOwned + Sized,
-    {
-        // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
-        // for general purpose API access.
-        let url = Url::from_str(&format!(
-            "http://{}:{}/{path}",
-            self.base_url.host_str().unwrap(),
-            self.base_url.port().unwrap()
-        ))
-        .unwrap();
-
-        let mut builder = self.client.request(method, url);
-        if let Some(body) = body {
-            builder = builder.json(&body)
-        }
-        if let Some(jwt_token) = &self.jwt_token {
-            builder = builder.header(
-                reqwest::header::AUTHORIZATION,
-                format!("Bearer {jwt_token}"),
-            );
-        }
-
-        let response = builder.send().await.map_err(mgmt_api::Error::ReceiveBody)?;
-        let response = response.error_from_body().await?;
-
-        response
-            .json()
-            .await
-            .map_err(pageserver_client::mgmt_api::Error::ReceiveBody)
-    }
-}
-
 #[tokio::main]
 async fn main() -> anyhow::Result<()> {
     let cli = Cli::parse();
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index f05c1315ea..d0e1eb6b28 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -87,7 +87,7 @@ pub struct TenantLocateResponse {
     pub shard_params: ShardParameters,
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Serialize, Deserialize, Debug)]
 pub struct TenantDescribeResponse {
     pub tenant_id: TenantId,
     pub shards: Vec<TenantDescribeResponseShard>,
@@ -110,7 +110,7 @@ pub struct NodeDescribeResponse {
     pub listen_pg_port: u16,
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Serialize, Deserialize, Debug)]
 pub struct TenantDescribeResponseShard {
     pub tenant_shard_id: TenantShardId,
 
diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs
index 03e65f74fe..a1170a460d 100644
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -33,6 +33,10 @@ pub enum Scope {
     GenerationsApi,
     // Allows access to control plane managment API and some storage controller endpoints.
     Admin,
+
+    /// Allows access to storage controller APIs used by the scrubber, to interrogate the state
+    /// of a tenant & post scrub results.
+    Scrubber,
 }
 
 /// JWT payload. See docs/authentication.md for the format
diff --git a/pageserver/src/auth.rs b/pageserver/src/auth.rs
index 4785c8c4c5..9e3dedb75a 100644
--- a/pageserver/src/auth.rs
+++ b/pageserver/src/auth.rs
@@ -14,12 +14,14 @@ pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<
         }
         (Scope::PageServerApi, None) => Ok(()), // access to management api for PageServerApi scope
         (Scope::PageServerApi, Some(_)) => Ok(()), // access to tenant api using PageServerApi scope
-        (Scope::Admin | Scope::SafekeeperData | Scope::GenerationsApi, _) => Err(AuthError(
-            format!(
-                "JWT scope '{:?}' is ineligible for Pageserver auth",
-                claims.scope
-            )
-            .into(),
-        )),
+        (Scope::Admin | Scope::SafekeeperData | Scope::GenerationsApi | Scope::Scrubber, _) => {
+            Err(AuthError(
+                format!(
+                    "JWT scope '{:?}' is ineligible for Pageserver auth",
+                    claims.scope
+                )
+                .into(),
+            ))
+        }
     }
 }
diff --git a/safekeeper/src/auth.rs b/safekeeper/src/auth.rs
index dd9058c468..b8bc3f3e06 100644
--- a/safekeeper/src/auth.rs
+++ b/safekeeper/src/auth.rs
@@ -12,13 +12,15 @@ pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<
             }
             Ok(())
         }
-        (Scope::Admin | Scope::PageServerApi | Scope::GenerationsApi, _) => Err(AuthError(
-            format!(
-                "JWT scope '{:?}' is ineligible for Safekeeper auth",
-                claims.scope
-            )
-            .into(),
-        )),
+        (Scope::Admin | Scope::PageServerApi | Scope::GenerationsApi | Scope::Scrubber, _) => {
+            Err(AuthError(
+                format!(
+                    "JWT scope '{:?}' is ineligible for Safekeeper auth",
+                    claims.scope
+                )
+                .into(),
+            ))
+        }
         (Scope::SafekeeperData, _) => Ok(()),
     }
 }
diff --git a/storage_controller/client/Cargo.toml b/storage_controller/client/Cargo.toml
new file mode 100644
index 0000000000..c3bfe2bfd2
--- /dev/null
+++ b/storage_controller/client/Cargo.toml
@@ -0,0 +1,23 @@
+[package]
+name = "storage_controller_client"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+pageserver_api.workspace = true
+pageserver_client.workspace = true
+thiserror.workspace = true
+async-trait.workspace = true
+reqwest.workspace = true
+utils.workspace = true
+serde.workspace = true
+workspace_hack = { version = "0.1", path = "../../workspace_hack" }
+tokio-postgres.workspace = true
+tokio-stream.workspace = true
+tokio.workspace = true
+futures.workspace = true
+tokio-util.workspace = true
+anyhow.workspace = true
+postgres.workspace = true
+bytes.workspace = true
diff --git a/storage_controller/client/src/control_api.rs b/storage_controller/client/src/control_api.rs
new file mode 100644
index 0000000000..a981b5020e
--- /dev/null
+++ b/storage_controller/client/src/control_api.rs
@@ -0,0 +1,62 @@
+use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt};
+use reqwest::{Method, Url};
+use serde::{de::DeserializeOwned, Serialize};
+use std::str::FromStr;
+
+pub struct Client {
+    base_url: Url,
+    jwt_token: Option<String>,
+    client: reqwest::Client,
+}
+
+impl Client {
+    pub fn new(base_url: Url, jwt_token: Option<String>) -> Self {
+        Self {
+            base_url,
+            jwt_token,
+            client: reqwest::ClientBuilder::new()
+                .build()
+                .expect("Failed to construct http client"),
+        }
+    }
+
+    /// Simple HTTP request wrapper for calling into storage controller
+    pub async fn dispatch<RQ, RS>(
+        &self,
+        method: Method,
+        path: String,
+        body: Option<RQ>,
+    ) -> mgmt_api::Result<RS>
+    where
+        RQ: Serialize + Sized,
+        RS: DeserializeOwned + Sized,
+    {
+        // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
+        // for general purpose API access.
+        let url = Url::from_str(&format!(
+            "http://{}:{}/{path}",
+            self.base_url.host_str().unwrap(),
+            self.base_url.port().unwrap()
+        ))
+        .unwrap();
+
+        let mut builder = self.client.request(method, url);
+        if let Some(body) = body {
+            builder = builder.json(&body)
+        }
+        if let Some(jwt_token) = &self.jwt_token {
+            builder = builder.header(
+                reqwest::header::AUTHORIZATION,
+                format!("Bearer {jwt_token}"),
+            );
+        }
+
+        let response = builder.send().await.map_err(mgmt_api::Error::ReceiveBody)?;
+        let response = response.error_from_body().await?;
+
+        response
+            .json()
+            .await
+            .map_err(pageserver_client::mgmt_api::Error::ReceiveBody)
+    }
+}
diff --git a/storage_controller/client/src/lib.rs b/storage_controller/client/src/lib.rs
new file mode 100644
index 0000000000..6d5e202942
--- /dev/null
+++ b/storage_controller/client/src/lib.rs
@@ -0,0 +1 @@
+pub mod control_api;
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 9ddf98eb3b..8fb4be93e0 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -430,7 +430,7 @@ async fn handle_tenant_describe(
     service: Arc<Service>,
     req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
+    check_permissions(&req, Scope::Scrubber)?;
 
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     json_response(StatusCode::OK, service.tenant_describe(tenant_id)?)
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index 4bf6b528f4..789f96beb3 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -5,6 +5,7 @@ use metrics::launch_timestamp::LaunchTimestamp;
 use metrics::BuildInfo;
 use std::path::PathBuf;
 use std::sync::Arc;
+use std::time::Duration;
 use storage_controller::http::make_router;
 use storage_controller::metrics::preinitialize_metrics;
 use storage_controller::persistence::Persistence;
@@ -310,12 +311,21 @@ async fn async_main() -> anyhow::Result<()> {
     tracing::info!("Terminating on signal");
 
     // Stop HTTP server first, so that we don't have to service requests
-    // while shutting down Service
+    // while shutting down Service.
     server_shutdown.cancel();
-    if let Err(e) = server_task.await {
-        tracing::error!("Error joining HTTP server task: {e}")
+    match tokio::time::timeout(Duration::from_secs(5), server_task).await {
+        Ok(Ok(_)) => {
+            tracing::info!("Joined HTTP server task");
+        }
+        Ok(Err(e)) => {
+            tracing::error!("Error joining HTTP server task: {e}")
+        }
+        Err(_) => {
+            tracing::warn!("Timed out joining HTTP server task");
+            // We will fall through and shut down the service anyway, any request handlers
+            // in flight will experience cancellation & their clients will see a torn connection.
+        }
     }
-    tracing::info!("Joined HTTP server task");
 
     service.shutdown().await;
     tracing::info!("Service shutdown complete");
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 3c24433c42..a163453dca 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -3956,6 +3956,8 @@ impl Service {
                 "failpoint".to_string()
             )));
 
+            failpoint_support::sleep_millis_async!("shard-split-post-remote-sleep", &self.cancel);
+
             tracing::info!(
                 "Split {} into {}",
                 parent_id,
diff --git a/storage_scrubber/Cargo.toml b/storage_scrubber/Cargo.toml
index 050be66483..5233afbebe 100644
--- a/storage_scrubber/Cargo.toml
+++ b/storage_scrubber/Cargo.toml
@@ -34,6 +34,7 @@ camino.workspace = true
 rustls.workspace = true
 rustls-native-certs.workspace = true
 once_cell.workspace = true
+storage_controller_client.workspace = true
 
 tokio = { workspace = true, features = ["macros", "rt-multi-thread"] }
 chrono = { workspace = true, default-features = false, features = ["clock", "serde"] }
diff --git a/storage_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs
index 9102ad9906..a0b6d7ea30 100644
--- a/storage_scrubber/src/lib.rs
+++ b/storage_scrubber/src/lib.rs
@@ -24,6 +24,7 @@ use camino::{Utf8Path, Utf8PathBuf};
 use clap::ValueEnum;
 use pageserver::tenant::TENANTS_SEGMENT_NAME;
 use pageserver_api::shard::TenantShardId;
+use remote_storage::RemotePath;
 use reqwest::Url;
 use serde::{Deserialize, Serialize};
 use tokio::io::AsyncReadExt;
@@ -31,7 +32,7 @@ use tracing::error;
 use tracing_appender::non_blocking::WorkerGuard;
 use tracing_subscriber::{fmt, prelude::*, EnvFilter};
 use utils::fs_ext;
-use utils::id::{TenantId, TimelineId};
+use utils::id::{TenantId, TenantTimelineId, TimelineId};
 
 const MAX_RETRIES: usize = 20;
 const CLOUD_ADMIN_API_TOKEN_ENV_VAR: &str = "CLOUD_ADMIN_API_TOKEN";
@@ -54,7 +55,7 @@ pub struct S3Target {
 /// in the pageserver, as all timeline objects existing in the scope of a particular
 /// tenant: the scrubber is different in that it handles collections of data referring to many
 /// TenantShardTimelineIds in on place.
-#[derive(Serialize, Deserialize, Debug, Clone, Copy, Hash, PartialEq, Eq)]
+#[derive(Serialize, Deserialize, Debug, Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)]
 pub struct TenantShardTimelineId {
     tenant_shard_id: TenantShardId,
     timeline_id: TimelineId,
@@ -67,6 +68,10 @@ impl TenantShardTimelineId {
             timeline_id,
         }
     }
+
+    fn as_tenant_timeline_id(&self) -> TenantTimelineId {
+        TenantTimelineId::new(self.tenant_shard_id.tenant_id, self.timeline_id)
+    }
 }
 
 impl Display for TenantShardTimelineId {
@@ -179,6 +184,22 @@ impl RootTarget {
             .with_sub_segment(&id.timeline_id.to_string())
     }
 
+    /// Given RemotePath "tenants/foo/timelines/bar/layerxyz", prefix it to a literal
+    /// key in the S3 bucket.
+    pub fn absolute_key(&self, key: &RemotePath) -> String {
+        let root = match self {
+            Self::Pageserver(root) => root,
+            Self::Safekeeper(root) => root,
+        };
+
+        let prefix = &root.prefix_in_bucket;
+        if prefix.ends_with('/') {
+            format!("{prefix}{key}")
+        } else {
+            format!("{prefix}/{key}")
+        }
+    }
+
     pub fn bucket_name(&self) -> &str {
         match self {
             Self::Pageserver(root) => &root.bucket_name,
@@ -216,6 +237,14 @@ impl BucketConfig {
     }
 }
 
+pub struct ControllerClientConfig {
+    /// URL to storage controller.  e.g. http://127.0.0.1:1234 when using `neon_local`
+    pub controller_api: Url,
+
+    /// JWT token for authenticating with storage controller.  Requires scope 'scrubber' or 'admin'.
+    pub controller_jwt: String,
+}
+
 pub struct ConsoleConfig {
     pub token: String,
     pub base_url: Url,
diff --git a/storage_scrubber/src/main.rs b/storage_scrubber/src/main.rs
index d816121192..b3ed6f6451 100644
--- a/storage_scrubber/src/main.rs
+++ b/storage_scrubber/src/main.rs
@@ -1,11 +1,12 @@
-use anyhow::bail;
+use anyhow::{anyhow, bail};
 use camino::Utf8PathBuf;
 use pageserver_api::shard::TenantShardId;
-use storage_scrubber::find_large_objects;
+use reqwest::Url;
 use storage_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
 use storage_scrubber::pageserver_physical_gc::GcMode;
 use storage_scrubber::scan_pageserver_metadata::scan_metadata;
 use storage_scrubber::tenant_snapshot::SnapshotDownloader;
+use storage_scrubber::{find_large_objects, ControllerClientConfig};
 use storage_scrubber::{
     init_logging, pageserver_physical_gc::pageserver_physical_gc,
     scan_safekeeper_metadata::scan_safekeeper_metadata, BucketConfig, ConsoleConfig, NodeKind,
@@ -24,6 +25,14 @@ struct Cli {
 
     #[arg(short, long, default_value_t = false)]
     delete: bool,
+
+    #[arg(long)]
+    /// URL to storage controller.  e.g. http://127.0.0.1:1234 when using `neon_local`
+    controller_api: Option<Url>,
+
+    #[arg(long)]
+    /// JWT token for authenticating with storage controller.  Requires scope 'scrubber' or 'admin'.
+    controller_jwt: Option<String>,
 }
 
 #[derive(Subcommand, Debug)]
@@ -204,8 +213,37 @@ async fn main() -> anyhow::Result<()> {
             min_age,
             mode,
         } => {
-            let summary =
-                pageserver_physical_gc(bucket_config, tenant_ids, min_age.into(), mode).await?;
+            let controller_client_conf = cli.controller_api.map(|controller_api| {
+                ControllerClientConfig {
+                    controller_api,
+                    // Default to no key: this is a convenience when working in a development environment
+                    controller_jwt: cli.controller_jwt.unwrap_or("".to_owned()),
+                }
+            });
+
+            match (&controller_client_conf, mode) {
+                (Some(_), _) => {
+                    // Any mode may run when controller API is set
+                }
+                (None, GcMode::Full) => {
+                    // The part of physical GC where we erase ancestor layers cannot be done safely without
+                    // confirming the most recent complete shard split with the controller.  Refuse to run, rather
+                    // than doing it unsafely.
+                    return Err(anyhow!("Full physical GC requires `--controller-api` and `--controller-jwt` to run"));
+                }
+                (None, GcMode::DryRun | GcMode::IndicesOnly) => {
+                    // These GcModes do not require the controller to run.
+                }
+            }
+
+            let summary = pageserver_physical_gc(
+                bucket_config,
+                controller_client_conf,
+                tenant_ids,
+                min_age.into(),
+                mode,
+            )
+            .await?;
             println!("{}", serde_json::to_string(&summary).unwrap());
             Ok(())
         }
diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs
index fb8fbc1635..e977fd49f7 100644
--- a/storage_scrubber/src/pageserver_physical_gc.rs
+++ b/storage_scrubber/src/pageserver_physical_gc.rs
@@ -1,22 +1,50 @@
-use std::time::{Duration, UNIX_EPOCH};
+use std::collections::{BTreeMap, HashMap};
+use std::sync::Arc;
+use std::time::{Duration, SystemTime};
 
 use crate::checks::{list_timeline_blobs, BlobDataParseResult};
 use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
-use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
+use crate::{
+    init_remote, BucketConfig, ControllerClientConfig, NodeKind, RootTarget, TenantShardTimelineId,
+};
 use aws_sdk_s3::Client;
 use futures_util::{StreamExt, TryStreamExt};
-use pageserver::tenant::remote_timeline_client::parse_remote_index_path;
+use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
+use pageserver::tenant::remote_timeline_client::{parse_remote_index_path, remote_layer_path};
+use pageserver::tenant::storage_layer::LayerName;
 use pageserver::tenant::IndexPart;
-use pageserver_api::shard::TenantShardId;
+use pageserver_api::controller_api::TenantDescribeResponse;
+use pageserver_api::shard::{ShardIndex, TenantShardId};
 use remote_storage::RemotePath;
+use reqwest::Method;
 use serde::Serialize;
+use storage_controller_client::control_api;
 use tracing::{info_span, Instrument};
 use utils::generation::Generation;
+use utils::id::{TenantId, TenantTimelineId};
 
 #[derive(Serialize, Default)]
 pub struct GcSummary {
     indices_deleted: usize,
     remote_storage_errors: usize,
+    controller_api_errors: usize,
+    ancestor_layers_deleted: usize,
+}
+
+impl GcSummary {
+    fn merge(&mut self, other: Self) {
+        let Self {
+            indices_deleted,
+            remote_storage_errors,
+            ancestor_layers_deleted,
+            controller_api_errors,
+        } = other;
+
+        self.indices_deleted += indices_deleted;
+        self.remote_storage_errors += remote_storage_errors;
+        self.ancestor_layers_deleted += ancestor_layers_deleted;
+        self.controller_api_errors += controller_api_errors;
+    }
 }
 
 #[derive(clap::ValueEnum, Debug, Clone, Copy)]
@@ -26,9 +54,9 @@ pub enum GcMode {
 
     // Enable only removing old-generation indices
     IndicesOnly,
+
     // Enable all forms of GC
-    // TODO: this will be used when shard split ancestor layer deletion is added
-    // All,
+    Full,
 }
 
 impl std::fmt::Display for GcMode {
@@ -36,10 +64,232 @@ impl std::fmt::Display for GcMode {
         match self {
             GcMode::DryRun => write!(f, "dry-run"),
             GcMode::IndicesOnly => write!(f, "indices-only"),
+            GcMode::Full => write!(f, "full"),
         }
     }
 }
 
+mod refs {
+    use super::*;
+    // Map of cross-shard layer references, giving a refcount for each layer in each shard that is referenced by some other
+    // shard in the same tenant.  This is sparse!  The vast majority of timelines will have no cross-shard refs, and those that
+    // do have cross shard refs should eventually drop most of them via compaction.
+    //
+    // In our inner map type, the TTID in the key is shard-agnostic, and the ShardIndex in the value refers to the _ancestor
+    // which is is referenced_.
+    #[derive(Default)]
+    pub(super) struct AncestorRefs(
+        BTreeMap<TenantTimelineId, HashMap<(ShardIndex, LayerName), usize>>,
+    );
+
+    impl AncestorRefs {
+        /// Insert references for layers discovered in a particular shard-timeline that refer to an ancestral shard-timeline.
+        pub(super) fn update(
+            &mut self,
+            ttid: TenantShardTimelineId,
+            layers: Vec<(LayerName, LayerFileMetadata)>,
+        ) {
+            let ttid_refs = self.0.entry(ttid.as_tenant_timeline_id()).or_default();
+            for (layer_name, layer_metadata) in layers {
+                // Increment refcount of this layer in the ancestor shard
+                *(ttid_refs
+                    .entry((layer_metadata.shard, layer_name))
+                    .or_default()) += 1;
+            }
+        }
+
+        /// For a particular TTID, return the map of all ancestor layers referenced by a descendent to their refcount
+        ///
+        /// The `ShardIndex` in the result's key is the index of the _ancestor_, not the descendent.
+        pub(super) fn get_ttid_refcounts(
+            &self,
+            ttid: &TenantTimelineId,
+        ) -> Option<&HashMap<(ShardIndex, LayerName), usize>> {
+            self.0.get(ttid)
+        }
+    }
+}
+
+use refs::AncestorRefs;
+
+// As we see shards for a tenant, acccumulate knowledge needed for cross-shard GC:
+// - Are there any ancestor shards?
+// - Are there any refs to ancestor shards' layers?
+#[derive(Default)]
+struct TenantRefAccumulator {
+    shards_seen: HashMap<TenantId, Vec<ShardIndex>>,
+
+    // For each shard that has refs to an ancestor's layers, the set of ancestor layers referred to
+    ancestor_ref_shards: AncestorRefs,
+}
+
+impl TenantRefAccumulator {
+    fn update(&mut self, ttid: TenantShardTimelineId, index_part: &IndexPart) {
+        let this_shard_idx = ttid.tenant_shard_id.to_index();
+        (*self
+            .shards_seen
+            .entry(ttid.tenant_shard_id.tenant_id)
+            .or_default())
+        .push(this_shard_idx);
+
+        let mut ancestor_refs = Vec::new();
+        for (layer_name, layer_metadata) in &index_part.layer_metadata {
+            if layer_metadata.shard != this_shard_idx {
+                // This is a reference from this shard to a layer in an ancestor shard: we must track this
+                // as a marker to not GC this layer from the parent.
+                ancestor_refs.push((layer_name.clone(), layer_metadata.clone()));
+            }
+        }
+
+        if !ancestor_refs.is_empty() {
+            tracing::info!(%ttid, "Found {} ancestor refs", ancestor_refs.len());
+            self.ancestor_ref_shards.update(ttid, ancestor_refs);
+        }
+    }
+
+    /// Consume Self and return a vector of ancestor tenant shards that should be GC'd, and map of referenced ancestor layers to preserve
+    async fn into_gc_ancestors(
+        self,
+        controller_client: &control_api::Client,
+        summary: &mut GcSummary,
+    ) -> (Vec<TenantShardId>, AncestorRefs) {
+        let mut ancestors_to_gc = Vec::new();
+        for (tenant_id, mut shard_indices) in self.shards_seen {
+            // Find the highest shard count
+            let latest_count = shard_indices
+                .iter()
+                .map(|i| i.shard_count)
+                .max()
+                .expect("Always at least one shard");
+
+            let (mut latest_shards, ancestor_shards) = {
+                let at =
+                    itertools::partition(&mut shard_indices, |i| i.shard_count == latest_count);
+                (shard_indices[0..at].to_owned(), &shard_indices[at..])
+            };
+            // Sort shards, as we will later compare them with a sorted list from the controller
+            latest_shards.sort();
+
+            // Check that we have a complete view of the latest shard count: this should always be the case unless we happened
+            // to scan the S3 bucket halfway through a shard split.
+            if latest_shards.len() != latest_count.count() as usize {
+                // This should be extremely rare, so we warn on it.
+                tracing::warn!(%tenant_id, "Missed some shards at count {:?}", latest_count);
+                continue;
+            }
+
+            // Check if we have any non-latest-count shards
+            if ancestor_shards.is_empty() {
+                tracing::debug!(%tenant_id, "No ancestor shards to clean up");
+                continue;
+            }
+
+            // Based on S3 view, this tenant looks like it might have some ancestor shard work to do.  We
+            // must only do this work if the tenant is not currently being split: otherwise, it is not safe
+            // to GC ancestors, because if the split fails then the controller will try to attach ancestor
+            // shards again.
+            match controller_client
+                .dispatch::<(), TenantDescribeResponse>(
+                    Method::GET,
+                    format!("control/v1/tenant/{tenant_id}"),
+                    None,
+                )
+                .await
+            {
+                Err(e) => {
+                    // We were not able to learn the latest shard split state from the controller, so we will not
+                    // do ancestor GC on this tenant.
+                    tracing::warn!(%tenant_id, "Failed to query storage controller, will not do ancestor GC: {e}");
+                    summary.controller_api_errors += 1;
+                    continue;
+                }
+                Ok(desc) => {
+                    // We expect to see that the latest shard count matches the one we saw in S3, and that none
+                    // of the shards indicate splitting in progress.
+
+                    let controller_indices: Vec<ShardIndex> = desc
+                        .shards
+                        .iter()
+                        .map(|s| s.tenant_shard_id.to_index())
+                        .collect();
+                    if controller_indices != latest_shards {
+                        tracing::info!(%tenant_id, "Latest shards seen in S3 ({latest_shards:?}) don't match controller state ({controller_indices:?})");
+                        continue;
+                    }
+
+                    if desc.shards.iter().any(|s| s.is_splitting) {
+                        tracing::info!(%tenant_id, "One or more shards is currently splitting");
+                        continue;
+                    }
+
+                    // This shouldn't be too noisy, because we only log this for tenants that have some ancestral refs.
+                    tracing::info!(%tenant_id, "Validated state with controller: {desc:?}");
+                }
+            }
+
+            // GC ancestor shards
+            for ancestor_shard in ancestor_shards.iter().map(|idx| TenantShardId {
+                tenant_id,
+                shard_count: idx.shard_count,
+                shard_number: idx.shard_number,
+            }) {
+                ancestors_to_gc.push(ancestor_shard);
+            }
+        }
+
+        (ancestors_to_gc, self.ancestor_ref_shards)
+    }
+}
+
+async fn is_old_enough(
+    s3_client: &Client,
+    bucket_config: &BucketConfig,
+    min_age: &Duration,
+    key: &str,
+    summary: &mut GcSummary,
+) -> bool {
+    // Validation: we will only GC indices & layers after a time threshold (e.g. one week) so that during an incident
+    // it is easier to read old data for analysis, and easier to roll back shard splits without having to un-delete any objects.
+    let age: Duration = match s3_client
+        .head_object()
+        .bucket(&bucket_config.bucket)
+        .key(key)
+        .send()
+        .await
+    {
+        Ok(response) => match response.last_modified {
+            None => {
+                tracing::warn!("Missing last_modified");
+                summary.remote_storage_errors += 1;
+                return false;
+            }
+            Some(last_modified) => match SystemTime::try_from(last_modified).map(|t| t.elapsed()) {
+                Ok(Ok(e)) => e,
+                Err(_) | Ok(Err(_)) => {
+                    tracing::warn!("Bad last_modified time: {last_modified:?}");
+                    return false;
+                }
+            },
+        },
+        Err(e) => {
+            tracing::warn!("Failed to HEAD {key}: {e}");
+            summary.remote_storage_errors += 1;
+            return false;
+        }
+    };
+    let old_enough = &age > min_age;
+
+    if !old_enough {
+        tracing::info!(
+            "Skipping young object {} < {}",
+            humantime::format_duration(age),
+            humantime::format_duration(*min_age)
+        );
+    }
+
+    old_enough
+}
+
 async fn maybe_delete_index(
     s3_client: &Client,
     bucket_config: &BucketConfig,
@@ -79,45 +329,7 @@ async fn maybe_delete_index(
         return;
     }
 
-    // Validation: we will only delete indices after one week, so that during incidents we will have
-    // easy access to recent indices.
-    let age: Duration = match s3_client
-        .head_object()
-        .bucket(&bucket_config.bucket)
-        .key(key)
-        .send()
-        .await
-    {
-        Ok(response) => match response.last_modified {
-            None => {
-                tracing::warn!("Missing last_modified");
-                summary.remote_storage_errors += 1;
-                return;
-            }
-            Some(last_modified) => {
-                let last_modified =
-                    UNIX_EPOCH + Duration::from_secs_f64(last_modified.as_secs_f64());
-                match last_modified.elapsed() {
-                    Ok(e) => e,
-                    Err(_) => {
-                        tracing::warn!("Bad last_modified time: {last_modified:?}");
-                        return;
-                    }
-                }
-            }
-        },
-        Err(e) => {
-            tracing::warn!("Failed to HEAD {key}: {e}");
-            summary.remote_storage_errors += 1;
-            return;
-        }
-    };
-    if &age < min_age {
-        tracing::info!(
-            "Skipping young object {} < {}",
-            age.as_secs_f64(),
-            min_age.as_secs_f64()
-        );
+    if !is_old_enough(s3_client, bucket_config, min_age, key, summary).await {
         return;
     }
 
@@ -145,6 +357,108 @@ async fn maybe_delete_index(
     }
 }
 
+#[allow(clippy::too_many_arguments)]
+async fn gc_ancestor(
+    s3_client: &Client,
+    bucket_config: &BucketConfig,
+    root_target: &RootTarget,
+    min_age: &Duration,
+    ancestor: TenantShardId,
+    refs: &AncestorRefs,
+    mode: GcMode,
+    summary: &mut GcSummary,
+) -> anyhow::Result<()> {
+    // Scan timelines in the ancestor
+    let timelines = stream_tenant_timelines(s3_client, root_target, ancestor).await?;
+    let mut timelines = std::pin::pin!(timelines);
+
+    // Build a list of keys to retain
+
+    while let Some(ttid) = timelines.next().await {
+        let ttid = ttid?;
+
+        let data = list_timeline_blobs(s3_client, ttid, root_target).await?;
+
+        let s3_layers = match data.blob_data {
+            BlobDataParseResult::Parsed {
+                index_part: _,
+                index_part_generation: _,
+                s3_layers,
+            } => s3_layers,
+            BlobDataParseResult::Relic => {
+                // Post-deletion tenant location: don't try and GC it.
+                continue;
+            }
+            BlobDataParseResult::Incorrect(reasons) => {
+                // Our primary purpose isn't to report on bad data, but log this rather than skipping silently
+                tracing::warn!(
+                    "Skipping ancestor GC for timeline {ttid}, bad metadata: {reasons:?}"
+                );
+                continue;
+            }
+        };
+
+        let ttid_refs = refs.get_ttid_refcounts(&ttid.as_tenant_timeline_id());
+        let ancestor_shard_index = ttid.tenant_shard_id.to_index();
+
+        for (layer_name, layer_gen) in s3_layers {
+            let ref_count = ttid_refs
+                .and_then(|m| m.get(&(ancestor_shard_index, layer_name.clone())))
+                .copied()
+                .unwrap_or(0);
+
+            if ref_count > 0 {
+                tracing::debug!(%ttid, "Ancestor layer {layer_name}  has {ref_count} refs");
+                continue;
+            }
+
+            tracing::info!(%ttid, "Ancestor layer {layer_name} is not referenced");
+
+            // Build the key for the layer we are considering deleting
+            let key = root_target.absolute_key(&remote_layer_path(
+                &ttid.tenant_shard_id.tenant_id,
+                &ttid.timeline_id,
+                ancestor_shard_index,
+                &layer_name,
+                layer_gen,
+            ));
+
+            // We apply a time threshold to GCing objects that are un-referenced: this preserves our ability
+            // to roll back a shard split if we have to, by avoiding deleting ancestor layers right away
+            if !is_old_enough(s3_client, bucket_config, min_age, &key, summary).await {
+                continue;
+            }
+
+            if !matches!(mode, GcMode::Full) {
+                tracing::info!("Dry run: would delete key {key}");
+                continue;
+            }
+
+            // All validations passed: erase the object
+            match s3_client
+                .delete_object()
+                .bucket(&bucket_config.bucket)
+                .key(&key)
+                .send()
+                .await
+            {
+                Ok(_) => {
+                    tracing::info!("Successfully deleted unreferenced ancestor layer {key}");
+                    summary.ancestor_layers_deleted += 1;
+                }
+                Err(e) => {
+                    tracing::warn!("Failed to delete layer {key}: {e}");
+                    summary.remote_storage_errors += 1;
+                }
+            }
+        }
+
+        // TODO: if all the layers are gone, clean up the whole timeline dir (remove index)
+    }
+
+    Ok(())
+}
+
 /// Physical garbage collection: removing unused S3 objects.  This is distinct from the garbage collection
 /// done inside the pageserver, which operates at a higher level (keys, layers).  This type of garbage collection
 /// is about removing:
@@ -156,22 +470,26 @@ async fn maybe_delete_index(
 /// make sure that object listings don't get slowed down by large numbers of garbage objects.
 pub async fn pageserver_physical_gc(
     bucket_config: BucketConfig,
-    tenant_ids: Vec<TenantShardId>,
+    controller_client_conf: Option<ControllerClientConfig>,
+    tenant_shard_ids: Vec<TenantShardId>,
     min_age: Duration,
     mode: GcMode,
 ) -> anyhow::Result<GcSummary> {
     let (s3_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?;
 
-    let tenants = if tenant_ids.is_empty() {
+    let tenants = if tenant_shard_ids.is_empty() {
         futures::future::Either::Left(stream_tenants(&s3_client, &target))
     } else {
-        futures::future::Either::Right(futures::stream::iter(tenant_ids.into_iter().map(Ok)))
+        futures::future::Either::Right(futures::stream::iter(tenant_shard_ids.into_iter().map(Ok)))
     };
 
     // How many tenants to process in parallel.  We need to be mindful of pageservers
     // accessing the same per tenant prefixes, so use a lower setting than pageservers.
     const CONCURRENCY: usize = 32;
 
+    // Accumulate information about each tenant for cross-shard GC step we'll do at the end
+    let accumulator = Arc::new(std::sync::Mutex::new(TenantRefAccumulator::default()));
+
     // Generate a stream of TenantTimelineId
     let timelines = tenants.map_ok(|t| stream_tenant_timelines(&s3_client, &target, t));
     let timelines = timelines.try_buffered(CONCURRENCY);
@@ -185,16 +503,17 @@ pub async fn pageserver_physical_gc(
         target: &RootTarget,
         mode: GcMode,
         ttid: TenantShardTimelineId,
+        accumulator: &Arc<std::sync::Mutex<TenantRefAccumulator>>,
     ) -> anyhow::Result<GcSummary> {
         let mut summary = GcSummary::default();
         let data = list_timeline_blobs(s3_client, ttid, target).await?;
 
-        let (latest_gen, candidates) = match &data.blob_data {
+        let (index_part, latest_gen, candidates) = match &data.blob_data {
             BlobDataParseResult::Parsed {
-                index_part: _index_part,
+                index_part,
                 index_part_generation,
                 s3_layers: _s3_layers,
-            } => (*index_part_generation, data.unused_index_keys),
+            } => (index_part, *index_part_generation, data.unused_index_keys),
             BlobDataParseResult::Relic => {
                 // Post-deletion tenant location: don't try and GC it.
                 return Ok(summary);
@@ -206,6 +525,8 @@ pub async fn pageserver_physical_gc(
             }
         };
 
+        accumulator.lock().unwrap().update(ttid, index_part);
+
         for key in candidates {
             maybe_delete_index(
                 s3_client,
@@ -222,17 +543,61 @@ pub async fn pageserver_physical_gc(
 
         Ok(summary)
     }
-    let timelines = timelines
-        .map_ok(|ttid| gc_timeline(&s3_client, &bucket_config, &min_age, &target, mode, ttid));
-    let mut timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY));
 
     let mut summary = GcSummary::default();
 
-    while let Some(i) = timelines.next().await {
-        let tl_summary = i?;
+    // Drain futures for per-shard GC, populating accumulator as a side effect
+    {
+        let timelines = timelines.map_ok(|ttid| {
+            gc_timeline(
+                &s3_client,
+                &bucket_config,
+                &min_age,
+                &target,
+                mode,
+                ttid,
+                &accumulator,
+            )
+        });
+        let mut timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY));
 
-        summary.indices_deleted += tl_summary.indices_deleted;
-        summary.remote_storage_errors += tl_summary.remote_storage_errors;
+        while let Some(i) = timelines.next().await {
+            summary.merge(i?);
+        }
+    }
+
+    // Execute cross-shard GC, using the accumulator's full view of all the shards built in the per-shard GC
+    let Some(controller_client) = controller_client_conf.as_ref().map(|c| {
+        let ControllerClientConfig {
+            controller_api,
+            controller_jwt,
+        } = c;
+        control_api::Client::new(controller_api.clone(), Some(controller_jwt.clone()))
+    }) else {
+        tracing::info!("Skipping ancestor layer GC, because no `--controller-api` was specified");
+        return Ok(summary);
+    };
+
+    let (ancestor_shards, ancestor_refs) = Arc::into_inner(accumulator)
+        .unwrap()
+        .into_inner()
+        .unwrap()
+        .into_gc_ancestors(&controller_client, &mut summary)
+        .await;
+
+    for ancestor_shard in ancestor_shards {
+        gc_ancestor(
+            &s3_client,
+            &bucket_config,
+            &target,
+            &min_age,
+            ancestor_shard,
+            &ancestor_refs,
+            mode,
+            &mut summary,
+        )
+        .instrument(info_span!("gc_ancestor", %ancestor_shard))
+        .await?;
     }
 
     Ok(summary)
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index db7269ad41..9e39457c06 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -997,7 +997,7 @@ class NeonEnvBuilder:
 
             if self.scrub_on_exit:
                 try:
-                    StorageScrubber(self).scan_metadata()
+                    self.env.storage_scrubber.scan_metadata()
                 except Exception as e:
                     log.error(f"Error during remote storage scrub: {e}")
                     cleanup_error = e
@@ -1225,6 +1225,9 @@ class NeonEnv:
             )
             cfg["safekeepers"].append(sk_cfg)
 
+        # Scrubber instance for tests that use it, and for use during teardown checks
+        self.storage_scrubber = StorageScrubber(self, log_dir=config.test_output_dir)
+
         log.info(f"Config: {cfg}")
         self.neon_cli.init(
             cfg,
@@ -4265,9 +4268,9 @@ class Safekeeper(LogUtils):
 
 
 class StorageScrubber:
-    def __init__(self, env: NeonEnvBuilder, log_dir: Optional[Path] = None):
+    def __init__(self, env: NeonEnv, log_dir: Path):
         self.env = env
-        self.log_dir = log_dir or env.test_output_dir
+        self.log_dir = log_dir
 
     def scrubber_cli(self, args: list[str], timeout) -> str:
         assert isinstance(self.env.pageserver_remote_storage, S3Storage)
@@ -4284,11 +4287,14 @@ class StorageScrubber:
         if s3_storage.endpoint is not None:
             env.update({"AWS_ENDPOINT_URL": s3_storage.endpoint})
 
-        base_args = [str(self.env.neon_binpath / "storage_scrubber")]
+        base_args = [
+            str(self.env.neon_binpath / "storage_scrubber"),
+            f"--controller-api={self.env.storage_controller_api}",
+        ]
         args = base_args + args
 
         (output_path, stdout, status_code) = subprocess_capture(
-            self.env.test_output_dir,
+            self.log_dir,
             args,
             echo_stderr=True,
             echo_stdout=True,
@@ -4327,7 +4333,10 @@ class StorageScrubber:
         log.info(f"tenant-snapshot output: {stdout}")
 
     def pageserver_physical_gc(
-        self, min_age_secs: int, tenant_ids: Optional[list[TenantId]] = None
+        self,
+        min_age_secs: int,
+        tenant_ids: Optional[list[TenantId]] = None,
+        mode: Optional[str] = None,
     ):
         args = ["pageserver-physical-gc", "--min-age", f"{min_age_secs}s"]
 
@@ -4337,6 +4346,9 @@ class StorageScrubber:
         for tenant_id in tenant_ids:
             args.extend(["--tenant-id", str(tenant_id)])
 
+        if mode is not None:
+            args.extend(["--mode", mode])
+
         stdout = self.scrubber_cli(
             args,
             timeout=30,
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index 7ce38c5c3c..041942cda3 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -22,7 +22,6 @@ from fixtures.neon_fixtures import (
     NeonEnv,
     NeonEnvBuilder,
     PgBin,
-    StorageScrubber,
     generate_uploads_and_deletions,
 )
 from fixtures.pageserver.common_types import parse_layer_file_name
@@ -215,7 +214,7 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
 
     # Having written a mixture of generation-aware and legacy index_part.json,
     # ensure the scrubber handles the situation as expected.
-    metadata_summary = StorageScrubber(neon_env_builder).scan_metadata()
+    metadata_summary = env.storage_scrubber.scan_metadata()
     assert metadata_summary["tenant_count"] == 1  # Scrubber should have seen our timeline
     assert metadata_summary["timeline_count"] == 1
     assert metadata_summary["timeline_shard_count"] == 1
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 0416078ebc..58d61eab0d 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -7,7 +7,7 @@ from typing import Any, Dict, Optional
 import pytest
 from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver, StorageScrubber
+from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver
 from fixtures.pageserver.common_types import parse_layer_file_name
 from fixtures.pageserver.utils import (
     assert_prefix_empty,
@@ -234,7 +234,7 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, make_httpserver,
     # Having done a bunch of attach/detach cycles, we will have generated some index garbage: check
     # that the scrubber sees it and cleans it up.  We do this before the final attach+validate pass,
     # to also validate that the scrubber isn't breaking anything.
-    gc_summary = StorageScrubber(neon_env_builder).pageserver_physical_gc(min_age_secs=1)
+    gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=1)
     assert gc_summary["remote_storage_errors"] == 0
     assert gc_summary["indices_deleted"] > 0
 
@@ -555,7 +555,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
     # Scrub the remote storage
     # ========================
     # This confirms that the scrubber isn't upset by the presence of the heatmap
-    StorageScrubber(neon_env_builder).scan_metadata()
+    env.storage_scrubber.scan_metadata()
 
     # Detach secondary and delete tenant
     # ===================================
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 4471237900..90c6e26d01 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -12,7 +12,6 @@ from fixtures.neon_fixtures import (
     NeonEnv,
     NeonEnvBuilder,
     StorageControllerApiException,
-    StorageScrubber,
     last_flush_lsn_upload,
     tenant_get_shards,
     wait_for_last_flush_lsn,
@@ -128,7 +127,7 @@ def test_sharding_smoke(
 
     # Check the scrubber isn't confused by sharded content, then disable
     # it during teardown because we'll have deleted by then
-    StorageScrubber(neon_env_builder).scan_metadata()
+    env.storage_scrubber.scan_metadata()
     neon_env_builder.scrub_on_exit = False
 
     env.storage_controller.pageserver_api().tenant_delete(tenant_id)
diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py
index 35ae61c380..635690fc7f 100644
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -1,14 +1,19 @@
 import os
 import shutil
+import threading
+import time
+from concurrent.futures import ThreadPoolExecutor
 from typing import Optional
 
 import pytest
 from fixtures.common_types import TenantId, TenantShardId, TimelineId
+from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
+    NeonEnv,
     NeonEnvBuilder,
-    StorageScrubber,
 )
 from fixtures.remote_storage import S3Storage, s3_storage
+from fixtures.utils import wait_until
 from fixtures.workload import Workload
 
 
@@ -60,8 +65,7 @@ def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count:
     output_path = neon_env_builder.test_output_dir / "snapshot"
     os.makedirs(output_path)
 
-    scrubber = StorageScrubber(neon_env_builder)
-    scrubber.tenant_snapshot(tenant_id, output_path)
+    env.storage_scrubber.tenant_snapshot(tenant_id, output_path)
 
     assert len(os.listdir(output_path)) > 0
 
@@ -111,6 +115,14 @@ def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count:
     workload.validate()
 
 
+def drop_local_state(env: NeonEnv, tenant_id: TenantId):
+    env.storage_controller.tenant_policy_update(tenant_id, {"placement": "Detached"})
+    env.storage_controller.reconcile_until_idle()
+
+    env.storage_controller.tenant_policy_update(tenant_id, {"placement": {"Attached": 0}})
+    env.storage_controller.reconcile_until_idle()
+
+
 @pytest.mark.parametrize("shard_count", [None, 4])
 def test_scrubber_physical_gc(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]):
     neon_env_builder.enable_pageserver_remote_storage(s3_storage())
@@ -133,28 +145,231 @@ def test_scrubber_physical_gc(neon_env_builder: NeonEnvBuilder, shard_count: Opt
 
     # For each cycle, detach and attach the tenant to bump the generation, and do some writes to generate uploads
     for _i in range(0, n_cycles):
-        env.storage_controller.tenant_policy_update(tenant_id, {"placement": "Detached"})
-        env.storage_controller.reconcile_until_idle()
-
-        env.storage_controller.tenant_policy_update(tenant_id, {"placement": {"Attached": 0}})
-        env.storage_controller.reconcile_until_idle()
+        drop_local_state(env, tenant_id)
 
         # This write includes remote upload, will generate an index in this generation
         workload.write_rows(1)
 
     # With a high min_age, the scrubber should decline to delete anything
-    gc_summary = StorageScrubber(neon_env_builder).pageserver_physical_gc(min_age_secs=3600)
+    gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=3600)
     assert gc_summary["remote_storage_errors"] == 0
     assert gc_summary["indices_deleted"] == 0
 
     # If targeting a different tenant, the scrubber shouldn't do anything
-    gc_summary = StorageScrubber(neon_env_builder).pageserver_physical_gc(
+    gc_summary = env.storage_scrubber.pageserver_physical_gc(
         min_age_secs=1, tenant_ids=[TenantId.generate()]
     )
     assert gc_summary["remote_storage_errors"] == 0
     assert gc_summary["indices_deleted"] == 0
 
     #  With a low min_age, the scrubber should go ahead and clean up all but the latest 2 generations
-    gc_summary = StorageScrubber(neon_env_builder).pageserver_physical_gc(min_age_secs=1)
+    gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=1)
     assert gc_summary["remote_storage_errors"] == 0
     assert gc_summary["indices_deleted"] == (expect_indices_per_shard - 2) * shard_count
+
+
+@pytest.mark.parametrize("shard_count", [None, 2])
+def test_scrubber_physical_gc_ancestors(
+    neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]
+):
+    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
+    neon_env_builder.num_pageservers = 2
+
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    tenant_id = TenantId.generate()
+    timeline_id = TimelineId.generate()
+    env.neon_cli.create_tenant(
+        tenant_id,
+        timeline_id,
+        shard_count=shard_count,
+        conf={
+            # Small layers and low compaction thresholds, so that when we split we can expect some to
+            # be dropped by child shards
+            "checkpoint_distance": f"{1024 * 1024}",
+            "compaction_threshold": "1",
+            "compaction_target_size": f"{1024 * 1024}",
+            "image_creation_threshold": "2",
+            "image_layer_creation_check_threshold": "0",
+            # Disable background compaction, we will do it explicitly
+            "compaction_period": "0s",
+            # No PITR, so that as soon as child shards generate an image layer, it covers ancestor deltas
+            # and makes them GC'able
+            "pitr_interval": "0s",
+        },
+    )
+
+    # Make sure the original shard has some layers
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init()
+    workload.write_rows(100)
+
+    new_shard_count = 4
+    assert shard_count is None or new_shard_count > shard_count
+    shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count)
+
+    # Make sure child shards have some layers
+    workload.write_rows(100)
+
+    # Flush deletion queue so that we don't leave any orphan layers in the parent that will confuse subsequent checks: once
+    # a shard is split, any layers in its prefix that aren't referenced by a child will be considered GC'able, even
+    # if they were logically deleted before the shard split, just not physically deleted yet because of the queue.
+    for ps in env.pageservers:
+        ps.http_client().deletion_queue_flush(execute=True)
+
+    # Before compacting, all the layers in the ancestor should still be referenced by the children: the scrubber
+    # should not erase any ancestor layers
+    gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=1, mode="full")
+    assert gc_summary["remote_storage_errors"] == 0
+    assert gc_summary["indices_deleted"] == 0
+    assert gc_summary["ancestor_layers_deleted"] == 0
+
+    # Write some data and compact: compacting, some ancestor layers should no longer be needed by children
+    # (the compaction is part of the checkpoint that Workload does for us)
+    workload.churn_rows(100)
+    workload.churn_rows(100)
+    workload.churn_rows(100)
+    for shard in shards:
+        ps = env.get_tenant_pageserver(shard)
+        ps.http_client().timeline_compact(shard, timeline_id)
+        ps.http_client().timeline_gc(shard, timeline_id, 0)
+
+    # We will use a min_age_secs=1 threshold for deletion, let it pass
+    time.sleep(2)
+
+    # Our time threshold should be respected: check that with a high threshold we delete nothing
+    gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=3600, mode="full")
+    assert gc_summary["remote_storage_errors"] == 0
+    assert gc_summary["indices_deleted"] == 0
+    assert gc_summary["ancestor_layers_deleted"] == 0
+
+    # Now run with a low time threshold: deletions of ancestor layers should be executed
+    gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=1, mode="full")
+    assert gc_summary["remote_storage_errors"] == 0
+    assert gc_summary["indices_deleted"] == 0
+    assert gc_summary["ancestor_layers_deleted"] > 0
+
+    # We deleted some layers: now check we didn't corrupt the tenant by doing so. Detach and
+    # attach it, to drop any local state, then check it's still readable.
+    workload.stop()
+    drop_local_state(env, tenant_id)
+
+    workload.validate()
+
+
+def test_scrubber_physical_gc_ancestors_split(neon_env_builder: NeonEnvBuilder):
+    """
+    Exercise ancestor GC while a tenant is partly split: this test ensures that if we have some child shards
+    which don't reference an ancestor, but some child shards that don't exist yet, then we do not incorrectly
+    GC any ancestor layers.
+    """
+    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
+    neon_env_builder.num_pageservers = 2
+
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    tenant_id = TenantId.generate()
+    timeline_id = TimelineId.generate()
+    initial_shard_count = 2
+    env.neon_cli.create_tenant(
+        tenant_id,
+        timeline_id,
+        shard_count=initial_shard_count,
+        conf={
+            # Small layers and low compaction thresholds, so that when we split we can expect some to
+            # be dropped by child shards
+            "checkpoint_distance": f"{1024 * 1024}",
+            "compaction_threshold": "1",
+            "compaction_target_size": f"{1024 * 1024}",
+            "image_creation_threshold": "2",
+            "image_layer_creation_check_threshold": "0",
+            # Disable background compaction, we will do it explicitly
+            "compaction_period": "0s",
+            # No PITR, so that as soon as child shards generate an image layer, it covers ancestor deltas
+            # and makes them GC'able
+            "pitr_interval": "0s",
+        },
+    )
+
+    unstuck = threading.Event()
+
+    def stuck_split():
+        # Pause our shard split after the first shard but before the second, such that when we run
+        # the scrub, the S3 bucket contains shards 0002, 0101, 0004, 0204 (but not 0104, 0304).
+        env.storage_controller.configure_failpoints(
+            ("shard-split-post-remote-sleep", "return(3600000)")
+        )
+        try:
+            split_response = env.storage_controller.tenant_shard_split(tenant_id, shard_count=4)
+        except Exception as e:
+            log.info(f"Split failed with {e}")
+        else:
+            if not unstuck.is_set():
+                raise RuntimeError(f"Split succeeded unexpectedly ({split_response})")
+
+    with ThreadPoolExecutor(max_workers=1) as threads:
+        log.info("Starting hung shard split")
+        stuck_split_fut = threads.submit(stuck_split)
+
+        # Let the controller reach the failpoint
+        wait_until(
+            10,
+            1,
+            lambda: env.storage_controller.assert_log_contains(
+                'failpoint "shard-split-post-remote-sleep": sleeping'
+            ),
+        )
+
+        # Run compaction on the new child shards, so that they drop some refs to their parent
+        child_shards = [
+            TenantShardId(tenant_id, 0, 4),
+            TenantShardId(tenant_id, 2, 4),
+        ]
+        log.info("Compacting first two children")
+        for child in child_shards:
+            env.get_tenant_pageserver(
+                TenantShardId(tenant_id, 0, initial_shard_count)
+            ).http_client().timeline_compact(child, timeline_id)
+
+        # Check that the other child shards weren't created
+        assert env.get_tenant_pageserver(TenantShardId(tenant_id, 1, 4)) is None
+        assert env.get_tenant_pageserver(TenantShardId(tenant_id, 3, 4)) is None
+
+        # Run scrubber: it should not incorrectly interpret the **04 shards' lack of refs to all
+        # ancestor layers as a reason to GC them, because it should realize that a split is in progress.
+        # (GC requires that controller does not indicate split in progress, and that if we see the highest
+        #  shard count N, then there are N shards present with that shard count).
+        gc_output = env.storage_scrubber.pageserver_physical_gc(min_age_secs=0, mode="full")
+        log.info(f"Ran physical GC partway through split: {gc_output}")
+        assert gc_output["ancestor_layers_deleted"] == 0
+        assert gc_output["remote_storage_errors"] == 0
+        assert gc_output["controller_api_errors"] == 0
+
+        # Storage controller shutdown lets our split request client complete
+        log.info("Stopping storage controller")
+        unstuck.set()
+        env.storage_controller.allowed_errors.append(".*Timed out joining HTTP server task.*")
+        env.storage_controller.stop()
+        stuck_split_fut.result()
+
+        # Restart the controller and retry the split with the failpoint disabled, this should
+        # complete successfully and result in an S3 state that allows the scrubber to proceed with removing ancestor layers
+        log.info("Starting & retrying split")
+        env.storage_controller.start()
+        env.storage_controller.tenant_shard_split(tenant_id, shard_count=4)
+
+        # The other child shards exist now, we can compact them to drop refs to ancestor
+        log.info("Compacting second two children")
+        for child in [
+            TenantShardId(tenant_id, 1, 4),
+            TenantShardId(tenant_id, 3, 4),
+        ]:
+            env.get_tenant_pageserver(child).http_client().timeline_compact(child, timeline_id)
+
+        gc_output = env.storage_scrubber.pageserver_physical_gc(min_age_secs=0, mode="full")
+        log.info(f"Ran physical GC after split completed: {gc_output}")
+        assert gc_output["ancestor_layers_deleted"] > 0
+        assert gc_output["remote_storage_errors"] == 0
+        assert gc_output["controller_api_errors"] == 0
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index 1d7c8b8e31..6d20b3d0de 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -5,7 +5,6 @@ from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     PgBin,
-    StorageScrubber,
     wait_for_last_flush_lsn,
 )
 from fixtures.pageserver.http import PageserverApiException
@@ -325,7 +324,6 @@ def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder)
 
     remote_storage_kind = RemoteStorageKind.MOCK_S3
     neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
-    scrubber = StorageScrubber(neon_env_builder)
     env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
 
     ps_http = env.pageserver.http_client()
@@ -340,7 +338,7 @@ def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder)
     wait_for_upload(ps_http, tenant_id, timeline_id, last_flush_lsn)
     env.stop()
 
-    result = scrubber.scan_metadata()
+    result = env.storage_scrubber.scan_metadata()
     assert result["with_warnings"] == []
 
     env.start()
@@ -348,5 +346,5 @@ def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder)
     ps_http.tenant_delete(tenant_id)
     env.stop()
 
-    scrubber.scan_metadata()
+    env.storage_scrubber.scan_metadata()
     assert result["with_warnings"] == []

From 067363fe95678c0d470654f01d6b3177618d09f5 Mon Sep 17 00:00:00 2001
From: Shinya Kato <37682778+shinyaaa@users.noreply.github.com>
Date: Sat, 20 Jul 2024 02:10:19 +0900
Subject: [PATCH 281/412] safekeeper: remove unused safekeeper runtimes (#8433)

There are unused safekeeper runtimes `WAL_REMOVER_RUNTIME` and
`METRICS_SHIFTER_RUNTIME`.

`WAL_REMOVER_RUNTIME` was implemented in
[#4119](https://github.com/neondatabase/neon/pull/4119) and removed in
[#7887](https://github.com/neondatabase/neon/pull/7887).
`METRICS_SHIFTER_RUNTIME` was also implemented in
[#4119](https://github.com/neondatabase/neon/pull/4119) but has never
been used.

I removed unused safekeeper runtimes `WAL_REMOVER_RUNTIME` and
`METRICS_SHIFTER_RUNTIME`.
---
 safekeeper/src/lib.rs | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index af83feb77f..8f2920ada3 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -173,15 +173,6 @@ pub static BROKER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
         .expect("Failed to create broker runtime")
 });
 
-pub static WAL_REMOVER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
-    tokio::runtime::Builder::new_multi_thread()
-        .thread_name("WAL remover")
-        .worker_threads(1)
-        .enable_all()
-        .build()
-        .expect("Failed to create broker runtime")
-});
-
 pub static WAL_BACKUP_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
     tokio::runtime::Builder::new_multi_thread()
         .thread_name("WAL backup worker")
@@ -189,12 +180,3 @@ pub static WAL_BACKUP_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
         .build()
         .expect("Failed to create WAL backup runtime")
 });
-
-pub static METRICS_SHIFTER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
-    tokio::runtime::Builder::new_multi_thread()
-        .thread_name("metric shifter")
-        .worker_threads(1)
-        .enable_all()
-        .build()
-        .expect("Failed to create broker runtime")
-});

From bba062e2626051825437989577142ae97fb2bbe0 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 19 Jul 2024 18:30:28 +0100
Subject: [PATCH 282/412] tests: longer timeouts in
 test_timeline_deletion_with_files_stuck_in_upload_queue (#8438)

## Problem

This test had two locations with 2 second timeouts, which is rather low
when we run on a highly contended test machine running lots of tests in
parallel. It usually passes, but today I've seen both of these locations
time out on separate PRs.

Example failure:
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-8432/10007868041/index.html#suites/837740b64a53e769572c4ed7b7a7eeeb/6c6a092be083d27c

## Summary of changes

- Change 2 second timeouts to 20 second timeouts
---
 test_runner/regress/test_remote_storage.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index fac7fe9dee..09f941f582 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -577,7 +577,7 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
             > 0
         )
 
-    wait_until(20, 0.1, assert_compacted_and_uploads_queued)
+    wait_until(200, 0.1, assert_compacted_and_uploads_queued)
 
     # Regardless, give checkpoint some time to block for good.
     # Not strictly necessary, but might help uncover failure modes in the future.
@@ -619,7 +619,7 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
     )
 
     # timeline deletion should be unblocking checkpoint ops
-    checkpoint_thread.join(2.0)
+    checkpoint_thread.join(20.0)
     assert not checkpoint_thread.is_alive()
 
     # Just to be sure, unblock ongoing uploads. If the previous assert was incorrect, or the prometheus metric broken,

From f7f9b4aaec21775147dce63100b0cb35b3d90d50 Mon Sep 17 00:00:00 2001
From: Shinya Kato <37682778+shinyaaa@users.noreply.github.com>
Date: Sat, 20 Jul 2024 03:20:57 +0900
Subject: [PATCH 283/412] Fix openapi specification (#8273)

## Problem

There are some swagger errors in `pageserver/src/http/openapi_spec.yml`
```
Error	431	15000	Object includes not allowed fields
Error	569	3100401	should always have a 'required'
Error	569	15000	Object includes not allowed fields
Error	1111	10037	properties members must be schemas
```

## Summary of changes

Fixed the above errors.
---
 pageserver/src/http/openapi_spec.yml | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index ae109ec1e7..4d243ddeb9 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -377,7 +377,7 @@ paths:
               schema:
                 $ref: "#/components/schemas/ConflictError"
 
-  /v1/tenant/{tenant_id}/{timeline_id}/preserve_initdb_archive:
+  /v1/tenant/{tenant_id}/timeline/{timeline_id}/preserve_initdb_archive:
     parameters:
       - name: tenant_id
         in: path
@@ -429,7 +429,9 @@ paths:
               schema:
                 $ref: "#/components/schemas/SyntheticSizeResponse"
             text/html:
-              description: SVG representation of the tenant and it's timelines.
+              schema:
+                type: string
+                description: SVG representation of the tenant and its timelines.
         "401":
           description: Unauthorized Error
           content:
@@ -568,7 +570,7 @@ paths:
           type: string
       - name: timeline_id
         in: path
-        ŕequired: true
+        required: true
         schema:
           type: string
 
@@ -774,15 +776,13 @@ components:
     TenantCreateRequest:
       allOf:
         - $ref: '#/components/schemas/TenantConfig'
+        - $ref: '#/components/schemas/TenantLoadRequest'
         - type: object
           required:
             - new_tenant_id
           properties:
             new_tenant_id:
               type: string
-            generation:
-              type: integer
-              description: Attachment generation number.
     TenantLoadRequest:
       type: object
       properties:
@@ -1106,7 +1106,7 @@ components:
         reparented_timelines:
           type: array
           description: Set of reparented timeline ids
-          properties:
+          items:
             type: string
             format: hex
             description: TimelineId

From 21b3a191bfd1a1eb1a8df773431a92939033d063 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 19 Jul 2024 21:01:59 +0200
Subject: [PATCH 284/412] Add archival_config endpoint to pageserver (#8414)

This adds an archival_config endpoint to the pageserver. Currently it
has no effect, and always "works", but later the intent is that it will
make a timeline archived/unarchived.

- [x] add yml spec
- [x] add endpoint handler

Part of https://github.com/neondatabase/neon/issues/8088
---
 libs/pageserver_api/src/models.rs    | 11 ++++++
 pageserver/src/http/openapi_spec.yml | 54 ++++++++++++++++++++++++++++
 pageserver/src/http/routes.rs        | 44 +++++++++++++++++++++--
 pageserver/src/tenant.rs             |  9 +++++
 4 files changed, 115 insertions(+), 3 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 6abdcb88d0..231a604b47 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -651,6 +651,17 @@ pub struct TenantDetails {
     pub timelines: Vec<TimelineId>,
 }
 
+#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Copy, Debug)]
+pub enum TimelineArchivalState {
+    Archived,
+    Unarchived,
+}
+
+#[derive(Serialize, Deserialize, PartialEq, Eq, Clone)]
+pub struct TimelineArchivalConfigRequest {
+    pub state: TimelineArchivalState,
+}
+
 /// This represents the output of the "timeline_detail" and "timeline_list" API calls.
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct TimelineInfo {
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 4d243ddeb9..087d281a0c 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -397,6 +397,51 @@ paths:
         "202":
           description: Tenant scheduled to load successfully
 
+  /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/archival_config:
+    parameters:
+      - name: tenant_shard_id
+        in: path
+        required: true
+        schema:
+          type: string
+      - name: timeline_id
+        in: path
+        required: true
+        schema:
+          type: string
+    put:
+      description: |
+        Either archives or unarchives the given timeline.
+        An archived timeline may not have any non-archived children.
+      requestBody:
+        required: false
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/ArchivalConfigRequest"
+      responses:
+        "200":
+          description: Timeline (un)archived successfully
+        "409":
+          description: |
+            The tenant/timeline is already being modified, perhaps by a concurrent call to this API
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ConflictError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+
   /v1/tenant/{tenant_id}/synthetic_size:
     parameters:
       - name: tenant_id
@@ -846,6 +891,15 @@ components:
         warm:
           type: boolean
           description: Whether to poll remote storage for layers to download.  If false, secondary locations don't download anything.
+    ArchivalConfigRequest:
+      type: object
+      required
+        - state
+      properties:
+        state:
+          description: The archival state of a timeline
+          type: string
+          enum: ["Archived", "Unarchived"]
     TenantConfig:
       type: object
       properties:
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index d7ef70477f..b8063eb5a2 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -18,14 +18,17 @@ use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
 use pageserver_api::models::AuxFilePolicy;
+use pageserver_api::models::DownloadRemoteLayersTaskSpawnRequest;
 use pageserver_api::models::IngestAuxFilesRequest;
 use pageserver_api::models::ListAuxFilesRequest;
 use pageserver_api::models::LocationConfig;
 use pageserver_api::models::LocationConfigListResponse;
+use pageserver_api::models::LocationConfigMode;
 use pageserver_api::models::LsnLease;
 use pageserver_api::models::LsnLeaseRequest;
 use pageserver_api::models::ShardParameters;
 use pageserver_api::models::TenantDetails;
+use pageserver_api::models::TenantLocationConfigRequest;
 use pageserver_api::models::TenantLocationConfigResponse;
 use pageserver_api::models::TenantScanRemoteStorageResponse;
 use pageserver_api::models::TenantScanRemoteStorageShard;
@@ -33,12 +36,10 @@ use pageserver_api::models::TenantShardLocation;
 use pageserver_api::models::TenantShardSplitRequest;
 use pageserver_api::models::TenantShardSplitResponse;
 use pageserver_api::models::TenantSorting;
+use pageserver_api::models::TimelineArchivalConfigRequest;
 use pageserver_api::models::TopTenantShardItem;
 use pageserver_api::models::TopTenantShardsRequest;
 use pageserver_api::models::TopTenantShardsResponse;
-use pageserver_api::models::{
-    DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantLocationConfigRequest,
-};
 use pageserver_api::shard::ShardCount;
 use pageserver_api::shard::TenantShardId;
 use remote_storage::DownloadError;
@@ -664,6 +665,39 @@ async fn timeline_preserve_initdb_handler(
     json_response(StatusCode::OK, ())
 }
 
+async fn timeline_archival_config_handler(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+
+    let request_data: TimelineArchivalConfigRequest = json_request(&mut request).await?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    let state = get_state(&request);
+
+    async {
+        let tenant = state
+            .tenant_manager
+            .get_attached_tenant_shard(tenant_shard_id)?;
+
+        tenant
+            .apply_timeline_archival_config(timeline_id, request_data.state)
+            .await
+            .context("applying archival config")
+            .map_err(ApiError::InternalServerError)?;
+        Ok::<_, ApiError>(())
+    }
+    .instrument(info_span!("timeline_archival_config",
+                tenant_id = %tenant_shard_id.tenant_id,
+                shard_id = %tenant_shard_id.shard_slug(),
+                state = ?request_data.state,
+                %timeline_id))
+    .await?;
+
+    json_response(StatusCode::OK, ())
+}
+
 async fn timeline_detail_handler(
     request: Request<Body>,
     _cancel: CancellationToken,
@@ -2789,6 +2823,10 @@ pub fn make_router(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/preserve_initdb_archive",
             |r| api_handler(r, timeline_preserve_initdb_handler),
         )
+        .post(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/archival_config",
+            |r| api_handler(r, timeline_archival_config_handler),
+        )
         .get("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| {
             api_handler(r, timeline_detail_handler)
         })
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 637051413f..01f7ac626b 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -21,6 +21,7 @@ use futures::FutureExt;
 use futures::StreamExt;
 use pageserver_api::models;
 use pageserver_api::models::AuxFilePolicy;
+use pageserver_api::models::TimelineArchivalState;
 use pageserver_api::models::TimelineState;
 use pageserver_api::models::TopTenantShardItem;
 use pageserver_api::models::WalRedoManagerStatus;
@@ -1228,6 +1229,14 @@ impl Tenant {
         Ok(timeline_preloads)
     }
 
+    pub async fn apply_timeline_archival_config(
+        &self,
+        _timeline_id: TimelineId,
+        _config: TimelineArchivalState,
+    ) -> anyhow::Result<()> {
+        Ok(())
+    }
+
     pub(crate) fn tenant_shard_id(&self) -> TenantShardId {
         self.tenant_shard_id
     }

From 31bfeaf9343f3c40fa798246dcd5d8c88aaf4aaa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 19 Jul 2024 21:19:30 +0200
Subject: [PATCH 285/412] Use DefaultCredentialsChain AWS authentication in
 remote_storage (#8440)

PR #8299 has switched the storage scrubber to use
`DefaultCredentialsChain`. Now we do this for `remote_storage`, as it
allows us to use `remote_storage` from inside kubernetes. Most of the
diff is due to `GenericRemoteStorage::from_config` becoming `async fn`.
---
 libs/remote_storage/src/lib.rs                |   4 +-
 libs/remote_storage/src/s3_bucket.rs          |  64 ++++------
 libs/remote_storage/tests/test_real_azure.rs  |   7 +-
 libs/remote_storage/tests/test_real_s3.rs     |   7 +-
 pageserver/ctl/src/main.rs                    |   2 +-
 pageserver/src/bin/pageserver.rs              |   6 +-
 pageserver/src/consumption_metrics.rs         |   2 +-
 pageserver/src/deletion_queue.rs              |  20 ++-
 pageserver/src/pgdatadir_mapping.rs           |   2 +-
 pageserver/src/tenant.rs                      | 116 +++++++++++-------
 pageserver/src/tenant/mgr.rs                  |   4 +-
 .../src/tenant/remote_timeline_client.rs      |   2 +-
 .../src/tenant/storage_layer/delta_layer.rs   |   8 +-
 .../src/tenant/storage_layer/image_layer.rs   |   4 +-
 .../src/tenant/storage_layer/layer/tests.rs   |  25 ++--
 .../tenant/storage_layer/merge_iterator.rs    |  12 +-
 pageserver/src/tenant/timeline.rs             |   5 +-
 .../walreceiver/connection_manager.rs         |  17 +--
 pageserver/src/walingest.rs                   |  16 ++-
 proxy/src/context/parquet.rs                  |  10 +-
 proxy/src/usage_metrics.rs                    |  14 ++-
 safekeeper/src/bin/safekeeper.rs              |   2 +-
 safekeeper/src/wal_backup.rs                  |  26 ++--
 23 files changed, 219 insertions(+), 156 deletions(-)

diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index d440c03a0e..3381c4296f 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -443,7 +443,7 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
 }
 
 impl GenericRemoteStorage {
-    pub fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result<Self> {
+    pub async fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result<Self> {
         let timeout = storage_config.timeout;
         Ok(match &storage_config.storage {
             RemoteStorageKind::LocalFs { local_path: path } => {
@@ -458,7 +458,7 @@ impl GenericRemoteStorage {
                     std::env::var("AWS_ACCESS_KEY_ID").unwrap_or_else(|_| "<none>".into());
                 info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}', profile: {profile}, access_key_id: {access_key_id}",
                       s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint);
-                Self::AwsS3(Arc::new(S3Bucket::new(s3_config, timeout)?))
+                Self::AwsS3(Arc::new(S3Bucket::new(s3_config, timeout).await?))
             }
             RemoteStorageKind::AzureContainer(azure_config) => {
                 let storage_account = azure_config
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index ef1bd2c047..b65d8b7e9e 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -16,16 +16,10 @@ use std::{
 
 use anyhow::{anyhow, Context as _};
 use aws_config::{
-    environment::credentials::EnvironmentVariableCredentialsProvider,
-    imds::credentials::ImdsCredentialsProvider,
-    meta::credentials::CredentialsProviderChain,
-    profile::ProfileFileCredentialsProvider,
-    provider_config::ProviderConfig,
+    default_provider::credentials::DefaultCredentialsChain,
     retry::{RetryConfigBuilder, RetryMode},
-    web_identity_token::WebIdentityTokenCredentialsProvider,
     BehaviorVersion,
 };
-use aws_credential_types::provider::SharedCredentialsProvider;
 use aws_sdk_s3::{
     config::{AsyncSleep, IdentityCache, Region, SharedAsyncSleep},
     error::SdkError,
@@ -76,40 +70,27 @@ struct GetObjectRequest {
 }
 impl S3Bucket {
     /// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
-    pub fn new(remote_storage_config: &S3Config, timeout: Duration) -> anyhow::Result<Self> {
+    pub async fn new(remote_storage_config: &S3Config, timeout: Duration) -> anyhow::Result<Self> {
         tracing::debug!(
             "Creating s3 remote storage for S3 bucket {}",
             remote_storage_config.bucket_name
         );
 
-        let region = Some(Region::new(remote_storage_config.bucket_region.clone()));
+        let region = Region::new(remote_storage_config.bucket_region.clone());
+        let region_opt = Some(region.clone());
 
-        let provider_conf = ProviderConfig::without_region().with_region(region.clone());
-
-        let credentials_provider = {
-            // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
-            CredentialsProviderChain::first_try(
-                "env",
-                EnvironmentVariableCredentialsProvider::new(),
-            )
-            // uses "AWS_PROFILE" / `aws sso login --profile <profile>`
-            .or_else(
-                "profile-sso",
-                ProfileFileCredentialsProvider::builder()
-                    .configure(&provider_conf)
-                    .build(),
-            )
-            // uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
-            // needed to access remote extensions bucket
-            .or_else(
-                "token",
-                WebIdentityTokenCredentialsProvider::builder()
-                    .configure(&provider_conf)
-                    .build(),
-            )
-            // uses imds v2
-            .or_else("imds", ImdsCredentialsProvider::builder().build())
-        };
+        // https://docs.aws.amazon.com/sdkref/latest/guide/standardized-credentials.html
+        // https://docs.rs/aws-config/latest/aws_config/default_provider/credentials/struct.DefaultCredentialsChain.html
+        // Incomplete list of auth methods used by this:
+        // * "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
+        // * "AWS_PROFILE" / `aws sso login --profile <profile>`
+        // * "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
+        // * http (ECS/EKS) container credentials
+        // * imds v2
+        let credentials_provider = DefaultCredentialsChain::builder()
+            .region(region)
+            .build()
+            .await;
 
         // AWS SDK requires us to specify how the RetryConfig should sleep when it wants to back off
         let sleep_impl: Arc<dyn AsyncSleep> = Arc::new(TokioSleep::new());
@@ -118,9 +99,9 @@ impl S3Bucket {
             #[allow(deprecated)] /* TODO: https://github.com/neondatabase/neon/issues/7665 */
             BehaviorVersion::v2023_11_09(),
         )
-        .region(region)
+        .region(region_opt)
         .identity_cache(IdentityCache::lazy().build())
-        .credentials_provider(SharedCredentialsProvider::new(credentials_provider))
+        .credentials_provider(credentials_provider)
         .sleep_impl(SharedAsyncSleep::from(sleep_impl));
 
         let sdk_config: aws_config::SdkConfig = std::thread::scope(|s| {
@@ -1041,8 +1022,8 @@ mod tests {
 
     use crate::{RemotePath, S3Bucket, S3Config};
 
-    #[test]
-    fn relative_path() {
+    #[tokio::test]
+    async fn relative_path() {
         let all_paths = ["", "some/path", "some/path/"];
         let all_paths: Vec<RemotePath> = all_paths
             .iter()
@@ -1085,8 +1066,9 @@ mod tests {
                 max_keys_per_list_response: Some(5),
                 upload_storage_class: None,
             };
-            let storage =
-                S3Bucket::new(&config, std::time::Duration::ZERO).expect("remote storage init");
+            let storage = S3Bucket::new(&config, std::time::Duration::ZERO)
+                .await
+                .expect("remote storage init");
             for (test_path_idx, test_path) in all_paths.iter().enumerate() {
                 let result = storage.relative_path_to_s3_object(test_path);
                 let expected = expected_outputs[prefix_idx][test_path_idx];
diff --git a/libs/remote_storage/tests/test_real_azure.rs b/libs/remote_storage/tests/test_real_azure.rs
index 23628dfebe..3a20649490 100644
--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -31,6 +31,7 @@ struct EnabledAzure {
 impl EnabledAzure {
     async fn setup(max_keys_in_list_response: Option<i32>) -> Self {
         let client = create_azure_client(max_keys_in_list_response)
+            .await
             .context("Azure client creation")
             .expect("Azure client creation failed");
 
@@ -187,7 +188,7 @@ impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
     }
 }
 
-fn create_azure_client(
+async fn create_azure_client(
     max_keys_per_list_response: Option<i32>,
 ) -> anyhow::Result<Arc<GenericRemoteStorage>> {
     use rand::Rng;
@@ -221,6 +222,8 @@ fn create_azure_client(
         timeout: Duration::from_secs(120),
     };
     Ok(Arc::new(
-        GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
+        GenericRemoteStorage::from_config(&remote_storage_config)
+            .await
+            .context("remote storage init")?,
     ))
 }
diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs
index a273abe867..342bc6da0b 100644
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -197,6 +197,7 @@ struct EnabledS3 {
 impl EnabledS3 {
     async fn setup(max_keys_in_list_response: Option<i32>) -> Self {
         let client = create_s3_client(max_keys_in_list_response)
+            .await
             .context("S3 client creation")
             .expect("S3 client creation failed");
 
@@ -352,7 +353,7 @@ impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
     }
 }
 
-fn create_s3_client(
+async fn create_s3_client(
     max_keys_per_list_response: Option<i32>,
 ) -> anyhow::Result<Arc<GenericRemoteStorage>> {
     use rand::Rng;
@@ -385,7 +386,9 @@ fn create_s3_client(
         timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
     };
     Ok(Arc::new(
-        GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
+        GenericRemoteStorage::from_config(&remote_storage_config)
+            .await
+            .context("remote storage init")?,
     ))
 }
 
diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs
index ea09a011e5..3fabf62987 100644
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -179,7 +179,7 @@ async fn main() -> anyhow::Result<()> {
                 .get("remote_storage")
                 .expect("need remote_storage");
             let config = RemoteStorageConfig::from_toml(toml_item)?;
-            let storage = remote_storage::GenericRemoteStorage::from_config(&config);
+            let storage = remote_storage::GenericRemoteStorage::from_config(&config).await;
             let cancel = CancellationToken::new();
             storage
                 .unwrap()
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index fceddfb757..ec1ceb54ce 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -385,7 +385,7 @@ fn start_pageserver(
     let shutdown_pageserver = tokio_util::sync::CancellationToken::new();
 
     // Set up remote storage client
-    let remote_storage = create_remote_storage_client(conf)?;
+    let remote_storage = BACKGROUND_RUNTIME.block_on(create_remote_storage_client(conf))?;
 
     // Set up deletion queue
     let (deletion_queue, deletion_workers) = DeletionQueue::new(
@@ -701,7 +701,7 @@ fn start_pageserver(
     }
 }
 
-fn create_remote_storage_client(
+async fn create_remote_storage_client(
     conf: &'static PageServerConf,
 ) -> anyhow::Result<GenericRemoteStorage> {
     let config = if let Some(config) = &conf.remote_storage_config {
@@ -711,7 +711,7 @@ fn create_remote_storage_client(
     };
 
     // Create the client
-    let mut remote_storage = GenericRemoteStorage::from_config(config)?;
+    let mut remote_storage = GenericRemoteStorage::from_config(config).await?;
 
     // If `test_remote_failures` is non-zero, wrap the client with a
     // wrapper that simulates failures.
diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index 6861adad2c..9104da6072 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -96,7 +96,7 @@ pub async fn collect_metrics(
         .expect("Failed to create http client with timeout");
 
     let bucket_client = if let Some(bucket_config) = metric_collection_bucket {
-        match GenericRemoteStorage::from_config(bucket_config) {
+        match GenericRemoteStorage::from_config(bucket_config).await {
             Ok(client) => Some(client),
             Err(e) => {
                 // Non-fatal error: if we were given an invalid config, we will proceed
diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs
index 3e48552ace..22f7d5b824 100644
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -828,9 +828,9 @@ mod test {
         }
     }
 
-    fn setup(test_name: &str) -> anyhow::Result<TestSetup> {
+    async fn setup(test_name: &str) -> anyhow::Result<TestSetup> {
         let test_name = Box::leak(Box::new(format!("deletion_queue__{test_name}")));
-        let harness = TenantHarness::create(test_name)?;
+        let harness = TenantHarness::create(test_name).await?;
 
         // We do not load() the harness: we only need its config and remote_storage
 
@@ -844,7 +844,9 @@ mod test {
             },
             timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
         };
-        let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();
+        let storage = GenericRemoteStorage::from_config(&storage_config)
+            .await
+            .unwrap();
 
         let mock_control_plane = MockControlPlane::new();
 
@@ -922,7 +924,9 @@ mod test {
     #[tokio::test]
     async fn deletion_queue_smoke() -> anyhow::Result<()> {
         // Basic test that the deletion queue processes the deletions we pass into it
-        let ctx = setup("deletion_queue_smoke").expect("Failed test setup");
+        let ctx = setup("deletion_queue_smoke")
+            .await
+            .expect("Failed test setup");
         let client = ctx.deletion_queue.new_client();
         client.recover(HashMap::new())?;
 
@@ -992,7 +996,9 @@ mod test {
 
     #[tokio::test]
     async fn deletion_queue_validation() -> anyhow::Result<()> {
-        let ctx = setup("deletion_queue_validation").expect("Failed test setup");
+        let ctx = setup("deletion_queue_validation")
+            .await
+            .expect("Failed test setup");
         let client = ctx.deletion_queue.new_client();
         client.recover(HashMap::new())?;
 
@@ -1051,7 +1057,9 @@ mod test {
     #[tokio::test]
     async fn deletion_queue_recovery() -> anyhow::Result<()> {
         // Basic test that the deletion queue processes the deletions we pass into it
-        let mut ctx = setup("deletion_queue_recovery").expect("Failed test setup");
+        let mut ctx = setup("deletion_queue_recovery")
+            .await
+            .expect("Failed test setup");
         let client = ctx.deletion_queue.new_client();
         client.recover(HashMap::new())?;
 
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index a821b824d0..3bbd084ab4 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -2031,7 +2031,7 @@ mod tests {
     #[tokio::test]
     async fn aux_files_round_trip() -> anyhow::Result<()> {
         let name = "aux_files_round_trip";
-        let harness = TenantHarness::create(name)?;
+        let harness = TenantHarness::create(name).await?;
 
         pub const TIMELINE_ID: TimelineId =
             TimelineId::from_array(hex!("11223344556677881122334455667788"));
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 01f7ac626b..6d59752606 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3797,7 +3797,7 @@ pub(crate) mod harness {
     }
 
     impl TenantHarness {
-        pub fn create_custom(
+        pub async fn create_custom(
             test_name: &'static str,
             tenant_conf: TenantConf,
             tenant_id: TenantId,
@@ -3833,7 +3833,7 @@ pub(crate) mod harness {
                 },
                 timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
             };
-            let remote_storage = GenericRemoteStorage::from_config(&config).unwrap();
+            let remote_storage = GenericRemoteStorage::from_config(&config).await.unwrap();
             let deletion_queue = MockDeletionQueue::new(Some(remote_storage.clone()));
 
             Ok(Self {
@@ -3848,7 +3848,7 @@ pub(crate) mod harness {
             })
         }
 
-        pub fn create(test_name: &'static str) -> anyhow::Result<Self> {
+        pub async fn create(test_name: &'static str) -> anyhow::Result<Self> {
             // Disable automatic GC and compaction to make the unit tests more deterministic.
             // The tests perform them manually if needed.
             let tenant_conf = TenantConf {
@@ -3865,6 +3865,7 @@ pub(crate) mod harness {
                 shard,
                 Generation::new(0xdeadbeef),
             )
+            .await
         }
 
         pub fn span(&self) -> tracing::Span {
@@ -4001,7 +4002,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_basic() -> anyhow::Result<()> {
-        let (tenant, ctx) = TenantHarness::create("test_basic")?.load().await;
+        let (tenant, ctx) = TenantHarness::create("test_basic").await?.load().await;
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
             .await?;
@@ -4048,7 +4049,8 @@ mod tests {
 
     #[tokio::test]
     async fn no_duplicate_timelines() -> anyhow::Result<()> {
-        let (tenant, ctx) = TenantHarness::create("no_duplicate_timelines")?
+        let (tenant, ctx) = TenantHarness::create("no_duplicate_timelines")
+            .await?
             .load()
             .await;
         let _ = tenant
@@ -4080,7 +4082,7 @@ mod tests {
     async fn test_branch() -> anyhow::Result<()> {
         use std::str::from_utf8;
 
-        let (tenant, ctx) = TenantHarness::create("test_branch")?.load().await;
+        let (tenant, ctx) = TenantHarness::create("test_branch").await?.load().await;
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
             .await?;
@@ -4202,7 +4204,8 @@ mod tests {
     #[tokio::test]
     async fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> {
         let (tenant, ctx) =
-            TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")?
+            TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")
+                .await?
                 .load()
                 .await;
         let tline = tenant
@@ -4249,7 +4252,8 @@ mod tests {
     #[tokio::test]
     async fn test_prohibit_branch_creation_on_pre_initdb_lsn() -> anyhow::Result<()> {
         let (tenant, ctx) =
-            TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?
+            TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")
+                .await?
                 .load()
                 .await;
 
@@ -4304,7 +4308,8 @@ mod tests {
     #[tokio::test]
     async fn test_get_branchpoints_from_an_inactive_timeline() -> anyhow::Result<()> {
         let (tenant, ctx) =
-            TenantHarness::create("test_get_branchpoints_from_an_inactive_timeline")?
+            TenantHarness::create("test_get_branchpoints_from_an_inactive_timeline")
+                .await?
                 .load()
                 .await;
         let tline = tenant
@@ -4361,7 +4366,8 @@ mod tests {
     #[tokio::test]
     async fn test_retain_data_in_parent_which_is_needed_for_child() -> anyhow::Result<()> {
         let (tenant, ctx) =
-            TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?
+            TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")
+                .await?
                 .load()
                 .await;
         let tline = tenant
@@ -4391,10 +4397,10 @@ mod tests {
     }
     #[tokio::test]
     async fn test_parent_keeps_data_forever_after_branching() -> anyhow::Result<()> {
-        let (tenant, ctx) =
-            TenantHarness::create("test_parent_keeps_data_forever_after_branching")?
-                .load()
-                .await;
+        let (tenant, ctx) = TenantHarness::create("test_parent_keeps_data_forever_after_branching")
+            .await?
+            .load()
+            .await;
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
             .await?;
@@ -4432,7 +4438,7 @@ mod tests {
     #[tokio::test]
     async fn timeline_load() -> anyhow::Result<()> {
         const TEST_NAME: &str = "timeline_load";
-        let harness = TenantHarness::create(TEST_NAME)?;
+        let harness = TenantHarness::create(TEST_NAME).await?;
         {
             let (tenant, ctx) = harness.load().await;
             let tline = tenant
@@ -4459,7 +4465,7 @@ mod tests {
     #[tokio::test]
     async fn timeline_load_with_ancestor() -> anyhow::Result<()> {
         const TEST_NAME: &str = "timeline_load_with_ancestor";
-        let harness = TenantHarness::create(TEST_NAME)?;
+        let harness = TenantHarness::create(TEST_NAME).await?;
         // create two timelines
         {
             let (tenant, ctx) = harness.load().await;
@@ -4507,7 +4513,10 @@ mod tests {
     #[tokio::test]
     async fn delta_layer_dumping() -> anyhow::Result<()> {
         use storage_layer::AsLayerDesc;
-        let (tenant, ctx) = TenantHarness::create("test_layer_dumping")?.load().await;
+        let (tenant, ctx) = TenantHarness::create("test_layer_dumping")
+            .await?
+            .load()
+            .await;
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
             .await?;
@@ -4534,7 +4543,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_images() -> anyhow::Result<()> {
-        let (tenant, ctx) = TenantHarness::create("test_images")?.load().await;
+        let (tenant, ctx) = TenantHarness::create("test_images").await?.load().await;
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
             .await?;
@@ -4705,7 +4714,7 @@ mod tests {
     //
     #[tokio::test]
     async fn test_bulk_insert() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_bulk_insert")?;
+        let harness = TenantHarness::create("test_bulk_insert").await?;
         let (tenant, ctx) = harness.load().await;
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
@@ -4736,7 +4745,7 @@ mod tests {
     // so the search can stop at the first delta layer and doesn't traverse any deeper.
     #[tokio::test]
     async fn test_get_vectored() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_get_vectored")?;
+        let harness = TenantHarness::create("test_get_vectored").await?;
         let (tenant, ctx) = harness.load().await;
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
@@ -4814,7 +4823,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_get_vectored_aux_files() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_get_vectored_aux_files")?;
+        let harness = TenantHarness::create("test_get_vectored_aux_files").await?;
 
         let (tenant, ctx) = harness.load().await;
         let tline = tenant
@@ -4900,7 +4909,8 @@ mod tests {
             TenantId::generate(),
             ShardIdentity::unsharded(),
             Generation::new(0xdeadbeef),
-        )?;
+        )
+        .await?;
         let (tenant, ctx) = harness.load().await;
 
         let mut current_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
@@ -5043,7 +5053,7 @@ mod tests {
     // ```
     #[tokio::test]
     async fn test_get_vectored_ancestor_descent() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_get_vectored_on_lsn_axis")?;
+        let harness = TenantHarness::create("test_get_vectored_on_lsn_axis").await?;
         let (tenant, ctx) = harness.load().await;
 
         let start_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
@@ -5192,7 +5202,7 @@ mod tests {
         name: &'static str,
         compaction_algorithm: CompactionAlgorithm,
     ) -> anyhow::Result<()> {
-        let mut harness = TenantHarness::create(name)?;
+        let mut harness = TenantHarness::create(name).await?;
         harness.tenant_conf.compaction_algorithm = CompactionAlgorithmSettings {
             kind: compaction_algorithm,
         };
@@ -5276,7 +5286,8 @@ mod tests {
 
     #[tokio::test]
     async fn test_traverse_branches() -> anyhow::Result<()> {
-        let (tenant, ctx) = TenantHarness::create("test_traverse_branches")?
+        let (tenant, ctx) = TenantHarness::create("test_traverse_branches")
+            .await?
             .load()
             .await;
         let mut tline = tenant
@@ -5366,7 +5377,8 @@ mod tests {
 
     #[tokio::test]
     async fn test_traverse_ancestors() -> anyhow::Result<()> {
-        let (tenant, ctx) = TenantHarness::create("test_traverse_ancestors")?
+        let (tenant, ctx) = TenantHarness::create("test_traverse_ancestors")
+            .await?
             .load()
             .await;
         let mut tline = tenant
@@ -5432,7 +5444,8 @@ mod tests {
 
     #[tokio::test]
     async fn test_write_at_initdb_lsn_takes_optimization_code_path() -> anyhow::Result<()> {
-        let (tenant, ctx) = TenantHarness::create("test_empty_test_timeline_is_usable")?
+        let (tenant, ctx) = TenantHarness::create("test_empty_test_timeline_is_usable")
+            .await?
             .load()
             .await;
 
@@ -5501,7 +5514,7 @@ mod tests {
     #[tokio::test]
     async fn test_create_guard_crash() -> anyhow::Result<()> {
         let name = "test_create_guard_crash";
-        let harness = TenantHarness::create(name)?;
+        let harness = TenantHarness::create(name).await?;
         {
             let (tenant, ctx) = harness.load().await;
             let tline = tenant
@@ -5554,7 +5567,7 @@ mod tests {
         name: &'static str,
         compaction_algorithm: CompactionAlgorithm,
     ) -> anyhow::Result<()> {
-        let mut harness = TenantHarness::create(name)?;
+        let mut harness = TenantHarness::create(name).await?;
         harness.tenant_conf.compaction_algorithm = CompactionAlgorithmSettings {
             kind: compaction_algorithm,
         };
@@ -5578,7 +5591,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_metadata_scan() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_metadata_scan")?;
+        let harness = TenantHarness::create("test_metadata_scan").await?;
         let (tenant, ctx) = harness.load().await;
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
@@ -5697,7 +5710,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_metadata_compaction_trigger() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_metadata_compaction_trigger")?;
+        let harness = TenantHarness::create("test_metadata_compaction_trigger").await?;
         let (tenant, ctx) = harness.load().await;
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
@@ -5756,7 +5769,9 @@ mod tests {
 
     #[tokio::test]
     async fn test_branch_copies_dirty_aux_file_flag() {
-        let harness = TenantHarness::create("test_branch_copies_dirty_aux_file_flag").unwrap();
+        let harness = TenantHarness::create("test_branch_copies_dirty_aux_file_flag")
+            .await
+            .unwrap();
 
         // the default aux file policy to switch is v1 if not set by the admins
         assert_eq!(
@@ -5858,7 +5873,9 @@ mod tests {
 
     #[tokio::test]
     async fn aux_file_policy_switch() {
-        let mut harness = TenantHarness::create("aux_file_policy_switch").unwrap();
+        let mut harness = TenantHarness::create("aux_file_policy_switch")
+            .await
+            .unwrap();
         harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::CrossValidation; // set to cross-validation mode
         let (tenant, ctx) = harness.load().await;
 
@@ -6032,7 +6049,9 @@ mod tests {
 
     #[tokio::test]
     async fn aux_file_policy_force_switch() {
-        let mut harness = TenantHarness::create("aux_file_policy_force_switch").unwrap();
+        let mut harness = TenantHarness::create("aux_file_policy_force_switch")
+            .await
+            .unwrap();
         harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::V1;
         let (tenant, ctx) = harness.load().await;
 
@@ -6093,7 +6112,9 @@ mod tests {
 
     #[tokio::test]
     async fn aux_file_policy_auto_detect() {
-        let mut harness = TenantHarness::create("aux_file_policy_auto_detect").unwrap();
+        let mut harness = TenantHarness::create("aux_file_policy_auto_detect")
+            .await
+            .unwrap();
         harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::V2; // set to cross-validation mode
         let (tenant, ctx) = harness.load().await;
 
@@ -6156,7 +6177,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_metadata_image_creation() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_metadata_image_creation")?;
+        let harness = TenantHarness::create("test_metadata_image_creation").await?;
         let (tenant, ctx) = harness.load().await;
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
@@ -6255,7 +6276,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_vectored_missing_data_key_reads() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_vectored_missing_data_key_reads")?;
+        let harness = TenantHarness::create("test_vectored_missing_data_key_reads").await?;
         let (tenant, ctx) = harness.load().await;
 
         let base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
@@ -6327,7 +6348,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_vectored_missing_metadata_key_reads() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_vectored_missing_metadata_key_reads")?;
+        let harness = TenantHarness::create("test_vectored_missing_metadata_key_reads").await?;
         let (tenant, ctx) = harness.load().await;
 
         let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap();
@@ -6419,7 +6440,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_metadata_tombstone_reads() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_metadata_tombstone_reads")?;
+        let harness = TenantHarness::create("test_metadata_tombstone_reads").await?;
         let (tenant, ctx) = harness.load().await;
         let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap();
         let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap();
@@ -6499,7 +6520,9 @@ mod tests {
 
     #[tokio::test]
     async fn test_metadata_tombstone_image_creation() {
-        let harness = TenantHarness::create("test_metadata_tombstone_image_creation").unwrap();
+        let harness = TenantHarness::create("test_metadata_tombstone_image_creation")
+            .await
+            .unwrap();
         let (tenant, ctx) = harness.load().await;
 
         let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap();
@@ -6571,8 +6594,9 @@ mod tests {
 
     #[tokio::test]
     async fn test_metadata_tombstone_empty_image_creation() {
-        let harness =
-            TenantHarness::create("test_metadata_tombstone_empty_image_creation").unwrap();
+        let harness = TenantHarness::create("test_metadata_tombstone_empty_image_creation")
+            .await
+            .unwrap();
         let (tenant, ctx) = harness.load().await;
 
         let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap();
@@ -6635,7 +6659,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_simple_bottom_most_compaction_images() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_simple_bottom_most_compaction_images")?;
+        let harness = TenantHarness::create("test_simple_bottom_most_compaction_images").await?;
         let (tenant, ctx) = harness.load().await;
 
         fn get_key(id: u32) -> Key {
@@ -6843,7 +6867,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_neon_test_record() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_neon_test_record")?;
+        let harness = TenantHarness::create("test_neon_test_record").await?;
         let (tenant, ctx) = harness.load().await;
 
         fn get_key(id: u32) -> Key {
@@ -6924,7 +6948,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_lsn_lease() -> anyhow::Result<()> {
-        let (tenant, ctx) = TenantHarness::create("test_lsn_lease")?.load().await;
+        let (tenant, ctx) = TenantHarness::create("test_lsn_lease").await?.load().await;
         let key = Key::from_hex("010000000033333333444444445500000000").unwrap();
 
         let end_lsn = Lsn(0x100);
@@ -7013,7 +7037,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_simple_bottom_most_compaction_deltas() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_simple_bottom_most_compaction_deltas")?;
+        let harness = TenantHarness::create("test_simple_bottom_most_compaction_deltas").await?;
         let (tenant, ctx) = harness.load().await;
 
         fn get_key(id: u32) -> Key {
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index b0159e22bf..4912608677 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -2698,7 +2698,9 @@ mod tests {
         // Test that if an InProgress tenant is in the map during shutdown, the shutdown will gracefully
         // wait for it to complete before proceeding.
 
-        let h = TenantHarness::create("shutdown_awaits_in_progress_tenant").unwrap();
+        let h = TenantHarness::create("shutdown_awaits_in_progress_tenant")
+            .await
+            .unwrap();
         let (t, _ctx) = h.load().await;
 
         // harness loads it to active, which is forced and nothing is running on the tenant
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 66b759c8e0..bb42fbeebf 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -2128,7 +2128,7 @@ mod tests {
     impl TestSetup {
         async fn new(test_name: &str) -> anyhow::Result<Self> {
             let test_name = Box::leak(Box::new(format!("remote_timeline_client__{test_name}")));
-            let harness = TenantHarness::create(test_name)?;
+            let harness = TenantHarness::create(test_name).await?;
             let (tenant, ctx) = harness.load().await;
 
             let timeline = tenant
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index c34923320a..512e9e86fa 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -1934,7 +1934,7 @@ pub(crate) mod test {
 
     #[tokio::test]
     async fn test_delta_layer_vectored_read_end_to_end() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_delta_layer_oversized_vectored_read")?;
+        let harness = TenantHarness::create("test_delta_layer_oversized_vectored_read").await?;
         let (tenant, ctx) = harness.load().await;
 
         let timeline_id = TimelineId::generate();
@@ -2034,7 +2034,9 @@ pub(crate) mod test {
         use crate::walrecord::NeonWalRecord;
         use bytes::Bytes;
 
-        let h = crate::tenant::harness::TenantHarness::create("truncate_delta_smoke").unwrap();
+        let h = crate::tenant::harness::TenantHarness::create("truncate_delta_smoke")
+            .await
+            .unwrap();
         let (tenant, ctx) = h.load().await;
         let ctx = &ctx;
         let timeline = tenant
@@ -2312,7 +2314,7 @@ pub(crate) mod test {
 
     #[tokio::test]
     async fn delta_layer_iterator() {
-        let harness = TenantHarness::create("delta_layer_iterator").unwrap();
+        let harness = TenantHarness::create("delta_layer_iterator").await.unwrap();
         let (tenant, ctx) = harness.load().await;
 
         let tline = tenant
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 45b47bb62b..19e4e9e2e9 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -1111,6 +1111,7 @@ mod test {
             ShardIdentity::unsharded(),
             get_next_gen(),
         )
+        .await
         .unwrap();
         let (tenant, ctx) = harness.load().await;
         let timeline = tenant
@@ -1177,6 +1178,7 @@ mod test {
                 // But here, all we care about is that the gen number is unique.
                 get_next_gen(),
             )
+            .await
             .unwrap();
             let (tenant, ctx) = harness.load().await;
             let timeline = tenant
@@ -1308,7 +1310,7 @@ mod test {
 
     #[tokio::test]
     async fn image_layer_iterator() {
-        let harness = TenantHarness::create("image_layer_iterator").unwrap();
+        let harness = TenantHarness::create("image_layer_iterator").await.unwrap();
         let (tenant, ctx) = harness.load().await;
 
         let tline = tenant
diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs
index 3a7aca7a6c..8a3737f8a7 100644
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -22,7 +22,7 @@ const FOREVER: std::time::Duration = std::time::Duration::from_secs(ADVANCE.as_s
 async fn smoke_test() {
     let handle = tokio::runtime::Handle::current();
 
-    let h = TenantHarness::create("smoke_test").unwrap();
+    let h = TenantHarness::create("smoke_test").await.unwrap();
     let span = h.span();
     let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
     let (tenant, _) = h.load().await;
@@ -176,7 +176,9 @@ async fn evict_and_wait_on_wanted_deleted() {
     // this is the runtime on which Layer spawns the blocking tasks on
     let handle = tokio::runtime::Handle::current();
 
-    let h = TenantHarness::create("evict_and_wait_on_wanted_deleted").unwrap();
+    let h = TenantHarness::create("evict_and_wait_on_wanted_deleted")
+        .await
+        .unwrap();
     utils::logging::replace_panic_hook_with_tracing_panic_hook().forget();
     let (tenant, ctx) = h.load().await;
 
@@ -258,7 +260,9 @@ fn read_wins_pending_eviction() {
     rt.block_on(async move {
         // this is the runtime on which Layer spawns the blocking tasks on
         let handle = tokio::runtime::Handle::current();
-        let h = TenantHarness::create("read_wins_pending_eviction").unwrap();
+        let h = TenantHarness::create("read_wins_pending_eviction")
+            .await
+            .unwrap();
         let (tenant, ctx) = h.load().await;
         let span = h.span();
         let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
@@ -390,7 +394,7 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
     rt.block_on(async move {
         // this is the runtime on which Layer spawns the blocking tasks on
         let handle = tokio::runtime::Handle::current();
-        let h = TenantHarness::create(name).unwrap();
+        let h = TenantHarness::create(name).await.unwrap();
         let (tenant, ctx) = h.load().await;
         let span = h.span();
         let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
@@ -559,8 +563,9 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
 #[tokio::test(start_paused = true)]
 async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
     let handle = tokio::runtime::Handle::current();
-    let h =
-        TenantHarness::create("cancelled_get_or_maybe_download_does_not_cancel_eviction").unwrap();
+    let h = TenantHarness::create("cancelled_get_or_maybe_download_does_not_cancel_eviction")
+        .await
+        .unwrap();
     let (tenant, ctx) = h.load().await;
 
     let timeline = tenant
@@ -636,7 +641,9 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
 #[tokio::test(start_paused = true)]
 async fn evict_and_wait_does_not_wait_for_download() {
     // let handle = tokio::runtime::Handle::current();
-    let h = TenantHarness::create("evict_and_wait_does_not_wait_for_download").unwrap();
+    let h = TenantHarness::create("evict_and_wait_does_not_wait_for_download")
+        .await
+        .unwrap();
     let (tenant, ctx) = h.load().await;
     let span = h.span();
     let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
@@ -733,7 +740,9 @@ async fn eviction_cancellation_on_drop() {
     // this is the runtime on which Layer spawns the blocking tasks on
     let handle = tokio::runtime::Handle::current();
 
-    let h = TenantHarness::create("eviction_cancellation_on_drop").unwrap();
+    let h = TenantHarness::create("eviction_cancellation_on_drop")
+        .await
+        .unwrap();
     utils::logging::replace_panic_hook_with_tracing_panic_hook().forget();
     let (tenant, ctx) = h.load().await;
 
diff --git a/pageserver/src/tenant/storage_layer/merge_iterator.rs b/pageserver/src/tenant/storage_layer/merge_iterator.rs
index 6f59b2fd77..eb4a1f28a1 100644
--- a/pageserver/src/tenant/storage_layer/merge_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs
@@ -293,7 +293,9 @@ mod tests {
         use crate::repository::Value;
         use bytes::Bytes;
 
-        let harness = TenantHarness::create("merge_iterator_merge_in_between").unwrap();
+        let harness = TenantHarness::create("merge_iterator_merge_in_between")
+            .await
+            .unwrap();
         let (tenant, ctx) = harness.load().await;
 
         let tline = tenant
@@ -356,7 +358,9 @@ mod tests {
         use crate::repository::Value;
         use bytes::Bytes;
 
-        let harness = TenantHarness::create("merge_iterator_delta_merge").unwrap();
+        let harness = TenantHarness::create("merge_iterator_delta_merge")
+            .await
+            .unwrap();
         let (tenant, ctx) = harness.load().await;
 
         let tline = tenant
@@ -430,7 +434,9 @@ mod tests {
         use crate::repository::Value;
         use bytes::Bytes;
 
-        let harness = TenantHarness::create("merge_iterator_delta_image_mixed_merge").unwrap();
+        let harness = TenantHarness::create("merge_iterator_delta_image_mixed_merge")
+            .await
+            .unwrap();
         let (tenant, ctx) = harness.load().await;
 
         let tline = tenant
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 3d3d3ac34d..19b1396981 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -6046,8 +6046,9 @@ mod tests {
 
     #[tokio::test]
     async fn two_layer_eviction_attempts_at_the_same_time() {
-        let harness =
-            TenantHarness::create("two_layer_eviction_attempts_at_the_same_time").unwrap();
+        let harness = TenantHarness::create("two_layer_eviction_attempts_at_the_same_time")
+            .await
+            .unwrap();
 
         let (tenant, ctx) = harness.load().await;
         let timeline = tenant
diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
index 1d2ffec08f..de50f217d8 100644
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -1118,7 +1118,7 @@ mod tests {
 
     #[tokio::test]
     async fn no_connection_no_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("no_connection_no_candidate")?;
+        let harness = TenantHarness::create("no_connection_no_candidate").await?;
         let mut state = dummy_state(&harness).await;
         let now = Utc::now().naive_utc();
 
@@ -1151,7 +1151,7 @@ mod tests {
 
     #[tokio::test]
     async fn connection_no_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("connection_no_candidate")?;
+        let harness = TenantHarness::create("connection_no_candidate").await?;
         let mut state = dummy_state(&harness).await;
         let now = Utc::now().naive_utc();
 
@@ -1216,7 +1216,7 @@ mod tests {
 
     #[tokio::test]
     async fn no_connection_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("no_connection_candidate")?;
+        let harness = TenantHarness::create("no_connection_candidate").await?;
         let mut state = dummy_state(&harness).await;
         let now = Utc::now().naive_utc();
 
@@ -1279,7 +1279,7 @@ mod tests {
 
     #[tokio::test]
     async fn candidate_with_many_connection_failures() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("candidate_with_many_connection_failures")?;
+        let harness = TenantHarness::create("candidate_with_many_connection_failures").await?;
         let mut state = dummy_state(&harness).await;
         let now = Utc::now().naive_utc();
 
@@ -1319,7 +1319,7 @@ mod tests {
 
     #[tokio::test]
     async fn lsn_wal_over_threshold_current_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("lsn_wal_over_threshcurrent_candidate")?;
+        let harness = TenantHarness::create("lsn_wal_over_threshcurrent_candidate").await?;
         let mut state = dummy_state(&harness).await;
         let current_lsn = Lsn(100_000).align();
         let now = Utc::now().naive_utc();
@@ -1385,7 +1385,8 @@ mod tests {
 
     #[tokio::test]
     async fn timeout_connection_threshold_current_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("timeout_connection_threshold_current_candidate")?;
+        let harness =
+            TenantHarness::create("timeout_connection_threshold_current_candidate").await?;
         let mut state = dummy_state(&harness).await;
         let current_lsn = Lsn(100_000).align();
         let now = Utc::now().naive_utc();
@@ -1448,7 +1449,7 @@ mod tests {
 
     #[tokio::test]
     async fn timeout_wal_over_threshold_current_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("timeout_wal_over_threshold_current_candidate")?;
+        let harness = TenantHarness::create("timeout_wal_over_threshold_current_candidate").await?;
         let mut state = dummy_state(&harness).await;
         let current_lsn = Lsn(100_000).align();
         let new_lsn = Lsn(100_100).align();
@@ -1550,7 +1551,7 @@ mod tests {
         // and pageserver should prefer to connect to it.
         let test_az = Some("test_az".to_owned());
 
-        let harness = TenantHarness::create("switch_to_same_availability_zone")?;
+        let harness = TenantHarness::create("switch_to_same_availability_zone").await?;
         let mut state = dummy_state(&harness).await;
         state.conf.availability_zone.clone_from(&test_az);
         let current_lsn = Lsn(100_000).align();
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 07c90385e6..dff3a8f52d 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -1754,7 +1754,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_relsize() -> Result<()> {
-        let (tenant, ctx) = TenantHarness::create("test_relsize")?.load().await;
+        let (tenant, ctx) = TenantHarness::create("test_relsize").await?.load().await;
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
             .await?;
@@ -1975,7 +1975,10 @@ mod tests {
     // and then created it again within the same layer.
     #[tokio::test]
     async fn test_drop_extend() -> Result<()> {
-        let (tenant, ctx) = TenantHarness::create("test_drop_extend")?.load().await;
+        let (tenant, ctx) = TenantHarness::create("test_drop_extend")
+            .await?
+            .load()
+            .await;
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
             .await?;
@@ -2046,7 +2049,10 @@ mod tests {
     // and then extended it again within the same layer.
     #[tokio::test]
     async fn test_truncate_extend() -> Result<()> {
-        let (tenant, ctx) = TenantHarness::create("test_truncate_extend")?.load().await;
+        let (tenant, ctx) = TenantHarness::create("test_truncate_extend")
+            .await?
+            .load()
+            .await;
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
             .await?;
@@ -2188,7 +2194,7 @@ mod tests {
     /// split into multiple 1 GB segments in Postgres.
     #[tokio::test]
     async fn test_large_rel() -> Result<()> {
-        let (tenant, ctx) = TenantHarness::create("test_large_rel")?.load().await;
+        let (tenant, ctx) = TenantHarness::create("test_large_rel").await?.load().await;
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
             .await?;
@@ -2296,7 +2302,7 @@ mod tests {
         let startpoint = Lsn::from_hex("14AEC08").unwrap();
         let _endpoint = Lsn::from_hex("1FFFF98").unwrap();
 
-        let harness = TenantHarness::create("test_ingest_real_wal").unwrap();
+        let harness = TenantHarness::create("test_ingest_real_wal").await.unwrap();
         let (tenant, ctx) = harness.load().await;
 
         let remote_initdb_path =
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index cfc1f8e89e..543a458274 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -181,8 +181,9 @@ pub async fn worker(
     let rx = futures::stream::poll_fn(move |cx| rx.poll_recv(cx));
     let rx = rx.map(RequestData::from);
 
-    let storage =
-        GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?;
+    let storage = GenericRemoteStorage::from_config(&remote_storage_config)
+        .await
+        .context("remote storage init")?;
 
     let properties = WriterProperties::builder()
         .set_data_page_size_limit(config.parquet_upload_page_size)
@@ -217,6 +218,7 @@ pub async fn worker(
 
         let storage_disconnect =
             GenericRemoteStorage::from_config(&disconnect_events_storage_config)
+                .await
                 .context("remote storage for disconnect events init")?;
         let parquet_config_disconnect = parquet_config.clone();
         tokio::try_join!(
@@ -545,7 +547,9 @@ mod tests {
             },
             timeout: std::time::Duration::from_secs(120),
         };
-        let storage = GenericRemoteStorage::from_config(&remote_storage_config).unwrap();
+        let storage = GenericRemoteStorage::from_config(&remote_storage_config)
+            .await
+            .unwrap();
 
         worker_inner(storage, rx, config).await.unwrap();
 
diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs
index 56ed2145dc..a8735fe0bb 100644
--- a/proxy/src/usage_metrics.rs
+++ b/proxy/src/usage_metrics.rs
@@ -357,11 +357,15 @@ pub async fn task_backup(
         info!("metrics backup has shut down");
     }
     // Even if the remote storage is not configured, we still want to clear the metrics.
-    let storage = backup_config
-        .remote_storage_config
-        .as_ref()
-        .map(|config| GenericRemoteStorage::from_config(config).context("remote storage init"))
-        .transpose()?;
+    let storage = if let Some(config) = backup_config.remote_storage_config.as_ref() {
+        Some(
+            GenericRemoteStorage::from_config(config)
+                .await
+                .context("remote storage init")?,
+        )
+    } else {
+        None
+    };
     let mut ticker = tokio::time::interval(backup_config.interval);
     let mut prev = Utc::now();
     let hostname = hostname::get()?.as_os_str().to_string_lossy().into_owned();
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 9eb6546d6b..2365fd0587 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -418,7 +418,7 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
     let timeline_collector = safekeeper::metrics::TimelineCollector::new();
     metrics::register_internal(Box::new(timeline_collector))?;
 
-    wal_backup::init_remote_storage(&conf);
+    wal_backup::init_remote_storage(&conf).await;
 
     // Keep handles to main tasks to die if any of them disappears.
     let mut tasks_handles: FuturesUnordered<BoxFuture<(String, JoinTaskRes)>> =
diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs
index 5a590689c3..7ecee178f3 100644
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -22,7 +22,7 @@ use tokio::fs::File;
 
 use tokio::select;
 use tokio::sync::mpsc::{self, Receiver, Sender};
-use tokio::sync::watch;
+use tokio::sync::{watch, OnceCell};
 use tokio::time::sleep;
 use tracing::*;
 
@@ -33,8 +33,6 @@ use crate::timeline::{PeerInfo, WalResidentTimeline};
 use crate::timeline_manager::{Manager, StateSnapshot};
 use crate::{SafeKeeperConf, WAL_BACKUP_RUNTIME};
 
-use once_cell::sync::OnceCell;
-
 const UPLOAD_FAILURE_RETRY_MIN_MS: u64 = 10;
 const UPLOAD_FAILURE_RETRY_MAX_MS: u64 = 5000;
 
@@ -167,7 +165,7 @@ fn determine_offloader(
     }
 }
 
-static REMOTE_STORAGE: OnceCell<Option<GenericRemoteStorage>> = OnceCell::new();
+static REMOTE_STORAGE: OnceCell<Option<GenericRemoteStorage>> = OnceCell::const_new();
 
 // Storage must be configured and initialized when this is called.
 fn get_configured_remote_storage() -> &'static GenericRemoteStorage {
@@ -178,14 +176,22 @@ fn get_configured_remote_storage() -> &'static GenericRemoteStorage {
         .unwrap()
 }
 
-pub fn init_remote_storage(conf: &SafeKeeperConf) {
+pub async fn init_remote_storage(conf: &SafeKeeperConf) {
     // TODO: refactor REMOTE_STORAGE to avoid using global variables, and provide
     // dependencies to all tasks instead.
-    REMOTE_STORAGE.get_or_init(|| {
-        conf.remote_storage
-            .as_ref()
-            .map(|c| GenericRemoteStorage::from_config(c).expect("failed to create remote storage"))
-    });
+    REMOTE_STORAGE
+        .get_or_init(|| async {
+            if let Some(conf) = conf.remote_storage.as_ref() {
+                Some(
+                    GenericRemoteStorage::from_config(conf)
+                        .await
+                        .expect("failed to create remote storage"),
+                )
+            } else {
+                None
+            }
+        })
+        .await;
 }
 
 struct WalBackupTask {

From a1272b6ed8b6e4139bf52bb340ed7ef893019d46 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 23 Jul 2024 11:41:12 +0100
Subject: [PATCH 286/412] pageserver: use identity file as node id authority
 and remove init command and config-override flags (#7766)

Ansible will soon write the node id to `identity.toml` in the work dir
for new pageservers. On the pageserver side, we read the node id from
the identity file if it is present and use that as the source of truth.
If the identity file is missing, cannot be read, or does not
deserialise, start-up is aborted.

This PR also removes the `--init` mode and the `--config-override` flag
from the `pageserver` binary.
The neon_local is already not using these flags anymore.

Ansible still uses them until the linked change is merged & deployed,
so, this PR has to land simultaneously or after the Ansible change due
to that.

Related Ansible change: https://github.com/neondatabase/aws/pull/1322
Cplane change to remove config-override usages:
https://github.com/neondatabase/cloud/pull/13417
Closes: https://github.com/neondatabase/neon/issues/7736
Overall plan:
https://www.notion.so/neondatabase/Rollout-Plan-simplified-pageserver-initialization-f935ae02b225444e8a41130b7d34e4ea?pvs=4

Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 Dockerfile                                    |  18 ++--
 control_plane/src/pageserver.rs               |  18 ++++
 .../compute_wrapper/shell/compute.sh          |   2 +-
 docker-compose/docker-compose.yml             |  15 +--
 .../pageserver_config/identity.toml           |   1 +
 .../pageserver_config/pageserver.toml         |   5 +
 pageserver/src/bin/pageserver.rs              | 101 ++++++------------
 pageserver/src/config.rs                      |  68 ++++++++----
 test_runner/regress/test_pageserver_api.py    |  64 -----------
 9 files changed, 117 insertions(+), 175 deletions(-)
 create mode 100644 docker-compose/pageserver_config/identity.toml
 create mode 100644 docker-compose/pageserver_config/pageserver.toml

diff --git a/Dockerfile b/Dockerfile
index a41598ef72..ace112cccf 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -93,13 +93,14 @@ COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/
 
 # By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config.
 # Now, when `docker run ... pageserver` is run, it can start without errors, yet will have some default dummy values.
-RUN mkdir -p /data/.neon/ && chown -R neon:neon /data/.neon/ \
-    && /usr/local/bin/pageserver -D /data/.neon/ --init \
-       -c "id=1234" \
-       -c "broker_endpoint='http://storage_broker:50051'" \
-       -c "pg_distrib_dir='/usr/local/'" \
-       -c "listen_pg_addr='0.0.0.0:6400'" \
-       -c "listen_http_addr='0.0.0.0:9898'"
+RUN mkdir -p /data/.neon/ && \
+  echo "id=1234" > "/data/.neon/identity.toml" && \
+  echo "broker_endpoint='http://storage_broker:50051'\n" \
+       "pg_distrib_dir='/usr/local/'\n" \
+       "listen_pg_addr='0.0.0.0:6400'\n" \
+       "listen_http_addr='0.0.0.0:9898'\n" \
+  > /data/.neon/pageserver.toml && \
+  chown -R neon:neon /data/.neon
 
 # When running a binary that links with libpq, default to using our most recent postgres version.  Binaries
 # that want a particular postgres version will select it explicitly: this is just a default.
@@ -110,3 +111,6 @@ VOLUME ["/data"]
 USER neon
 EXPOSE 6400
 EXPOSE 9898
+
+CMD /usr/local/bin/pageserver -D /data/.neon
+
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index e3d1d0e110..ba4f98d945 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -25,6 +25,7 @@ use pageserver_client::mgmt_api;
 use postgres_backend::AuthType;
 use postgres_connection::{parse_host_port, PgConnectionConfig};
 use utils::auth::{Claims, Scope};
+use utils::id::NodeId;
 use utils::{
     id::{TenantId, TimelineId},
     lsn::Lsn,
@@ -74,6 +75,10 @@ impl PageServerNode {
         }
     }
 
+    fn pageserver_make_identity_toml(&self, node_id: NodeId) -> toml_edit::Document {
+        toml_edit::Document::from_str(&format!("id={node_id}")).unwrap()
+    }
+
     fn pageserver_init_make_toml(
         &self,
         conf: NeonLocalInitPageserverConf,
@@ -186,6 +191,19 @@ impl PageServerNode {
             .write_all(config.to_string().as_bytes())
             .context("write pageserver toml")?;
         drop(config_file);
+
+        let identity_file_path = datadir.join("identity.toml");
+        let mut identity_file = std::fs::OpenOptions::new()
+            .create_new(true)
+            .write(true)
+            .open(identity_file_path)
+            .with_context(|| format!("open identity toml for write: {config_file_path:?}"))?;
+        let identity_toml = self.pageserver_make_identity_toml(node_id);
+        identity_file
+            .write_all(identity_toml.to_string().as_bytes())
+            .context("write identity toml")?;
+        drop(identity_toml);
+
         // TODO: invoke a TBD config-check command to validate that pageserver will start with the written config
 
         // Write metadata file, used by pageserver on startup to register itself with
diff --git a/docker-compose/compute_wrapper/shell/compute.sh b/docker-compose/compute_wrapper/shell/compute.sh
index f646e36f59..33455e458a 100755
--- a/docker-compose/compute_wrapper/shell/compute.sh
+++ b/docker-compose/compute_wrapper/shell/compute.sh
@@ -33,7 +33,7 @@ echo $result | jq .
 
 generate_id timeline_id
 PARAMS=(
-     -sb 
+     -sbf
      -X POST
      -H "Content-Type: application/json"
      -d "{\"new_timeline_id\": \"${timeline_id}\", \"pg_version\": ${PG_VERSION}}"
diff --git a/docker-compose/docker-compose.yml b/docker-compose/docker-compose.yml
index 5503b6611a..6e15fdbe0d 100644
--- a/docker-compose/docker-compose.yml
+++ b/docker-compose/docker-compose.yml
@@ -31,25 +31,14 @@ services:
     restart: always
     image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest}
     environment:
-      - BROKER_ENDPOINT='http://storage_broker:50051'
       - AWS_ACCESS_KEY_ID=minio
       - AWS_SECRET_ACCESS_KEY=password
       #- RUST_BACKTRACE=1
     ports:
        #- 6400:6400  # pg protocol handler
        - 9898:9898 # http endpoints
-    entrypoint:
-      - "/bin/sh"
-      - "-c"
-    command:
-      - "/usr/local/bin/pageserver -D /data/.neon/
-                                   -c \"broker_endpoint=$$BROKER_ENDPOINT\"
-                                   -c \"listen_pg_addr='0.0.0.0:6400'\"
-                                   -c \"listen_http_addr='0.0.0.0:9898'\"
-                                   -c \"remote_storage={endpoint='http://minio:9000',
-                                                        bucket_name='neon',
-                                                        bucket_region='eu-north-1',
-                                                        prefix_in_bucket='/pageserver/'}\""
+    volumes:
+      - ./pageserver_config:/data/.neon/
     depends_on:
       - storage_broker
       - minio_create_buckets
diff --git a/docker-compose/pageserver_config/identity.toml b/docker-compose/pageserver_config/identity.toml
new file mode 100644
index 0000000000..20121327c7
--- /dev/null
+++ b/docker-compose/pageserver_config/identity.toml
@@ -0,0 +1 @@
+id=1234
diff --git a/docker-compose/pageserver_config/pageserver.toml b/docker-compose/pageserver_config/pageserver.toml
new file mode 100644
index 0000000000..76935453b6
--- /dev/null
+++ b/docker-compose/pageserver_config/pageserver.toml
@@ -0,0 +1,5 @@
+broker_endpoint='http://storage_broker:50051'
+pg_distrib_dir='/usr/local/'
+listen_pg_addr='0.0.0.0:6400'
+listen_http_addr='0.0.0.0:9898'
+remote_storage={ endpoint='http://minio:9000', bucket_name='neon', bucket_region='eu-north-1', prefix_in_bucket='/pageserver' }
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index ec1ceb54ce..28846ffbae 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -2,17 +2,18 @@
 
 //! Main entry point for the Page Server executable.
 
+use std::env;
 use std::env::{var, VarError};
 use std::io::Read;
 use std::sync::Arc;
 use std::time::Duration;
-use std::{env, ops::ControlFlow, str::FromStr};
 
 use anyhow::{anyhow, Context};
 use camino::Utf8Path;
 use clap::{Arg, ArgAction, Command};
 
 use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
+use pageserver::config::PageserverIdentity;
 use pageserver::control_plane_client::ControlPlaneClient;
 use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
 use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
@@ -25,7 +26,7 @@ use tracing::*;
 
 use metrics::set_build_info_metric;
 use pageserver::{
-    config::{defaults::*, PageServerConf},
+    config::PageServerConf,
     context::{DownloadBehavior, RequestContext},
     deletion_queue::DeletionQueue,
     http, page_cache, page_service, task_mgr,
@@ -84,18 +85,13 @@ fn main() -> anyhow::Result<()> {
         .with_context(|| format!("Error opening workdir '{workdir}'"))?;
 
     let cfg_file_path = workdir.join("pageserver.toml");
+    let identity_file_path = workdir.join("identity.toml");
 
     // Set CWD to workdir for non-daemon modes
     env::set_current_dir(&workdir)
         .with_context(|| format!("Failed to set application's current dir to '{workdir}'"))?;
 
-    let conf = match initialize_config(&cfg_file_path, arg_matches, &workdir)? {
-        ControlFlow::Continue(conf) => conf,
-        ControlFlow::Break(()) => {
-            info!("Pageserver config init successful");
-            return Ok(());
-        }
-    };
+    let conf = initialize_config(&identity_file_path, &cfg_file_path, &workdir)?;
 
     // Initialize logging.
     //
@@ -150,70 +146,55 @@ fn main() -> anyhow::Result<()> {
 }
 
 fn initialize_config(
+    identity_file_path: &Utf8Path,
     cfg_file_path: &Utf8Path,
-    arg_matches: clap::ArgMatches,
     workdir: &Utf8Path,
-) -> anyhow::Result<ControlFlow<(), &'static PageServerConf>> {
-    let init = arg_matches.get_flag("init");
-
-    let file_contents: Option<toml_edit::Document> = match std::fs::File::open(cfg_file_path) {
+) -> anyhow::Result<&'static PageServerConf> {
+    // The deployment orchestrator writes out an indentity file containing the node id
+    // for all pageservers. This file is the source of truth for the node id. In order
+    // to allow for rolling back pageserver releases, the node id is also included in
+    // the pageserver config that the deployment orchestrator writes to disk for the pageserver.
+    // A rolled back version of the pageserver will get the node id from the pageserver.toml
+    // config file.
+    let identity = match std::fs::File::open(identity_file_path) {
         Ok(mut f) => {
-            if init {
-                anyhow::bail!("config file already exists: {cfg_file_path}");
+            let md = f.metadata().context("stat config file")?;
+            if !md.is_file() {
+                anyhow::bail!("Pageserver found identity file but it is a dir entry: {identity_file_path}. Aborting start up ...");
             }
+
+            let mut s = String::new();
+            f.read_to_string(&mut s).context("read identity file")?;
+            toml_edit::de::from_str::<PageserverIdentity>(&s)?
+        }
+        Err(e) => {
+            anyhow::bail!("Pageserver could not read identity file: {identity_file_path}: {e}. Aborting start up ...");
+        }
+    };
+
+    let config: toml_edit::Document = match std::fs::File::open(cfg_file_path) {
+        Ok(mut f) => {
             let md = f.metadata().context("stat config file")?;
             if md.is_file() {
                 let mut s = String::new();
                 f.read_to_string(&mut s).context("read config file")?;
-                Some(s.parse().context("parse config file toml")?)
+                s.parse().context("parse config file toml")?
             } else {
                 anyhow::bail!("directory entry exists but is not a file: {cfg_file_path}");
             }
         }
-        Err(e) if e.kind() == std::io::ErrorKind::NotFound => None,
         Err(e) => {
             anyhow::bail!("open pageserver config: {e}: {cfg_file_path}");
         }
     };
 
-    let mut effective_config = file_contents.unwrap_or_else(|| {
-        DEFAULT_CONFIG_FILE
-            .parse()
-            .expect("unit tests ensure this works")
-    });
-
-    // Patch with overrides from the command line
-    if let Some(values) = arg_matches.get_many::<String>("config-override") {
-        for option_line in values {
-            let doc = toml_edit::Document::from_str(option_line).with_context(|| {
-                format!("Option '{option_line}' could not be parsed as a toml document")
-            })?;
-
-            for (key, item) in doc.iter() {
-                effective_config.insert(key, item.clone());
-            }
-        }
-    }
-
-    debug!("Resulting toml: {effective_config}");
+    debug!("Using pageserver toml: {config}");
 
     // Construct the runtime representation
-    let conf = PageServerConf::parse_and_validate(&effective_config, workdir)
+    let conf = PageServerConf::parse_and_validate(identity.id, &config, workdir)
         .context("Failed to parse pageserver configuration")?;
 
-    if init {
-        info!("Writing pageserver config to '{cfg_file_path}'");
-
-        std::fs::write(cfg_file_path, effective_config.to_string())
-            .with_context(|| format!("Failed to write pageserver config to '{cfg_file_path}'"))?;
-        info!("Config successfully written to '{cfg_file_path}'")
-    }
-
-    Ok(if init {
-        ControlFlow::Break(())
-    } else {
-        ControlFlow::Continue(Box::leak(Box::new(conf)))
-    })
+    Ok(Box::leak(Box::new(conf)))
 }
 
 struct WaitForPhaseResult<F: std::future::Future + Unpin> {
@@ -734,28 +715,12 @@ fn cli() -> Command {
     Command::new("Neon page server")
         .about("Materializes WAL stream to pages and serves them to the postgres")
         .version(version())
-        .arg(
-            Arg::new("init")
-                .long("init")
-                .action(ArgAction::SetTrue)
-                .help("Initialize pageserver with all given config overrides"),
-        )
         .arg(
             Arg::new("workdir")
                 .short('D')
                 .long("workdir")
                 .help("Working directory for the pageserver"),
         )
-        // See `settings.md` for more details on the extra configuration patameters pageserver can process
-        .arg(
-            Arg::new("config-override")
-                .long("config-override")
-                .short('c')
-                .num_args(1)
-                .action(ArgAction::Append)
-                .help("Additional configuration overrides of the ones from the toml config file (or new ones to add there). \
-                Any option has to be a valid toml document, example: `-c=\"foo='hey'\"` `-c=\"foo={value=1}\"`"),
-        )
         .arg(
             Arg::new("enabled-features")
                 .long("enabled-features")
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 35b4e79365..b4359b926d 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -7,8 +7,8 @@
 use anyhow::{anyhow, bail, ensure, Context, Result};
 use pageserver_api::{models::ImageCompressionAlgorithm, shard::TenantShardId};
 use remote_storage::{RemotePath, RemoteStorageConfig};
-use serde;
 use serde::de::IntoDeserializer;
+use serde::{self, Deserialize};
 use std::env;
 use storage_broker::Uri;
 use utils::crashsafe::path_with_suffix_extension;
@@ -406,6 +406,13 @@ struct PageServerConfigBuilder {
 }
 
 impl PageServerConfigBuilder {
+    fn new(node_id: NodeId) -> Self {
+        let mut this = Self::default();
+        this.id(node_id);
+
+        this
+    }
+
     #[inline(always)]
     fn default_values() -> Self {
         use self::BuilderValue::*;
@@ -881,8 +888,12 @@ impl PageServerConf {
     /// validating the input and failing on errors.
     ///
     /// This leaves any options not present in the file in the built-in defaults.
-    pub fn parse_and_validate(toml: &Document, workdir: &Utf8Path) -> anyhow::Result<Self> {
-        let mut builder = PageServerConfigBuilder::default();
+    pub fn parse_and_validate(
+        node_id: NodeId,
+        toml: &Document,
+        workdir: &Utf8Path,
+    ) -> anyhow::Result<Self> {
+        let mut builder = PageServerConfigBuilder::new(node_id);
         builder.workdir(workdir.to_owned());
 
         let mut t_conf = TenantConfOpt::default();
@@ -913,7 +924,8 @@ impl PageServerConf {
                 "tenant_config" => {
                     t_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("failed to parse: '{key}'"))?;
                 }
-                "id" => builder.id(NodeId(parse_toml_u64(key, item)?)),
+                "id" => {}, // Ignoring `id` field in pageserver.toml - using identity.toml as the source of truth
+                            // Logging is not set up yet, so we can't do it.
                 "broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?),
                 "broker_keepalive_interval" => builder.broker_keepalive_interval(parse_toml_duration(key, item)?),
                 "log_format" => builder.log_format(
@@ -1090,6 +1102,12 @@ impl PageServerConf {
     }
 }
 
+#[derive(Deserialize)]
+#[serde(deny_unknown_fields)]
+pub struct PageserverIdentity {
+    pub id: NodeId,
+}
+
 // Helper functions to parse a toml Item
 
 fn parse_toml_string(name: &str, item: &Item) -> Result<String> {
@@ -1259,7 +1277,7 @@ background_task_maximum_delay = '334 s'
         );
         let toml = config_string.parse()?;
 
-        let parsed_config = PageServerConf::parse_and_validate(&toml, &workdir)
+        let parsed_config = PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
             .unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e:?}"));
 
         assert_eq!(
@@ -1341,7 +1359,7 @@ background_task_maximum_delay = '334 s'
         );
         let toml = config_string.parse()?;
 
-        let parsed_config = PageServerConf::parse_and_validate(&toml, &workdir)
+        let parsed_config = PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
             .unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e:?}"));
 
         assert_eq!(
@@ -1431,12 +1449,13 @@ broker_endpoint = '{broker_endpoint}'
 
             let toml = config_string.parse()?;
 
-            let parsed_remote_storage_config = PageServerConf::parse_and_validate(&toml, &workdir)
-                .unwrap_or_else(|e| {
-                    panic!("Failed to parse config '{config_string}', reason: {e:?}")
-                })
-                .remote_storage_config
-                .expect("Should have remote storage config for the local FS");
+            let parsed_remote_storage_config =
+                PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
+                    .unwrap_or_else(|e| {
+                        panic!("Failed to parse config '{config_string}', reason: {e:?}")
+                    })
+                    .remote_storage_config
+                    .expect("Should have remote storage config for the local FS");
 
             assert_eq!(
                 parsed_remote_storage_config,
@@ -1492,12 +1511,13 @@ broker_endpoint = '{broker_endpoint}'
 
             let toml = config_string.parse()?;
 
-            let parsed_remote_storage_config = PageServerConf::parse_and_validate(&toml, &workdir)
-                .unwrap_or_else(|e| {
-                    panic!("Failed to parse config '{config_string}', reason: {e:?}")
-                })
-                .remote_storage_config
-                .expect("Should have remote storage config for S3");
+            let parsed_remote_storage_config =
+                PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
+                    .unwrap_or_else(|e| {
+                        panic!("Failed to parse config '{config_string}', reason: {e:?}")
+                    })
+                    .remote_storage_config
+                    .expect("Should have remote storage config for S3");
 
             assert_eq!(
                 parsed_remote_storage_config,
@@ -1576,7 +1596,7 @@ threshold = "20m"
 "#,
         );
         let toml: Document = pageserver_conf_toml.parse()?;
-        let conf = PageServerConf::parse_and_validate(&toml, &workdir)?;
+        let conf = PageServerConf::parse_and_validate(NodeId(333), &toml, &workdir)?;
 
         assert_eq!(conf.pg_distrib_dir, pg_distrib_dir);
         assert_eq!(
@@ -1592,7 +1612,11 @@ threshold = "20m"
                 .evictions_low_residence_duration_metric_threshold,
             Duration::from_secs(20 * 60)
         );
-        assert_eq!(conf.id, NodeId(222));
+
+        // Assert that the node id provided by the indentity file (threaded
+        // through the call to [`PageServerConf::parse_and_validate`] is
+        // used.
+        assert_eq!(conf.id, NodeId(333));
         assert_eq!(
             conf.disk_usage_based_eviction,
             Some(DiskUsageEvictionTaskConfig {
@@ -1637,7 +1661,7 @@ threshold = "20m"
 "#,
         );
         let toml: Document = pageserver_conf_toml.parse().unwrap();
-        let conf = PageServerConf::parse_and_validate(&toml, &workdir).unwrap();
+        let conf = PageServerConf::parse_and_validate(NodeId(222), &toml, &workdir).unwrap();
 
         match &conf.default_tenant_conf.eviction_policy {
             EvictionPolicy::OnlyImitiate(t) => {
@@ -1656,7 +1680,7 @@ threshold = "20m"
 remote_storage = {}
         "#;
         let doc = toml_edit::Document::from_str(input).unwrap();
-        let err = PageServerConf::parse_and_validate(&doc, &workdir)
+        let err = PageServerConf::parse_and_validate(NodeId(222), &doc, &workdir)
             .expect_err("empty remote_storage field should fail, don't specify it if you want no remote_storage");
         assert!(format!("{err}").contains("remote_storage"), "{err}");
     }
diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py
index caeae7fd15..28dbf40bed 100644
--- a/test_runner/regress/test_pageserver_api.py
+++ b/test_runner/regress/test_pageserver_api.py
@@ -1,8 +1,5 @@
-import subprocess
-from pathlib import Path
 from typing import Optional
 
-import toml
 from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.neon_fixtures import (
     DEFAULT_BRANCH_NAME,
@@ -13,67 +10,6 @@ from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.utils import wait_until
 
 
-def test_pageserver_init_node_id(neon_simple_env: NeonEnv, neon_binpath: Path):
-    """
-    NB: The neon_local doesn't use `--init` mode anymore, but our production
-    deployment still does => https://github.com/neondatabase/aws/pull/1322
-    """
-    workdir = neon_simple_env.pageserver.workdir
-    pageserver_config = workdir / "pageserver.toml"
-    pageserver_bin = neon_binpath / "pageserver"
-
-    def run_pageserver(args):
-        return subprocess.run(
-            [str(pageserver_bin), "-D", str(workdir), *args],
-            check=False,
-            universal_newlines=True,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-        )
-
-    neon_simple_env.pageserver.stop()
-
-    with open(neon_simple_env.pageserver.config_toml_path, "r") as f:
-        ps_config = toml.load(f)
-
-    required_config_keys = [
-        "pg_distrib_dir",
-        "listen_pg_addr",
-        "listen_http_addr",
-        "pg_auth_type",
-        "http_auth_type",
-        # TODO: only needed for NEON_PAGESERVER_PANIC_ON_UNSPECIFIED_COMPACTION_ALGORITHM in https://github.com/neondatabase/neon/pull/7748
-        # "tenant_config",
-    ]
-    required_config_overrides = [
-        f"--config-override={toml.dumps({k: ps_config[k]})}" for k in required_config_keys
-    ]
-
-    pageserver_config.unlink()
-
-    bad_init = run_pageserver(["--init", *required_config_overrides])
-    assert (
-        bad_init.returncode == 1
-    ), "pageserver should not be able to init new config without the node id"
-    assert 'missing config value "id"' in bad_init.stderr
-    assert not pageserver_config.exists(), "config file should not be created after init error"
-
-    good_init_cmd = [
-        "--init",
-        f"--config-override=id={ps_config['id']}",
-        *required_config_overrides,
-    ]
-    completed_init = run_pageserver(good_init_cmd)
-    assert (
-        completed_init.returncode == 0
-    ), "pageserver should be able to create a new config with the node id given"
-    assert pageserver_config.exists(), "config file should be created successfully"
-
-    bad_reinit = run_pageserver(good_init_cmd)
-    assert bad_reinit.returncode == 1, "pageserver refuses to init if already exists"
-    assert "config file already exists" in bad_reinit.stderr
-
-
 def check_client(env: NeonEnv, client: PageserverHttpClient):
     pg_version = env.pg_version
     initial_tenant = env.initial_tenant

From 59eeadabe9bb65b92917e56950f5909bb897604b Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@neon.tech>
Date: Wed, 17 Jul 2024 10:04:46 +0300
Subject: [PATCH 287/412] Change default version of Neon extensio  to 1.4

---
 pgxn/neon/neon.control                                    | 2 +-
 test_runner/regress/test_lfc_working_set_approximation.py | 2 +-
 test_runner/regress/test_neon_extension.py                | 6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/pgxn/neon/neon.control b/pgxn/neon/neon.control
index cee2f336f2..03bdb9a0b4 100644
--- a/pgxn/neon/neon.control
+++ b/pgxn/neon/neon.control
@@ -1,6 +1,6 @@
 # neon extension
 comment = 'cloud storage for PostgreSQL'
-default_version = '1.3'
+default_version = '1.4'
 module_pathname = '$libdir/neon'
 relocatable = true
 trusted = true
diff --git a/test_runner/regress/test_lfc_working_set_approximation.py b/test_runner/regress/test_lfc_working_set_approximation.py
index 6465bdfd21..4c53e4e2fd 100644
--- a/test_runner/regress/test_lfc_working_set_approximation.py
+++ b/test_runner/regress/test_lfc_working_set_approximation.py
@@ -89,7 +89,7 @@ def test_sliding_working_set_approximation(neon_simple_env: NeonEnv):
     )
     conn = endpoint.connect()
     cur = conn.cursor()
-    cur.execute("create extension neon version '1.4'")
+    cur.execute("create extension neon")
     cur.execute(
         "create table t(pk integer primary key, count integer default 0, payload text default repeat('?', 128))"
     )
diff --git a/test_runner/regress/test_neon_extension.py b/test_runner/regress/test_neon_extension.py
index e83aaf91c6..bb844244e3 100644
--- a/test_runner/regress/test_neon_extension.py
+++ b/test_runner/regress/test_neon_extension.py
@@ -24,7 +24,7 @@ def test_neon_extension(neon_env_builder: NeonEnvBuilder):
             # IMPORTANT:
             # If the version has changed, the test should be updated.
             # Ensure that the default version is also updated in the neon.control file
-            assert cur.fetchone() == ("1.3",)
+            assert cur.fetchone() == ("1.4",)
             cur.execute("SELECT * from neon.NEON_STAT_FILE_CACHE")
             res = cur.fetchall()
             log.info(res)
@@ -48,10 +48,10 @@ def test_neon_extension_compatibility(neon_env_builder: NeonEnvBuilder):
             # IMPORTANT:
             # If the version has changed, the test should be updated.
             # Ensure that the default version is also updated in the neon.control file
-            assert cur.fetchone() == ("1.3",)
+            assert cur.fetchone() == ("1.4",)
             cur.execute("SELECT * from neon.NEON_STAT_FILE_CACHE")
             all_versions = ["1.4", "1.3", "1.2", "1.1", "1.0"]
-            current_version = "1.3"
+            current_version = "1.4"
             for idx, begin_version in enumerate(all_versions):
                 for target_version in all_versions[idx + 1 :]:
                     if current_version != begin_version:

From 138ae15a913bed588b02c5ed2355284ad689e1ca Mon Sep 17 00:00:00 2001
From: Em Sharnoff <sharnoff@neon.tech>
Date: Mon, 22 Jul 2024 11:28:08 -0700
Subject: [PATCH 288/412] vm-image: Expose new LFC working set size metrics
 (#8298)

In general, replace:

* 'lfc_approximate_working_set_size' with
* 'lfc_approximate_working_set_size_windows'

For the "main" metrics that are actually scraped and used internally,
the old one is just marked as deprecated.
For the "autoscaling" metrics, we're not currently using the old one, so
we can get away with just replacing it.

Also, for the user-visible metrics we'll only store & expose a few
different time windows, to avoid making the UI overly busy or bloating
our internal metrics storage.

But for the autoscaling-related scraper, we aren't storing the metrics,
and it's useful to be able to programmatically operate on the trendline
of how WSS increases (or doesn't!) with window size. So there, we can
just output datapoints for each minute.

Part of neondatabase/autoscaling#872
See also https://www.notion.so/neondatabase/cca38138fadd45eaa753d81b859490c6
---
 vm-image-spec.yaml | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index 3c446ecdea..224e9847f3 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -236,6 +236,7 @@ files:
         query: |
           select sum(pg_database_size(datname)) as total from pg_database;
 
+      # DEPRECATED
       - metric_name: lfc_approximate_working_set_size
         type: gauge
         help: 'Approximate working set size in pages of 8192 bytes'
@@ -244,6 +245,20 @@ files:
         query: |
           select neon.approximate_working_set_size(false) as approximate_working_set_size;
 
+      - metric_name: lfc_approximate_working_set_size_windows
+        type: gauge
+        help: 'Approximate working set size in pages of 8192 bytes'
+        key_labels: [duration]
+        values: [size]
+        # NOTE: This is the "public" / "human-readable" version. Here, we supply a small selection
+        # of durations in a pretty-printed form.
+        query: |
+          select
+            x as duration,
+            neon.approximate_working_set_size_seconds(extract('epoch' from x::interval)::int) as size
+          from
+            (values ('5m'),('15m'),('1h')) as t (x);
+
       - metric_name: current_lsn
         type: gauge
         help: 'Current LSN of the database'
@@ -377,13 +392,19 @@ files:
         query: |
           select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit;
 
-      - metric_name: lfc_approximate_working_set_size
+      - metric_name: lfc_approximate_working_set_size_windows
         type: gauge
         help: 'Approximate working set size in pages of 8192 bytes'
-        key_labels:
-        values: [approximate_working_set_size]
+        key_labels: [duration_seconds]
+        values: [size]
+        # NOTE: This is the "internal" / "machine-readable" version. This outputs the working set
+        # size looking back 1..60 minutes, labeled with the number of minutes.
         query: |
-          select neon.approximate_working_set_size(false) as approximate_working_set_size;
+          select
+            x::text as duration_seconds,
+            neon.approximate_working_set_size_seconds(x) as size
+          from
+            (select generate_series * 60 as x from generate_series(1, 60));
 build: |
   # Build cgroup-tools
   #

From 8bf597c4d75ad26c9f3a81a2427ff0f303987152 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Thu, 25 Jul 2024 21:21:58 +0100
Subject: [PATCH 289/412] Update pgrx to v 0.11.3 (#8515)

update pg_jsonschema extension to v 0.3.1
update pg_graphql extension to v1.5.7
update pgx_ulid extension to v0.1.5
update pg_tiktoken extension, patch Cargo.toml to use new pgrx
---
 Dockerfile.compute-node | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 48a52bfc6d..5e53a55316 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -657,7 +657,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
     chmod +x rustup-init && \
     ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
     rm rustup-init && \
-    cargo install --locked --version 0.10.2 cargo-pgrx && \
+    cargo install --locked --version 0.11.3 cargo-pgrx && \
     /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
 
 USER root
@@ -672,10 +672,15 @@ USER root
 FROM rust-extensions-build AS pg-jsonschema-pg-build
 ARG PG_VERSION
 
-RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.2.0.tar.gz -O pg_jsonschema.tar.gz && \
-    echo "9118fc508a6e231e7a39acaa6f066fcd79af17a5db757b47d2eefbe14f7794f0 pg_jsonschema.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.1.tar.gz -O pg_jsonschema.tar.gz && \
+    echo "61df3db1ed83cf24f6aa39c826f8818bfa4f0bd33b587fd6b2b1747985642297 pg_jsonschema.tar.gz" | sha256sum --check && \
     mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
-    sed -i 's/pgrx = "0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    # see commit 252b3685a27a0f4c31a0f91e983c6314838e89e8
+    # `unsafe-postgres` feature allows to build pgx extensions
+    # against postgres forks that decided to change their ABI name (like us).
+    # With that we can build extensions without forking them and using stock
+    # pgx. As this feature is new few manual version bumps were required.
+    sed -i 's/pgrx = "0.11.3"/pgrx = { version = "0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     cargo pgrx install --release && \
     echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_jsonschema.control
 
@@ -689,10 +694,10 @@ RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.2.0.tar.
 FROM rust-extensions-build AS pg-graphql-pg-build
 ARG PG_VERSION
 
-RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.4.0.tar.gz -O pg_graphql.tar.gz && \
-    echo "bd8dc7230282b3efa9ae5baf053a54151ed0e66881c7c53750e2d0c765776edc pg_graphql.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.5.7.tar.gz -O pg_graphql.tar.gz && \
+    echo "2b3e567a5b31019cb97ae0e33263c1bcc28580be5a444ac4c8ece5c4be2aea41 pg_graphql.tar.gz" | sha256sum --check && \
     mkdir pg_graphql-src && cd pg_graphql-src && tar xzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
-    sed -i 's/pgrx = "=0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    sed -i 's/pgrx = "=0.11.3"/pgrx = { version = "0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     cargo pgrx install --release && \
     # it's needed to enable extension because it uses untrusted C language
     sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_graphql.control && \
@@ -712,6 +717,9 @@ ARG PG_VERSION
 RUN wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6878884c41a262318.tar.gz -O pg_tiktoken.tar.gz && \
     echo "e64e55aaa38c259512d3e27c572da22c4637418cf124caba904cd50944e5004e pg_tiktoken.tar.gz" | sha256sum --check && \
     mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \
+    # TODO update pgrx version in the pg_tiktoken repo and remove this line
+    sed -i 's/pgrx = { version = "=0.10.2",/pgrx = { version = "0.11.3",/g' Cargo.toml && \
+    sed -i 's/pgrx-tests = "=0.10.2"/pgrx-tests = "0.11.3"/g' Cargo.toml && \
     cargo pgrx install --release && \
     echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control
 
@@ -725,14 +733,10 @@ RUN wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6
 FROM rust-extensions-build AS pg-pgx-ulid-build
 ARG PG_VERSION
 
-RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.3.tar.gz -O pgx_ulid.tar.gz && \
-    echo "ee5db82945d2d9f2d15597a80cf32de9dca67b897f605beb830561705f12683c pgx_ulid.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz -O pgx_ulid.tar.gz && \
+    echo "9d1659a2da65af0133d5451c454de31b37364e3502087dadf579f790bc8bef17 pgx_ulid.tar.gz" | sha256sum --check && \
     mkdir pgx_ulid-src && cd pgx_ulid-src && tar xzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
-    echo "******************* Apply a patch for Postgres 16 support; delete in the next release ******************" && \
-    wget https://github.com/pksunkara/pgx_ulid/commit/f84954cf63fc8c80d964ac970d9eceed3c791196.patch && \
-    patch -p1 < f84954cf63fc8c80d964ac970d9eceed3c791196.patch && \
-    echo "********************************************************************************************************" && \
-    sed -i 's/pgrx       = "=0.10.2"/pgrx = { version = "=0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    sed -i 's/pgrx       = "^0.11.2"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     cargo pgrx install --release && \
     echo "trusted = true" >> /usr/local/pgsql/share/extension/ulid.control
 

From 3a0ee16ed5f03b2a4597c71cb3f5ecf9ecddc25a Mon Sep 17 00:00:00 2001
From: Em Sharnoff <sharnoff@neon.tech>
Date: Fri, 26 Jul 2024 07:08:13 -0700
Subject: [PATCH 290/412] Fix sql-exporter-autoscaling for pg < 16 (#8523)

The lfc_approximate_working_set_size_windows query was failing on pg14
and pg15 with

  pq: subquery in FROM must have an alias

Because aliases in that position became optional only in pg16.

Some context here: https://neondb.slack.com/archives/C04DGM6SMTM/p1721970322601679?thread_ts=1721921122.528849
---
 vm-image-spec.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index 224e9847f3..2767710bad 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -404,7 +404,7 @@ files:
             x::text as duration_seconds,
             neon.approximate_working_set_size_seconds(x) as size
           from
-            (select generate_series * 60 as x from generate_series(1, 60));
+            (select generate_series * 60 as x from generate_series(1, 60)) as t (x);
 build: |
   # Build cgroup-tools
   #

From e0a5bb17edfae771426e471ab1d604630b232590 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 29 Jul 2024 16:16:32 +0200
Subject: [PATCH 291/412] pageserver: fail if `id` is present in
 pageserver.toml (#8489)

Overall plan:
https://www.notion.so/neondatabase/Rollout-Plan-simplified-pageserver-initialization-f935ae02b225444e8a41130b7d34e4ea?pvs=4

---

`identity.toml` is the authoritative place for `id` as of
https://github.com/neondatabase/neon/pull/7766

refs https://github.com/neondatabase/neon/issues/7736
---
 control_plane/src/local_env.rs  | 19 +++++++++++++++----
 control_plane/src/pageserver.rs | 11 +++++++----
 pageserver/src/config.rs        | 32 ++++++++------------------------
 3 files changed, 30 insertions(+), 32 deletions(-)

diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index d7830a5e70..505d157efd 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -514,7 +514,6 @@ impl LocalEnv {
                 #[derive(serde::Serialize, serde::Deserialize)]
                 // (allow unknown fields, unlike PageServerConf)
                 struct PageserverConfigTomlSubset {
-                    id: NodeId,
                     listen_pg_addr: String,
                     listen_http_addr: String,
                     pg_auth_type: AuthType,
@@ -526,18 +525,30 @@ impl LocalEnv {
                         .with_context(|| format!("read {:?}", config_toml_path))?,
                 )
                 .context("parse pageserver.toml")?;
+                let identity_toml_path = dentry.path().join("identity.toml");
+                #[derive(serde::Serialize, serde::Deserialize)]
+                struct IdentityTomlSubset {
+                    id: NodeId,
+                }
+                let identity_toml: IdentityTomlSubset = toml_edit::de::from_str(
+                    &std::fs::read_to_string(&identity_toml_path)
+                        .with_context(|| format!("read {:?}", identity_toml_path))?,
+                )
+                .context("parse identity.toml")?;
                 let PageserverConfigTomlSubset {
-                    id: config_toml_id,
                     listen_pg_addr,
                     listen_http_addr,
                     pg_auth_type,
                     http_auth_type,
                 } = config_toml;
+                let IdentityTomlSubset {
+                    id: identity_toml_id,
+                } = identity_toml;
                 let conf = PageServerConf {
                     id: {
                         anyhow::ensure!(
-                            config_toml_id == id,
-                            "id mismatch: config_toml.id={config_toml_id} id={id}",
+                            identity_toml_id == id,
+                            "id mismatch: identity.toml:id={identity_toml_id} pageserver_(.*) id={id}",
                         );
                         id
                     },
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index ba4f98d945..399b1c2653 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -127,10 +127,13 @@ impl PageServerNode {
         }
 
         // Apply the user-provided overrides
-        overrides.push(
-            toml_edit::ser::to_string_pretty(&conf)
-                .expect("we deserialized this from toml earlier"),
-        );
+        overrides.push({
+            let mut doc =
+                toml_edit::ser::to_document(&conf).expect("we deserialized this from toml earlier");
+            // `id` is written out to `identity.toml` instead of `pageserver.toml`
+            doc.remove("id").expect("it's part of the struct");
+            doc.to_string()
+        });
 
         // Turn `overrides` into a toml document.
         // TODO: above code is legacy code, it should be refactored to use toml_edit directly.
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 100c6c1ac5..f71881683d 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -356,8 +356,6 @@ struct PageServerConfigBuilder {
     auth_validation_public_key_path: BuilderValue<Option<Utf8PathBuf>>,
     remote_storage_config: BuilderValue<Option<RemoteStorageConfig>>,
 
-    id: BuilderValue<NodeId>,
-
     broker_endpoint: BuilderValue<Uri>,
     broker_keepalive_interval: BuilderValue<Duration>,
 
@@ -406,11 +404,8 @@ struct PageServerConfigBuilder {
 }
 
 impl PageServerConfigBuilder {
-    fn new(node_id: NodeId) -> Self {
-        let mut this = Self::default();
-        this.id(node_id);
-
-        this
+    fn new() -> Self {
+        Self::default()
     }
 
     #[inline(always)]
@@ -438,7 +433,6 @@ impl PageServerConfigBuilder {
             pg_auth_type: Set(AuthType::Trust),
             auth_validation_public_key_path: Set(None),
             remote_storage_config: Set(None),
-            id: NotSet,
             broker_endpoint: Set(storage_broker::DEFAULT_ENDPOINT
                 .parse()
                 .expect("failed to parse default broker endpoint")),
@@ -568,10 +562,6 @@ impl PageServerConfigBuilder {
         self.broker_keepalive_interval = BuilderValue::Set(broker_keepalive_interval)
     }
 
-    pub fn id(&mut self, node_id: NodeId) {
-        self.id = BuilderValue::Set(node_id)
-    }
-
     pub fn log_format(&mut self, log_format: LogFormat) {
         self.log_format = BuilderValue::Set(log_format)
     }
@@ -683,7 +673,7 @@ impl PageServerConfigBuilder {
         self.l0_flush = BuilderValue::Set(value);
     }
 
-    pub fn build(self) -> anyhow::Result<PageServerConf> {
+    pub fn build(self, id: NodeId) -> anyhow::Result<PageServerConf> {
         let default = Self::default_values();
 
         macro_rules! conf {
@@ -716,7 +706,6 @@ impl PageServerConfigBuilder {
                 pg_auth_type,
                 auth_validation_public_key_path,
                 remote_storage_config,
-                id,
                 broker_endpoint,
                 broker_keepalive_interval,
                 log_format,
@@ -744,6 +733,7 @@ impl PageServerConfigBuilder {
             }
             CUSTOM LOGIC
             {
+                id: id,
                 // TenantConf is handled separately
                 default_tenant_conf: TenantConf::default(),
                 concurrent_tenant_warmup: ConfigurableSemaphore::new({
@@ -893,7 +883,7 @@ impl PageServerConf {
         toml: &Document,
         workdir: &Utf8Path,
     ) -> anyhow::Result<Self> {
-        let mut builder = PageServerConfigBuilder::new(node_id);
+        let mut builder = PageServerConfigBuilder::new();
         builder.workdir(workdir.to_owned());
 
         let mut t_conf = TenantConfOpt::default();
@@ -924,8 +914,6 @@ impl PageServerConf {
                 "tenant_config" => {
                     t_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("failed to parse: '{key}'"))?;
                 }
-                "id" => {}, // Ignoring `id` field in pageserver.toml - using identity.toml as the source of truth
-                            // Logging is not set up yet, so we can't do it.
                 "broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?),
                 "broker_keepalive_interval" => builder.broker_keepalive_interval(parse_toml_duration(key, item)?),
                 "log_format" => builder.log_format(
@@ -1018,7 +1006,7 @@ impl PageServerConf {
             }
         }
 
-        let mut conf = builder.build().context("invalid config")?;
+        let mut conf = builder.build(node_id).context("invalid config")?;
 
         if conf.http_auth_type == AuthType::NeonJWT || conf.pg_auth_type == AuthType::NeonJWT {
             let auth_validation_public_key_path = conf
@@ -1255,7 +1243,6 @@ max_file_descriptors = 333
 
 # initial superuser role name to use when creating a new tenant
 initial_superuser_name = 'zzzz'
-id = 10
 
 metric_collection_interval = '222 s'
 metric_collection_endpoint = 'http://localhost:80/metrics'
@@ -1272,9 +1259,8 @@ background_task_maximum_delay = '334 s'
         let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
         let broker_endpoint = storage_broker::DEFAULT_ENDPOINT;
         // we have to create dummy values to overcome the validation errors
-        let config_string = format!(
-            "pg_distrib_dir='{pg_distrib_dir}'\nid=10\nbroker_endpoint = '{broker_endpoint}'",
-        );
+        let config_string =
+            format!("pg_distrib_dir='{pg_distrib_dir}'\nbroker_endpoint = '{broker_endpoint}'",);
         let toml = config_string.parse()?;
 
         let parsed_config = PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
@@ -1579,7 +1565,6 @@ broker_endpoint = '{broker_endpoint}'
             r#"pg_distrib_dir = "{pg_distrib_dir}"
 metric_collection_endpoint = "http://sample.url"
 metric_collection_interval = "10min"
-id = 222
 
 [disk_usage_based_eviction]
 max_usage_pct = 80
@@ -1649,7 +1634,6 @@ threshold = "20m"
             r#"pg_distrib_dir = "{pg_distrib_dir}"
 metric_collection_endpoint = "http://sample.url"
 metric_collection_interval = "10min"
-id = 222
 
 [tenant_config]
 evictions_low_residence_duration_metric_threshold = "20m"

From a4d3e0c747ebc82ce7b5a706883de5976d8e8c62 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 29 Jul 2024 12:05:18 +0200
Subject: [PATCH 292/412] Adopt list_streaming in tenant deletion (#8504)

Uses the Stream based `list_streaming` function added by #8457 in tenant
deletion, as suggested in https://github.com/neondatabase/neon/pull/7932#issuecomment-2150480180 .

We don't have to worry about retries, as the function is wrapped inside
an outer retry block. If there is a retryable error either during the
listing or during deletion, we just do a fresh start.

Also adds `+ Send` bounds as they are required by the
`delete_tenant_remote` function.
---
 libs/remote_storage/src/lib.rs               |  6 +--
 libs/remote_storage/src/simulate_failures.rs |  2 +-
 pageserver/src/tenant/mgr.rs                 | 52 ++++++++++----------
 3 files changed, 29 insertions(+), 31 deletions(-)

diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 75aa28233b..031548bbec 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -194,7 +194,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
         mode: ListingMode,
         max_keys: Option<NonZeroU32>,
         cancel: &CancellationToken,
-    ) -> impl Stream<Item = Result<Listing, DownloadError>>;
+    ) -> impl Stream<Item = Result<Listing, DownloadError>> + Send;
 
     async fn list(
         &self,
@@ -351,10 +351,10 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
         mode: ListingMode,
         max_keys: Option<NonZeroU32>,
         cancel: &'a CancellationToken,
-    ) -> impl Stream<Item = Result<Listing, DownloadError>> + 'a {
+    ) -> impl Stream<Item = Result<Listing, DownloadError>> + 'a + Send {
         match self {
             Self::LocalFs(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel))
-                as Pin<Box<dyn Stream<Item = Result<Listing, DownloadError>>>>,
+                as Pin<Box<dyn Stream<Item = Result<Listing, DownloadError>> + Send>>,
             Self::AwsS3(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
             Self::AzureBlob(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
             Self::Unreliable(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs
index 67e5be2955..13f873dcdb 100644
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -114,7 +114,7 @@ impl RemoteStorage for UnreliableWrapper {
         mode: ListingMode,
         max_keys: Option<NonZeroU32>,
         cancel: &CancellationToken,
-    ) -> impl Stream<Item = Result<Listing, DownloadError>> {
+    ) -> impl Stream<Item = Result<Listing, DownloadError>> + Send {
         async_stream::stream! {
             self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
                 .map_err(DownloadError::Other)?;
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 75c8682c97..5e1f69f4c1 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -1384,34 +1384,32 @@ impl TenantManager {
         tenant_shard_id: TenantShardId,
     ) -> Result<(), DeleteTenantError> {
         let remote_path = remote_tenant_path(&tenant_shard_id);
-        let keys = match self
-            .resources
-            .remote_storage
-            .list(
-                Some(&remote_path),
-                remote_storage::ListingMode::NoDelimiter,
-                None,
-                &self.cancel,
-            )
-            .await
-        {
-            Ok(listing) => listing.keys,
-            Err(remote_storage::DownloadError::Cancelled) => {
-                return Err(DeleteTenantError::Cancelled)
-            }
-            Err(remote_storage::DownloadError::NotFound) => return Ok(()),
-            Err(other) => return Err(DeleteTenantError::Other(anyhow::anyhow!(other))),
-        };
+        let mut keys_stream = self.resources.remote_storage.list_streaming(
+            Some(&remote_path),
+            remote_storage::ListingMode::NoDelimiter,
+            None,
+            &self.cancel,
+        );
+        while let Some(chunk) = keys_stream.next().await {
+            let keys = match chunk {
+                Ok(listing) => listing.keys,
+                Err(remote_storage::DownloadError::Cancelled) => {
+                    return Err(DeleteTenantError::Cancelled)
+                }
+                Err(remote_storage::DownloadError::NotFound) => return Ok(()),
+                Err(other) => return Err(DeleteTenantError::Other(anyhow::anyhow!(other))),
+            };
 
-        if keys.is_empty() {
-            tracing::info!("Remote storage already deleted");
-        } else {
-            tracing::info!("Deleting {} keys from remote storage", keys.len());
-            let keys = keys.into_iter().map(|o| o.key).collect::<Vec<_>>();
-            self.resources
-                .remote_storage
-                .delete_objects(&keys, &self.cancel)
-                .await?;
+            if keys.is_empty() {
+                tracing::info!("Remote storage already deleted");
+            } else {
+                tracing::info!("Deleting {} keys from remote storage", keys.len());
+                let keys = keys.into_iter().map(|o| o.key).collect::<Vec<_>>();
+                self.resources
+                    .remote_storage
+                    .delete_objects(&keys, &self.cancel)
+                    .await?;
+            }
         }
 
         Ok(())

From d8205248e215c170fad1a3141330cbf5563fcf55 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Mon, 29 Jul 2024 14:35:12 +0200
Subject: [PATCH 293/412] Add a test for clickhouse as a logical replication
 consumer (#8408)

## Problem

We need to test logical replication with 3rd-party tools regularly.

## Summary of changes

Added a test using ClickHouse as a client

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 .github/workflows/pg-clients.yml          |  72 ++++++++++
 poetry.lock                               | 153 +++++++++++++++++++++-
 pyproject.toml                            |   1 +
 test_runner/logical_repl/test_log_repl.py |  88 +++++++++++++
 4 files changed, 313 insertions(+), 1 deletion(-)
 create mode 100644 test_runner/logical_repl/test_log_repl.py

diff --git a/.github/workflows/pg-clients.yml b/.github/workflows/pg-clients.yml
index e21e45c929..55b68ccdb5 100644
--- a/.github/workflows/pg-clients.yml
+++ b/.github/workflows/pg-clients.yml
@@ -13,6 +13,7 @@ on:
     paths:
       - '.github/workflows/pg-clients.yml'
       - 'test_runner/pg_clients/**'
+      - 'test_runner/logical_repl/**'
       - 'poetry.lock'
   workflow_dispatch:
 
@@ -49,6 +50,77 @@ jobs:
       image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }}
     secrets: inherit
 
+  test-logical-replication:
+    needs: [ build-build-tools-image ]
+    runs-on: ubuntu-22.04
+
+    container:
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      options: --init --user root
+    services:
+      clickhouse:
+        image: clickhouse/clickhouse-server:24.6.3.64
+        ports:
+          - 9000:9000
+          - 8123:8123
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Download Neon artifact
+        uses: ./.github/actions/download
+        with:
+          name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
+          path: /tmp/neon/
+          prefix: latest
+
+      - name: Create Neon Project
+        id: create-neon-project
+        uses: ./.github/actions/neon-project-create
+        with:
+          api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+          postgres_version: ${{ env.DEFAULT_PG_VERSION }}
+
+      - name: Run tests
+        uses: ./.github/actions/run-python-test-set
+        with:
+          build_type: remote
+          test_selection: logical_repl
+          run_in_parallel: false
+          extra_params: -m remote_cluster
+          pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        env:
+          BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
+
+      - name: Delete Neon Project
+        if: always()
+        uses: ./.github/actions/neon-project-delete
+        with:
+          project_id: ${{ steps.create-neon-project.outputs.project_id }}
+          api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+
+      - name: Create Allure report
+        if: ${{ !cancelled() }}
+        id: create-allure-report
+        uses: ./.github/actions/allure-report-generate
+        with:
+          store-test-results-into-db: true
+        env:
+          REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
+
+      - name: Post to a Slack channel
+        if: github.event.schedule && failure()
+        uses: slackapi/slack-github-action@v1
+        with:
+          channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
+          slack-message: |
+            Testing the logical replication: <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|${{ job.status }}> (<${{ steps.create-allure-report.outputs.report-url }}|test report>)
+        env:
+          SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
+
   test-postgres-client-libs:
     needs: [ build-build-tools-image ]
     runs-on: ubuntu-22.04
diff --git a/poetry.lock b/poetry.lock
index 5192a574cc..d7a3dde65b 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -870,6 +870,96 @@ files = [
 [package.dependencies]
 colorama = {version = "*", markers = "platform_system == \"Windows\""}
 
+[[package]]
+name = "clickhouse-connect"
+version = "0.7.17"
+description = "ClickHouse Database Core Driver for Python, Pandas, and Superset"
+optional = false
+python-versions = "~=3.8"
+files = [
+    {file = "clickhouse-connect-0.7.17.tar.gz", hash = "sha256:854f1f9f3e024e7f89ae5d57cd3289d7a4c3dc91a9f24c4d233014f0ea19cb2d"},
+    {file = "clickhouse_connect-0.7.17-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:aca36f5f28be1ada2981fce87724bbf451f267c918015baec59e527de3c9c882"},
+    {file = "clickhouse_connect-0.7.17-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:66209e4634f457604c263bea176336079d26c284e251e68a8435b0b80c1a25ff"},
+    {file = "clickhouse_connect-0.7.17-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e4d86c5a561a2a99321c8b4af22257461b8e67142f34cfea6e70f39b45b1f406"},
+    {file = "clickhouse_connect-0.7.17-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d200c9afa2725a96f9f3718221f641276b80c11bf504d8a2fbaafb5a05b2f0d3"},
+    {file = "clickhouse_connect-0.7.17-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:004d867b1005445a46e6742db1054bf2a717a451372663b46e09b5e9e90a31e3"},
+    {file = "clickhouse_connect-0.7.17-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:4ef94a4a8e008882259151833c3c47cfbb9c8f08de0f100aaf3b95c366dcfb24"},
+    {file = "clickhouse_connect-0.7.17-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:ee732c3df50c8b07d16b5836ff85e6b84569922455c03837c3add5cf1388fe1f"},
+    {file = "clickhouse_connect-0.7.17-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d9dbe1235465bb946e24b90b0ca5b8800b5d645acb2d7d6ee819448c3e2fd959"},
+    {file = "clickhouse_connect-0.7.17-cp310-cp310-win32.whl", hash = "sha256:e5db0d68dfb63db0297d44dc91406bcfd7d333708d7cd55086c8550fbf870b78"},
+    {file = "clickhouse_connect-0.7.17-cp310-cp310-win_amd64.whl", hash = "sha256:800750f568c097ea312887785025006d6098bffd8ed2dd6a57048fb3ced6d778"},
+    {file = "clickhouse_connect-0.7.17-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4eb390623b3d15dc9cda78f5c68f83ef9ad11743797e70af8fabc384b015a73c"},
+    {file = "clickhouse_connect-0.7.17-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:35f172ca950f218f63072024c81d5b4ff6e5399620c255506c321ccc7b17c9a5"},
+    {file = "clickhouse_connect-0.7.17-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ae7918f060f7576fc931c692e0122b1b07576fabd81444af22e1f8582300d200"},
+    {file = "clickhouse_connect-0.7.17-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ff2881b93c7a1afb9c99fb59ad5fd666850421325d0931e2b77f3f4ba872303d"},
+    {file = "clickhouse_connect-0.7.17-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8a4d9b4f97271addf66aadbaf7f154f19a0ad6c22026d575a995c55ebd8576db"},
+    {file = "clickhouse_connect-0.7.17-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e431469b1ff2d5c3e4c406d55c6afdf7102f5d2524c2ceb5481b94ac24412aa3"},
+    {file = "clickhouse_connect-0.7.17-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:2b6f80115176559f181a6b3ecad11aa3d70ef6014c3d2905b90fcef3f27d25c2"},
+    {file = "clickhouse_connect-0.7.17-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d8ac694f40dfafc8a3cc877116b4bc73e8877ebf66d4d96ee092484ee4c0b481"},
+    {file = "clickhouse_connect-0.7.17-cp311-cp311-win32.whl", hash = "sha256:78b7a3f6b0fad4eaf8afb5f9a2e855bde53e82ea5804960e9cf779538f4606a1"},
+    {file = "clickhouse_connect-0.7.17-cp311-cp311-win_amd64.whl", hash = "sha256:efd390cc045334ecc3f2a9c18cc07c041d0288b145967805fdcab65abeefa75f"},
+    {file = "clickhouse_connect-0.7.17-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:9228334a17dc0a7842222f54ba5b89fc563532424aad4f66be799df70ab37e9f"},
+    {file = "clickhouse_connect-0.7.17-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:e432a42bb788bda77e88eda2774392a60fbbb5ee2a79cb2881d182d26c45fe49"},
+    {file = "clickhouse_connect-0.7.17-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c85152ed2879965ee1fa2bd5e31fb27d281fd5f50d6e86a401efd95cd85b29ef"},
+    {file = "clickhouse_connect-0.7.17-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:29a126104aa5e11df570cbd89fca4988784084602ba77d17b2396b334c54fd75"},
+    {file = "clickhouse_connect-0.7.17-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:882d8f9570549258e6eb6a97915fbf64ed29fe395d5e360866ea8d42c8283a35"},
+    {file = "clickhouse_connect-0.7.17-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:06ebf99111171442f462fb8b357364c3e276da3e8f8557b2e8fee9eb55ab37d1"},
+    {file = "clickhouse_connect-0.7.17-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:e0cf6f99b2777b0d164bf8b65ec39104cdc0789a56bcb52d98289bbd6f5cc70e"},
+    {file = "clickhouse_connect-0.7.17-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:ee46c508fddfff3b7ac52326788e0c6dd8dfb416b6d7e02e5d30e8110749dac2"},
+    {file = "clickhouse_connect-0.7.17-cp312-cp312-win32.whl", hash = "sha256:eb708b590a37d56b069a6088254ffa55d73b8cb65527339df81ef03fe67ffdec"},
+    {file = "clickhouse_connect-0.7.17-cp312-cp312-win_amd64.whl", hash = "sha256:17f00dccddaeaf43733faa1fa21f7d24641454a73669fda862545ba7c88627f5"},
+    {file = "clickhouse_connect-0.7.17-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ab5d4b37a6dcc39e94c63beac0f22d9dda914f5eb865d166c64cf04dfadb7d16"},
+    {file = "clickhouse_connect-0.7.17-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:32aa90387f45f34cbc5a984789ed4c12760a3c0056c190ab0123ceafc36b1002"},
+    {file = "clickhouse_connect-0.7.17-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:21277b6bdd6c8ff14170bfcd52125c5c39f442ec4bafbb643ad7d0ca915f0029"},
+    {file = "clickhouse_connect-0.7.17-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ca68d8b7dee3fb4e7229e06152f5b0faaccafb4c87d9c2d48fa5bd117a3cc1c0"},
+    {file = "clickhouse_connect-0.7.17-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:841c56282102b2fba1e0b332bb1c7a0c50992fbc321746af8d3e0e6ca2450e8b"},
+    {file = "clickhouse_connect-0.7.17-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:8d7ffde5a4b95d8fe9ed38e08e504e497310e3d7a17691bd40bf65734648fdfc"},
+    {file = "clickhouse_connect-0.7.17-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:055960086b6b92b6e44f5ba04c81c40c10b038588e4b3908b033c99f66125332"},
+    {file = "clickhouse_connect-0.7.17-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:36491fec63ceb8503b6344c23477647030139f346b749dc5ee672c505939dbbe"},
+    {file = "clickhouse_connect-0.7.17-cp38-cp38-win32.whl", hash = "sha256:8779a907e026db32e6bc0bc0c8d5de0e2e3afd166afc2d4adcc0603399af5539"},
+    {file = "clickhouse_connect-0.7.17-cp38-cp38-win_amd64.whl", hash = "sha256:309854fa197885c6278438ddd032ab52e6fec56f162074e343c3635ca7266078"},
+    {file = "clickhouse_connect-0.7.17-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e8009f94550178dc971aeb4f8787ba7a5b473c22647490428b7229f540a51d2b"},
+    {file = "clickhouse_connect-0.7.17-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:70f8422f407b13a404b3670fd097855abd5adaf890c710d6678d2b46ab61ac48"},
+    {file = "clickhouse_connect-0.7.17-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:082783eb1e8baf7b3465dd045132dc5cb5a91432c899dc4e19891c5f782d8d23"},
+    {file = "clickhouse_connect-0.7.17-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1c30aad2a9c7584c4ee19e646a087b3bbd2d4daab3d88a2afeeae1a7f6febf9"},
+    {file = "clickhouse_connect-0.7.17-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fc8e245a9f4f0dce39f155e626405f60f1d3cf4d1e52dd2c793ea6b603ca111b"},
+    {file = "clickhouse_connect-0.7.17-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:802372cb8a69c9ffdf4260e9f01616c8601ba531825ed6f08834827e0b880cd1"},
+    {file = "clickhouse_connect-0.7.17-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:193a60271a3b105cdbde96fb20b40eab8a50fca3bb1f397546f7a18b53d9aa9c"},
+    {file = "clickhouse_connect-0.7.17-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:59d58932916792fdbd09cb961a245a0c2d87b07b8296f9138915b998f4522941"},
+    {file = "clickhouse_connect-0.7.17-cp39-cp39-win32.whl", hash = "sha256:3cfd0edabb589f640636a97ffc38d1b3d760faef208d44e50829cc1ad3f0d3e5"},
+    {file = "clickhouse_connect-0.7.17-cp39-cp39-win_amd64.whl", hash = "sha256:5661b4629aac228481219abf2e149119af1a71d897f191665e182d9d192d7033"},
+    {file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:7429d309109e7e4a70fd867d69fcfea9ddcb1a1e910caa6b0e2c3776b71f4613"},
+    {file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5ae619151006da84a0b1585a9bcc81be32459d8061aeb2e116bad5bbaa7d108"},
+    {file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec0c84a0880621cb2389656a89886ef3133f0b3f8dc016eee6f25bbb49ff6f70"},
+    {file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:705464c23f821666b76f8f619cf2870225156276562756b3933aaa24708e0ff8"},
+    {file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:1822016f4b769e89264fe26cefe0bc5e50e4c3ca0747d89bb52d57dc4f1e5ffb"},
+    {file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:6c92b0c342c1fbfa666010e8175e05026dc570a7ef91d8fa81ce503180f318aa"},
+    {file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d2e106536540e906c3c866f8615fcf870a9a77c1bfab9ef4b042febfd2fdb953"},
+    {file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bac9a32e62384b4341ba51a451084eb3b00c6e59aaac1499145dd8b897cb585c"},
+    {file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0feed93b9912b7862a8c41be1febcd44b68a824a5c1059b19d5c567afdaa6273"},
+    {file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:2e2dd6db52e799f065fd565143fde5a872cfe903de1bee7775bc3a349856a790"},
+    {file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:ed13add5d579a5960155f3000420544368501c9703d2fb94f103b4a6126081f6"},
+    {file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c257a23ed3bf1858593fb03927d9d073fbbdfa24dc2afee537c3314bd66b4e24"},
+    {file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d47866f64cbdc2d5cc4f8a7a8c49e3ee90c9e487091b9eda7c3a3576418e1cbe"},
+    {file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9b850e2f17e0a0b5a37d996d3fb728050227489d64d271d678d166abea94f26e"},
+    {file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:349682288987dc84ac7695f7cd6b510be8d0ec0eee7c1b72dbf2146b4e9efdb8"},
+]
+
+[package.dependencies]
+certifi = "*"
+lz4 = "*"
+pytz = "*"
+urllib3 = ">=1.26"
+zstandard = "*"
+
+[package.extras]
+arrow = ["pyarrow"]
+numpy = ["numpy"]
+orjson = ["orjson"]
+pandas = ["pandas"]
+sqlalchemy = ["sqlalchemy (>1.3.21,<2.0)"]
+tzlocal = ["tzlocal (>=4.0)"]
+
 [[package]]
 name = "colorama"
 version = "0.4.5"
@@ -1470,6 +1560,56 @@ files = [
     {file = "lazy_object_proxy-1.10.0-pp310.pp311.pp312.pp38.pp39-none-any.whl", hash = "sha256:80fa48bd89c8f2f456fc0765c11c23bf5af827febacd2f523ca5bc1893fcc09d"},
 ]
 
+[[package]]
+name = "lz4"
+version = "4.3.3"
+description = "LZ4 Bindings for Python"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "lz4-4.3.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b891880c187e96339474af2a3b2bfb11a8e4732ff5034be919aa9029484cd201"},
+    {file = "lz4-4.3.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:222a7e35137d7539c9c33bb53fcbb26510c5748779364014235afc62b0ec797f"},
+    {file = "lz4-4.3.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f76176492ff082657ada0d0f10c794b6da5800249ef1692b35cf49b1e93e8ef7"},
+    {file = "lz4-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f1d18718f9d78182c6b60f568c9a9cec8a7204d7cb6fad4e511a2ef279e4cb05"},
+    {file = "lz4-4.3.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6cdc60e21ec70266947a48839b437d46025076eb4b12c76bd47f8e5eb8a75dcc"},
+    {file = "lz4-4.3.3-cp310-cp310-win32.whl", hash = "sha256:c81703b12475da73a5d66618856d04b1307e43428a7e59d98cfe5a5d608a74c6"},
+    {file = "lz4-4.3.3-cp310-cp310-win_amd64.whl", hash = "sha256:43cf03059c0f941b772c8aeb42a0813d68d7081c009542301637e5782f8a33e2"},
+    {file = "lz4-4.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:30e8c20b8857adef7be045c65f47ab1e2c4fabba86a9fa9a997d7674a31ea6b6"},
+    {file = "lz4-4.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2f7b1839f795315e480fb87d9bc60b186a98e3e5d17203c6e757611ef7dcef61"},
+    {file = "lz4-4.3.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:edfd858985c23523f4e5a7526ca6ee65ff930207a7ec8a8f57a01eae506aaee7"},
+    {file = "lz4-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e9c410b11a31dbdc94c05ac3c480cb4b222460faf9231f12538d0074e56c563"},
+    {file = "lz4-4.3.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d2507ee9c99dbddd191c86f0e0c8b724c76d26b0602db9ea23232304382e1f21"},
+    {file = "lz4-4.3.3-cp311-cp311-win32.whl", hash = "sha256:f180904f33bdd1e92967923a43c22899e303906d19b2cf8bb547db6653ea6e7d"},
+    {file = "lz4-4.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:b14d948e6dce389f9a7afc666d60dd1e35fa2138a8ec5306d30cd2e30d36b40c"},
+    {file = "lz4-4.3.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:e36cd7b9d4d920d3bfc2369840da506fa68258f7bb176b8743189793c055e43d"},
+    {file = "lz4-4.3.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:31ea4be9d0059c00b2572d700bf2c1bc82f241f2c3282034a759c9a4d6ca4dc2"},
+    {file = "lz4-4.3.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:33c9a6fd20767ccaf70649982f8f3eeb0884035c150c0b818ea660152cf3c809"},
+    {file = "lz4-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bca8fccc15e3add173da91be8f34121578dc777711ffd98d399be35487c934bf"},
+    {file = "lz4-4.3.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e7d84b479ddf39fe3ea05387f10b779155fc0990125f4fb35d636114e1c63a2e"},
+    {file = "lz4-4.3.3-cp312-cp312-win32.whl", hash = "sha256:337cb94488a1b060ef1685187d6ad4ba8bc61d26d631d7ba909ee984ea736be1"},
+    {file = "lz4-4.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:5d35533bf2cee56f38ced91f766cd0038b6abf46f438a80d50c52750088be93f"},
+    {file = "lz4-4.3.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:363ab65bf31338eb364062a15f302fc0fab0a49426051429866d71c793c23394"},
+    {file = "lz4-4.3.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0a136e44a16fc98b1abc404fbabf7f1fada2bdab6a7e970974fb81cf55b636d0"},
+    {file = "lz4-4.3.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:abc197e4aca8b63f5ae200af03eb95fb4b5055a8f990079b5bdf042f568469dd"},
+    {file = "lz4-4.3.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:56f4fe9c6327adb97406f27a66420b22ce02d71a5c365c48d6b656b4aaeb7775"},
+    {file = "lz4-4.3.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f0e822cd7644995d9ba248cb4b67859701748a93e2ab7fc9bc18c599a52e4604"},
+    {file = "lz4-4.3.3-cp38-cp38-win32.whl", hash = "sha256:24b3206de56b7a537eda3a8123c644a2b7bf111f0af53bc14bed90ce5562d1aa"},
+    {file = "lz4-4.3.3-cp38-cp38-win_amd64.whl", hash = "sha256:b47839b53956e2737229d70714f1d75f33e8ac26e52c267f0197b3189ca6de24"},
+    {file = "lz4-4.3.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6756212507405f270b66b3ff7f564618de0606395c0fe10a7ae2ffcbbe0b1fba"},
+    {file = "lz4-4.3.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ee9ff50557a942d187ec85462bb0960207e7ec5b19b3b48949263993771c6205"},
+    {file = "lz4-4.3.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2b901c7784caac9a1ded4555258207d9e9697e746cc8532129f150ffe1f6ba0d"},
+    {file = "lz4-4.3.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b6d9ec061b9eca86e4dcc003d93334b95d53909afd5a32c6e4f222157b50c071"},
+    {file = "lz4-4.3.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f4c7bf687303ca47d69f9f0133274958fd672efaa33fb5bcde467862d6c621f0"},
+    {file = "lz4-4.3.3-cp39-cp39-win32.whl", hash = "sha256:054b4631a355606e99a42396f5db4d22046a3397ffc3269a348ec41eaebd69d2"},
+    {file = "lz4-4.3.3-cp39-cp39-win_amd64.whl", hash = "sha256:eac9af361e0d98335a02ff12fb56caeb7ea1196cf1a49dbf6f17828a131da807"},
+    {file = "lz4-4.3.3.tar.gz", hash = "sha256:01fe674ef2889dbb9899d8a67361e0c4a2c833af5aeb37dd505727cf5d2a131e"},
+]
+
+[package.extras]
+docs = ["sphinx (>=1.6.0)", "sphinx-bootstrap-theme"]
+flake8 = ["flake8"]
+tests = ["psutil", "pytest (!=3.3.0)", "pytest-cov"]
+
 [[package]]
 name = "markupsafe"
 version = "2.1.1"
@@ -2361,6 +2501,17 @@ files = [
 [package.dependencies]
 six = ">=1.5"
 
+[[package]]
+name = "pytz"
+version = "2024.1"
+description = "World timezone definitions, modern and historical"
+optional = false
+python-versions = "*"
+files = [
+    {file = "pytz-2024.1-py2.py3-none-any.whl", hash = "sha256:328171f4e3623139da4983451950b28e95ac706e13f3f2630a879749e7a8b319"},
+    {file = "pytz-2024.1.tar.gz", hash = "sha256:2a29735ea9c18baf14b448846bde5a48030ed267578472d8955cd0e7443a9812"},
+]
+
 [[package]]
 name = "pywin32"
 version = "301"
@@ -3206,4 +3357,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "16ebd6a46768be7f67dbdb4ee5903b167d94edc9965f29252f038c67e9e907b0"
+content-hash = "7cee6a8c30bc7f4bfb0a87c6bad3952dfb4da127fad853d2710a93ac3eab8a00"
diff --git a/pyproject.toml b/pyproject.toml
index c7f1a07512..0d5782ac7c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -41,6 +41,7 @@ zstandard = "^0.21.0"
 httpx = {extras = ["http2"], version = "^0.26.0"}
 pytest-repeat = "^0.9.3"
 websockets = "^12.0"
+clickhouse-connect = "^0.7.16"
 
 [tool.poetry.group.dev.dependencies]
 mypy = "==1.3.0"
diff --git a/test_runner/logical_repl/test_log_repl.py b/test_runner/logical_repl/test_log_repl.py
new file mode 100644
index 0000000000..0a1aecfe2b
--- /dev/null
+++ b/test_runner/logical_repl/test_log_repl.py
@@ -0,0 +1,88 @@
+"""
+Test the logical replication in Neon with the different consumers
+"""
+
+import hashlib
+import time
+
+import clickhouse_connect
+import psycopg2
+import pytest
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import RemotePostgres
+from fixtures.utils import wait_until
+
+
+def query_clickhouse(
+    client,
+    query: str,
+    digest: str,
+) -> None:
+    """
+    Run the query on the client
+    return answer if successful, raise an exception otherwise
+    """
+    log.debug("Query: %s", query)
+    res = client.query(query)
+    log.debug(res.result_rows)
+    m = hashlib.sha1()
+    m.update(repr(tuple(res.result_rows)).encode())
+    hash_res = m.hexdigest()
+    log.debug("Hash: %s", hash_res)
+    if hash_res == digest:
+        return
+    raise ValueError("Hash mismatch")
+
+
+@pytest.mark.remote_cluster
+def test_clickhouse(remote_pg: RemotePostgres):
+    """
+    Test the logical replication having ClickHouse as a client
+    """
+    conn_options = remote_pg.conn_options()
+    for _ in range(5):
+        try:
+            conn = psycopg2.connect(remote_pg.connstr())
+        except psycopg2.OperationalError as perr:
+            log.debug(perr)
+            time.sleep(1)
+        else:
+            break
+        raise TimeoutError
+    cur = conn.cursor()
+    cur.execute("DROP TABLE IF EXISTS table1")
+    cur.execute("CREATE TABLE table1 (id integer primary key, column1 varchar(10));")
+    cur.execute("INSERT INTO table1 (id, column1) VALUES (1, 'abc'), (2, 'def');")
+    conn.commit()
+    client = clickhouse_connect.get_client(host="clickhouse")
+    client.command("SET allow_experimental_database_materialized_postgresql=1")
+    client.command(
+        "CREATE DATABASE db1_postgres ENGINE = "
+        f"MaterializedPostgreSQL('{conn_options['host']}', "
+        f"'{conn_options['dbname']}', "
+        f"'{conn_options['user']}', '{conn_options['password']}') "
+        "SETTINGS materialized_postgresql_tables_list = 'table1';"
+    )
+    wait_until(
+        120,
+        0.5,
+        lambda: query_clickhouse(
+            client,
+            "select * from db1_postgres.table1 order by 1",
+            "ee600d8f7cd05bd0b169fa81f44300a9dd10085a",
+        ),
+    )
+    cur.execute("INSERT INTO table1 (id, column1) VALUES (3, 'ghi'), (4, 'jkl');")
+    conn.commit()
+    wait_until(
+        120,
+        0.5,
+        lambda: query_clickhouse(
+            client,
+            "select * from db1_postgres.table1 order by 1",
+            "9eba2daaf7e4d7d27ac849525f68b562ab53947d",
+        ),
+    )
+    log.debug("Sleeping before final checking if Neon is still alive")
+    time.sleep(3)
+    cur.execute("SELECT 1")

From 7b3f94c1f05178cc01141b5a59d5aec7dd26d9b9 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 29 Jul 2024 15:41:06 +0300
Subject: [PATCH 294/412] test: deflake test_duplicate_creation (#8536)

By including comparison of `remote_consistent_lsn_visible` we risk
flakyness coming from outside of timeline creation. Mask out the
`remote_consistent_lsn_visible` for the comparison.

Evidence:
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-8489/10142336315/index.html#suites/ffbb7f9930a77115316b58ff32b7c719/89ff0270bf58577a
---
 test_runner/regress/test_branching.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py
index 03d6946c15..190b624a54 100644
--- a/test_runner/regress/test_branching.py
+++ b/test_runner/regress/test_branching.py
@@ -389,6 +389,11 @@ def test_duplicate_creation(neon_env_builder: NeonEnvBuilder):
         repeat_result = ps_http.timeline_create(
             env.pg_version, env.initial_tenant, success_timeline, timeout=60
         )
+        # remote_consistent_lsn_visible will be published only after we've
+        # confirmed the generation, which is not part of what we await during
+        # timeline creation (uploads). mask it out here to avoid flakyness.
+        del success_result["remote_consistent_lsn_visible"]
+        del repeat_result["remote_consistent_lsn_visible"]
         assert repeat_result == success_result
     finally:
         env.pageserver.stop(immediate=True)

From 7c40266c82435e1a6e7b8f256506be3107349413 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 29 Jul 2024 15:05:30 +0100
Subject: [PATCH 295/412] pageserver: fix return code from
 secondary_download_handler (#8508)

## Problem

The secondary download HTTP API is meant to return 200 if the download
is complete, and 202 if it is still in progress. In #8198 the download
implementation was changed to drop out with success early if it
over-runs a time budget, which resulted in 200 responses for incomplete
downloads.

This breaks storcon_cli's "tenant-warmup" command, which uses the OK
status to indicate download complete.

## Summary of changes

- Only return 200 if we get an Ok() _and_ the progress stats indicate
the download is complete.
---
 pageserver/src/http/routes.rs | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 7935aeb5e9..9222123ad3 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2129,14 +2129,24 @@ async fn secondary_download_handler(
 
     let timeout = wait.unwrap_or(Duration::MAX);
 
-    let status = match tokio::time::timeout(
+    let result = tokio::time::timeout(
         timeout,
         state.secondary_controller.download_tenant(tenant_shard_id),
     )
-    .await
-    {
-        // Download job ran to completion.
-        Ok(Ok(())) => StatusCode::OK,
+    .await;
+
+    let progress = secondary_tenant.progress.lock().unwrap().clone();
+
+    let status = match result {
+        Ok(Ok(())) => {
+            if progress.layers_downloaded >= progress.layers_total {
+                // Download job ran to completion
+                StatusCode::OK
+            } else {
+                // Download dropped out without errors because it ran out of time budget
+                StatusCode::ACCEPTED
+            }
+        }
         // Edge case: downloads aren't usually fallible: things like a missing heatmap are considered
         // okay.  We could get an error here in the unlikely edge case that the tenant
         // was detached between our check above and executing the download job.
@@ -2146,8 +2156,6 @@ async fn secondary_download_handler(
         Err(_) => StatusCode::ACCEPTED,
     };
 
-    let progress = secondary_tenant.progress.lock().unwrap().clone();
-
     json_response(status, progress)
 }
 

From 010203a49e4eb06733ac11241ece9f8ea6d6b8e4 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 29 Jul 2024 16:49:22 +0200
Subject: [PATCH 296/412] l0_flush: use mode=direct by default => coverage in
 automated tests (#8534)

Testing in staging and pre-prod has been [going

well](https://github.com/neondatabase/neon/issues/7418#issuecomment-2255474917).

This PR enables mode=direct by default, thereby providing additional
coverage in the automated tests:
- Rust tests
- Integration tests
- Nightly pagebench (likely irrelevant because it's read-only)

Production deployments continue to use `mode=page-cache` for the time
being: https://github.com/neondatabase/aws/pull/1655

refs https://github.com/neondatabase/neon/issues/7418
---
 pageserver/src/l0_flush.rs | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/l0_flush.rs b/pageserver/src/l0_flush.rs
index 7fe8fedc63..8945e5accd 100644
--- a/pageserver/src/l0_flush.rs
+++ b/pageserver/src/l0_flush.rs
@@ -2,13 +2,23 @@ use std::{num::NonZeroUsize, sync::Arc};
 
 use crate::tenant::ephemeral_file;
 
-#[derive(Default, Debug, PartialEq, Eq, Clone, serde::Deserialize)]
+#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize)]
 #[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
 pub enum L0FlushConfig {
-    #[default]
     PageCached,
     #[serde(rename_all = "snake_case")]
-    Direct { max_concurrency: NonZeroUsize },
+    Direct {
+        max_concurrency: NonZeroUsize,
+    },
+}
+
+impl Default for L0FlushConfig {
+    fn default() -> Self {
+        Self::Direct {
+            // TODO: using num_cpus results in different peak memory usage on different instance types.
+            max_concurrency: NonZeroUsize::new(usize::max(1, num_cpus::get())).unwrap(),
+        }
+    }
 }
 
 #[derive(Clone)]

From 3f05758d0937c28b055514ddb18532426225451f Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 29 Jul 2024 17:50:44 +0100
Subject: [PATCH 297/412] scrubber: enable cleaning up garbage tenants from
 known deletion bugs, add object age safety check (#8461)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

Old storage buckets can contain a lot of tenants that aren't known to
the control plane at all, because they belonged to test jobs that get
their control plane state cleaned up shortly after running.

In general, it's somewhat unsafe to purge these, as it's hard to
distinguish "control plane doesn't know about this, so it's garbage"
from "control plane said it didn't know about this, which is a bug in
the scrubber, control plane, or API URL configured".

However, the most common case is that we see only a small husk of a
tenant in S3 from a specific old behavior of the software, for example:
- We had a bug where heatmaps weren't deleted on tenant delete
- When WAL DR was first deployed, we didn't delete initdb.tar.zst on
tenant deletion

## Summary of changes

- Add a KnownBug variant for the garbage reason
- Include such cases in the "safe" deletion mode (`--mode=deleted`)
- Add code that inspects tenants missing in control plane to identify
cases of known bugs (this is kind of slow, but should go away once we've
cleaned all these up)
- Add an additional `-min-age` safety check similar to physical GC,
where even if everything indicates objects aren't needed, we won't
delete something that has been modified too recently.

---------

Co-authored-by: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
---
 storage_scrubber/src/garbage.rs | 118 ++++++++++++++++++++++++++++++--
 storage_scrubber/src/main.rs    |  10 ++-
 2 files changed, 121 insertions(+), 7 deletions(-)

diff --git a/storage_scrubber/src/garbage.rs b/storage_scrubber/src/garbage.rs
index 333269ec7e..78ecfc7232 100644
--- a/storage_scrubber/src/garbage.rs
+++ b/storage_scrubber/src/garbage.rs
@@ -5,6 +5,7 @@
 use std::{
     collections::{HashMap, HashSet},
     sync::Arc,
+    time::Duration,
 };
 
 use anyhow::Context;
@@ -18,7 +19,7 @@ use utils::id::TenantId;
 
 use crate::{
     cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData},
-    init_remote, init_remote_generic,
+    init_remote, init_remote_generic, list_objects_with_retries,
     metadata_stream::{stream_tenant_timelines, stream_tenants},
     BucketConfig, ConsoleConfig, NodeKind, TenantShardTimelineId, TraversingDepth,
 };
@@ -27,6 +28,11 @@ use crate::{
 enum GarbageReason {
     DeletedInConsole,
     MissingInConsole,
+
+    // The remaining data relates to a known deletion issue, and we're sure that purging this
+    // will not delete any real data, for example https://github.com/neondatabase/neon/pull/7928 where
+    // there is nothing in a tenant path apart from a heatmap file.
+    KnownBug,
 }
 
 #[derive(Serialize, Deserialize, Debug)]
@@ -72,6 +78,15 @@ impl GarbageList {
         }
     }
 
+    /// If an entity has been identified as requiring purge due to a known bug, e.g.
+    /// a particular type of object left behind after an incomplete deletion.
+    fn append_buggy(&mut self, entity: GarbageEntity) {
+        self.items.push(GarbageItem {
+            entity,
+            reason: GarbageReason::KnownBug,
+        });
+    }
+
     /// Return true if appended, false if not.  False means the result was not garbage.
     fn maybe_append<T>(&mut self, entity: GarbageEntity, result: Option<T>) -> bool
     where
@@ -219,6 +234,71 @@ async fn find_garbage_inner(
             assert!(project.tenant == tenant_shard_id.tenant_id);
         }
 
+        // Special case: If it's missing in console, check for known bugs that would enable us to conclusively
+        // identify it as purge-able anyway
+        if console_result.is_none() {
+            let timelines = stream_tenant_timelines(&s3_client, &target, tenant_shard_id)
+                .await?
+                .collect::<Vec<_>>()
+                .await;
+            if timelines.is_empty() {
+                // No timelines, but a heatmap: the deletion bug where we deleted everything but heatmaps
+                let tenant_objects = list_objects_with_retries(
+                    &s3_client,
+                    &target.tenant_root(&tenant_shard_id),
+                    None,
+                )
+                .await?;
+                let object = tenant_objects.contents.as_ref().unwrap().first().unwrap();
+                if object.key.as_ref().unwrap().ends_with("heatmap-v1.json") {
+                    tracing::info!("Tenant {tenant_shard_id}: is missing in console and is only a heatmap (known historic deletion bug)");
+                    garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id));
+                    continue;
+                } else {
+                    tracing::info!("Tenant {tenant_shard_id} is missing in console and contains one object: {}", object.key.as_ref().unwrap());
+                }
+            } else {
+                // A console-unknown tenant with timelines: check if these timelines only contain initdb.tar.zst, from the initial
+                // rollout of WAL DR in which we never deleted these.
+                let mut any_non_initdb = false;
+
+                for timeline_r in timelines {
+                    let timeline = timeline_r?;
+                    let timeline_objects = list_objects_with_retries(
+                        &s3_client,
+                        &target.timeline_root(&timeline),
+                        None,
+                    )
+                    .await?;
+                    if timeline_objects
+                        .common_prefixes
+                        .as_ref()
+                        .map(|v| v.len())
+                        .unwrap_or(0)
+                        > 0
+                    {
+                        // Sub-paths?  Unexpected
+                        any_non_initdb = true;
+                    } else {
+                        let object = timeline_objects.contents.as_ref().unwrap().first().unwrap();
+                        if object.key.as_ref().unwrap().ends_with("initdb.tar.zst") {
+                            tracing::info!("Timeline {timeline} contains only initdb.tar.zst");
+                        } else {
+                            any_non_initdb = true;
+                        }
+                    }
+                }
+
+                if any_non_initdb {
+                    tracing::info!("Tenant {tenant_shard_id}: is missing in console and contains timelines, one or more of which are more than just initdb");
+                } else {
+                    tracing::info!("Tenant {tenant_shard_id}: is missing in console and contains only timelines that only contain initdb");
+                    garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id));
+                    continue;
+                }
+            }
+        }
+
         if garbage.maybe_append(GarbageEntity::Tenant(tenant_shard_id), console_result) {
             tracing::debug!("Tenant {tenant_shard_id} is garbage");
         } else {
@@ -349,9 +429,6 @@ pub async fn get_timeline_objects(
     tracing::debug!("Listing objects in timeline {ttid}");
     let timeline_root = super::remote_timeline_path_id(&ttid);
 
-    // TODO: apply extra validation based on object modification time.  Don't purge
-    // timelines whose index_part.json has been touched recently.
-
     let list = s3_client
         .list(
             Some(&timeline_root),
@@ -422,6 +499,7 @@ impl DeletionProgressTracker {
 pub async fn purge_garbage(
     input_path: String,
     mode: PurgeMode,
+    min_age: Duration,
     dry_run: bool,
 ) -> anyhow::Result<()> {
     let list_bytes = tokio::fs::read(&input_path).await?;
@@ -459,6 +537,7 @@ pub async fn purge_garbage(
         .filter(|i| match (&mode, &i.reason) {
             (PurgeMode::DeletedAndMissing, _) => true,
             (PurgeMode::DeletedOnly, GarbageReason::DeletedInConsole) => true,
+            (PurgeMode::DeletedOnly, GarbageReason::KnownBug) => true,
             (PurgeMode::DeletedOnly, GarbageReason::MissingInConsole) => false,
         });
 
@@ -487,6 +566,37 @@ pub async fn purge_garbage(
     let mut progress_tracker = DeletionProgressTracker::default();
     while let Some(result) = get_objects_results.next().await {
         let mut object_list = result?;
+
+        // Extra safety check: even if a collection of objects is garbage, check max() of modification
+        // times before purging, so that if we incorrectly marked a live tenant as garbage then we would
+        // notice that its index has been written recently and would omit deleting it.
+        if object_list.is_empty() {
+            // Simplify subsequent code by ensuring list always has at least one item
+            // Usually, this only occurs if there is parallel deletions racing us, as there is no empty prefixes
+            continue;
+        }
+        let max_mtime = object_list.iter().map(|o| o.last_modified).max().unwrap();
+        let age = max_mtime.elapsed();
+        match age {
+            Err(_) => {
+                tracing::warn!("Bad last_modified time");
+                continue;
+            }
+            Ok(a) if a < min_age => {
+                // Failed age check.  This doesn't mean we did something wrong: a tenant might really be garbage and recently
+                // written, but out of an abundance of caution we still don't purge it.
+                tracing::info!(
+                    "Skipping tenant with young objects {}..{}",
+                    object_list.first().as_ref().unwrap().key,
+                    object_list.last().as_ref().unwrap().key
+                );
+                continue;
+            }
+            Ok(_) => {
+                // Passed age check
+            }
+        }
+
         objects_to_delete.append(&mut object_list);
         if objects_to_delete.len() >= MAX_KEYS_PER_DELETE {
             do_delete(
diff --git a/storage_scrubber/src/main.rs b/storage_scrubber/src/main.rs
index b3ed6f6451..346829b7c9 100644
--- a/storage_scrubber/src/main.rs
+++ b/storage_scrubber/src/main.rs
@@ -50,6 +50,8 @@ enum Command {
         input_path: String,
         #[arg(short, long, default_value_t = PurgeMode::DeletedOnly)]
         mode: PurgeMode,
+        #[arg(long = "min-age")]
+        min_age: humantime::Duration,
     },
     #[command(verbatim_doc_comment)]
     ScanMetadata {
@@ -196,9 +198,11 @@ async fn main() -> anyhow::Result<()> {
             let console_config = ConsoleConfig::from_env()?;
             find_garbage(bucket_config, console_config, depth, node_kind, output_path).await
         }
-        Command::PurgeGarbage { input_path, mode } => {
-            purge_garbage(input_path, mode, !cli.delete).await
-        }
+        Command::PurgeGarbage {
+            input_path,
+            mode,
+            min_age,
+        } => purge_garbage(input_path, mode, min_age.into(), !cli.delete).await,
         Command::TenantSnapshot {
             tenant_id,
             output_path,

From 57f22178d7bbe3f635fc5f0858b942d73f03ad72 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 30 Jul 2024 09:59:15 +0200
Subject: [PATCH 298/412] Add metrics for input data considered and taken for
 compression (#8522)

If compression is enabled, we currently try compressing each image
larger than a specific size and if the compressed version is smaller, we
write that one, otherwise we use the uncompressed image. However, this
might sometimes be a wasteful process, if there is a substantial amount
of images that don't compress well.

The compression metrics added in #8420
`pageserver_compression_image_in_bytes_total` and
`pageserver_compression_image_out_bytes_total` are well designed for
answering the question how space efficient the total compression process
is end-to-end, which helps one to decide whether to enable it or not.

To answer the question of how much waste there is in terms of trial
compression, so CPU time, we add two metrics:

* one about the images that have been trial-compressed (considered), and
* one about the images where the compressed image has actually been
written (chosen).

There is different ways of weighting them, like for example one could
look at the count, or the compressed data. But the main contributor to
compression CPU usage is amount of data processed, so we weight the
images by their *uncompressed* size. In other words, the two metrics
are:

* `pageserver_compression_image_in_bytes_considered`
* `pageserver_compression_image_in_bytes_chosen`

Part of #5431
---
 pageserver/src/metrics.rs                     | 18 +++++++++-
 pageserver/src/tenant/blob_io.rs              | 36 +++++++++++++------
 .../src/tenant/storage_layer/delta_layer.rs   |  2 +-
 .../src/tenant/storage_layer/image_layer.rs   | 26 ++++++++++++--
 4 files changed, 68 insertions(+), 14 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 9aff5220f5..ede6b41a75 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -613,7 +613,23 @@ pub(crate) static CIRCUIT_BREAKERS_UNBROKEN: Lazy<IntCounter> = Lazy::new(|| {
 pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES: Lazy<IntCounter> = Lazy::new(|| {
     register_int_counter!(
         "pageserver_compression_image_in_bytes_total",
-        "Size of uncompressed data written into image layers"
+        "Size of data written into image layers before compression"
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES_CONSIDERED: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_compression_image_in_bytes_considered",
+        "Size of potentially compressible data written into image layers before compression"
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES_CHOSEN: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_compression_image_in_bytes_chosen",
+        "Size of data whose compressed form was written into image layers"
     )
     .expect("failed to define a metric")
 });
diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs
index 791eefebe9..8e9d349ca8 100644
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -28,6 +28,12 @@ use crate::virtual_file::VirtualFile;
 use std::cmp::min;
 use std::io::{Error, ErrorKind};
 
+#[derive(Copy, Clone, Debug)]
+pub struct CompressionInfo {
+    pub written_compressed: bool,
+    pub compressed_size: Option<usize>,
+}
+
 impl<'a> BlockCursor<'a> {
     /// Read a blob into a new buffer.
     pub async fn read_blob(
@@ -273,8 +279,10 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
         srcbuf: B,
         ctx: &RequestContext,
     ) -> (B::Buf, Result<u64, Error>) {
-        self.write_blob_maybe_compressed(srcbuf, ctx, ImageCompressionAlgorithm::Disabled)
-            .await
+        let (buf, res) = self
+            .write_blob_maybe_compressed(srcbuf, ctx, ImageCompressionAlgorithm::Disabled)
+            .await;
+        (buf, res.map(|(off, _compression_info)| off))
     }
 
     /// Write a blob of data. Returns the offset that it was written to,
@@ -284,8 +292,12 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
         srcbuf: B,
         ctx: &RequestContext,
         algorithm: ImageCompressionAlgorithm,
-    ) -> (B::Buf, Result<u64, Error>) {
+    ) -> (B::Buf, Result<(u64, CompressionInfo), Error>) {
         let offset = self.offset;
+        let mut compression_info = CompressionInfo {
+            written_compressed: false,
+            compressed_size: None,
+        };
 
         let len = srcbuf.bytes_init();
 
@@ -328,7 +340,9 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
                         encoder.write_all(&slice[..]).await.unwrap();
                         encoder.shutdown().await.unwrap();
                         let compressed = encoder.into_inner();
+                        compression_info.compressed_size = Some(compressed.len());
                         if compressed.len() < len {
+                            compression_info.written_compressed = true;
                             let compressed_len = compressed.len();
                             compressed_buf = Some(compressed);
                             (BYTE_ZSTD, compressed_len, slice.into_inner())
@@ -359,7 +373,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
         } else {
             self.write_all(srcbuf, ctx).await
         };
-        (srcbuf, res.map(|_| offset))
+        (srcbuf, res.map(|_| (offset, compression_info)))
     }
 }
 
@@ -416,12 +430,14 @@ pub(crate) mod tests {
             let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
             for blob in blobs.iter() {
                 let (_, res) = if compression {
-                    wtr.write_blob_maybe_compressed(
-                        blob.clone(),
-                        ctx,
-                        ImageCompressionAlgorithm::Zstd { level: Some(1) },
-                    )
-                    .await
+                    let res = wtr
+                        .write_blob_maybe_compressed(
+                            blob.clone(),
+                            ctx,
+                            ImageCompressionAlgorithm::Zstd { level: Some(1) },
+                        )
+                        .await;
+                    (res.0, res.1.map(|(off, _)| off))
                 } else {
                     wtr.write_blob(blob.clone(), ctx).await
                 };
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 229d1e3608..f9becf53ff 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -467,7 +467,7 @@ impl DeltaLayerWriterInner {
             .write_blob_maybe_compressed(val, ctx, compression)
             .await;
         let off = match res {
-            Ok(off) => off,
+            Ok((off, _)) => off,
             Err(e) => return (val, Err(anyhow::anyhow!(e))),
         };
 
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 44ba685490..08db27514a 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -734,6 +734,14 @@ struct ImageLayerWriterInner {
     // Total uncompressed bytes passed into put_image
     uncompressed_bytes: u64,
 
+    // Like `uncompressed_bytes`,
+    // but only of images we might consider for compression
+    uncompressed_bytes_eligible: u64,
+
+    // Like `uncompressed_bytes`, but only of images
+    // where we have chosen their compressed form
+    uncompressed_bytes_chosen: u64,
+
     blob_writer: BlobWriter<false>,
     tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
 }
@@ -790,6 +798,8 @@ impl ImageLayerWriterInner {
             tree: tree_builder,
             blob_writer,
             uncompressed_bytes: 0,
+            uncompressed_bytes_eligible: 0,
+            uncompressed_bytes_chosen: 0,
         };
 
         Ok(writer)
@@ -808,13 +818,22 @@ impl ImageLayerWriterInner {
     ) -> anyhow::Result<()> {
         ensure!(self.key_range.contains(&key));
         let compression = self.conf.image_compression;
-        self.uncompressed_bytes += img.len() as u64;
+        let uncompressed_len = img.len() as u64;
+        self.uncompressed_bytes += uncompressed_len;
         let (_img, res) = self
             .blob_writer
             .write_blob_maybe_compressed(img, ctx, compression)
             .await;
         // TODO: re-use the buffer for `img` further upstack
-        let off = res?;
+        let (off, compression_info) = res?;
+        if compression_info.compressed_size.is_some() {
+            // The image has been considered for compression at least
+            self.uncompressed_bytes_eligible += uncompressed_len;
+        }
+        if compression_info.written_compressed {
+            // The image has been compressed
+            self.uncompressed_bytes_chosen += uncompressed_len;
+        }
 
         let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
         key.write_to_byte_slice(&mut keybuf);
@@ -837,6 +856,9 @@ impl ImageLayerWriterInner {
         // Calculate compression ratio
         let compressed_size = self.blob_writer.size() - PAGE_SZ as u64; // Subtract PAGE_SZ for header
         crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES.inc_by(self.uncompressed_bytes);
+        crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES_CONSIDERED
+            .inc_by(self.uncompressed_bytes_eligible);
+        crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES_CHOSEN.inc_by(self.uncompressed_bytes_chosen);
         crate::metrics::COMPRESSION_IMAGE_OUTPUT_BYTES.inc_by(compressed_size);
 
         let mut file = self.blob_writer.into_inner();

From cd3f4b3a5352993488901327432ff05a0e760e5a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 30 Jul 2024 11:00:37 +0200
Subject: [PATCH 299/412] scrubber: add remote_storage based listing APIs and
 use them in find-large-objects (#8541)

Add two new functions `stream_objects_with_retries` and
`stream_tenants_generic` and use them in the `find-large-objects`
subcommand, migrating it to `remote_storage`.

Also adds the `size` field to the `ListingObject` struct.

Part of #7547
---
 libs/remote_storage/src/azure_blob.rs      |  3 +-
 libs/remote_storage/src/lib.rs             |  1 +
 libs/remote_storage/src/local_fs.rs        |  2 +
 libs/remote_storage/src/s3_bucket.rs       |  5 +-
 storage_scrubber/src/find_large_objects.rs | 44 +++++------
 storage_scrubber/src/garbage.rs            |  2 +-
 storage_scrubber/src/lib.rs                | 90 +++++++++++++++++-----
 storage_scrubber/src/metadata_stream.rs    | 33 +++++++-
 8 files changed, 133 insertions(+), 47 deletions(-)

diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index 6ca4ae43f2..3c77d5a227 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -355,7 +355,8 @@ impl RemoteStorage for AzureBlobStorage {
                     .blobs()
                     .map(|k| ListingObject{
                         key: self.name_to_relative_path(&k.name),
-                        last_modified: k.properties.last_modified.into()
+                        last_modified: k.properties.last_modified.into(),
+                        size: k.properties.content_length,
                     }
                     );
 
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 031548bbec..794e696769 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -153,6 +153,7 @@ pub enum ListingMode {
 pub struct ListingObject {
     pub key: RemotePath,
     pub last_modified: SystemTime,
+    pub size: u64,
 }
 
 #[derive(Default)]
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index bc6b10aa51..99b4aa4061 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -368,6 +368,7 @@ impl RemoteStorage for LocalFs {
                             key: k.clone(),
                             // LocalFs is just for testing, so just specify a dummy time
                             last_modified: SystemTime::now(),
+                            size: 0,
                         })
                     }
                 })
@@ -411,6 +412,7 @@ impl RemoteStorage for LocalFs {
                             key: RemotePath::from_string(&relative_key).unwrap(),
                             // LocalFs is just for testing
                             last_modified: SystemTime::now(),
+                            size: 0,
                         });
                     }
                 }
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 412f307445..1f25da813d 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -565,9 +565,12 @@ impl RemoteStorage for S3Bucket {
                         }
                     };
 
+                    let size = object.size.unwrap_or(0) as u64;
+
                     result.keys.push(ListingObject{
                         key,
-                        last_modified
+                        last_modified,
+                        size,
                     });
                     if let Some(mut mk) = max_keys {
                         assert!(mk > 0);
diff --git a/storage_scrubber/src/find_large_objects.rs b/storage_scrubber/src/find_large_objects.rs
index 2ef802229d..f5bb7e088a 100644
--- a/storage_scrubber/src/find_large_objects.rs
+++ b/storage_scrubber/src/find_large_objects.rs
@@ -1,10 +1,13 @@
+use std::pin::pin;
+
 use futures::{StreamExt, TryStreamExt};
 use pageserver::tenant::storage_layer::LayerName;
+use remote_storage::ListingMode;
 use serde::{Deserialize, Serialize};
 
 use crate::{
-    checks::parse_layer_object_name, init_remote, list_objects_with_retries,
-    metadata_stream::stream_tenants, BucketConfig, NodeKind,
+    checks::parse_layer_object_name, init_remote_generic, metadata_stream::stream_tenants_generic,
+    stream_objects_with_retries, BucketConfig, NodeKind,
 };
 
 #[derive(Serialize, Deserialize, Clone, Copy, PartialEq, Eq)]
@@ -47,45 +50,38 @@ pub async fn find_large_objects(
     ignore_deltas: bool,
     concurrency: usize,
 ) -> anyhow::Result<LargeObjectListing> {
-    let (s3_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?;
-    let tenants = std::pin::pin!(stream_tenants(&s3_client, &target));
+    let (remote_client, target) =
+        init_remote_generic(bucket_config.clone(), NodeKind::Pageserver).await?;
+    let tenants = pin!(stream_tenants_generic(&remote_client, &target));
 
     let objects_stream = tenants.map_ok(|tenant_shard_id| {
         let mut tenant_root = target.tenant_root(&tenant_shard_id);
-        let s3_client = s3_client.clone();
+        let remote_client = remote_client.clone();
         async move {
             let mut objects = Vec::new();
             let mut total_objects_ctr = 0u64;
             // We want the objects and not just common prefixes
             tenant_root.delimiter.clear();
-            let mut continuation_token = None;
-            loop {
-                let fetch_response =
-                    list_objects_with_retries(&s3_client, &tenant_root, continuation_token.clone())
-                        .await?;
-                for obj in fetch_response.contents().iter().filter(|o| {
-                    if let Some(obj_size) = o.size {
-                        min_size as i64 <= obj_size
-                    } else {
-                        false
-                    }
-                }) {
-                    let key = obj.key().expect("couldn't get key").to_owned();
+            let mut objects_stream = pin!(stream_objects_with_retries(
+                &remote_client,
+                ListingMode::NoDelimiter,
+                &tenant_root
+            ));
+            while let Some(listing) = objects_stream.next().await {
+                let listing = listing?;
+                for obj in listing.keys.iter().filter(|obj| min_size <= obj.size) {
+                    let key = obj.key.to_string();
                     let kind = LargeObjectKind::from_key(&key);
                     if ignore_deltas && kind == LargeObjectKind::DeltaLayer {
                         continue;
                     }
                     objects.push(LargeObject {
                         key,
-                        size: obj.size.unwrap() as u64,
+                        size: obj.size,
                         kind,
                     })
                 }
-                total_objects_ctr += fetch_response.contents().len() as u64;
-                match fetch_response.next_continuation_token {
-                    Some(new_token) => continuation_token = Some(new_token),
-                    None => break,
-                }
+                total_objects_ctr += listing.keys.len() as u64;
             }
 
             Ok((tenant_shard_id, objects, total_objects_ctr))
diff --git a/storage_scrubber/src/garbage.rs b/storage_scrubber/src/garbage.rs
index 78ecfc7232..73479c3658 100644
--- a/storage_scrubber/src/garbage.rs
+++ b/storage_scrubber/src/garbage.rs
@@ -510,7 +510,7 @@ pub async fn purge_garbage(
         input_path
     );
 
-    let remote_client =
+    let (remote_client, _target) =
         init_remote_generic(garbage_list.bucket_config.clone(), garbage_list.node_kind).await?;
 
     assert_eq!(
diff --git a/storage_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs
index 5c64e7e459..c7900f9b02 100644
--- a/storage_scrubber/src/lib.rs
+++ b/storage_scrubber/src/lib.rs
@@ -22,16 +22,18 @@ use aws_sdk_s3::Client;
 
 use camino::{Utf8Path, Utf8PathBuf};
 use clap::ValueEnum;
+use futures::{Stream, StreamExt};
 use pageserver::tenant::remote_timeline_client::{remote_tenant_path, remote_timeline_path};
 use pageserver::tenant::TENANTS_SEGMENT_NAME;
 use pageserver_api::shard::TenantShardId;
 use remote_storage::{
-    GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
-    DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
+    GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorageConfig, RemoteStorageKind,
+    S3Config, DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
 };
 use reqwest::Url;
 use serde::{Deserialize, Serialize};
 use tokio::io::AsyncReadExt;
+use tokio_util::sync::CancellationToken;
 use tracing::error;
 use tracing_appender::non_blocking::WorkerGuard;
 use tracing_subscriber::{fmt, prelude::*, EnvFilter};
@@ -319,27 +321,35 @@ fn default_prefix_in_bucket(node_kind: NodeKind) -> &'static str {
     }
 }
 
+fn make_root_target(
+    bucket_name: String,
+    prefix_in_bucket: String,
+    node_kind: NodeKind,
+) -> RootTarget {
+    let s3_target = S3Target {
+        bucket_name,
+        prefix_in_bucket,
+        delimiter: "/".to_string(),
+    };
+    match node_kind {
+        NodeKind::Pageserver => RootTarget::Pageserver(s3_target),
+        NodeKind::Safekeeper => RootTarget::Safekeeper(s3_target),
+    }
+}
+
 async fn init_remote(
     bucket_config: BucketConfig,
     node_kind: NodeKind,
 ) -> anyhow::Result<(Arc<Client>, RootTarget)> {
     let bucket_region = Region::new(bucket_config.region);
-    let delimiter = "/".to_string();
     let s3_client = Arc::new(init_s3_client(bucket_region).await);
     let default_prefix = default_prefix_in_bucket(node_kind).to_string();
 
-    let s3_root = match node_kind {
-        NodeKind::Pageserver => RootTarget::Pageserver(S3Target {
-            bucket_name: bucket_config.bucket,
-            prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or(default_prefix),
-            delimiter,
-        }),
-        NodeKind::Safekeeper => RootTarget::Safekeeper(S3Target {
-            bucket_name: bucket_config.bucket,
-            prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or(default_prefix),
-            delimiter,
-        }),
-    };
+    let s3_root = make_root_target(
+        bucket_config.bucket,
+        bucket_config.prefix_in_bucket.unwrap_or(default_prefix),
+        node_kind,
+    );
 
     Ok((s3_client, s3_root))
 }
@@ -347,12 +357,12 @@ async fn init_remote(
 async fn init_remote_generic(
     bucket_config: BucketConfig,
     node_kind: NodeKind,
-) -> anyhow::Result<GenericRemoteStorage> {
+) -> anyhow::Result<(GenericRemoteStorage, RootTarget)> {
     let endpoint = env::var("AWS_ENDPOINT_URL").ok();
     let default_prefix = default_prefix_in_bucket(node_kind).to_string();
     let prefix_in_bucket = Some(bucket_config.prefix_in_bucket.unwrap_or(default_prefix));
     let storage = S3Config {
-        bucket_name: bucket_config.bucket,
+        bucket_name: bucket_config.bucket.clone(),
         bucket_region: bucket_config.region,
         prefix_in_bucket,
         endpoint,
@@ -366,7 +376,13 @@ async fn init_remote_generic(
         storage: RemoteStorageKind::AwsS3(storage),
         timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
     };
-    GenericRemoteStorage::from_config(&storage_config).await
+
+    // We already pass the prefix to the remote client above
+    let prefix_in_root_target = String::new();
+    let s3_root = make_root_target(bucket_config.bucket, prefix_in_root_target, node_kind);
+
+    let client = GenericRemoteStorage::from_config(&storage_config).await?;
+    Ok((client, s3_root))
 }
 
 async fn list_objects_with_retries(
@@ -404,6 +420,44 @@ async fn list_objects_with_retries(
     Err(anyhow!("unreachable unless MAX_RETRIES==0"))
 }
 
+fn stream_objects_with_retries<'a>(
+    storage_client: &'a GenericRemoteStorage,
+    listing_mode: ListingMode,
+    s3_target: &'a S3Target,
+) -> impl Stream<Item = Result<Listing, anyhow::Error>> + 'a {
+    async_stream::stream! {
+        let mut trial = 0;
+        let cancel = CancellationToken::new();
+        let prefix_str = &s3_target
+            .prefix_in_bucket
+            .strip_prefix("/")
+            .unwrap_or(&s3_target.prefix_in_bucket);
+        let prefix = RemotePath::from_string(prefix_str)?;
+        let mut list_stream =
+            storage_client.list_streaming(Some(&prefix), listing_mode, None, &cancel);
+        while let Some(res) = list_stream.next().await {
+            if let Err(err) = res {
+                let yield_err = if err.is_permanent() {
+                    true
+                } else {
+                    let backoff_time = 1 << trial.max(5);
+                    tokio::time::sleep(Duration::from_secs(backoff_time)).await;
+                    trial += 1;
+                    trial == MAX_RETRIES - 1
+                };
+                if yield_err {
+                    yield Err(err)
+                        .with_context(|| format!("Failed to list objects {MAX_RETRIES} times"));
+                    break;
+                }
+            } else {
+                trial = 0;
+                yield res.map_err(anyhow::Error::from);
+            }
+        }
+    }
+}
+
 async fn download_object_with_retries(
     s3_client: &Client,
     bucket_name: &str,
diff --git a/storage_scrubber/src/metadata_stream.rs b/storage_scrubber/src/metadata_stream.rs
index c05874f556..91dba3c992 100644
--- a/storage_scrubber/src/metadata_stream.rs
+++ b/storage_scrubber/src/metadata_stream.rs
@@ -1,12 +1,41 @@
-use anyhow::Context;
+use std::str::FromStr;
+
+use anyhow::{anyhow, Context};
 use async_stream::{stream, try_stream};
 use aws_sdk_s3::{types::ObjectIdentifier, Client};
+use futures::StreamExt;
+use remote_storage::{GenericRemoteStorage, ListingMode};
 use tokio_stream::Stream;
 
-use crate::{list_objects_with_retries, RootTarget, S3Target, TenantShardTimelineId};
+use crate::{
+    list_objects_with_retries, stream_objects_with_retries, RootTarget, S3Target,
+    TenantShardTimelineId,
+};
 use pageserver_api::shard::TenantShardId;
 use utils::id::{TenantId, TimelineId};
 
+/// Given a remote storage and a target, output a stream of TenantIds discovered via listing prefixes
+pub fn stream_tenants_generic<'a>(
+    remote_client: &'a GenericRemoteStorage,
+    target: &'a RootTarget,
+) -> impl Stream<Item = anyhow::Result<TenantShardId>> + 'a {
+    try_stream! {
+        let tenants_target = target.tenants_root();
+        let mut tenants_stream =
+            std::pin::pin!(stream_objects_with_retries(remote_client, ListingMode::WithDelimiter, &tenants_target));
+        while let Some(chunk) = tenants_stream.next().await {
+            let chunk = chunk?;
+            let entry_ids = chunk.prefixes.iter()
+                .map(|prefix| prefix.get_path().file_name().ok_or_else(|| anyhow!("no final component in path '{prefix}'")));
+            for dir_name_res in entry_ids {
+                let dir_name = dir_name_res?;
+                let id = TenantShardId::from_str(dir_name)?;
+                yield id;
+            }
+        }
+    }
+}
+
 /// Given an S3 bucket, output a stream of TenantIds discovered via ListObjectsv2
 pub fn stream_tenants<'a>(
     s3_client: &'a Client,

From 5be3e090827755cc052f48e703c028ef721a43b8 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 30 Jul 2024 13:38:23 +0100
Subject: [PATCH 300/412] CI(benchmarking): make neonvm default provisioner
 (#8538)

## Problem

We don't allow regular end-users to use `k8s-pod` provisioner,
but we still use it in nightly benchmarks

## Summary of changes
- Remove `provisioner` input from `neon-create-project` action, use
`k8s-neonvm` as a default provioner
- Change `neon-` platform prefix to `neonvm-`
- Remove `neon-captest-freetier` and `neon-captest-new` as we already
have their `neonvm` counterparts
---
 .../actions/neon-project-create/action.yml    | 12 +----
 .github/workflows/benchmarking.yml            | 52 ++++++++-----------
 2 files changed, 25 insertions(+), 39 deletions(-)

diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml
index d4029bd37c..f4a194639f 100644
--- a/.github/actions/neon-project-create/action.yml
+++ b/.github/actions/neon-project-create/action.yml
@@ -14,11 +14,8 @@ inputs:
   api_host:
     description: 'Neon API host'
     default: console-stage.neon.build
-  provisioner:
-    description: 'k8s-pod or k8s-neonvm'
-    default: 'k8s-pod'
   compute_units:
-    description: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal'
+    description: '[Min, Max] compute units'
     default: '[1, 1]'
 
 outputs:
@@ -37,10 +34,6 @@ runs:
       # A shell without `set -x` to not to expose password/dsn in logs
       shell: bash -euo pipefail {0}
       run: |
-        if [ "${PROVISIONER}" == "k8s-pod" ] && [ "${MIN_CU}" != "${MAX_CU}" ]; then
-          echo >&2 "For k8s-pod provisioner MIN_CU should be equal to MAX_CU"
-        fi
-
         project=$(curl \
           "https://${API_HOST}/api/v2/projects" \
           --fail \
@@ -52,7 +45,7 @@ runs:
               \"name\": \"Created by actions/neon-project-create; GITHUB_RUN_ID=${GITHUB_RUN_ID}\",
               \"pg_version\": ${POSTGRES_VERSION},
               \"region_id\": \"${REGION_ID}\",
-              \"provisioner\": \"${PROVISIONER}\",
+              \"provisioner\": \"k8s-neonvm\",
               \"autoscaling_limit_min_cu\": ${MIN_CU},
               \"autoscaling_limit_max_cu\": ${MAX_CU},
               \"settings\": { }
@@ -75,6 +68,5 @@ runs:
         API_KEY: ${{ inputs.api_key }}
         REGION_ID: ${{ inputs.region_id }}
         POSTGRES_VERSION: ${{ inputs.postgres_version }}
-        PROVISIONER: ${{ inputs.provisioner }}
         MIN_CU: ${{ fromJSON(inputs.compute_units)[0] }}
         MAX_CU: ${{ fromJSON(inputs.compute_units)[1] }}
diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 5ffdb29fe6..f7ea534fb9 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -63,11 +63,9 @@ jobs:
           - DEFAULT_PG_VERSION: 16
             PLATFORM: "neon-staging"
             region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
-            provisioner: 'k8s-pod' 
           - DEFAULT_PG_VERSION: 16
             PLATFORM: "azure-staging"
             region_id: 'azure-eastus2'
-            provisioner: 'k8s-neonvm'
     env:
       TEST_PG_BENCH_DURATIONS_MATRIX: "300"
       TEST_PG_BENCH_SCALES_MATRIX: "10,100"
@@ -100,7 +98,6 @@ jobs:
         region_id: ${{ matrix.region_id }}
         postgres_version: ${{ env.DEFAULT_PG_VERSION }}
         api_key: ${{ secrets.NEON_STAGING_API_KEY }}
-        provisioner: ${{ matrix.provisioner }}
 
     - name: Run benchmark
       uses: ./.github/actions/run-python-test-set
@@ -216,11 +213,11 @@ jobs:
     # Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)
     #
     # Available platforms:
-    # - neon-captest-new: Freshly created project (1 CU)
-    # - neon-captest-freetier: Use freetier-sized compute (0.25 CU)
+    # - neonvm-captest-new: Freshly created project (1 CU)
+    # - neonvm-captest-freetier: Use freetier-sized compute (0.25 CU)
     # - neonvm-captest-azure-new: Freshly created project (1 CU) in azure region
     # - neonvm-captest-azure-freetier: Use freetier-sized compute (0.25 CU) in azure region
-    # - neon-captest-reuse: Reusing existing project
+    # - neonvm-captest-reuse: Reusing existing project
     # - rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
     # - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
     env:
@@ -245,18 +242,16 @@ jobs:
             "'"$region_id_default"'"
             ],
           "platform": [
-            "neon-captest-new",
-            "neon-captest-reuse",
+            "neonvm-captest-new",
+            "neonvm-captest-reuse",
             "neonvm-captest-new"
           ],
           "db_size": [ "10gb" ],
-          "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neon-captest-freetier",         "db_size": "3gb"  },
-                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neon-captest-new",              "db_size": "50gb" },
-                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier",       "db_size": "3gb"  },
+          "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier",       "db_size": "3gb"  },
                       { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "50gb" },
-                      { "pg_version": 16, "region_id": "azure-eastus2",      "platform": "neonvm-azure-captest-freetier", "db_size": "3gb"  },
-                      { "pg_version": 16, "region_id": "azure-eastus2",      "platform": "neonvm-azure-captest-new",      "db_size": "10gb" },
-                      { "pg_version": 16, "region_id": "azure-eastus2",      "platform": "neonvm-azure-captest-new",      "db_size": "50gb" },
+                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-freetier", "db_size": "3gb"  },
+                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "10gb" },
+                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "50gb" },
                       { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
         }'
 
@@ -271,7 +266,7 @@ jobs:
       run: |
         matrix='{
           "platform": [
-            "neon-captest-reuse"
+            "neonvm-captest-reuse"
           ]
         }'
 
@@ -287,7 +282,7 @@ jobs:
       run: |
         matrix='{
           "platform": [
-            "neon-captest-reuse"
+            "neonvm-captest-reuse"
           ],
           "scale": [
             "10"
@@ -338,7 +333,7 @@ jobs:
         prefix: latest
 
     - name: Create Neon Project
-      if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform)
+      if: contains(fromJson('["neonvm-captest-new", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform)
       id: create-neon-project
       uses: ./.github/actions/neon-project-create
       with:
@@ -346,19 +341,18 @@ jobs:
         postgres_version: ${{ env.DEFAULT_PG_VERSION }}
         api_key: ${{ secrets.NEON_STAGING_API_KEY }}
         compute_units: ${{ (contains(matrix.platform, 'captest-freetier') && '[0.25, 0.25]') || '[1, 1]' }}
-        provisioner: ${{ (contains(matrix.platform, 'neonvm-') && 'k8s-neonvm') || 'k8s-pod' }}
 
     - name: Set up Connection String
       id: set-up-connstr
       run: |
         case "${PLATFORM}" in
-          neon-captest-reuse)
+          neonvm-captest-reuse)
             CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
             ;;
           neonvm-captest-sharding-reuse)
             CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }}
             ;;
-          neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier | neonvm-azure-captest-new | neonvm-azure-captest-freetier)
+          neonvm-captest-new | neonvm-captest-freetier | neonvm-azure-captest-new | neonvm-azure-captest-freetier)
             CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
             ;;
           rds-aurora)
@@ -442,9 +436,9 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - PLATFORM: "neon-captest-pgvector"
+          - PLATFORM: "neonvm-captest-pgvector"
           - PLATFORM: "azure-captest-pgvector"
-            
+
     env:
       TEST_PG_BENCH_DURATIONS_MATRIX: "15m"
       TEST_PG_BENCH_SCALES_MATRIX: "1"
@@ -486,7 +480,7 @@ jobs:
       id: set-up-connstr
       run: |
         case "${PLATFORM}" in
-          neon-captest-pgvector)
+          neonvm-captest-pgvector)
             CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
             ;;
           azure-captest-pgvector)
@@ -585,7 +579,7 @@ jobs:
       id: set-up-connstr
       run: |
         case "${PLATFORM}" in
-          neon-captest-reuse)
+          neonvm-captest-reuse)
             CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_10M_CONNSTR }}
             ;;
           rds-aurora)
@@ -595,7 +589,7 @@ jobs:
             CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CLICKBENCH_10M_CONNSTR }}
             ;;
           *)
-            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
+            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'"
             exit 1
             ;;
         esac
@@ -672,7 +666,7 @@ jobs:
     - name: Get Connstring Secret Name
       run: |
         case "${PLATFORM}" in
-          neon-captest-reuse)
+          neonvm-captest-reuse)
             ENV_PLATFORM=CAPTEST_TPCH
             ;;
           rds-aurora)
@@ -682,7 +676,7 @@ jobs:
             ENV_PLATFORM=RDS_AURORA_TPCH
             ;;
           *)
-            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
+            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'"
             exit 1
             ;;
         esac
@@ -759,7 +753,7 @@ jobs:
       id: set-up-connstr
       run: |
         case "${PLATFORM}" in
-          neon-captest-reuse)
+          neonvm-captest-reuse)
             CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_CAPTEST_CONNSTR }}
             ;;
           rds-aurora)
@@ -769,7 +763,7 @@ jobs:
             CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_RDS_POSTGRES_CONNSTR }}
             ;;
           *)
-            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
+            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'"
             exit 1
             ;;
         esac

From 5702e1cb46d2e98c9cce711247b580ddca5b48c8 Mon Sep 17 00:00:00 2001
From: Anton Chaporgin <chapson@neon.tech>
Date: Tue, 30 Jul 2024 16:15:53 +0300
Subject: [PATCH 301/412] [neon/acr] impr: push to ACR while building images
 (#8545)

This tests the ability to push into ACR using OIDC. Proved it worked by running slightly modified YAML.
In `promote-images` we push the following images `neon compute-tools {vm-,}compute-node-{v14,v15,v16}` into `neoneastus2`.

https://github.com/neondatabase/cloud/issues/14640
---
 .github/workflows/build_and_test.yml | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 872c1fbb39..3cf40e6153 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -833,6 +833,9 @@ jobs:
           rm -rf .docker-custom
 
   promote-images:
+    permissions:
+      contents: read  # This is required for actions/checkout
+      id-token: write # This is required for Azure Login to work.
     needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
     runs-on: ubuntu-22.04
 
@@ -859,6 +862,28 @@ jobs:
                                                neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
           done
 
+      - name: Azure login
+        if: github.ref_name == 'main'
+        uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a  # @v2.1.1
+        with:
+          client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }}
+          tenant-id: ${{ secrets.AZURE_TENANT_ID }}
+          subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
+
+      - name: Login to ACR
+        if: github.ref_name == 'main'
+        run: |
+          az acr login --name=neoneastus2
+
+      - name: Copy docker images to ACR-dev
+        if: github.ref_name == 'main'
+        run: |
+          for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16}; do
+            docker buildx imagetools create \
+              -t neoneastus2.azurecr.io/neondatabase/${image}:${{ needs.tag.outputs.build-tag }} \
+                                        neondatabase/${image}:${{ needs.tag.outputs.build-tag }}
+          done
+
       - name: Add latest tag to images
         if: github.ref_name == 'main'
         run: |

From b87a1384f05418f2e994370d77b15a7906c3d999 Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Tue, 30 Jul 2024 09:32:00 -0400
Subject: [PATCH 302/412] feat(storcon): store scrubber metadata scan result
 (#8480)

Part of #8128, followed by #8502.

## Problem

Currently we lack mechanism to alert unhealthy `scan_metadata` status if
we start running this scrubber command as part of a cronjob. With the
storage controller client introduced to storage scrubber in #8196, it is
viable to set up alert by storing health status in the storage
controller database.

We intentionally do not store the full output to the database as the
json blobs potentially makes the table really huge. Instead, only a
health status and a timestamp recording the last time metadata health
status is posted on a tenant shard.

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
---
 Cargo.lock                                    |   2 +
 libs/pageserver_api/src/controller_api.rs     |  38 +++-
 libs/utils/src/auth.rs                        |  16 +-
 storage_controller/Cargo.toml                 |   9 +-
 .../down.sql                                  |   1 +
 .../up.sql                                    |  14 ++
 storage_controller/src/http.rs                |  73 ++++++-
 storage_controller/src/persistence.rs         | 180 +++++++++++++++++-
 storage_controller/src/schema.rs              |  12 +-
 storage_controller/src/service.rs             |  74 ++++++-
 test_runner/fixtures/neon_fixtures.py         |  46 +++++
 .../regress/test_storage_controller.py        | 122 +++++++++++-
 12 files changed, 560 insertions(+), 27 deletions(-)
 create mode 100644 storage_controller/migrations/2024-07-23-191537_create_metadata_health/down.sql
 create mode 100644 storage_controller/migrations/2024-07-23-191537_create_metadata_health/up.sql

diff --git a/Cargo.lock b/Cargo.lock
index 2b56095bc8..2186d55e9c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1672,6 +1672,7 @@ checksum = "62d6dcd069e7b5fe49a302411f759d4cf1cf2c27fe798ef46fb8baefc053dd2b"
 dependencies = [
  "bitflags 2.4.1",
  "byteorder",
+ "chrono",
  "diesel_derives",
  "itoa",
  "pq-sys",
@@ -5718,6 +5719,7 @@ dependencies = [
  "aws-config",
  "bytes",
  "camino",
+ "chrono",
  "clap",
  "control_plane",
  "diesel",
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index 474f796040..36b1bd95ff 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -1,5 +1,5 @@
 use std::str::FromStr;
-use std::time::Instant;
+use std::time::{Duration, Instant};
 
 /// Request/response types for the storage controller
 /// API (`/control/v1` prefix).  Implemented by the server
@@ -294,6 +294,42 @@ pub enum PlacementPolicy {
 #[derive(Serialize, Deserialize, Debug)]
 pub struct TenantShardMigrateResponse {}
 
+/// Metadata health record posted from scrubber.
+#[derive(Serialize, Deserialize, Debug)]
+pub struct MetadataHealthRecord {
+    pub tenant_shard_id: TenantShardId,
+    pub healthy: bool,
+    pub last_scrubbed_at: chrono::DateTime<chrono::Utc>,
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+pub struct MetadataHealthUpdateRequest {
+    pub healthy_tenant_shards: Vec<TenantShardId>,
+    pub unhealthy_tenant_shards: Vec<TenantShardId>,
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+pub struct MetadataHealthUpdateResponse {}
+
+#[derive(Serialize, Deserialize, Debug)]
+
+pub struct MetadataHealthListUnhealthyResponse {
+    pub unhealthy_tenant_shards: Vec<TenantShardId>,
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+
+pub struct MetadataHealthListOutdatedRequest {
+    #[serde(with = "humantime_serde")]
+    pub not_scrubbed_for: Duration,
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+
+pub struct MetadataHealthListOutdatedResponse {
+    pub health_records: Vec<MetadataHealthRecord>,
+}
+
 #[cfg(test)]
 mod test {
     use super::*;
diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs
index a1170a460d..7b735875b7 100644
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -18,20 +18,20 @@ const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA;
 #[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq)]
 #[serde(rename_all = "lowercase")]
 pub enum Scope {
-    // Provides access to all data for a specific tenant (specified in `struct Claims` below)
+    /// Provides access to all data for a specific tenant (specified in `struct Claims` below)
     // TODO: join these two?
     Tenant,
-    // Provides blanket access to all tenants on the pageserver plus pageserver-wide APIs.
-    // Should only be used e.g. for status check/tenant creation/list.
+    /// Provides blanket access to all tenants on the pageserver plus pageserver-wide APIs.
+    /// Should only be used e.g. for status check/tenant creation/list.
     PageServerApi,
-    // Provides blanket access to all data on the safekeeper plus safekeeper-wide APIs.
-    // Should only be used e.g. for status check.
-    // Currently also used for connection from any pageserver to any safekeeper.
+    /// Provides blanket access to all data on the safekeeper plus safekeeper-wide APIs.
+    /// Should only be used e.g. for status check.
+    /// Currently also used for connection from any pageserver to any safekeeper.
     SafekeeperData,
-    // The scope used by pageservers in upcalls to storage controller and cloud control plane
+    /// The scope used by pageservers in upcalls to storage controller and cloud control plane
     #[serde(rename = "generations_api")]
     GenerationsApi,
-    // Allows access to control plane managment API and some storage controller endpoints.
+    /// Allows access to control plane managment API and some storage controller endpoints.
     Admin,
 
     /// Allows access to storage controller APIs used by the scrubber, to interrogate the state
diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml
index b54dea5d47..d14b235046 100644
--- a/storage_controller/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -18,6 +18,7 @@ anyhow.workspace = true
 aws-config.workspace = true
 bytes.workspace = true
 camino.workspace = true
+chrono.workspace = true
 clap.workspace = true
 fail.workspace = true
 futures.workspace = true
@@ -44,7 +45,12 @@ scopeguard.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
 
-diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] }
+diesel = { version = "2.1.4", features = [
+    "serde_json",
+    "postgres",
+    "r2d2",
+    "chrono",
+] }
 diesel_migrations = { version = "2.1.0" }
 r2d2 = { version = "0.8.10" }
 
@@ -52,4 +58,3 @@ utils = { path = "../libs/utils/" }
 metrics = { path = "../libs/metrics/" }
 control_plane = { path = "../control_plane" }
 workspace_hack = { version = "0.1", path = "../workspace_hack" }
-
diff --git a/storage_controller/migrations/2024-07-23-191537_create_metadata_health/down.sql b/storage_controller/migrations/2024-07-23-191537_create_metadata_health/down.sql
new file mode 100644
index 0000000000..1ecfc8786f
--- /dev/null
+++ b/storage_controller/migrations/2024-07-23-191537_create_metadata_health/down.sql
@@ -0,0 +1 @@
+DROP TABLE metadata_health;
\ No newline at end of file
diff --git a/storage_controller/migrations/2024-07-23-191537_create_metadata_health/up.sql b/storage_controller/migrations/2024-07-23-191537_create_metadata_health/up.sql
new file mode 100644
index 0000000000..fa87eda119
--- /dev/null
+++ b/storage_controller/migrations/2024-07-23-191537_create_metadata_health/up.sql
@@ -0,0 +1,14 @@
+CREATE TABLE metadata_health (
+  tenant_id VARCHAR NOT NULL,
+  shard_number INTEGER NOT NULL,
+  shard_count INTEGER NOT NULL,
+  PRIMARY KEY(tenant_id, shard_number, shard_count),
+  -- Rely on cascade behavior for delete
+  FOREIGN KEY(tenant_id, shard_number, shard_count) REFERENCES tenant_shards ON DELETE CASCADE,
+  healthy BOOLEAN NOT NULL DEFAULT TRUE,
+  last_scrubbed_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
+);
+
+
+INSERT INTO metadata_health(tenant_id, shard_number, shard_count)
+SELECT tenant_id, shard_number, shard_count FROM tenant_shards;
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index c77918827f..e8513b31eb 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -10,7 +10,11 @@ use hyper::header::CONTENT_TYPE;
 use hyper::{Body, Request, Response};
 use hyper::{StatusCode, Uri};
 use metrics::{BuildInfo, NeonMetrics};
-use pageserver_api::controller_api::TenantCreateRequest;
+use pageserver_api::controller_api::{
+    MetadataHealthListOutdatedRequest, MetadataHealthListOutdatedResponse,
+    MetadataHealthListUnhealthyResponse, MetadataHealthUpdateRequest, MetadataHealthUpdateResponse,
+    TenantCreateRequest,
+};
 use pageserver_api::models::{
     TenantConfigRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
     TenantTimeTravelRequest, TimelineCreateRequest,
@@ -560,6 +564,51 @@ async fn handle_cancel_node_fill(req: Request<Body>) -> Result<Response<Body>, A
     json_response(StatusCode::ACCEPTED, ())
 }
 
+async fn handle_metadata_health_update(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Scrubber)?;
+
+    let update_req = json_request::<MetadataHealthUpdateRequest>(&mut req).await?;
+    let state = get_state(&req);
+
+    state.service.metadata_health_update(update_req).await?;
+
+    json_response(StatusCode::OK, MetadataHealthUpdateResponse {})
+}
+
+async fn handle_metadata_health_list_unhealthy(
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    let state = get_state(&req);
+    let unhealthy_tenant_shards = state.service.metadata_health_list_unhealthy().await?;
+
+    json_response(
+        StatusCode::OK,
+        MetadataHealthListUnhealthyResponse {
+            unhealthy_tenant_shards,
+        },
+    )
+}
+
+async fn handle_metadata_health_list_outdated(
+    mut req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    let list_outdated_req = json_request::<MetadataHealthListOutdatedRequest>(&mut req).await?;
+    let state = get_state(&req);
+    let health_records = state
+        .service
+        .metadata_health_list_outdated(list_outdated_req.not_scrubbed_for)
+        .await?;
+
+    json_response(
+        StatusCode::OK,
+        MetadataHealthListOutdatedResponse { health_records },
+    )
+}
+
 async fn handle_tenant_shard_split(
     service: Arc<Service>,
     mut req: Request<Body>,
@@ -987,6 +1036,28 @@ pub fn make_router(
                 RequestName("control_v1_cancel_node_fill"),
             )
         })
+        // Metadata health operations
+        .post("/control/v1/metadata_health/update", |r| {
+            named_request_span(
+                r,
+                handle_metadata_health_update,
+                RequestName("control_v1_metadata_health_update"),
+            )
+        })
+        .get("/control/v1/metadata_health/unhealthy", |r| {
+            named_request_span(
+                r,
+                handle_metadata_health_list_unhealthy,
+                RequestName("control_v1_metadata_health_list_unhealthy"),
+            )
+        })
+        .post("/control/v1/metadata_health/outdated", |r| {
+            named_request_span(
+                r,
+                handle_metadata_health_list_outdated,
+                RequestName("control_v1_metadata_health_list_outdated"),
+            )
+        })
         // TODO(vlad): endpoint for cancelling drain and fill
         // Tenant Shard operations
         .put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index d8f31e86e5..64a3e597ce 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -8,6 +8,7 @@ use self::split_state::SplitState;
 use diesel::pg::PgConnection;
 use diesel::prelude::*;
 use diesel::Connection;
+use pageserver_api::controller_api::MetadataHealthRecord;
 use pageserver_api::controller_api::ShardSchedulingPolicy;
 use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy};
 use pageserver_api::models::TenantConfig;
@@ -90,6 +91,10 @@ pub(crate) enum DatabaseOperation {
     UpdateTenantShard,
     DeleteTenant,
     UpdateTenantConfig,
+    UpdateMetadataHealth,
+    ListMetadataHealth,
+    ListMetadataHealthUnhealthy,
+    ListMetadataHealthOutdated,
 }
 
 #[must_use]
@@ -307,15 +312,32 @@ impl Persistence {
         &self,
         shards: Vec<TenantShardPersistence>,
     ) -> DatabaseResult<()> {
-        use crate::schema::tenant_shards::dsl::*;
+        use crate::schema::metadata_health;
+        use crate::schema::tenant_shards;
+
+        let now = chrono::Utc::now();
+
+        let metadata_health_records = shards
+            .iter()
+            .map(|t| MetadataHealthPersistence {
+                tenant_id: t.tenant_id.clone(),
+                shard_number: t.shard_number,
+                shard_count: t.shard_count,
+                healthy: true,
+                last_scrubbed_at: now,
+            })
+            .collect::<Vec<_>>();
+
         self.with_measured_conn(
             DatabaseOperation::InsertTenantShards,
             move |conn| -> DatabaseResult<()> {
-                for tenant in &shards {
-                    diesel::insert_into(tenant_shards)
-                        .values(tenant)
-                        .execute(conn)?;
-                }
+                diesel::insert_into(tenant_shards::table)
+                    .values(&shards)
+                    .execute(conn)?;
+
+                diesel::insert_into(metadata_health::table)
+                    .values(&metadata_health_records)
+                    .execute(conn)?;
                 Ok(())
             },
         )
@@ -329,10 +351,10 @@ impl Persistence {
         self.with_measured_conn(
             DatabaseOperation::DeleteTenant,
             move |conn| -> DatabaseResult<()> {
+                // `metadata_health` status (if exists) is also deleted based on the cascade behavior.
                 diesel::delete(tenant_shards)
                     .filter(tenant_id.eq(del_tenant_id.to_string()))
                     .execute(conn)?;
-
                 Ok(())
             },
         )
@@ -675,6 +697,94 @@ impl Persistence {
         )
         .await
     }
+
+    /// Stores all the latest metadata health updates durably. Updates existing entry on conflict.
+    ///
+    /// **Correctness:** `metadata_health_updates` should all belong the tenant shards managed by the storage controller.
+    #[allow(dead_code)]
+    pub(crate) async fn update_metadata_health_records(
+        &self,
+        healthy_records: Vec<MetadataHealthPersistence>,
+        unhealthy_records: Vec<MetadataHealthPersistence>,
+        now: chrono::DateTime<chrono::Utc>,
+    ) -> DatabaseResult<()> {
+        use crate::schema::metadata_health::dsl::*;
+
+        self.with_measured_conn(
+            DatabaseOperation::UpdateMetadataHealth,
+            move |conn| -> DatabaseResult<_> {
+                diesel::insert_into(metadata_health)
+                    .values(&healthy_records)
+                    .on_conflict((tenant_id, shard_number, shard_count))
+                    .do_update()
+                    .set((healthy.eq(true), last_scrubbed_at.eq(now)))
+                    .execute(conn)?;
+
+                diesel::insert_into(metadata_health)
+                    .values(&unhealthy_records)
+                    .on_conflict((tenant_id, shard_number, shard_count))
+                    .do_update()
+                    .set((healthy.eq(false), last_scrubbed_at.eq(now)))
+                    .execute(conn)?;
+                Ok(())
+            },
+        )
+        .await
+    }
+
+    /// Lists all the metadata health records.
+    #[allow(dead_code)]
+    pub(crate) async fn list_metadata_health_records(
+        &self,
+    ) -> DatabaseResult<Vec<MetadataHealthPersistence>> {
+        self.with_measured_conn(
+            DatabaseOperation::ListMetadataHealth,
+            move |conn| -> DatabaseResult<_> {
+                Ok(
+                    crate::schema::metadata_health::table
+                        .load::<MetadataHealthPersistence>(conn)?,
+                )
+            },
+        )
+        .await
+    }
+
+    /// Lists all the metadata health records that is unhealthy.
+    #[allow(dead_code)]
+    pub(crate) async fn list_unhealthy_metadata_health_records(
+        &self,
+    ) -> DatabaseResult<Vec<MetadataHealthPersistence>> {
+        use crate::schema::metadata_health::dsl::*;
+        self.with_measured_conn(
+            DatabaseOperation::ListMetadataHealthUnhealthy,
+            move |conn| -> DatabaseResult<_> {
+                Ok(crate::schema::metadata_health::table
+                    .filter(healthy.eq(false))
+                    .load::<MetadataHealthPersistence>(conn)?)
+            },
+        )
+        .await
+    }
+
+    /// Lists all the metadata health records that have not been updated since an `earlier` time.
+    #[allow(dead_code)]
+    pub(crate) async fn list_outdated_metadata_health_records(
+        &self,
+        earlier: chrono::DateTime<chrono::Utc>,
+    ) -> DatabaseResult<Vec<MetadataHealthPersistence>> {
+        use crate::schema::metadata_health::dsl::*;
+
+        self.with_measured_conn(
+            DatabaseOperation::ListMetadataHealthOutdated,
+            move |conn| -> DatabaseResult<_> {
+                let query = metadata_health.filter(last_scrubbed_at.lt(earlier));
+                let res = query.load::<MetadataHealthPersistence>(conn)?;
+
+                Ok(res)
+            },
+        )
+        .await
+    }
 }
 
 /// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably
@@ -744,3 +854,59 @@ pub(crate) struct NodePersistence {
     pub(crate) listen_pg_addr: String,
     pub(crate) listen_pg_port: i32,
 }
+
+/// Tenant metadata health status that are stored durably.
+#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq)]
+#[diesel(table_name = crate::schema::metadata_health)]
+pub(crate) struct MetadataHealthPersistence {
+    #[serde(default)]
+    pub(crate) tenant_id: String,
+    #[serde(default)]
+    pub(crate) shard_number: i32,
+    #[serde(default)]
+    pub(crate) shard_count: i32,
+
+    pub(crate) healthy: bool,
+    pub(crate) last_scrubbed_at: chrono::DateTime<chrono::Utc>,
+}
+
+impl MetadataHealthPersistence {
+    pub fn new(
+        tenant_shard_id: TenantShardId,
+        healthy: bool,
+        last_scrubbed_at: chrono::DateTime<chrono::Utc>,
+    ) -> Self {
+        let tenant_id = tenant_shard_id.tenant_id.to_string();
+        let shard_number = tenant_shard_id.shard_number.0 as i32;
+        let shard_count = tenant_shard_id.shard_count.literal() as i32;
+
+        MetadataHealthPersistence {
+            tenant_id,
+            shard_number,
+            shard_count,
+            healthy,
+            last_scrubbed_at,
+        }
+    }
+
+    #[allow(dead_code)]
+    pub(crate) fn get_tenant_shard_id(&self) -> Result<TenantShardId, hex::FromHexError> {
+        Ok(TenantShardId {
+            tenant_id: TenantId::from_str(self.tenant_id.as_str())?,
+            shard_number: ShardNumber(self.shard_number as u8),
+            shard_count: ShardCount::new(self.shard_count as u8),
+        })
+    }
+}
+
+impl From<MetadataHealthPersistence> for MetadataHealthRecord {
+    fn from(value: MetadataHealthPersistence) -> Self {
+        MetadataHealthRecord {
+            tenant_shard_id: value
+                .get_tenant_shard_id()
+                .expect("stored tenant id should be valid"),
+            healthy: value.healthy,
+            last_scrubbed_at: value.last_scrubbed_at,
+        }
+    }
+}
diff --git a/storage_controller/src/schema.rs b/storage_controller/src/schema.rs
index ff37d0fe77..cb5ba3f38b 100644
--- a/storage_controller/src/schema.rs
+++ b/storage_controller/src/schema.rs
@@ -1,5 +1,15 @@
 // @generated automatically by Diesel CLI.
 
+diesel::table! {
+    metadata_health (tenant_id, shard_number, shard_count) {
+        tenant_id -> Varchar,
+        shard_number -> Int4,
+        shard_count -> Int4,
+        healthy -> Bool,
+        last_scrubbed_at -> Timestamptz,
+    }
+}
+
 diesel::table! {
     nodes (node_id) {
         node_id -> Int8,
@@ -26,4 +36,4 @@ diesel::table! {
     }
 }
 
-diesel::allow_tables_to_appear_in_same_query!(nodes, tenant_shards,);
+diesel::allow_tables_to_appear_in_same_query!(metadata_health, nodes, tenant_shards,);
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 821f45d0c0..ea515f67da 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -16,7 +16,7 @@ use crate::{
     compute_hook::NotifyError,
     id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, TracingExclusiveGuard},
     metrics::LeadershipStatusGroup,
-    persistence::{AbortShardSplitStatus, TenantFilter},
+    persistence::{AbortShardSplitStatus, MetadataHealthPersistence, TenantFilter},
     reconciler::{ReconcileError, ReconcileUnits},
     scheduler::{MaySchedule, ScheduleContext, ScheduleMode},
     tenant_shard::{
@@ -33,11 +33,11 @@ use futures::{stream::FuturesUnordered, StreamExt};
 use itertools::Itertools;
 use pageserver_api::{
     controller_api::{
-        NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
-        ShardSchedulingPolicy, TenantCreateRequest, TenantCreateResponse,
-        TenantCreateResponseShard, TenantDescribeResponse, TenantDescribeResponseShard,
-        TenantLocateResponse, TenantPolicyRequest, TenantShardMigrateRequest,
-        TenantShardMigrateResponse, UtilizationScore,
+        MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability, NodeRegisterRequest,
+        NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy, TenantCreateRequest,
+        TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse,
+        TenantDescribeResponseShard, TenantLocateResponse, TenantPolicyRequest,
+        TenantShardMigrateRequest, TenantShardMigrateResponse, UtilizationScore,
     },
     models::{SecondaryProgress, TenantConfigRequest, TopTenantShardsRequest},
 };
@@ -6095,6 +6095,68 @@ impl Service {
         Ok(())
     }
 
+    /// Updates scrubber metadata health check results.
+    pub(crate) async fn metadata_health_update(
+        &self,
+        update_req: MetadataHealthUpdateRequest,
+    ) -> Result<(), ApiError> {
+        let now = chrono::offset::Utc::now();
+        let (healthy_records, unhealthy_records) = {
+            let locked = self.inner.read().unwrap();
+            let healthy_records = update_req
+                .healthy_tenant_shards
+                .into_iter()
+                // Retain only health records associated with tenant shards managed by storage controller.
+                .filter(|tenant_shard_id| locked.tenants.contains_key(tenant_shard_id))
+                .map(|tenant_shard_id| MetadataHealthPersistence::new(tenant_shard_id, true, now))
+                .collect();
+            let unhealthy_records = update_req
+                .unhealthy_tenant_shards
+                .into_iter()
+                .filter(|tenant_shard_id| locked.tenants.contains_key(tenant_shard_id))
+                .map(|tenant_shard_id| MetadataHealthPersistence::new(tenant_shard_id, false, now))
+                .collect();
+
+            (healthy_records, unhealthy_records)
+        };
+
+        self.persistence
+            .update_metadata_health_records(healthy_records, unhealthy_records, now)
+            .await?;
+        Ok(())
+    }
+
+    /// Lists the tenant shards that has unhealthy metadata status.
+    pub(crate) async fn metadata_health_list_unhealthy(
+        &self,
+    ) -> Result<Vec<TenantShardId>, ApiError> {
+        let result = self
+            .persistence
+            .list_unhealthy_metadata_health_records()
+            .await?
+            .iter()
+            .map(|p| p.get_tenant_shard_id().unwrap())
+            .collect();
+
+        Ok(result)
+    }
+
+    /// Lists the tenant shards that have not been scrubbed for some duration.
+    pub(crate) async fn metadata_health_list_outdated(
+        &self,
+        not_scrubbed_for: Duration,
+    ) -> Result<Vec<MetadataHealthRecord>, ApiError> {
+        let earlier = chrono::offset::Utc::now() - not_scrubbed_for;
+        let result = self
+            .persistence
+            .list_outdated_metadata_health_records(earlier)
+            .await?
+            .into_iter()
+            .map(|record| record.into())
+            .collect();
+        Ok(result)
+    }
+
     pub(crate) fn get_leadership_status(&self) -> LeadershipStatus {
         self.inner.read().unwrap().get_leadership_status()
     }
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index c5fffc2af6..5b2ebea794 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -449,6 +449,7 @@ class TokenScope(str, Enum):
     GENERATIONS_API = "generations_api"
     SAFEKEEPER_DATA = "safekeeperdata"
     TENANT = "tenant"
+    SCRUBBER = "scrubber"
 
 
 class NeonEnvBuilder:
@@ -2586,6 +2587,51 @@ class NeonStorageController(MetricsGetter, LogUtils):
 
                 time.sleep(backoff)
 
+    def metadata_health_update(self, healthy: List[TenantShardId], unhealthy: List[TenantShardId]):
+        body: Dict[str, Any] = {
+            "healthy_tenant_shards": [str(t) for t in healthy],
+            "unhealthy_tenant_shards": [str(t) for t in unhealthy],
+        }
+
+        self.request(
+            "POST",
+            f"{self.env.storage_controller_api}/control/v1/metadata_health/update",
+            json=body,
+            headers=self.headers(TokenScope.SCRUBBER),
+        )
+
+    def metadata_health_list_unhealthy(self):
+        response = self.request(
+            "GET",
+            f"{self.env.storage_controller_api}/control/v1/metadata_health/unhealthy",
+            headers=self.headers(TokenScope.ADMIN),
+        )
+        return response.json()
+
+    def metadata_health_list_outdated(self, duration: str):
+        body: Dict[str, Any] = {"not_scrubbed_for": duration}
+
+        response = self.request(
+            "POST",
+            f"{self.env.storage_controller_api}/control/v1/metadata_health/outdated",
+            json=body,
+            headers=self.headers(TokenScope.ADMIN),
+        )
+        return response.json()
+
+    def metadata_health_is_healthy(self, outdated_duration: str = "1h") -> bool:
+        """Metadata is healthy if there is no unhealthy or outdated health records."""
+
+        unhealthy = self.metadata_health_list_unhealthy()
+        outdated = self.metadata_health_list_outdated(outdated_duration)
+
+        healthy = (
+            len(unhealthy["unhealthy_tenant_shards"]) == 0 and len(outdated["health_records"]) == 0
+        )
+        if not healthy:
+            log.info(f"{unhealthy=}, {outdated=}")
+        return healthy
+
     def step_down(self):
         log.info("Asking storage controller to step down")
         response = self.request(
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index da638ac233..eb2cdccdb9 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -3,7 +3,7 @@ import threading
 import time
 from collections import defaultdict
 from datetime import datetime, timezone
-from typing import Any, Dict, List, Union
+from typing import Any, Dict, List, Optional, Set, Tuple, Union
 
 import pytest
 from fixtures.common_types import TenantId, TenantShardId, TimelineId
@@ -1785,6 +1785,126 @@ def test_storage_controller_node_deletion(
     env.storage_controller.consistency_check()
 
 
+@pytest.mark.parametrize("shard_count", [None, 2])
+def test_storage_controller_metadata_health(
+    neon_env_builder: NeonEnvBuilder,
+    shard_count: Optional[int],
+):
+    """
+    Create three tenants A, B, C.
+
+    Phase 1:
+    - A: Post healthy status.
+    - B: Post unhealthy status.
+    - C: No updates.
+
+    Phase 2:
+    - B: Post healthy status.
+    - C: Post healthy status.
+
+    Phase 3:
+    - A: Post unhealthy status.
+
+    Phase 4:
+    - Delete tenant A, metadata health status should be deleted as well.
+    """
+
+    def update_and_query_metadata_health(
+        env: NeonEnv,
+        healthy: List[TenantShardId],
+        unhealthy: List[TenantShardId],
+        outdated_duration: str = "1h",
+    ) -> Tuple[Set[str], Set[str]]:
+        """
+        Update metadata health. Then list tenant shards with unhealthy and
+        outdated metadata health status.
+        """
+        if healthy or unhealthy:
+            env.storage_controller.metadata_health_update(healthy, unhealthy)
+        result = env.storage_controller.metadata_health_list_unhealthy()
+        unhealthy_res = set(result["unhealthy_tenant_shards"])
+        result = env.storage_controller.metadata_health_list_outdated(outdated_duration)
+        outdated_res = set(record["tenant_shard_id"] for record in result["health_records"])
+
+        return unhealthy_res, outdated_res
+
+    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
+
+    neon_env_builder.num_pageservers = 2
+    env = neon_env_builder.init_start()
+
+    # Mock tenant (`initial_tenant``) with healthy scrubber scan result
+    tenant_a_shard_ids = (
+        env.storage_controller.tenant_shard_split(env.initial_tenant, shard_count=shard_count)
+        if shard_count is not None
+        else [TenantShardId(env.initial_tenant, 0, 0)]
+    )
+
+    # Mock tenant with unhealthy scrubber scan result
+    tenant_b, _ = env.neon_cli.create_tenant(shard_count=shard_count)
+    tenant_b_shard_ids = (
+        env.storage_controller.tenant_shard_split(tenant_b, shard_count=shard_count)
+        if shard_count is not None
+        else [TenantShardId(tenant_b, 0, 0)]
+    )
+
+    # Mock tenant that never gets a health update from scrubber
+    tenant_c, _ = env.neon_cli.create_tenant(shard_count=shard_count)
+
+    tenant_c_shard_ids = (
+        env.storage_controller.tenant_shard_split(tenant_c, shard_count=shard_count)
+        if shard_count is not None
+        else [TenantShardId(tenant_c, 0, 0)]
+    )
+
+    # Metadata health table also updated as tenant shards are created.
+    assert env.storage_controller.metadata_health_is_healthy()
+
+    # post "fake" updates to storage controller db
+
+    unhealthy, outdated = update_and_query_metadata_health(
+        env, healthy=tenant_a_shard_ids, unhealthy=tenant_b_shard_ids
+    )
+
+    log.info(f"After Phase 1: {unhealthy=}, {outdated=}")
+    assert len(unhealthy) == len(tenant_b_shard_ids)
+    for t in tenant_b_shard_ids:
+        assert str(t) in unhealthy
+    assert len(outdated) == 0
+
+    unhealthy, outdated = update_and_query_metadata_health(
+        env, healthy=tenant_b_shard_ids + tenant_c_shard_ids, unhealthy=[]
+    )
+
+    log.info(f"After Phase 2: {unhealthy=}, {outdated=}")
+    assert len(unhealthy) == 0
+    assert len(outdated) == 0
+
+    unhealthy, outdated = update_and_query_metadata_health(
+        env, healthy=[], unhealthy=tenant_a_shard_ids
+    )
+
+    log.info(f"After Phase 3: {unhealthy=}, {outdated=}")
+    assert len(unhealthy) == len(tenant_a_shard_ids)
+    for t in tenant_a_shard_ids:
+        assert str(t) in unhealthy
+    assert len(outdated) == 0
+
+    # Phase 4: Delete A
+    env.storage_controller.pageserver_api().tenant_delete(env.initial_tenant)
+
+    # A's unhealthy metadata health status should be deleted as well.
+    assert env.storage_controller.metadata_health_is_healthy()
+
+    # All shards from B and C are not fresh if set outdated duration to 0 seconds.
+    unhealthy, outdated = update_and_query_metadata_health(
+        env, healthy=[], unhealthy=tenant_a_shard_ids, outdated_duration="0s"
+    )
+    assert len(unhealthy) == 0
+    for t in tenant_b_shard_ids + tenant_c_shard_ids:
+        assert str(t) in outdated
+
+
 def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder):
     """
     Test the `/control/v1/step_down` storage controller API. Upon receiving such

From d1d4631c8f0eda39b9558ab542bf04ca7c4c5604 Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Tue, 30 Jul 2024 11:07:34 -0400
Subject: [PATCH 303/412] feat(scrubber): post  `scan_metadata` results to
 storage controller (#8502)

Part of #8128, followup to #8480. closes #8421.

Enable scrubber to optionally post metadata scan health results to
storage controller.

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
---
 libs/pageserver_api/src/controller_api.rs     |  5 ++-
 storage_scrubber/src/checks.rs                |  5 +++
 storage_scrubber/src/lib.rs                   |  7 ++++
 storage_scrubber/src/main.rs                  | 40 ++++++++++++++-----
 .../src/pageserver_physical_gc.rs             |  8 +---
 .../src/scan_pageserver_metadata.rs           | 32 ++++++++++-----
 test_runner/fixtures/neon_fixtures.py         |  9 +++--
 test_runner/regress/test_storage_scrubber.py  | 16 +++++++-
 8 files changed, 88 insertions(+), 34 deletions(-)

diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index 36b1bd95ff..a5b452da83 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -1,3 +1,4 @@
+use std::collections::HashSet;
 use std::str::FromStr;
 use std::time::{Duration, Instant};
 
@@ -304,8 +305,8 @@ pub struct MetadataHealthRecord {
 
 #[derive(Serialize, Deserialize, Debug)]
 pub struct MetadataHealthUpdateRequest {
-    pub healthy_tenant_shards: Vec<TenantShardId>,
-    pub unhealthy_tenant_shards: Vec<TenantShardId>,
+    pub healthy_tenant_shards: HashSet<TenantShardId>,
+    pub unhealthy_tenant_shards: HashSet<TenantShardId>,
 }
 
 #[derive(Serialize, Deserialize, Debug)]
diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs
index a35a58aedd..5aa9e88c40 100644
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -40,6 +40,11 @@ impl TimelineAnalysis {
             garbage_keys: Vec::new(),
         }
     }
+
+    /// Whether a timeline is healthy.
+    pub(crate) fn is_healthy(&self) -> bool {
+        self.errors.is_empty() && self.warnings.is_empty()
+    }
 }
 
 pub(crate) async fn branch_cleanup_and_check_errors(
diff --git a/storage_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs
index c7900f9b02..e0f154def3 100644
--- a/storage_scrubber/src/lib.rs
+++ b/storage_scrubber/src/lib.rs
@@ -32,6 +32,7 @@ use remote_storage::{
 };
 use reqwest::Url;
 use serde::{Deserialize, Serialize};
+use storage_controller_client::control_api;
 use tokio::io::AsyncReadExt;
 use tokio_util::sync::CancellationToken;
 use tracing::error;
@@ -255,6 +256,12 @@ pub struct ControllerClientConfig {
     pub controller_jwt: String,
 }
 
+impl ControllerClientConfig {
+    pub fn build_client(self) -> control_api::Client {
+        control_api::Client::new(self.controller_api, Some(self.controller_jwt))
+    }
+}
+
 pub struct ConsoleConfig {
     pub token: String,
     pub base_url: Url,
diff --git a/storage_scrubber/src/main.rs b/storage_scrubber/src/main.rs
index 346829b7c9..4c804c00c1 100644
--- a/storage_scrubber/src/main.rs
+++ b/storage_scrubber/src/main.rs
@@ -1,7 +1,8 @@
 use anyhow::{anyhow, bail};
 use camino::Utf8PathBuf;
+use pageserver_api::controller_api::{MetadataHealthUpdateRequest, MetadataHealthUpdateResponse};
 use pageserver_api::shard::TenantShardId;
-use reqwest::Url;
+use reqwest::{Method, Url};
 use storage_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
 use storage_scrubber::pageserver_physical_gc::GcMode;
 use storage_scrubber::scan_pageserver_metadata::scan_metadata;
@@ -61,6 +62,8 @@ enum Command {
         json: bool,
         #[arg(long = "tenant-id", num_args = 0..)]
         tenant_ids: Vec<TenantShardId>,
+        #[arg(long = "post", default_value_t = false)]
+        post_to_storage_controller: bool,
         #[arg(long, default_value = None)]
         /// For safekeeper node_kind only, points to db with debug dump
         dump_db_connstr: Option<String>,
@@ -116,11 +119,20 @@ async fn main() -> anyhow::Result<()> {
         chrono::Utc::now().format("%Y_%m_%d__%H_%M_%S")
     ));
 
+    let controller_client_conf = cli.controller_api.map(|controller_api| {
+        ControllerClientConfig {
+            controller_api,
+            // Default to no key: this is a convenience when working in a development environment
+            controller_jwt: cli.controller_jwt.unwrap_or("".to_owned()),
+        }
+    });
+
     match cli.command {
         Command::ScanMetadata {
             json,
             tenant_ids,
             node_kind,
+            post_to_storage_controller,
             dump_db_connstr,
             dump_db_table,
         } => {
@@ -159,6 +171,9 @@ async fn main() -> anyhow::Result<()> {
                 }
                 Ok(())
             } else {
+                if controller_client_conf.is_none() && post_to_storage_controller {
+                    return Err(anyhow!("Posting pageserver scan health status to storage controller requires `--controller-api` and `--controller-jwt` to run"));
+                }
                 match scan_metadata(bucket_config.clone(), tenant_ids).await {
                     Err(e) => {
                         tracing::error!("Failed: {e}");
@@ -170,6 +185,21 @@ async fn main() -> anyhow::Result<()> {
                         } else {
                             println!("{}", summary.summary_string());
                         }
+
+                        if post_to_storage_controller {
+                            if let Some(conf) = controller_client_conf {
+                                let controller_client = conf.build_client();
+                                let body = summary.build_health_update_request();
+                                controller_client
+                                    .dispatch::<MetadataHealthUpdateRequest, MetadataHealthUpdateResponse>(
+                                        Method::POST,
+                                        "control/v1/metadata_health/update".to_string(),
+                                        Some(body),
+                                    )
+                                    .await?;
+                            }
+                        }
+
                         if summary.is_fatal() {
                             Err(anyhow::anyhow!("Fatal scrub errors detected"))
                         } else if summary.is_empty() {
@@ -217,14 +247,6 @@ async fn main() -> anyhow::Result<()> {
             min_age,
             mode,
         } => {
-            let controller_client_conf = cli.controller_api.map(|controller_api| {
-                ControllerClientConfig {
-                    controller_api,
-                    // Default to no key: this is a convenience when working in a development environment
-                    controller_jwt: cli.controller_jwt.unwrap_or("".to_owned()),
-                }
-            });
-
             match (&controller_client_conf, mode) {
                 (Some(_), _) => {
                     // Any mode may run when controller API is set
diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs
index e977fd49f7..69896caa82 100644
--- a/storage_scrubber/src/pageserver_physical_gc.rs
+++ b/storage_scrubber/src/pageserver_physical_gc.rs
@@ -567,13 +567,7 @@ pub async fn pageserver_physical_gc(
     }
 
     // Execute cross-shard GC, using the accumulator's full view of all the shards built in the per-shard GC
-    let Some(controller_client) = controller_client_conf.as_ref().map(|c| {
-        let ControllerClientConfig {
-            controller_api,
-            controller_jwt,
-        } = c;
-        control_api::Client::new(controller_api.clone(), Some(controller_jwt.clone()))
-    }) else {
+    let Some(controller_client) = controller_client_conf.map(|c| c.build_client()) else {
         tracing::info!("Skipping ancestor layer GC, because no `--controller-api` was specified");
         return Ok(summary);
     };
diff --git a/storage_scrubber/src/scan_pageserver_metadata.rs b/storage_scrubber/src/scan_pageserver_metadata.rs
index fbd60f93bb..dc410bde41 100644
--- a/storage_scrubber/src/scan_pageserver_metadata.rs
+++ b/storage_scrubber/src/scan_pageserver_metadata.rs
@@ -9,12 +9,13 @@ use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimeline
 use aws_sdk_s3::Client;
 use futures_util::{StreamExt, TryStreamExt};
 use pageserver::tenant::remote_timeline_client::remote_layer_path;
+use pageserver_api::controller_api::MetadataHealthUpdateRequest;
 use pageserver_api::shard::TenantShardId;
 use serde::Serialize;
 use utils::id::TenantId;
 use utils::shard::ShardCount;
 
-#[derive(Serialize)]
+#[derive(Serialize, Default)]
 pub struct MetadataSummary {
     tenant_count: usize,
     timeline_count: usize,
@@ -23,19 +24,16 @@ pub struct MetadataSummary {
     with_warnings: HashSet<TenantShardTimelineId>,
     with_orphans: HashSet<TenantShardTimelineId>,
     indices_by_version: HashMap<usize, usize>,
+
+    #[serde(skip)]
+    pub(crate) healthy_tenant_shards: HashSet<TenantShardId>,
+    #[serde(skip)]
+    pub(crate) unhealthy_tenant_shards: HashSet<TenantShardId>,
 }
 
 impl MetadataSummary {
     fn new() -> Self {
-        Self {
-            tenant_count: 0,
-            timeline_count: 0,
-            timeline_shard_count: 0,
-            with_errors: HashSet::new(),
-            with_warnings: HashSet::new(),
-            with_orphans: HashSet::new(),
-            indices_by_version: HashMap::new(),
-        }
+        Self::default()
     }
 
     fn update_data(&mut self, data: &S3TimelineBlobData) {
@@ -54,6 +52,13 @@ impl MetadataSummary {
     }
 
     fn update_analysis(&mut self, id: &TenantShardTimelineId, analysis: &TimelineAnalysis) {
+        if analysis.is_healthy() {
+            self.healthy_tenant_shards.insert(id.tenant_shard_id);
+        } else {
+            self.healthy_tenant_shards.remove(&id.tenant_shard_id);
+            self.unhealthy_tenant_shards.insert(id.tenant_shard_id);
+        }
+
         if !analysis.errors.is_empty() {
             self.with_errors.insert(*id);
         }
@@ -101,6 +106,13 @@ Index versions: {version_summary}
     pub fn is_empty(&self) -> bool {
         self.timeline_shard_count == 0
     }
+
+    pub fn build_health_update_request(&self) -> MetadataHealthUpdateRequest {
+        MetadataHealthUpdateRequest {
+            healthy_tenant_shards: self.healthy_tenant_shards.clone(),
+            unhealthy_tenant_shards: self.unhealthy_tenant_shards.clone(),
+        }
+    }
 }
 
 /// Scan the pageserver metadata in an S3 bucket, reporting errors and statistics.
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 5b2ebea794..0c33dec784 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4401,10 +4401,11 @@ class StorageScrubber:
         assert stdout is not None
         return stdout
 
-    def scan_metadata(self) -> Any:
-        stdout = self.scrubber_cli(
-            ["scan-metadata", "--node-kind", "pageserver", "--json"], timeout=30
-        )
+    def scan_metadata(self, post_to_storage_controller: bool = False) -> Any:
+        args = ["scan-metadata", "--node-kind", "pageserver", "--json"]
+        if post_to_storage_controller:
+            args.append("--post")
+        stdout = self.scrubber_cli(args, timeout=30)
 
         try:
             return json.loads(stdout)
diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py
index a45430ca86..fadf438788 100644
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -440,10 +440,12 @@ def test_scrubber_scan_pageserver_metadata(
     assert len(index.layer_metadata) > 0
     it = iter(index.layer_metadata.items())
 
-    scan_summary = env.storage_scrubber.scan_metadata()
+    scan_summary = env.storage_scrubber.scan_metadata(post_to_storage_controller=True)
     assert not scan_summary["with_warnings"]
     assert not scan_summary["with_errors"]
 
+    assert env.storage_controller.metadata_health_is_healthy()
+
     # Delete a layer file that is listed in the index.
     layer, metadata = next(it)
     log.info(f"Deleting {timeline_path}/{layer.to_str()}")
@@ -453,7 +455,17 @@ def test_scrubber_scan_pageserver_metadata(
     )
     log.info(f"delete response: {delete_response}")
 
-    # Check scan summary. Expect it to be a L0 layer so only emit warnings.
+    # Check scan summary without posting to storage controller. Expect it to be a L0 layer so only emit warnings.
     scan_summary = env.storage_scrubber.scan_metadata()
     log.info(f"{pprint.pformat(scan_summary)}")
     assert len(scan_summary["with_warnings"]) > 0
+
+    assert env.storage_controller.metadata_health_is_healthy()
+
+    # Now post to storage controller, expect seeing one unhealthy health record
+    scan_summary = env.storage_scrubber.scan_metadata(post_to_storage_controller=True)
+    log.info(f"{pprint.pformat(scan_summary)}")
+    assert len(scan_summary["with_warnings"]) > 0
+
+    unhealthy = env.storage_controller.metadata_health_list_unhealthy()["unhealthy_tenant_shards"]
+    assert len(unhealthy) == 1 and unhealthy[0] == str(tenant_shard_id)

From fb6c1e939049fc80cc0671ebae457e3662db0f5a Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 30 Jul 2024 18:13:18 +0200
Subject: [PATCH 304/412] cleanup(compact_level0_phase1): some commentary and
 wrapping into block expressions (#8544)

Byproduct of scouting done for
https://github.com/neondatabase/neon/issues/8184

refs https://github.com/neondatabase/neon/issues/8184
---
 pageserver/src/tenant/timeline.rs            |  21 +---
 pageserver/src/tenant/timeline/compaction.rs | 126 ++++++++++++-------
 2 files changed, 80 insertions(+), 67 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 862ca42188..2b205db6e1 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -58,7 +58,7 @@ use std::{
     sync::atomic::AtomicU64,
 };
 use std::{
-    cmp::{max, min, Ordering},
+    cmp::{max, min},
     ops::ControlFlow,
 };
 use std::{
@@ -177,25 +177,6 @@ impl std::fmt::Display for ImageLayerCreationMode {
     }
 }
 
-/// Wrapper for key range to provide reverse ordering by range length for BinaryHeap
-#[derive(Debug, Clone, PartialEq, Eq)]
-pub(crate) struct Hole {
-    key_range: Range<Key>,
-    coverage_size: usize,
-}
-
-impl Ord for Hole {
-    fn cmp(&self, other: &Self) -> Ordering {
-        other.coverage_size.cmp(&self.coverage_size) // inverse order
-    }
-}
-
-impl PartialOrd for Hole {
-    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
-        Some(self.cmp(other))
-    }
-}
-
 /// Temporary function for immutable storage state refactor, ensures we are dropping mutex guard instead of other things.
 /// Can be removed after all refactors are done.
 fn drop_rlock<T>(rlock: tokio::sync::RwLockReadGuard<T>) {
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 497d631f4f..3292b4a121 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -30,8 +30,8 @@ use crate::tenant::config::defaults::{DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPA
 use crate::tenant::remote_timeline_client::WaitCompletionError;
 use crate::tenant::storage_layer::merge_iterator::MergeIterator;
 use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc, ValueReconstructState};
+use crate::tenant::timeline::ImageLayerCreationOutcome;
 use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter};
-use crate::tenant::timeline::{Hole, ImageLayerCreationOutcome};
 use crate::tenant::timeline::{Layer, ResidentLayer};
 use crate::tenant::DeltaLayer;
 use crate::virtual_file::{MaybeFatalIo, VirtualFile};
@@ -608,62 +608,93 @@ impl Timeline {
             .read_lock_held_spawn_blocking_startup_micros
             .till_now();
 
-        // Determine N largest holes where N is number of compacted layers.
-        let max_holes = deltas_to_compact.len();
-        let last_record_lsn = self.get_last_record_lsn();
-        let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128;
-        let min_hole_coverage_size = 3; // TODO: something more flexible?
-
-        // min-heap (reserve space for one more element added before eviction)
-        let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
-        let mut prev: Option<Key> = None;
-
-        let mut all_keys = Vec::new();
-
-        for l in deltas_to_compact.iter() {
-            all_keys.extend(l.load_keys(ctx).await.map_err(CompactionError::Other)?);
-        }
-
-        // FIXME: should spawn_blocking the rest of this function
-
-        // The current stdlib sorting implementation is designed in a way where it is
-        // particularly fast where the slice is made up of sorted sub-ranges.
-        all_keys.sort_by_key(|DeltaEntry { key, lsn, .. }| (*key, *lsn));
+        // TODO: replace with streaming k-merge
+        let all_keys = {
+            let mut all_keys = Vec::new();
+            for l in deltas_to_compact.iter() {
+                all_keys.extend(l.load_keys(ctx).await.map_err(CompactionError::Other)?);
+            }
+            // The current stdlib sorting implementation is designed in a way where it is
+            // particularly fast where the slice is made up of sorted sub-ranges.
+            all_keys.sort_by_key(|DeltaEntry { key, lsn, .. }| (*key, *lsn));
+            all_keys
+        };
 
         stats.read_lock_held_key_sort_micros = stats.read_lock_held_prerequisites_micros.till_now();
 
-        for &DeltaEntry { key: next_key, .. } in all_keys.iter() {
-            if let Some(prev_key) = prev {
-                // just first fast filter, do not create hole entries for metadata keys. The last hole in the
-                // compaction is the gap between data key and metadata keys.
-                if next_key.to_i128() - prev_key.to_i128() >= min_hole_range
-                    && !Key::is_metadata_key(&prev_key)
-                {
-                    let key_range = prev_key..next_key;
-                    // Measuring hole by just subtraction of i128 representation of key range boundaries
-                    // has not so much sense, because largest holes will corresponds field1/field2 changes.
-                    // But we are mostly interested to eliminate holes which cause generation of excessive image layers.
-                    // That is why it is better to measure size of hole as number of covering image layers.
-                    let coverage_size = layers.image_coverage(&key_range, last_record_lsn).len();
-                    if coverage_size >= min_hole_coverage_size {
-                        heap.push(Hole {
-                            key_range,
-                            coverage_size,
-                        });
-                        if heap.len() > max_holes {
-                            heap.pop(); // remove smallest hole
+        // Determine N largest holes where N is number of compacted layers. The vec is sorted by key range start.
+        //
+        // A hole is a key range for which this compaction doesn't have any WAL records.
+        // Our goal in this compaction iteration is to avoid creating L1s that, in terms of their key range,
+        // cover the hole, but actually don't contain any WAL records for that key range.
+        // The reason is that the mere stack of L1s (`count_deltas`) triggers image layer creation (`create_image_layers`).
+        // That image layer creation would be useless for a hole range covered by L1s that don't contain any WAL records.
+        //
+        // The algorithm chooses holes as follows.
+        // - Slide a 2-window over the keys in key orde to get the hole range (=distance between two keys).
+        // - Filter: min threshold on range length
+        // - Rank: by coverage size (=number of image layers required to reconstruct each key in the range for which we have any data)
+        //
+        // For more details, intuition, and some ASCII art see https://github.com/neondatabase/neon/pull/3597#discussion_r1112704451
+        #[derive(PartialEq, Eq)]
+        struct Hole {
+            key_range: Range<Key>,
+            coverage_size: usize,
+        }
+        let holes: Vec<Hole> = {
+            use std::cmp::Ordering;
+            impl Ord for Hole {
+                fn cmp(&self, other: &Self) -> Ordering {
+                    self.coverage_size.cmp(&other.coverage_size).reverse()
+                }
+            }
+            impl PartialOrd for Hole {
+                fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+                    Some(self.cmp(other))
+                }
+            }
+            let max_holes = deltas_to_compact.len();
+            let last_record_lsn = self.get_last_record_lsn();
+            let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128;
+            let min_hole_coverage_size = 3; // TODO: something more flexible?
+                                            // min-heap (reserve space for one more element added before eviction)
+            let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
+            let mut prev: Option<Key> = None;
+
+            for &DeltaEntry { key: next_key, .. } in all_keys.iter() {
+                if let Some(prev_key) = prev {
+                    // just first fast filter, do not create hole entries for metadata keys. The last hole in the
+                    // compaction is the gap between data key and metadata keys.
+                    if next_key.to_i128() - prev_key.to_i128() >= min_hole_range
+                        && !Key::is_metadata_key(&prev_key)
+                    {
+                        let key_range = prev_key..next_key;
+                        // Measuring hole by just subtraction of i128 representation of key range boundaries
+                        // has not so much sense, because largest holes will corresponds field1/field2 changes.
+                        // But we are mostly interested to eliminate holes which cause generation of excessive image layers.
+                        // That is why it is better to measure size of hole as number of covering image layers.
+                        let coverage_size =
+                            layers.image_coverage(&key_range, last_record_lsn).len();
+                        if coverage_size >= min_hole_coverage_size {
+                            heap.push(Hole {
+                                key_range,
+                                coverage_size,
+                            });
+                            if heap.len() > max_holes {
+                                heap.pop(); // remove smallest hole
+                            }
                         }
                     }
                 }
+                prev = Some(next_key.next());
             }
-            prev = Some(next_key.next());
-        }
+            let mut holes = heap.into_vec();
+            holes.sort_unstable_by_key(|hole| hole.key_range.start);
+            holes
+        };
         stats.read_lock_held_compute_holes_micros = stats.read_lock_held_key_sort_micros.till_now();
         drop_rlock(guard);
         stats.read_lock_drop_micros = stats.read_lock_held_compute_holes_micros.till_now();
-        let mut holes = heap.into_vec();
-        holes.sort_unstable_by_key(|hole| hole.key_range.start);
-        let mut next_hole = 0; // index of next hole in holes vector
 
         // This iterator walks through all key-value pairs from all the layers
         // we're compacting, in key, LSN order.
@@ -738,6 +769,7 @@ impl Timeline {
         let mut key_values_total_size = 0u64;
         let mut dup_start_lsn: Lsn = Lsn::INVALID; // start LSN of layer containing values of the single key
         let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key
+        let mut next_hole = 0; // index of next hole in holes vector
 
         for &DeltaEntry {
             key, lsn, ref val, ..

From fa24d27d38aba64c38b7210fe6e81421592ee53a Mon Sep 17 00:00:00 2001
From: Cihan Demirci <128653800+fcdm@users.noreply.github.com>
Date: Tue, 30 Jul 2024 22:34:15 +0300
Subject: [PATCH 305/412] cicd: change Azure storage details [1/2] (#8553)

Change Azure storage configuration to point to new variables/secrets. They have
the `_NEW` suffix in order not to disrupt any tests while we complete the
switch.
---
 .github/actionlint.yml                        | 1 +
 .github/workflows/_build-and-test-locally.yml | 6 +++---
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/.github/actionlint.yml b/.github/actionlint.yml
index 37983798b7..f086008d34 100644
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -9,5 +9,6 @@ self-hosted-runner:
     - us-east-2
 config-variables:
   - REMOTE_STORAGE_AZURE_CONTAINER
+  - REMOTE_STORAGE_AZURE_CONTAINER_NEW
   - REMOTE_STORAGE_AZURE_REGION
   - SLACK_UPCOMING_RELEASE_CHANNEL_ID
diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index 26e234a04d..7751f9e8c9 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -219,9 +219,9 @@ jobs:
           # Run separate tests for real Azure Blob Storage
           # XXX: replace region with `eu-central-1`-like region
           export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
-          export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
-          export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
-          export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
+          export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV_NEW }}"
+          export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV_NEW }}"
+          export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER_NEW }}"
           export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
           ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_azure)'
 

From 35738ca37f4767c56cb5bd7cb132d694a0f238e5 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 31 Jul 2024 14:17:59 +0200
Subject: [PATCH 306/412] compaction_level0_phase1: bypass PS PageCache for
 data blocks (#8543)

part of https://github.com/neondatabase/neon/issues/8184

# Problem

We want to bypass PS PageCache for all data block reads, but
`compact_level0_phase1` currently uses `ValueRef::load` to load the WAL
records from delta layers.
Internally, that maps to `FileBlockReader:read_blk` which hits the
PageCache
[here](https://github.com/neondatabase/neon/blob/e78341e1c220625d9bfa3f08632bd5cfb8e6a876/pageserver/src/tenant/block_io.rs#L229-L236).

# Solution

This PR adds a mode for `compact_level0_phase1` that uses the
`MergeIterator` for reading the `Value`s from the delta layer files.

`MergeIterator` is a streaming k-merge that uses vectored blob_io under
the hood, which bypasses the PS PageCache for data blocks.

Other notable changes:
* change the `DiskBtreeReader::into_stream` to buffer the node, instead
of holding a `PageCache` `PageReadGuard`.
* Without this, we run out of page cache slots in
`test_pageserver_compaction_smoke`.
* Generally, `PageReadGuard`s aren't supposed to be held across await
points, so, this is a general bugfix.

# Testing / Validation / Performance

`MergeIterator` has not yet been used in production; it's being
developed as part of
* https://github.com/neondatabase/neon/issues/8002

Therefore, this PR adds a validation mode that compares the existing
approach's value iterator with the new approach's stream output, item by
item.
If they're not identical, we log a warning / fail the unit/regression
test.
To avoid flooding the logs, we apply a global rate limit of once per 10
seconds.
In any case, we use the existing approach's value.

Expected performance impact that will be monitored in staging / nightly
benchmarks / eventually pre-prod:
* with validation:
  * increased CPU usage
  * ~doubled VirtualFile read bytes/second metric
* no change in disk IO usage because the kernel page cache will likely
have the pages buffered on the second read
* without validation:
* slightly higher DRAM usage because each iterator participating in the
k-merge has a dedicated buffer (as opposed to before, where compactions
would rely on the PS PageCaceh as a shared evicting buffer)
* less disk IO if previously there were repeat PageCache misses (likely
case on a busy production Pageserver)
* lower CPU usage: PageCache out of the picture, fewer syscalls are made
(vectored blob io batches reads)

# Rollout

The new code is used with validation mode enabled-by-default.
This gets us validation everywhere by default, specifically in
- Rust unit tests
- Python tests
- Nightly pagebench (shouldn't really matter)
- Staging

Before the next release, I'll merge the following aws.git PR that
configures prod to continue using the existing behavior:

* https://github.com/neondatabase/aws/pull/1663

# Interactions With Other Features

This work & rollout should complete before Direct IO is enabled because
Direct IO would double the IOPS & latency for each compaction read
(#8240).

# Future Work

The streaming k-merge's memory usage is proportional to the amount of
memory per participating layer.

But `compact_level0_phase1` still loads all keys into memory for
`all_keys_iter`.
Thus, it continues to have active memory usage proportional to the
number of keys involved in the compaction.

Future work should replace `all_keys_iter` with a streaming keys
iterator.
This PR has a draft in its first commit, which I later reverted because
it's not necessary to achieve the goal of this PR / issue #8184.
---
 pageserver/src/bin/pageserver.rs             |   1 +
 pageserver/src/config.rs                     |  19 ++
 pageserver/src/repository.rs                 |   3 +-
 pageserver/src/tenant/disk_btree.rs          |  13 +-
 pageserver/src/tenant/timeline/compaction.rs | 184 ++++++++++++++++++-
 5 files changed, 210 insertions(+), 10 deletions(-)

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 7a96c86ded..2d00f311fb 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -129,6 +129,7 @@ fn main() -> anyhow::Result<()> {
     info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
     info!(?conf.get_impl, "starting with get page implementation");
     info!(?conf.get_vectored_impl, "starting with vectored get page implementation");
+    info!(?conf.compact_level0_phase1_value_access, "starting with setting for compact_level0_phase1_value_access");
 
     let tenants_path = conf.tenants_path();
     if !tenants_path.exists() {
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index f71881683d..41c2fe0af3 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -29,6 +29,7 @@ use utils::{
     logging::LogFormat,
 };
 
+use crate::tenant::timeline::compaction::CompactL0Phase1ValueAccess;
 use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
 use crate::tenant::{config::TenantConfOpt, timeline::GetImpl};
 use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
@@ -295,6 +296,10 @@ pub struct PageServerConf {
     pub ephemeral_bytes_per_memory_kb: usize,
 
     pub l0_flush: L0FlushConfig,
+
+    /// This flag is temporary and will be removed after gradual rollout.
+    /// See <https://github.com/neondatabase/neon/issues/8184>.
+    pub compact_level0_phase1_value_access: CompactL0Phase1ValueAccess,
 }
 
 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -401,6 +406,8 @@ struct PageServerConfigBuilder {
     ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
 
     l0_flush: BuilderValue<L0FlushConfig>,
+
+    compact_level0_phase1_value_access: BuilderValue<CompactL0Phase1ValueAccess>,
 }
 
 impl PageServerConfigBuilder {
@@ -490,6 +497,7 @@ impl PageServerConfigBuilder {
             validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
             ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
             l0_flush: Set(L0FlushConfig::default()),
+            compact_level0_phase1_value_access: Set(CompactL0Phase1ValueAccess::default()),
         }
     }
 }
@@ -673,6 +681,10 @@ impl PageServerConfigBuilder {
         self.l0_flush = BuilderValue::Set(value);
     }
 
+    pub fn compact_level0_phase1_value_access(&mut self, value: CompactL0Phase1ValueAccess) {
+        self.compact_level0_phase1_value_access = BuilderValue::Set(value);
+    }
+
     pub fn build(self, id: NodeId) -> anyhow::Result<PageServerConf> {
         let default = Self::default_values();
 
@@ -730,6 +742,7 @@ impl PageServerConfigBuilder {
                 image_compression,
                 ephemeral_bytes_per_memory_kb,
                 l0_flush,
+                compact_level0_phase1_value_access,
             }
             CUSTOM LOGIC
             {
@@ -1002,6 +1015,9 @@ impl PageServerConf {
                 "l0_flush" => {
                     builder.l0_flush(utils::toml_edit_ext::deserialize_item(item).context("l0_flush")?)
                 }
+                "compact_level0_phase1_value_access" => {
+                    builder.compact_level0_phase1_value_access(utils::toml_edit_ext::deserialize_item(item).context("compact_level0_phase1_value_access")?)
+                }
                 _ => bail!("unrecognized pageserver option '{key}'"),
             }
         }
@@ -1086,6 +1102,7 @@ impl PageServerConf {
             validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
             ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
             l0_flush: L0FlushConfig::default(),
+            compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
         }
     }
 }
@@ -1327,6 +1344,7 @@ background_task_maximum_delay = '334 s'
                 image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
                 ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
                 l0_flush: L0FlushConfig::default(),
+                compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
             },
             "Correct defaults should be used when no config values are provided"
         );
@@ -1401,6 +1419,7 @@ background_task_maximum_delay = '334 s'
                 image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
                 ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
                 l0_flush: L0FlushConfig::default(),
+                compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
             },
             "Should be able to parse all basic config values correctly"
         );
diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs
index 5a334d0290..e4ebafd927 100644
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -8,8 +8,7 @@ use std::time::Duration;
 pub use pageserver_api::key::{Key, KEY_SIZE};
 
 /// A 'value' stored for a one Key.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-#[cfg_attr(test, derive(PartialEq))]
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
 pub enum Value {
     /// An Image value contains a full copy of the value
     Image(Bytes),
diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs
index 1583a3826a..0107b0ac7e 100644
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -296,13 +296,19 @@ where
             let mut stack = Vec::new();
             stack.push((self.root_blk, None));
             let block_cursor = self.reader.block_cursor();
+            let mut node_buf = [0_u8; PAGE_SZ];
             while let Some((node_blknum, opt_iter)) = stack.pop() {
-                // Locate the node.
-                let node_buf = block_cursor
+                // Read the node, through the PS PageCache, into local variable `node_buf`.
+                // We could keep the page cache read guard alive, but, at the time of writing,
+                // we run quite small PS PageCache s => can't risk running out of
+                // PageCache space because this stream isn't consumed fast enough.
+                let page_read_guard = block_cursor
                     .read_blk(self.start_blk + node_blknum, ctx)
                     .await?;
+                node_buf.copy_from_slice(page_read_guard.as_ref());
+                drop(page_read_guard); // drop page cache read guard early
 
-                let node = OnDiskNode::deparse(node_buf.as_ref())?;
+                let node = OnDiskNode::deparse(&node_buf)?;
                 let prefix_len = node.prefix_len as usize;
                 let suffix_len = node.suffix_len as usize;
 
@@ -345,6 +351,7 @@ where
                     Either::Left(idx..node.num_children.into())
                 };
 
+
                 // idx points to the first match now. Keep going from there
                 while let Some(idx) = iter.next() {
                     let key_off = idx * suffix_len;
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 3292b4a121..7bfa8e9d35 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -698,7 +698,140 @@ impl Timeline {
 
         // This iterator walks through all key-value pairs from all the layers
         // we're compacting, in key, LSN order.
-        let all_values_iter = all_keys.iter();
+        // If there's both a Value::Image and Value::WalRecord for the same (key,lsn),
+        // then the Value::Image is ordered before Value::WalRecord.
+        //
+        // TODO(https://github.com/neondatabase/neon/issues/8184): remove the page cached blob_io
+        // option and validation code once we've reached confidence.
+        enum AllValuesIter<'a> {
+            PageCachedBlobIo {
+                all_keys_iter: VecIter<'a>,
+            },
+            StreamingKmergeBypassingPageCache {
+                merge_iter: MergeIterator<'a>,
+            },
+            ValidatingStreamingKmergeBypassingPageCache {
+                mode: CompactL0BypassPageCacheValidation,
+                merge_iter: MergeIterator<'a>,
+                all_keys_iter: VecIter<'a>,
+            },
+        }
+        type VecIter<'a> = std::slice::Iter<'a, DeltaEntry<'a>>; // TODO: distinguished lifetimes
+        impl AllValuesIter<'_> {
+            async fn next_all_keys_iter(
+                iter: &mut VecIter<'_>,
+                ctx: &RequestContext,
+            ) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
+                let Some(DeltaEntry {
+                    key,
+                    lsn,
+                    val: value_ref,
+                    ..
+                }) = iter.next()
+                else {
+                    return Ok(None);
+                };
+                let value = value_ref.load(ctx).await?;
+                Ok(Some((*key, *lsn, value)))
+            }
+            async fn next(
+                &mut self,
+                ctx: &RequestContext,
+            ) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
+                match self {
+                    AllValuesIter::PageCachedBlobIo { all_keys_iter: iter } => {
+                      Self::next_all_keys_iter(iter, ctx).await
+                    }
+                    AllValuesIter::StreamingKmergeBypassingPageCache { merge_iter } => merge_iter.next().await,
+                    AllValuesIter::ValidatingStreamingKmergeBypassingPageCache { mode, merge_iter, all_keys_iter } => async {
+                        // advance both iterators
+                        let all_keys_iter_item = Self::next_all_keys_iter(all_keys_iter, ctx).await;
+                        let merge_iter_item = merge_iter.next().await;
+                        // compare results & log warnings as needed
+                        macro_rules! rate_limited_warn {
+                            ($($arg:tt)*) => {{
+                                if cfg!(debug_assertions) || cfg!(feature = "testing") {
+                                    warn!($($arg)*);
+                                    panic!("CompactL0BypassPageCacheValidation failure, check logs");
+                                }
+                                use once_cell::sync::Lazy;
+                                use utils::rate_limit::RateLimit;
+                                use std::sync::Mutex;
+                                use std::time::Duration;
+                                static LOGGED: Lazy<Mutex<RateLimit>> =
+                                    Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
+                                let mut rate_limit = LOGGED.lock().unwrap();
+                                rate_limit.call(|| {
+                                    warn!($($arg)*);
+                                });
+                            }}
+                        }
+                        match (&all_keys_iter_item, &merge_iter_item) {
+                            (Err(_), Err(_)) => {
+                                // don't bother asserting equivality of the errors
+                            }
+                            (Err(all_keys), Ok(merge)) => {
+                                rate_limited_warn!(?merge, "all_keys_iter returned an error where merge did not: {all_keys:?}");
+                            },
+                            (Ok(all_keys), Err(merge)) => {
+                                rate_limited_warn!(?all_keys, "merge returned an error where all_keys_iter did not: {merge:?}");
+                            },
+                            (Ok(None), Ok(None)) => { }
+                            (Ok(Some(all_keys)), Ok(None)) => {
+                                rate_limited_warn!(?all_keys, "merge returned None where all_keys_iter returned Some");
+                            }
+                            (Ok(None), Ok(Some(merge))) => {
+                                rate_limited_warn!(?merge, "all_keys_iter returned None where merge returned Some");
+                            }
+                            (Ok(Some((all_keys_key, all_keys_lsn, all_keys_value))), Ok(Some((merge_key, merge_lsn, merge_value)))) => {
+                                match mode {
+                                    // TODO: in this mode, we still load the value from disk for both iterators, even though we only need the all_keys_iter one
+                                    CompactL0BypassPageCacheValidation::KeyLsn => {
+                                        let all_keys = (all_keys_key, all_keys_lsn);
+                                        let merge = (merge_key, merge_lsn);
+                                        if all_keys != merge {
+                                            rate_limited_warn!(?all_keys, ?merge, "merge returned a different (Key,LSN) than all_keys_iter");
+                                        }
+                                    }
+                                    CompactL0BypassPageCacheValidation::KeyLsnValue => {
+                                        let all_keys = (all_keys_key, all_keys_lsn, all_keys_value);
+                                        let merge = (merge_key, merge_lsn, merge_value);
+                                        if all_keys != merge {
+                                            rate_limited_warn!(?all_keys, ?merge, "merge returned a different (Key,LSN,Value) than all_keys_iter");
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                        // in case of mismatch, trust the legacy all_keys_iter_item
+                        all_keys_iter_item
+                    }.instrument(info_span!("next")).await
+                }
+            }
+        }
+        let mut all_values_iter = match &self.conf.compact_level0_phase1_value_access {
+            CompactL0Phase1ValueAccess::PageCachedBlobIo => AllValuesIter::PageCachedBlobIo {
+                all_keys_iter: all_keys.iter(),
+            },
+            CompactL0Phase1ValueAccess::StreamingKmerge { validate } => {
+                let merge_iter = {
+                    let mut deltas = Vec::with_capacity(deltas_to_compact.len());
+                    for l in deltas_to_compact.iter() {
+                        let l = l.get_as_delta(ctx).await.map_err(CompactionError::Other)?;
+                        deltas.push(l);
+                    }
+                    MergeIterator::create(&deltas, &[], ctx)
+                };
+                match validate {
+                    None => AllValuesIter::StreamingKmergeBypassingPageCache { merge_iter },
+                    Some(validate) => AllValuesIter::ValidatingStreamingKmergeBypassingPageCache {
+                        mode: validate.clone(),
+                        merge_iter,
+                        all_keys_iter: all_keys.iter(),
+                    },
+                }
+            }
+        };
 
         // This iterator walks through all keys and is needed to calculate size used by each key
         let mut all_keys_iter = all_keys
@@ -771,11 +904,11 @@ impl Timeline {
         let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key
         let mut next_hole = 0; // index of next hole in holes vector
 
-        for &DeltaEntry {
-            key, lsn, ref val, ..
-        } in all_values_iter
+        while let Some((key, lsn, value)) = all_values_iter
+            .next(ctx)
+            .await
+            .map_err(CompactionError::Other)?
         {
-            let value = val.load(ctx).await.map_err(CompactionError::Other)?;
             let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
             // We need to check key boundaries once we reach next key or end of layer with the same key
             if !same_key || lsn == dup_end_lsn {
@@ -960,6 +1093,10 @@ impl Timeline {
             }
         }
 
+        // Without this, rustc complains about deltas_to_compact still
+        // being borrowed when we `.into_iter()` below.
+        drop(all_values_iter);
+
         Ok(CompactLevel0Phase1Result {
             new_layers,
             deltas_to_compact: deltas_to_compact
@@ -1067,6 +1204,43 @@ impl TryFrom<CompactLevel0Phase1StatsBuilder> for CompactLevel0Phase1Stats {
     }
 }
 
+#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)]
+#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
+pub enum CompactL0Phase1ValueAccess {
+    /// The old way.
+    PageCachedBlobIo,
+    /// The new way.
+    StreamingKmerge {
+        /// If set, we run both the old way and the new way, validate that
+        /// they are identical (=> [`CompactL0BypassPageCacheValidation`]),
+        /// and if the validation fails,
+        /// - in tests: fail them with a panic or
+        /// - in prod, log a rate-limited warning and use the old way's results.
+        ///
+        /// If not set, we only run the new way and trust its results.
+        validate: Option<CompactL0BypassPageCacheValidation>,
+    },
+}
+
+/// See [`CompactL0Phase1ValueAccess::StreamingKmerge`].
+#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)]
+#[serde(rename_all = "kebab-case")]
+pub enum CompactL0BypassPageCacheValidation {
+    /// Validate that the series of (key, lsn) pairs are the same.
+    KeyLsn,
+    /// Validate that the entire output of old and new way is identical.
+    KeyLsnValue,
+}
+
+impl Default for CompactL0Phase1ValueAccess {
+    fn default() -> Self {
+        CompactL0Phase1ValueAccess::StreamingKmerge {
+            // TODO(https://github.com/neondatabase/neon/issues/8184): change to None once confident
+            validate: Some(CompactL0BypassPageCacheValidation::KeyLsnValue),
+        }
+    }
+}
+
 impl Timeline {
     /// Entry point for new tiered compaction algorithm.
     ///

From 0356fc426b347cd8a4ec3c21bce88922f9fb78ea Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 31 Jul 2024 15:10:27 +0100
Subject: [PATCH 307/412] CI(regress-tests): run less regression tests (#8561)

## Problem
We run regression tests on `release` & `debug` builds for each of the
three supported Postgres versions (6 in total).
With upcoming ARM support and Postgres 17, the number of jobs will jump
to 16, which is a lot.

See the internal discussion here:
https://neondb.slack.com/archives/C033A2WE6BZ/p1722365908404329

## Summary of changes
- Run `regress-tests` job in debug builds only with the latest Postgres
version
- Do not do `debug` builds on release branches
---
 .github/workflows/_build-and-test-locally.yml | 8 ++++++--
 .github/workflows/build_and_test.yml          | 5 ++++-
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index 7751f9e8c9..182e96a8ca 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -19,6 +19,10 @@ on:
         description: 'debug or release'
         required: true
         type: string
+      pg-versions:
+        description: 'a json array of postgres versions to run regression tests on'
+        required: true
+        type: string
 
 defaults:
   run:
@@ -254,7 +258,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        pg_version: [ v14, v15, v16 ]
+        pg_version: ${{ fromJson(inputs.pg-versions) }}
     steps:
       - uses: actions/checkout@v4
         with:
@@ -284,5 +288,5 @@ jobs:
       - name: Merge and upload coverage data
         if: |
           false &&
-          inputs.build-type == 'debug' && matrix.pg_version == 'v14'
+          inputs.build-type == 'debug' && matrix.pg_version == 'v16'
         uses: ./.github/actions/save-coverage-data
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 3cf40e6153..c4df98f585 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -203,7 +203,8 @@ jobs:
       fail-fast: false
       matrix:
         arch: [ x64 ]
-        build-type: [ debug, release ]
+        # Do not build or run tests in debug for release branches
+        build-type: ${{ fromJson((startsWith(github.ref_name, 'release' && github.event_name == 'push')) && '["release"]' || '["debug", "release"]') }}
         include:
           - build-type: release
             arch: arm64
@@ -213,6 +214,8 @@ jobs:
       build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}
       build-tag: ${{ needs.tag.outputs.build-tag }}
       build-type: ${{ matrix.build-type }}
+      # Run tests on all Postgres versions in release builds and only on the latest version in debug builds
+      pg-versions: ${{ matrix.build-type == 'release' && '["v14", "v15", "v16"]' || '["v16"]' }}
     secrets: inherit
 
   # Keep `benchmarks` job outside of `build-and-test-locally` workflow to make job failures non-blocking

From 311cc71b086bd01e79741106cf945439ded7e22d Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <iskyzh@gmail.com>
Date: Wed, 31 Jul 2024 10:48:48 -0400
Subject: [PATCH 308/412] feat(pageserver): support btm-gc-compaction for child
 branches (#8519)

part of https://github.com/neondatabase/neon/issues/8002

For child branches, we will pull the image of the modified keys from the
parant into the child branch, which creates a full history for
generating key retention. If there are not enough delta keys, the image
won't be wrote eventually, and we will only keep the deltas inside the
child branch. We could avoid the wasteful work to pull the image from
the parent if we can know the number of deltas in advance, in the future
(currently we always pull image for all modified keys in the child
branch)


---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant.rs                     | 293 ++++++++++++++++++-
 pageserver/src/tenant/timeline.rs            |   7 +-
 pageserver/src/tenant/timeline/compaction.rs | 135 ++++++---
 3 files changed, 400 insertions(+), 35 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index e5ac6725ad..48c1851a3a 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -7347,6 +7347,7 @@ mod tests {
                 Lsn(0x60),
                 &[Lsn(0x20), Lsn(0x40), Lsn(0x50)],
                 3,
+                None,
             )
             .await
             .unwrap();
@@ -7471,7 +7472,7 @@ mod tests {
             ),
         ];
         let res = tline
-            .generate_key_retention(key, &history, Lsn(0x60), &[Lsn(0x40), Lsn(0x50)], 3)
+            .generate_key_retention(key, &history, Lsn(0x60), &[Lsn(0x40), Lsn(0x50)], 3, None)
             .await
             .unwrap();
         let expected_res = KeyHistoryRetention {
@@ -7517,6 +7518,114 @@ mod tests {
         };
         assert_eq!(res, expected_res);
 
+        // In case of branch compaction, the branch itself does not have the full history, and we need to provide
+        // the ancestor image in the test case.
+
+        let history = vec![
+            (
+                key,
+                Lsn(0x20),
+                Value::WalRecord(NeonWalRecord::wal_append(";0x20")),
+            ),
+            (
+                key,
+                Lsn(0x30),
+                Value::WalRecord(NeonWalRecord::wal_append(";0x30")),
+            ),
+            (
+                key,
+                Lsn(0x40),
+                Value::WalRecord(NeonWalRecord::wal_append(";0x40")),
+            ),
+            (
+                key,
+                Lsn(0x70),
+                Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
+            ),
+        ];
+        let res = tline
+            .generate_key_retention(
+                key,
+                &history,
+                Lsn(0x60),
+                &[],
+                3,
+                Some((key, Lsn(0x10), Bytes::copy_from_slice(b"0x10"))),
+            )
+            .await
+            .unwrap();
+        let expected_res = KeyHistoryRetention {
+            below_horizon: vec![(
+                Lsn(0x60),
+                KeyLogAtLsn(vec![(
+                    Lsn(0x60),
+                    Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x30;0x40")), // use the ancestor image to reconstruct the page
+                )]),
+            )],
+            above_horizon: KeyLogAtLsn(vec![(
+                Lsn(0x70),
+                Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
+            )]),
+        };
+        assert_eq!(res, expected_res);
+
+        let history = vec![
+            (
+                key,
+                Lsn(0x20),
+                Value::WalRecord(NeonWalRecord::wal_append(";0x20")),
+            ),
+            (
+                key,
+                Lsn(0x40),
+                Value::WalRecord(NeonWalRecord::wal_append(";0x40")),
+            ),
+            (
+                key,
+                Lsn(0x60),
+                Value::WalRecord(NeonWalRecord::wal_append(";0x60")),
+            ),
+            (
+                key,
+                Lsn(0x70),
+                Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
+            ),
+        ];
+        let res = tline
+            .generate_key_retention(
+                key,
+                &history,
+                Lsn(0x60),
+                &[Lsn(0x30)],
+                3,
+                Some((key, Lsn(0x10), Bytes::copy_from_slice(b"0x10"))),
+            )
+            .await
+            .unwrap();
+        let expected_res = KeyHistoryRetention {
+            below_horizon: vec![
+                (
+                    Lsn(0x30),
+                    KeyLogAtLsn(vec![(
+                        Lsn(0x20),
+                        Value::WalRecord(NeonWalRecord::wal_append(";0x20")),
+                    )]),
+                ),
+                (
+                    Lsn(0x60),
+                    KeyLogAtLsn(vec![(
+                        Lsn(0x60),
+                        Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x40;0x60")),
+                    )]),
+                ),
+            ],
+            above_horizon: KeyLogAtLsn(vec![(
+                Lsn(0x70),
+                Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
+            )]),
+        };
+        assert_eq!(res, expected_res);
+
         Ok(())
     }
 
@@ -7715,4 +7824,186 @@ mod tests {
 
         Ok(())
     }
+
+    #[tokio::test]
+    async fn test_simple_bottom_most_compaction_on_branch() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_simple_bottom_most_compaction_on_branch").await?;
+        let (tenant, ctx) = harness.load().await;
+
+        fn get_key(id: u32) -> Key {
+            let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
+            key.field6 = id;
+            key
+        }
+
+        let img_layer = (0..10)
+            .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
+            .collect_vec();
+
+        let delta1 = vec![
+            (
+                get_key(1),
+                Lsn(0x20),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
+            ),
+            (
+                get_key(2),
+                Lsn(0x30),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
+            ),
+            (
+                get_key(3),
+                Lsn(0x28),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x28")),
+            ),
+            (
+                get_key(3),
+                Lsn(0x30),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
+            ),
+            (
+                get_key(3),
+                Lsn(0x40),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x40")),
+            ),
+        ];
+        let delta2 = vec![
+            (
+                get_key(5),
+                Lsn(0x20),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
+            ),
+            (
+                get_key(6),
+                Lsn(0x20),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
+            ),
+        ];
+        let delta3 = vec![
+            (
+                get_key(8),
+                Lsn(0x48),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
+            ),
+            (
+                get_key(9),
+                Lsn(0x48),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
+            ),
+        ];
+
+        let parent_tline = tenant
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                Lsn(0x10),
+                DEFAULT_PG_VERSION,
+                &ctx,
+                vec![],                       // delta layers
+                vec![(Lsn(0x18), img_layer)], // image layers
+                Lsn(0x18),
+            )
+            .await?;
+
+        parent_tline.add_extra_test_dense_keyspace(KeySpace::single(get_key(0)..get_key(10)));
+
+        let branch_tline = tenant
+            .branch_timeline_test_with_layers(
+                &parent_tline,
+                NEW_TIMELINE_ID,
+                Some(Lsn(0x18)),
+                &ctx,
+                vec![
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3),
+                ], // delta layers
+                vec![], // image layers
+                Lsn(0x50),
+            )
+            .await?;
+
+        branch_tline.add_extra_test_dense_keyspace(KeySpace::single(get_key(0)..get_key(10)));
+
+        {
+            // Update GC info
+            let mut guard = parent_tline.gc_info.write().unwrap();
+            *guard = GcInfo {
+                retain_lsns: vec![(Lsn(0x18), branch_tline.timeline_id)],
+                cutoffs: GcCutoffs {
+                    time: Lsn(0x10),
+                    space: Lsn(0x10),
+                },
+                leases: Default::default(),
+                within_ancestor_pitr: false,
+            };
+        }
+
+        {
+            // Update GC info
+            let mut guard = branch_tline.gc_info.write().unwrap();
+            *guard = GcInfo {
+                retain_lsns: vec![(Lsn(0x40), branch_tline.timeline_id)],
+                cutoffs: GcCutoffs {
+                    time: Lsn(0x50),
+                    space: Lsn(0x50),
+                },
+                leases: Default::default(),
+                within_ancestor_pitr: false,
+            };
+        }
+
+        let expected_result_at_gc_horizon = [
+            Bytes::from_static(b"value 0@0x10"),
+            Bytes::from_static(b"value 1@0x10@0x20"),
+            Bytes::from_static(b"value 2@0x10@0x30"),
+            Bytes::from_static(b"value 3@0x10@0x28@0x30@0x40"),
+            Bytes::from_static(b"value 4@0x10"),
+            Bytes::from_static(b"value 5@0x10@0x20"),
+            Bytes::from_static(b"value 6@0x10@0x20"),
+            Bytes::from_static(b"value 7@0x10"),
+            Bytes::from_static(b"value 8@0x10@0x48"),
+            Bytes::from_static(b"value 9@0x10@0x48"),
+        ];
+
+        let expected_result_at_lsn_40 = [
+            Bytes::from_static(b"value 0@0x10"),
+            Bytes::from_static(b"value 1@0x10@0x20"),
+            Bytes::from_static(b"value 2@0x10@0x30"),
+            Bytes::from_static(b"value 3@0x10@0x28@0x30@0x40"),
+            Bytes::from_static(b"value 4@0x10"),
+            Bytes::from_static(b"value 5@0x10@0x20"),
+            Bytes::from_static(b"value 6@0x10@0x20"),
+            Bytes::from_static(b"value 7@0x10"),
+            Bytes::from_static(b"value 8@0x10"),
+            Bytes::from_static(b"value 9@0x10"),
+        ];
+
+        let verify_result = || async {
+            for idx in 0..10 {
+                assert_eq!(
+                    branch_tline
+                        .get(get_key(idx as u32), Lsn(0x50), &ctx)
+                        .await
+                        .unwrap(),
+                    &expected_result_at_gc_horizon[idx]
+                );
+                assert_eq!(
+                    branch_tline
+                        .get(get_key(idx as u32), Lsn(0x40), &ctx)
+                        .await
+                        .unwrap(),
+                    &expected_result_at_lsn_40[idx]
+                );
+            }
+        };
+
+        verify_result().await;
+
+        let cancel = CancellationToken::new();
+        branch_tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+
+        verify_result().await;
+
+        Ok(())
+    }
 }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 2b205db6e1..4db44a3a19 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -529,7 +529,6 @@ impl GetVectoredError {
     }
 }
 
-#[derive(Debug)]
 pub struct MissingKeyError {
     key: Key,
     shard: ShardNumber,
@@ -540,6 +539,12 @@ pub struct MissingKeyError {
     backtrace: Option<std::backtrace::Backtrace>,
 }
 
+impl std::fmt::Debug for MissingKeyError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self)
+    }
+}
+
 impl std::fmt::Display for MissingKeyError {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         write!(
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 7bfa8e9d35..5e9ff1c9e4 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -15,6 +15,7 @@ use super::{
 };
 
 use anyhow::{anyhow, Context};
+use bytes::Bytes;
 use enumset::EnumSet;
 use fail::fail_point;
 use itertools::Itertools;
@@ -69,17 +70,21 @@ impl KeyHistoryRetention {
         self,
         key: Key,
         delta_writer: &mut Vec<(Key, Lsn, Value)>,
-        image_writer: &mut ImageLayerWriter,
+        mut image_writer: Option<&mut ImageLayerWriter>,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         let mut first_batch = true;
-        for (_, KeyLogAtLsn(logs)) in self.below_horizon {
+        for (cutoff_lsn, KeyLogAtLsn(logs)) in self.below_horizon {
             if first_batch {
                 if logs.len() == 1 && logs[0].1.is_image() {
                     let Value::Image(img) = &logs[0].1 else {
                         unreachable!()
                     };
-                    image_writer.put_image(key, img.clone(), ctx).await?;
+                    if let Some(image_writer) = image_writer.as_mut() {
+                        image_writer.put_image(key, img.clone(), ctx).await?;
+                    } else {
+                        delta_writer.push((key, cutoff_lsn, Value::Image(img.clone())));
+                    }
                 } else {
                     for (lsn, val) in logs {
                         delta_writer.push((key, lsn, val));
@@ -1328,6 +1333,7 @@ impl Timeline {
         horizon: Lsn,
         retain_lsn_below_horizon: &[Lsn],
         delta_threshold_cnt: usize,
+        base_img_from_ancestor: Option<(Key, Lsn, Bytes)>,
     ) -> anyhow::Result<KeyHistoryRetention> {
         // Pre-checks for the invariants
         if cfg!(debug_assertions) {
@@ -1357,6 +1363,7 @@ impl Timeline {
                 );
             }
         }
+        let has_ancestor = base_img_from_ancestor.is_some();
         // Step 1: split history into len(retain_lsn_below_horizon) + 2 buckets, where the last bucket is for all deltas above the horizon,
         // and the second-to-last bucket is for the horizon. Each bucket contains lsn_last_bucket < deltas <= lsn_this_bucket.
         let (mut split_history, lsn_split_points) = {
@@ -1390,6 +1397,9 @@ impl Timeline {
                         // For example, we have delta layer key1@0x10, key1@0x20, and image layer key1@0x10, we will
                         // keep the image for key1@0x10 and the delta for key1@0x20. key1@0x10 delta will be simply
                         // dropped.
+                        //
+                        // TODO: in case we have both delta + images for a given LSN and it does not exceed the delta
+                        // threshold, we could have kept delta instead to save space. This is an optimization for the future.
                         continue;
                     }
                 }
@@ -1407,9 +1417,13 @@ impl Timeline {
             "should have at least below + above horizon batches"
         );
         let mut replay_history: Vec<(Key, Lsn, Value)> = Vec::new();
+        if let Some((key, lsn, img)) = base_img_from_ancestor {
+            replay_history.push((key, lsn, Value::Image(img)));
+        }
         for (i, split_for_lsn) in split_history.into_iter().enumerate() {
+            // TODO: there could be image keys inside the splits, and we can compute records_since_last_image accordingly.
             records_since_last_image += split_for_lsn.len();
-            let generate_image = if i == 0 {
+            let generate_image = if i == 0 && !has_ancestor {
                 // We always generate images for the first batch (below horizon / lowest retain_lsn)
                 true
             } else if i == batch_cnt - 1 {
@@ -1532,20 +1546,25 @@ impl Timeline {
             retain_lsns_below_horizon.sort();
             (selected_layers, gc_cutoff, retain_lsns_below_horizon)
         };
-        let lowest_retain_lsn = retain_lsns_below_horizon
-            .first()
-            .copied()
-            .unwrap_or(gc_cutoff);
-        if cfg!(debug_assertions) {
-            assert_eq!(
-                lowest_retain_lsn,
-                retain_lsns_below_horizon
-                    .iter()
-                    .min()
-                    .copied()
-                    .unwrap_or(gc_cutoff)
-            );
-        }
+        let lowest_retain_lsn = if self.ancestor_timeline.is_some() {
+            Lsn(self.ancestor_lsn.0 + 1)
+        } else {
+            let res = retain_lsns_below_horizon
+                .first()
+                .copied()
+                .unwrap_or(gc_cutoff);
+            if cfg!(debug_assertions) {
+                assert_eq!(
+                    res,
+                    retain_lsns_below_horizon
+                        .iter()
+                        .min()
+                        .copied()
+                        .unwrap_or(gc_cutoff)
+                );
+            }
+            res
+        };
         info!(
             "picked {} layers for compaction with gc_cutoff={} lowest_retain_lsn={}",
             layer_selection.len(),
@@ -1586,6 +1605,7 @@ impl Timeline {
         let mut accumulated_values = Vec::new();
         let mut last_key: Option<Key> = None;
 
+        #[allow(clippy::too_many_arguments)]
         async fn flush_deltas(
             deltas: &mut Vec<(Key, Lsn, crate::repository::Value)>,
             last_key: Key,
@@ -1594,6 +1614,7 @@ impl Timeline {
             tline: &Arc<Timeline>,
             lowest_retain_lsn: Lsn,
             ctx: &RequestContext,
+            last_batch: bool,
         ) -> anyhow::Result<Option<ResidentLayer>> {
             // Check if we need to split the delta layer. We split at the original delta layer boundary to avoid
             // overlapping layers.
@@ -1614,7 +1635,7 @@ impl Timeline {
                 *current_delta_split_point += 1;
                 need_split = true;
             }
-            if !need_split {
+            if !need_split && !last_batch {
                 return Ok(None);
             }
             let deltas = std::mem::take(deltas);
@@ -1639,15 +1660,44 @@ impl Timeline {
             Ok(Some(delta_layer))
         }
 
-        let mut image_layer_writer = ImageLayerWriter::new(
-            self.conf,
-            self.timeline_id,
-            self.tenant_shard_id,
-            &(Key::MIN..Key::MAX), // covers the full key range
-            lowest_retain_lsn,
-            ctx,
-        )
-        .await?;
+        // Only create image layers when there is no ancestor branches. TODO: create covering image layer
+        // when some condition meet.
+        let mut image_layer_writer = if self.ancestor_timeline.is_none() {
+            Some(
+                ImageLayerWriter::new(
+                    self.conf,
+                    self.timeline_id,
+                    self.tenant_shard_id,
+                    &(Key::MIN..Key::MAX), // covers the full key range
+                    lowest_retain_lsn,
+                    ctx,
+                )
+                .await?,
+            )
+        } else {
+            None
+        };
+
+        /// Returns None if there is no ancestor branch. Throw an error when the key is not found.
+        ///
+        /// Currently, we always get the ancestor image for each key in the child branch no matter whether the image
+        /// is needed for reconstruction. This should be fixed in the future.
+        ///
+        /// Furthermore, we should do vectored get instead of a single get, or better, use k-merge for ancestor
+        /// images.
+        async fn get_ancestor_image(
+            tline: &Arc<Timeline>,
+            key: Key,
+            ctx: &RequestContext,
+        ) -> anyhow::Result<Option<(Key, Lsn, Bytes)>> {
+            if tline.ancestor_timeline.is_none() {
+                return Ok(None);
+            };
+            // This function is implemented as a get of the current timeline at ancestor LSN, therefore reusing
+            // as much existing code as possible.
+            let img = tline.get(key, tline.ancestor_lsn, ctx).await?;
+            Ok(Some((key, tline.ancestor_lsn, img)))
+        }
 
         let mut delta_values = Vec::new();
         let delta_split_points = delta_split_points.into_iter().collect_vec();
@@ -1668,11 +1718,17 @@ impl Timeline {
                         gc_cutoff,
                         &retain_lsns_below_horizon,
                         COMPACTION_DELTA_THRESHOLD,
+                        get_ancestor_image(self, *last_key, ctx).await?,
                     )
                     .await?;
                 // Put the image into the image layer. Currently we have a single big layer for the compaction.
                 retention
-                    .pipe_to(*last_key, &mut delta_values, &mut image_layer_writer, ctx)
+                    .pipe_to(
+                        *last_key,
+                        &mut delta_values,
+                        image_layer_writer.as_mut(),
+                        ctx,
+                    )
                     .await?;
                 delta_layers.extend(
                     flush_deltas(
@@ -1683,6 +1739,7 @@ impl Timeline {
                         self,
                         lowest_retain_lsn,
                         ctx,
+                        false,
                     )
                     .await?,
                 );
@@ -1701,11 +1758,17 @@ impl Timeline {
                 gc_cutoff,
                 &retain_lsns_below_horizon,
                 COMPACTION_DELTA_THRESHOLD,
+                get_ancestor_image(self, last_key, ctx).await?,
             )
             .await?;
         // Put the image into the image layer. Currently we have a single big layer for the compaction.
         retention
-            .pipe_to(last_key, &mut delta_values, &mut image_layer_writer, ctx)
+            .pipe_to(
+                last_key,
+                &mut delta_values,
+                image_layer_writer.as_mut(),
+                ctx,
+            )
             .await?;
         delta_layers.extend(
             flush_deltas(
@@ -1716,19 +1779,25 @@ impl Timeline {
                 self,
                 lowest_retain_lsn,
                 ctx,
+                true,
             )
             .await?,
         );
+        assert!(delta_values.is_empty(), "unprocessed keys");
 
-        let image_layer = image_layer_writer.finish(self, ctx).await?;
+        let image_layer = if let Some(writer) = image_layer_writer {
+            Some(writer.finish(self, ctx).await?)
+        } else {
+            None
+        };
         info!(
             "produced {} delta layers and {} image layers",
             delta_layers.len(),
-            1
+            if image_layer.is_some() { 1 } else { 0 }
         );
         let mut compact_to = Vec::new();
         compact_to.extend(delta_layers);
-        compact_to.push(image_layer);
+        compact_to.extend(image_layer);
         // Step 3: Place back to the layer map.
         {
             let mut guard = self.layers.write().await;

From 31122adee3fa0dac6bec5bfd1682f04adf42e490 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 31 Jul 2024 17:05:45 +0200
Subject: [PATCH 309/412] refactor(page_service): Timeline gate guard holding +
 cancellation + shutdown (#8339)

Since the introduction of sharding, the protocol handling loop in
`handle_pagerequests` cannot know anymore which concrete
`Tenant`/`Timeline` object any of the incoming `PagestreamFeMessage`
resolves to.
In fact, one message might resolve to one `Tenant`/`Timeline` while
the next one may resolve to another one.

To avoid going to tenant manager, we added the `shard_timelines` which
acted as an ever-growing cache that held timeline gate guards open for
the lifetime of the connection.
The consequence of holding the gate guards open was that we had to be
sensitive to every cached `Timeline::cancel` on each interaction with
the network connection, so that Timeline shutdown would not have to wait
for network connection interaction.

We can do better than that, meaning more efficiency & better
abstraction.
I proposed a sketch for it in

* https://github.com/neondatabase/neon/pull/8286

and this PR implements an evolution of that sketch.

The main idea is is that `mod page_service` shall be solely concerned
with the following:
1. receiving requests by speaking the protocol / pagestream subprotocol
2. dispatching the request to a corresponding method on the correct
shard/`Timeline` object
3. sending response by speaking the protocol / pagestream subprotocol.

The cancellation sensitivity responsibilities are clear cut:
* while in `page_service` code, sensitivity to page_service cancellation
is sufficient
* while in `Timeline` code, sensitivity to `Timeline::cancel` is
sufficient

To enforce these responsibilities, we introduce the notion of a
`timeline::handle::Handle` to a `Timeline` object that is checked out
from a `timeline::handle::Cache` for **each request**.
The `Handle` derefs to `Timeline` and is supposed to be used for a
single async method invocation on `Timeline`.
See the lengthy doc comment in `mod handle` for details of the design.
---
 pageserver/src/bin/pageserver.rs         |  43 +-
 pageserver/src/http/routes.rs            |   5 +
 pageserver/src/lib.rs                    |  10 +-
 pageserver/src/page_service.rs           | 766 +++++++++---------
 pageserver/src/tenant.rs                 |   2 +
 pageserver/src/tenant/mgr.rs             |   6 +-
 pageserver/src/tenant/timeline.rs        |  20 +
 pageserver/src/tenant/timeline/handle.rs | 967 +++++++++++++++++++++++
 8 files changed, 1387 insertions(+), 432 deletions(-)
 create mode 100644 pageserver/src/tenant/timeline/handle.rs

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 2d00f311fb..5ebd6511ac 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -17,11 +17,9 @@ use pageserver::config::PageserverIdentity;
 use pageserver::control_plane_client::ControlPlaneClient;
 use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
 use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
-use pageserver::task_mgr::WALRECEIVER_RUNTIME;
+use pageserver::task_mgr::{COMPUTE_REQUEST_RUNTIME, WALRECEIVER_RUNTIME};
 use pageserver::tenant::{secondary, TenantSharedResources};
-use pageserver::{
-    CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, LibpqEndpointListener,
-};
+use pageserver::{CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener};
 use remote_storage::GenericRemoteStorage;
 use tokio::signal::unix::SignalKind;
 use tokio::time::Instant;
@@ -31,11 +29,9 @@ use tracing::*;
 use metrics::set_build_info_metric;
 use pageserver::{
     config::PageServerConf,
-    context::{DownloadBehavior, RequestContext},
     deletion_queue::DeletionQueue,
     http, page_cache, page_service, task_mgr,
-    task_mgr::TaskKind,
-    task_mgr::{BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME},
+    task_mgr::{BACKGROUND_RUNTIME, MGMT_REQUEST_RUNTIME},
     tenant::mgr,
     virtual_file,
 };
@@ -594,30 +590,13 @@ fn start_pageserver(
 
     // Spawn a task to listen for libpq connections. It will spawn further tasks
     // for each connection. We created the listener earlier already.
-    let libpq_listener = {
-        let cancel = CancellationToken::new();
-        let libpq_ctx = RequestContext::todo_child(
-            TaskKind::LibpqEndpointListener,
-            // listener task shouldn't need to download anything. (We will
-            // create a separate sub-contexts for each connection, with their
-            // own download behavior. This context is used only to listen and
-            // accept connections.)
-            DownloadBehavior::Error,
-        );
-
-        let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
-            "libpq listener",
-            page_service::libpq_listener_main(
-                tenant_manager.clone(),
-                pg_auth,
-                pageserver_listener,
-                conf.pg_auth_type,
-                libpq_ctx,
-                cancel.clone(),
-            ),
-        ));
-        LibpqEndpointListener(CancellableTask { task, cancel })
-    };
+    let page_service = page_service::spawn(conf, tenant_manager.clone(), pg_auth, {
+        let _entered = COMPUTE_REQUEST_RUNTIME.enter(); // TcpListener::from_std requires it
+        pageserver_listener
+            .set_nonblocking(true)
+            .context("set listener to nonblocking")?;
+        tokio::net::TcpListener::from_std(pageserver_listener).context("create tokio listener")?
+    });
 
     let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());
 
@@ -645,7 +624,7 @@ fn start_pageserver(
             shutdown_pageserver.take();
             pageserver::shutdown_pageserver(
                 http_endpoint_listener,
-                libpq_listener,
+                page_service,
                 consumption_metrics_tasks,
                 disk_usage_eviction_task,
                 &tenant_manager,
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 9222123ad3..117f2c5869 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -296,6 +296,11 @@ impl From<GetActiveTenantError> for ApiError {
             GetActiveTenantError::WaitForActiveTimeout { .. } => {
                 ApiError::ResourceUnavailable(format!("{}", e).into())
             }
+            GetActiveTenantError::SwitchedTenant => {
+                // in our HTTP handlers, this error doesn't happen
+                // TODO: separate error types
+                ApiError::ResourceUnavailable("switched tenant".into())
+            }
         }
     }
 }
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index d944019641..f729cad3c3 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -30,7 +30,6 @@ pub mod walingest;
 pub mod walrecord;
 pub mod walredo;
 
-use crate::task_mgr::TaskKind;
 use camino::Utf8Path;
 use deletion_queue::DeletionQueue;
 use tenant::{
@@ -63,7 +62,6 @@ pub struct CancellableTask {
     pub cancel: CancellationToken,
 }
 pub struct HttpEndpointListener(pub CancellableTask);
-pub struct LibpqEndpointListener(pub CancellableTask);
 pub struct ConsumptionMetricsTasks(pub CancellableTask);
 pub struct DiskUsageEvictionTask(pub CancellableTask);
 impl CancellableTask {
@@ -77,7 +75,7 @@ impl CancellableTask {
 #[allow(clippy::too_many_arguments)]
 pub async fn shutdown_pageserver(
     http_listener: HttpEndpointListener,
-    libpq_listener: LibpqEndpointListener,
+    page_service: page_service::Listener,
     consumption_metrics_worker: ConsumptionMetricsTasks,
     disk_usage_eviction_task: Option<DiskUsageEvictionTask>,
     tenant_manager: &TenantManager,
@@ -89,8 +87,8 @@ pub async fn shutdown_pageserver(
     use std::time::Duration;
     // Shut down the libpq endpoint task. This prevents new connections from
     // being accepted.
-    timed(
-        libpq_listener.0.shutdown(),
+    let remaining_connections = timed(
+        page_service.stop_accepting(),
         "shutdown LibpqEndpointListener",
         Duration::from_secs(1),
     )
@@ -108,7 +106,7 @@ pub async fn shutdown_pageserver(
     // Shut down any page service tasks: any in-progress work for particular timelines or tenants
     // should already have been canclled via mgr::shutdown_all_tenants
     timed(
-        task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None),
+        remaining_connections.shutdown(),
         "shutdown PageRequestHandlers",
         Duration::from_secs(1),
     )
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 6353f713e0..5344b83e0d 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -4,9 +4,8 @@
 use anyhow::Context;
 use async_compression::tokio::write::GzipEncoder;
 use bytes::Buf;
-use futures::stream::FuturesUnordered;
-use futures::StreamExt;
-use pageserver_api::key::Key;
+use futures::FutureExt;
+use once_cell::sync::OnceCell;
 use pageserver_api::models::TenantState;
 use pageserver_api::models::{
     PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
@@ -15,28 +14,23 @@ use pageserver_api::models::{
     PagestreamGetSlruSegmentRequest, PagestreamGetSlruSegmentResponse, PagestreamNblocksRequest,
     PagestreamNblocksResponse, PagestreamProtocolVersion,
 };
-use pageserver_api::shard::ShardIndex;
-use pageserver_api::shard::ShardNumber;
 use pageserver_api::shard::TenantShardId;
 use postgres_backend::{is_expected_io_error, AuthType, PostgresBackend, QueryError};
 use pq_proto::framed::ConnectionError;
 use pq_proto::FeStartupPacket;
 use pq_proto::{BeMessage, FeMessage, RowDescriptor};
 use std::borrow::Cow;
-use std::collections::HashMap;
 use std::io;
-use std::net::TcpListener;
 use std::str;
 use std::str::FromStr;
 use std::sync::Arc;
-use std::time::Duration;
-use std::time::Instant;
 use std::time::SystemTime;
+use std::time::{Duration, Instant};
 use tokio::io::AsyncWriteExt;
 use tokio::io::{AsyncRead, AsyncWrite};
+use tokio::task::JoinHandle;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
-use utils::sync::gate::GateGuard;
 use utils::{
     auth::{Claims, Scope, SwappableJwtAuth},
     id::{TenantId, TimelineId},
@@ -47,61 +41,130 @@ use utils::{
 use crate::auth::check_permission;
 use crate::basebackup;
 use crate::basebackup::BasebackupError;
+use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::metrics;
 use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS};
 use crate::pgdatadir_mapping::Version;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
-use crate::task_mgr;
 use crate::task_mgr::TaskKind;
-use crate::tenant::mgr::GetActiveTenantError;
-use crate::tenant::mgr::GetTenantError;
-use crate::tenant::mgr::ShardResolveResult;
+use crate::task_mgr::{self, COMPUTE_REQUEST_RUNTIME};
 use crate::tenant::mgr::ShardSelector;
 use crate::tenant::mgr::TenantManager;
-use crate::tenant::timeline::WaitLsnError;
+use crate::tenant::mgr::{GetActiveTenantError, GetTenantError, ShardResolveResult};
+use crate::tenant::timeline::{self, WaitLsnError};
 use crate::tenant::GetTimelineError;
 use crate::tenant::PageReconstructError;
-use crate::tenant::Tenant;
 use crate::tenant::Timeline;
 use pageserver_api::key::rel_block_to_key;
 use pageserver_api::reltag::SlruKind;
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use postgres_ffi::BLCKSZ;
 
-// How long we may wait for a [`TenantSlot::InProgress`]` and/or a [`Tenant`] which
-// is not yet in state [`TenantState::Active`].
+/// How long we may wait for a [`crate::tenant::mgr::TenantSlot::InProgress`]` and/or a [`crate::tenant::Tenant`] which
+/// is not yet in state [`TenantState::Active`].
+///
+/// NB: this is a different value than [`crate::http::routes::ACTIVE_TENANT_TIMEOUT`].
 const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
 
 ///////////////////////////////////////////////////////////////////////////////
 
+pub struct Listener {
+    cancel: CancellationToken,
+    /// Cancel the listener task through `listen_cancel` to shut down the listener
+    /// and get a handle on the existing connections.
+    task: JoinHandle<Connections>,
+}
+
+pub struct Connections {
+    cancel: CancellationToken,
+    tasks: tokio::task::JoinSet<ConnectionHandlerResult>,
+}
+
+pub fn spawn(
+    conf: &'static PageServerConf,
+    tenant_manager: Arc<TenantManager>,
+    pg_auth: Option<Arc<SwappableJwtAuth>>,
+    tcp_listener: tokio::net::TcpListener,
+) -> Listener {
+    let cancel = CancellationToken::new();
+    let libpq_ctx = RequestContext::todo_child(
+        TaskKind::LibpqEndpointListener,
+        // listener task shouldn't need to download anything. (We will
+        // create a separate sub-contexts for each connection, with their
+        // own download behavior. This context is used only to listen and
+        // accept connections.)
+        DownloadBehavior::Error,
+    );
+    let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
+        "libpq listener",
+        libpq_listener_main(
+            tenant_manager,
+            pg_auth,
+            tcp_listener,
+            conf.pg_auth_type,
+            libpq_ctx,
+            cancel.clone(),
+        )
+        .map(anyhow::Ok),
+    ));
+
+    Listener { cancel, task }
+}
+
+impl Listener {
+    pub async fn stop_accepting(self) -> Connections {
+        self.cancel.cancel();
+        self.task
+            .await
+            .expect("unreachable: we wrap the listener task in task_mgr::exit_on_panic_or_error")
+    }
+}
+impl Connections {
+    pub async fn shutdown(self) {
+        let Self { cancel, mut tasks } = self;
+        cancel.cancel();
+        while let Some(res) = tasks.join_next().await {
+            // the logging done here mimics what was formerly done by task_mgr
+            match res {
+                Ok(Ok(())) => {}
+                Ok(Err(e)) => error!("error in page_service connection task: {:?}", e),
+                Err(e) => error!("page_service connection task panicked: {:?}", e),
+            }
+        }
+    }
+}
+
 ///
 /// Main loop of the page service.
 ///
 /// Listens for connections, and launches a new handler task for each.
 ///
+/// Returns Ok(()) upon cancellation via `cancel`, returning the set of
+/// open connections.
+///
 pub async fn libpq_listener_main(
     tenant_manager: Arc<TenantManager>,
     auth: Option<Arc<SwappableJwtAuth>>,
-    listener: TcpListener,
+    listener: tokio::net::TcpListener,
     auth_type: AuthType,
     listener_ctx: RequestContext,
-    cancel: CancellationToken,
-) -> anyhow::Result<()> {
-    listener.set_nonblocking(true)?;
-    let tokio_listener = tokio::net::TcpListener::from_std(listener)?;
+    listener_cancel: CancellationToken,
+) -> Connections {
+    let connections_cancel = CancellationToken::new();
+    let mut connection_handler_tasks = tokio::task::JoinSet::default();
 
     // Wait for a new connection to arrive, or for server shutdown.
     while let Some(res) = tokio::select! {
         biased;
 
-        _ = cancel.cancelled() => {
+        _ = listener_cancel.cancelled() => {
             // We were requested to shut down.
             None
         }
 
-        res = tokio_listener.accept() => {
+        res = listener.accept() => {
             Some(res)
         }
     } {
@@ -110,28 +173,16 @@ pub async fn libpq_listener_main(
                 // Connection established. Spawn a new task to handle it.
                 debug!("accepted connection from {}", peer_addr);
                 let local_auth = auth.clone();
-
                 let connection_ctx = listener_ctx
                     .detached_child(TaskKind::PageRequestHandler, DownloadBehavior::Download);
-
-                // PageRequestHandler tasks are not associated with any particular
-                // timeline in the task manager. In practice most connections will
-                // only deal with a particular timeline, but we don't know which one
-                // yet.
-                task_mgr::spawn(
-                    &tokio::runtime::Handle::current(),
-                    TaskKind::PageRequestHandler,
-                    None,
-                    None,
-                    "serving compute connection task",
-                    page_service_conn_main(
-                        tenant_manager.clone(),
-                        local_auth,
-                        socket,
-                        auth_type,
-                        connection_ctx,
-                    ),
-                );
+                connection_handler_tasks.spawn(page_service_conn_main(
+                    tenant_manager.clone(),
+                    local_auth,
+                    socket,
+                    auth_type,
+                    connection_ctx,
+                    connections_cancel.child_token(),
+                ));
             }
             Err(err) => {
                 // accept() failed. Log the error, and loop back to retry on next connection.
@@ -140,11 +191,16 @@ pub async fn libpq_listener_main(
         }
     }
 
-    debug!("page_service loop terminated");
+    debug!("page_service listener loop terminated");
 
-    Ok(())
+    Connections {
+        cancel: connections_cancel,
+        tasks: connection_handler_tasks,
+    }
 }
 
+type ConnectionHandlerResult = anyhow::Result<()>;
+
 #[instrument(skip_all, fields(peer_addr))]
 async fn page_service_conn_main(
     tenant_manager: Arc<TenantManager>,
@@ -152,7 +208,8 @@ async fn page_service_conn_main(
     socket: tokio::net::TcpStream,
     auth_type: AuthType,
     connection_ctx: RequestContext,
-) -> anyhow::Result<()> {
+    cancel: CancellationToken,
+) -> ConnectionHandlerResult {
     let _guard = LIVE_CONNECTIONS
         .with_label_values(&["page_service"])
         .guard();
@@ -200,13 +257,11 @@ async fn page_service_conn_main(
     // and create a child per-query context when it invokes process_query.
     // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
     // and create the per-query context in process_query ourselves.
-    let mut conn_handler = PageServerHandler::new(tenant_manager, auth, connection_ctx);
+    let mut conn_handler =
+        PageServerHandler::new(tenant_manager, auth, connection_ctx, cancel.clone());
     let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
 
-    match pgbackend
-        .run(&mut conn_handler, &task_mgr::shutdown_token())
-        .await
-    {
+    match pgbackend.run(&mut conn_handler, &cancel).await {
         Ok(()) => {
             // we've been requested to shut down
             Ok(())
@@ -223,32 +278,154 @@ async fn page_service_conn_main(
     }
 }
 
-/// While a handler holds a reference to a Timeline, it also holds a the
-/// timeline's Gate open.
-struct HandlerTimeline {
-    timeline: Arc<Timeline>,
-    _guard: GateGuard,
-}
-
 struct PageServerHandler {
     auth: Option<Arc<SwappableJwtAuth>>,
     claims: Option<Claims>,
 
-    tenant_manager: Arc<TenantManager>,
-
     /// The context created for the lifetime of the connection
     /// services by this PageServerHandler.
     /// For each query received over the connection,
     /// `process_query` creates a child context from this one.
     connection_ctx: RequestContext,
 
-    /// See [`Self::cache_timeline`] for usage.
-    ///
+    cancel: CancellationToken,
+
+    timeline_handles: TimelineHandles,
+}
+
+struct TimelineHandles {
+    wrapper: TenantManagerWrapper,
     /// Note on size: the typical size of this map is 1.  The largest size we expect
     /// to see is the number of shards divided by the number of pageservers (typically < 2),
     /// or the ratio used when splitting shards (i.e. how many children created from one)
     /// parent shard, where a "large" number might be ~8.
-    shard_timelines: HashMap<ShardIndex, HandlerTimeline>,
+    handles: timeline::handle::Cache<TenantManagerTypes>,
+}
+
+impl TimelineHandles {
+    fn new(tenant_manager: Arc<TenantManager>) -> Self {
+        Self {
+            wrapper: TenantManagerWrapper {
+                tenant_manager,
+                tenant_id: OnceCell::new(),
+            },
+            handles: Default::default(),
+        }
+    }
+    async fn get(
+        &mut self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        shard_selector: ShardSelector,
+    ) -> Result<timeline::handle::Handle<TenantManagerTypes>, GetActiveTimelineError> {
+        if *self.wrapper.tenant_id.get_or_init(|| tenant_id) != tenant_id {
+            return Err(GetActiveTimelineError::Tenant(
+                GetActiveTenantError::SwitchedTenant,
+            ));
+        }
+        self.handles
+            .get(timeline_id, shard_selector, &self.wrapper)
+            .await
+            .map_err(|e| match e {
+                timeline::handle::GetError::TenantManager(e) => e,
+                timeline::handle::GetError::TimelineGateClosed => {
+                    trace!("timeline gate closed");
+                    GetActiveTimelineError::Timeline(GetTimelineError::ShuttingDown)
+                }
+                timeline::handle::GetError::PerTimelineStateShutDown => {
+                    trace!("per-timeline state shut down");
+                    GetActiveTimelineError::Timeline(GetTimelineError::ShuttingDown)
+                }
+            })
+    }
+}
+
+pub(crate) struct TenantManagerWrapper {
+    tenant_manager: Arc<TenantManager>,
+    // We do not support switching tenant_id on a connection at this point.
+    // We can can add support for this later if needed without changing
+    // the protocol.
+    tenant_id: once_cell::sync::OnceCell<TenantId>,
+}
+
+#[derive(Debug)]
+pub(crate) struct TenantManagerTypes;
+
+impl timeline::handle::Types for TenantManagerTypes {
+    type TenantManagerError = GetActiveTimelineError;
+    type TenantManager = TenantManagerWrapper;
+    type Timeline = Arc<Timeline>;
+}
+
+impl timeline::handle::ArcTimeline<TenantManagerTypes> for Arc<Timeline> {
+    fn gate(&self) -> &utils::sync::gate::Gate {
+        &self.gate
+    }
+
+    fn shard_timeline_id(&self) -> timeline::handle::ShardTimelineId {
+        Timeline::shard_timeline_id(self)
+    }
+
+    fn per_timeline_state(&self) -> &timeline::handle::PerTimelineState<TenantManagerTypes> {
+        &self.handles
+    }
+
+    fn get_shard_identity(&self) -> &pageserver_api::shard::ShardIdentity {
+        Timeline::get_shard_identity(self)
+    }
+}
+
+impl timeline::handle::TenantManager<TenantManagerTypes> for TenantManagerWrapper {
+    async fn resolve(
+        &self,
+        timeline_id: TimelineId,
+        shard_selector: ShardSelector,
+    ) -> Result<Arc<Timeline>, GetActiveTimelineError> {
+        let tenant_id = self.tenant_id.get().expect("we set this in get()");
+        let timeout = ACTIVE_TENANT_TIMEOUT;
+        let wait_start = Instant::now();
+        let deadline = wait_start + timeout;
+        let tenant_shard = loop {
+            let resolved = self
+                .tenant_manager
+                .resolve_attached_shard(tenant_id, shard_selector);
+            match resolved {
+                ShardResolveResult::Found(tenant_shard) => break tenant_shard,
+                ShardResolveResult::NotFound => {
+                    return Err(GetActiveTimelineError::Tenant(
+                        GetActiveTenantError::NotFound(GetTenantError::NotFound(*tenant_id)),
+                    ));
+                }
+                ShardResolveResult::InProgress(barrier) => {
+                    // We can't authoritatively answer right now: wait for InProgress state
+                    // to end, then try again
+                    tokio::select! {
+                        _  = barrier.wait() => {
+                            // The barrier completed: proceed around the loop to try looking up again
+                        },
+                        _ = tokio::time::sleep(deadline.duration_since(Instant::now())) => {
+                            return Err(GetActiveTimelineError::Tenant(GetActiveTenantError::WaitForActiveTimeout {
+                                latest_state: None,
+                                wait_time: timeout,
+                            }));
+                        }
+                    }
+                }
+            };
+        };
+
+        tracing::debug!("Waiting for tenant to enter active state...");
+        tenant_shard
+            .wait_to_become_active(deadline.duration_since(Instant::now()))
+            .await
+            .map_err(GetActiveTimelineError::Tenant)?;
+
+        let timeline = tenant_shard
+            .get_timeline(timeline_id, true)
+            .map_err(GetActiveTimelineError::Timeline)?;
+        set_tracing_field_shard_id(&timeline);
+        Ok(timeline)
+    }
 }
 
 #[derive(thiserror::Error, Debug)]
@@ -292,7 +469,11 @@ impl From<PageReconstructError> for PageStreamError {
 impl From<GetActiveTimelineError> for PageStreamError {
     fn from(value: GetActiveTimelineError) -> Self {
         match value {
-            GetActiveTimelineError::Tenant(GetActiveTenantError::Cancelled) => Self::Shutdown,
+            GetActiveTimelineError::Tenant(GetActiveTenantError::Cancelled)
+            | GetActiveTimelineError::Tenant(GetActiveTenantError::WillNotBecomeActive(
+                TenantState::Stopping { .. },
+            ))
+            | GetActiveTimelineError::Timeline(GetTimelineError::ShuttingDown) => Self::Shutdown,
             GetActiveTimelineError::Tenant(e) => Self::NotFound(format!("{e}").into()),
             GetActiveTimelineError::Timeline(e) => Self::NotFound(format!("{e}").into()),
         }
@@ -324,64 +505,17 @@ impl PageServerHandler {
         tenant_manager: Arc<TenantManager>,
         auth: Option<Arc<SwappableJwtAuth>>,
         connection_ctx: RequestContext,
+        cancel: CancellationToken,
     ) -> Self {
         PageServerHandler {
-            tenant_manager,
             auth,
             claims: None,
             connection_ctx,
-            shard_timelines: HashMap::new(),
+            timeline_handles: TimelineHandles::new(tenant_manager),
+            cancel,
         }
     }
 
-    /// Future that completes when we need to shut down the connection.
-    ///
-    /// We currently need to shut down when any of the following happens:
-    /// 1. any of the timelines we hold GateGuards for in `shard_timelines` is cancelled
-    /// 2. task_mgr requests shutdown of the connection
-    ///
-    /// NB on (1): the connection's lifecycle is not actually tied to any of the
-    /// `shard_timelines`s' lifecycles. But it's _necessary_ in the current
-    /// implementation to be responsive to timeline cancellation because
-    /// the connection holds their `GateGuards` open (sored in `shard_timelines`).
-    /// We currently do the easy thing and terminate the connection if any of the
-    /// shard_timelines gets cancelled. But really, we cuold spend more effort
-    /// and simply remove the cancelled timeline from the `shard_timelines`, thereby
-    /// dropping the guard.
-    ///
-    /// NB: keep in sync with [`Self::is_connection_cancelled`]
-    async fn await_connection_cancelled(&self) {
-        // A short wait before we expend the cycles to walk our timeline map.  This avoids incurring
-        // that cost every time we check for cancellation.
-        tokio::time::sleep(Duration::from_millis(10)).await;
-
-        // This function is never called concurrently with code that adds timelines to shard_timelines,
-        // which is enforced by the borrow checker (the future returned by this function carries the
-        // immutable &self).  So it's fine to evaluate shard_timelines after the sleep, we don't risk
-        // missing any inserts to the map.
-
-        let mut cancellation_sources = Vec::with_capacity(1 + self.shard_timelines.len());
-        use futures::future::Either;
-        cancellation_sources.push(Either::Left(task_mgr::shutdown_watcher()));
-        cancellation_sources.extend(
-            self.shard_timelines
-                .values()
-                .map(|ht| Either::Right(ht.timeline.cancel.cancelled())),
-        );
-        FuturesUnordered::from_iter(cancellation_sources)
-            .next()
-            .await;
-    }
-
-    /// Checking variant of [`Self::await_connection_cancelled`].
-    fn is_connection_cancelled(&self) -> bool {
-        task_mgr::is_shutdown_requested()
-            || self
-                .shard_timelines
-                .values()
-                .any(|ht| ht.timeline.cancel.is_cancelled() || ht.timeline.is_stopping())
-    }
-
     /// This function always respects cancellation of any timeline in `[Self::shard_timelines]`.  Pass in
     /// a cancellation token at the next scope up (such as a tenant cancellation token) to ensure we respect
     /// cancellation if there aren't any timelines in the cache.
@@ -400,15 +534,21 @@ impl PageServerHandler {
             flush_r = pgb.flush() => {
                 Ok(flush_r?)
             },
-            _ = self.await_connection_cancelled() => {
-                Err(QueryError::Shutdown)
-            }
             _ = cancel.cancelled() => {
                 Err(QueryError::Shutdown)
             }
         )
     }
 
+    /// Pagestream sub-protocol handler.
+    ///
+    /// It is a simple request-response protocol inside a COPYBOTH session.
+    ///
+    /// # Coding Discipline
+    ///
+    /// Coding discipline within this function: all interaction with the `pgb` connection
+    /// needs to be sensitive to connection shutdown, currently signalled via [`Self::cancel`].
+    /// This is so that we can shutdown page_service quickly.
     #[instrument(skip_all)]
     async fn handle_pagerequests<IO>(
         &mut self,
@@ -423,27 +563,27 @@ impl PageServerHandler {
     {
         debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
 
-        let tenant = self
-            .get_active_tenant_with_timeout(tenant_id, ShardSelector::First, ACTIVE_TENANT_TIMEOUT)
-            .await?;
-
         // switch client to COPYBOTH
         pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
-        self.flush_cancellable(pgb, &tenant.cancel).await?;
+        tokio::select! {
+            biased;
+            _ = self.cancel.cancelled() => {
+                return Err(QueryError::Shutdown)
+            }
+            res = pgb.flush() => {
+                res?;
+            }
+        }
 
         loop {
+            // read request bytes (it's exactly 1 PagestreamFeMessage per CopyData)
             let msg = tokio::select! {
                 biased;
-
-                _ = self.await_connection_cancelled() => {
-                    // We were requested to shut down.
-                    info!("shutdown request received in page handler");
+                _ = self.cancel.cancelled() => {
                     return Err(QueryError::Shutdown)
                 }
-
                 msg = pgb.read_message() => { msg }
             };
-
             let copy_data_bytes = match msg? {
                 Some(FeMessage::CopyData(bytes)) => bytes,
                 Some(FeMessage::Terminate) => break,
@@ -458,13 +598,12 @@ impl PageServerHandler {
             trace!("query: {copy_data_bytes:?}");
             fail::fail_point!("ps::handle-pagerequest-message");
 
+            // parse request
             let neon_fe_msg =
                 PagestreamFeMessage::parse(&mut copy_data_bytes.reader(), protocol_version)?;
 
-            // TODO: We could create a new per-request context here, with unique ID.
-            // Currently we use the same per-timeline context for all requests
-
-            let (response, span) = match neon_fe_msg {
+            // invoke handler function
+            let (handler_result, span) = match neon_fe_msg {
                 PagestreamFeMessage::Exists(req) => {
                     fail::fail_point!("ps::handle-pagerequest-message::exists");
                     let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.request_lsn);
@@ -518,31 +657,26 @@ impl PageServerHandler {
                 }
             };
 
-            match response {
-                Err(PageStreamError::Shutdown) => {
-                    // If we fail to fulfil a request during shutdown, which may be _because_ of
-                    // shutdown, then do not send the error to the client.  Instead just drop the
-                    // connection.
-                    span.in_scope(|| info!("dropping connection due to shutdown"));
-                    return Err(QueryError::Shutdown);
-                }
-                Err(PageStreamError::Reconnect(reason)) => {
-                    span.in_scope(|| info!("handler requested reconnect: {reason}"));
-                    return Err(QueryError::Reconnect);
-                }
-                Err(e) if self.is_connection_cancelled() => {
-                    // This branch accomodates code within request handlers that returns an anyhow::Error instead of a clean
-                    // shutdown error, this may be buried inside a PageReconstructError::Other for example.
-                    //
-                    // Requests may fail as soon as we are Stopping, even if the Timeline's cancellation token wasn't fired yet,
-                    // because wait_lsn etc will drop out
-                    // is_stopping(): [`Timeline::flush_and_shutdown`] has entered
-                    // is_canceled(): [`Timeline::shutdown`]` has entered
-                    span.in_scope(|| info!("dropped error response during shutdown: {e:#}"));
-                    return Err(QueryError::Shutdown);
-                }
-                r => {
-                    let response_msg = r.unwrap_or_else(|e| {
+            // Map handler result to protocol behavior.
+            // Some handler errors cause exit from pagestream protocol.
+            // Other handler errors are sent back as an error message and we stay in pagestream protocol.
+            let response_msg = match handler_result {
+                Err(e) => match &e {
+                    PageStreamError::Shutdown => {
+                        // If we fail to fulfil a request during shutdown, which may be _because_ of
+                        // shutdown, then do not send the error to the client.  Instead just drop the
+                        // connection.
+                        span.in_scope(|| info!("dropping connection due to shutdown"));
+                        return Err(QueryError::Shutdown);
+                    }
+                    PageStreamError::Reconnect(reason) => {
+                        span.in_scope(|| info!("handler requested reconnect: {reason}"));
+                        return Err(QueryError::Reconnect);
+                    }
+                    PageStreamError::Read(_)
+                    | PageStreamError::LsnTimeout(_)
+                    | PageStreamError::NotFound(_)
+                    | PageStreamError::BadRequest(_) => {
                         // print the all details to the log with {:#}, but for the client the
                         // error message is enough.  Do not log if shutting down, as the anyhow::Error
                         // here includes cancellation which is not an error.
@@ -553,10 +687,22 @@ impl PageServerHandler {
                         PagestreamBeMessage::Error(PagestreamErrorResponse {
                             message: e.to_string(),
                         })
-                    });
+                    }
+                },
+                Ok(response_msg) => response_msg,
+            };
 
-                    pgb.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?;
-                    self.flush_cancellable(pgb, &tenant.cancel).await?;
+            // marshal & transmit response message
+            pgb.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?;
+            tokio::select! {
+                biased;
+                _ = self.cancel.cancelled() => {
+                    // We were requested to shut down.
+                    info!("shutdown request received in page handler");
+                    return Err(QueryError::Shutdown)
+                }
+                res = pgb.flush() => {
+                    res?;
                 }
             }
         }
@@ -644,7 +790,7 @@ impl PageServerHandler {
 
     #[instrument(skip_all, fields(shard_id, %lsn))]
     async fn handle_make_lsn_lease<IO>(
-        &self,
+        &mut self,
         pgb: &mut PostgresBackend<IO>,
         tenant_shard_id: TenantShardId,
         timeline_id: TimelineId,
@@ -654,10 +800,16 @@ impl PageServerHandler {
     where
         IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
     {
-        let shard_selector = ShardSelector::Known(tenant_shard_id.to_index());
         let timeline = self
-            .get_active_tenant_timeline(tenant_shard_id.tenant_id, timeline_id, shard_selector)
+            .timeline_handles
+            .get(
+                tenant_shard_id.tenant_id,
+                timeline_id,
+                ShardSelector::Known(tenant_shard_id.to_index()),
+            )
             .await?;
+        set_tracing_field_shard_id(&timeline);
+
         let lease = timeline.make_lsn_lease(lsn, timeline.get_lsn_lease_length(), ctx)?;
         let valid_until = lease
             .valid_until
@@ -683,14 +835,17 @@ impl PageServerHandler {
         req: &PagestreamExistsRequest,
         ctx: &RequestContext,
     ) -> Result<PagestreamBeMessage, PageStreamError> {
-        let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?;
+        let timeline = self
+            .timeline_handles
+            .get(tenant_id, timeline_id, ShardSelector::Zero)
+            .await?;
         let _timer = timeline
             .query_metrics
             .start_timer(metrics::SmgrQueryType::GetRelExists, ctx);
 
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn = Self::wait_or_get_last_lsn(
-            timeline,
+            &timeline,
             req.request_lsn,
             req.not_modified_since,
             &latest_gc_cutoff_lsn,
@@ -715,7 +870,10 @@ impl PageServerHandler {
         req: &PagestreamNblocksRequest,
         ctx: &RequestContext,
     ) -> Result<PagestreamBeMessage, PageStreamError> {
-        let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?;
+        let timeline = self
+            .timeline_handles
+            .get(tenant_id, timeline_id, ShardSelector::Zero)
+            .await?;
 
         let _timer = timeline
             .query_metrics
@@ -723,7 +881,7 @@ impl PageServerHandler {
 
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn = Self::wait_or_get_last_lsn(
-            timeline,
+            &timeline,
             req.request_lsn,
             req.not_modified_since,
             &latest_gc_cutoff_lsn,
@@ -748,7 +906,10 @@ impl PageServerHandler {
         req: &PagestreamDbSizeRequest,
         ctx: &RequestContext,
     ) -> Result<PagestreamBeMessage, PageStreamError> {
-        let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?;
+        let timeline = self
+            .timeline_handles
+            .get(tenant_id, timeline_id, ShardSelector::Zero)
+            .await?;
 
         let _timer = timeline
             .query_metrics
@@ -756,7 +917,7 @@ impl PageServerHandler {
 
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn = Self::wait_or_get_last_lsn(
-            timeline,
+            &timeline,
             req.request_lsn,
             req.not_modified_since,
             &latest_gc_cutoff_lsn,
@@ -774,122 +935,6 @@ impl PageServerHandler {
         }))
     }
 
-    /// For most getpage requests, we will already have a Timeline to serve the request: this function
-    /// looks up such a Timeline synchronously and without touching any global state.
-    fn get_cached_timeline_for_page(
-        &mut self,
-        req: &PagestreamGetPageRequest,
-    ) -> Result<&Arc<Timeline>, Key> {
-        let key = if let Some((first_idx, first_timeline)) = self.shard_timelines.iter().next() {
-            // Fastest path: single sharded case
-            if first_idx.shard_count.count() == 1 {
-                return Ok(&first_timeline.timeline);
-            }
-
-            let key = rel_block_to_key(req.rel, req.blkno);
-            let shard_num = first_timeline
-                .timeline
-                .get_shard_identity()
-                .get_shard_number(&key);
-
-            // Fast path: matched the first timeline in our local handler map.  This case is common if
-            // only one shard per tenant is attached to this pageserver.
-            if first_timeline.timeline.get_shard_identity().number == shard_num {
-                return Ok(&first_timeline.timeline);
-            }
-
-            let shard_index = ShardIndex {
-                shard_number: shard_num,
-                shard_count: first_timeline.timeline.get_shard_identity().count,
-            };
-
-            // Fast-ish path: timeline is in the connection handler's local cache
-            if let Some(found) = self.shard_timelines.get(&shard_index) {
-                return Ok(&found.timeline);
-            }
-
-            key
-        } else {
-            rel_block_to_key(req.rel, req.blkno)
-        };
-
-        Err(key)
-    }
-
-    /// Having looked up the [`Timeline`] instance for a particular shard, cache it to enable
-    /// use in future requests without having to traverse [`crate::tenant::mgr::TenantManager`]
-    /// again.
-    ///
-    /// Note that all the Timelines in this cache are for the same timeline_id: they're differ
-    /// in which shard they belong to.  When we serve a getpage@lsn request, we choose a shard
-    /// based on key.
-    ///
-    /// The typical size of this cache is 1, as we generally create shards to distribute work
-    /// across pageservers, so don't tend to have multiple shards for the same tenant on the
-    /// same pageserver.
-    fn cache_timeline(
-        &mut self,
-        timeline: Arc<Timeline>,
-    ) -> Result<&Arc<Timeline>, GetActiveTimelineError> {
-        let gate_guard = timeline
-            .gate
-            .enter()
-            .map_err(|_| GetActiveTimelineError::Tenant(GetActiveTenantError::Cancelled))?;
-
-        let shard_index = timeline.tenant_shard_id.to_index();
-        let entry = self
-            .shard_timelines
-            .entry(shard_index)
-            .or_insert(HandlerTimeline {
-                timeline,
-                _guard: gate_guard,
-            });
-
-        Ok(&entry.timeline)
-    }
-
-    /// If [`Self::get_cached_timeline_for_page`] missed, then this function is used to populate the cache with
-    /// a Timeline to serve requests for this key, if such a Timeline is present on this pageserver.  If no such
-    /// Timeline is found, then we will return an error (this indicates that the client is talking to the wrong node).
-    async fn load_timeline_for_page(
-        &mut self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        key: Key,
-    ) -> anyhow::Result<&Arc<Timeline>, GetActiveTimelineError> {
-        // Slow path: we must call out to the TenantManager to find the timeline for this Key
-        let timeline = self
-            .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Page(key))
-            .await?;
-
-        self.cache_timeline(timeline)
-    }
-
-    async fn get_timeline_shard_zero(
-        &mut self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-    ) -> anyhow::Result<&Arc<Timeline>, GetActiveTimelineError> {
-        // This is a borrow-checker workaround: we can't return from inside of the  `if let Some` because
-        // that would be an immutable-borrow-self return, whereas later in the function we will use a mutable
-        // ref to salf.  So instead, we first build a bool, and then return while not borrowing self.
-        let have_cached = if let Some((idx, _tl)) = self.shard_timelines.iter().next() {
-            idx.shard_number == ShardNumber(0)
-        } else {
-            false
-        };
-
-        if have_cached {
-            let entry = self.shard_timelines.iter().next().unwrap();
-            Ok(&entry.1.timeline)
-        } else {
-            let timeline = self
-                .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
-                .await?;
-            Ok(self.cache_timeline(timeline)?)
-        }
-    }
-
     #[instrument(skip_all, fields(shard_id))]
     async fn handle_get_page_at_lsn_request(
         &mut self,
@@ -898,33 +943,30 @@ impl PageServerHandler {
         req: &PagestreamGetPageRequest,
         ctx: &RequestContext,
     ) -> Result<PagestreamBeMessage, PageStreamError> {
-        let timeline = match self.get_cached_timeline_for_page(req) {
-            Ok(tl) => {
-                set_tracing_field_shard_id(tl);
-                tl
-            }
-            Err(key) => {
-                match self
-                    .load_timeline_for_page(tenant_id, timeline_id, key)
-                    .await
-                {
-                    Ok(t) => t,
-                    Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => {
-                        // We already know this tenant exists in general, because we resolved it at
-                        // start of connection.  Getting a NotFound here indicates that the shard containing
-                        // the requested page is not present on this node: the client's knowledge of shard->pageserver
-                        // mapping is out of date.
-                        //
-                        // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via
-                        // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration
-                        // and talk to a different pageserver.
-                        return Err(PageStreamError::Reconnect(
-                            "getpage@lsn request routed to wrong shard".into(),
-                        ));
-                    }
-                    Err(e) => return Err(e.into()),
-                }
+        let timeline = match self
+            .timeline_handles
+            .get(
+                tenant_id,
+                timeline_id,
+                ShardSelector::Page(rel_block_to_key(req.rel, req.blkno)),
+            )
+            .await
+        {
+            Ok(tl) => tl,
+            Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => {
+                // We already know this tenant exists in general, because we resolved it at
+                // start of connection.  Getting a NotFound here indicates that the shard containing
+                // the requested page is not present on this node: the client's knowledge of shard->pageserver
+                // mapping is out of date.
+                //
+                // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via
+                // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration
+                // and talk to a different pageserver.
+                return Err(PageStreamError::Reconnect(
+                    "getpage@lsn request routed to wrong shard".into(),
+                ));
             }
+            Err(e) => return Err(e.into()),
         };
 
         let _timer = timeline
@@ -933,7 +975,7 @@ impl PageServerHandler {
 
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn = Self::wait_or_get_last_lsn(
-            timeline,
+            &timeline,
             req.request_lsn,
             req.not_modified_since,
             &latest_gc_cutoff_lsn,
@@ -958,7 +1000,10 @@ impl PageServerHandler {
         req: &PagestreamGetSlruSegmentRequest,
         ctx: &RequestContext,
     ) -> Result<PagestreamBeMessage, PageStreamError> {
-        let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?;
+        let timeline = self
+            .timeline_handles
+            .get(tenant_id, timeline_id, ShardSelector::Zero)
+            .await?;
 
         let _timer = timeline
             .query_metrics
@@ -966,7 +1011,7 @@ impl PageServerHandler {
 
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn = Self::wait_or_get_last_lsn(
-            timeline,
+            &timeline,
             req.request_lsn,
             req.not_modified_since,
             &latest_gc_cutoff_lsn,
@@ -987,6 +1032,15 @@ impl PageServerHandler {
     /// Full basebackups should only be used for debugging purposes.
     /// Originally, it was introduced to enable breaking storage format changes,
     /// but that is not applicable anymore.
+    ///
+    /// # Coding Discipline
+    ///
+    /// Coding discipline within this function: all interaction with the `pgb` connection
+    /// needs to be sensitive to connection shutdown, currently signalled via [`Self::cancel`].
+    /// This is so that we can shutdown page_service quickly.
+    ///
+    /// TODO: wrap the pgb that we pass to the basebackup handler so that it's sensitive
+    /// to connection cancellation.
     #[allow(clippy::too_many_arguments)]
     #[instrument(skip_all, fields(shard_id, ?lsn, ?prev_lsn, %full_backup))]
     async fn handle_basebackup_request<IO>(
@@ -1012,10 +1066,11 @@ impl PageServerHandler {
 
         let started = std::time::Instant::now();
 
-        // check that the timeline exists
         let timeline = self
-            .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
+            .timeline_handles
+            .get(tenant_id, timeline_id, ShardSelector::Zero)
             .await?;
+
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         if let Some(lsn) = lsn {
             // Backup was requested at a particular LSN. Wait for it to arrive.
@@ -1037,7 +1092,7 @@ impl PageServerHandler {
         // switch client to COPYOUT
         pgb.write_message_noflush(&BeMessage::CopyOutResponse)
             .map_err(QueryError::Disconnected)?;
-        self.flush_cancellable(pgb, &timeline.cancel).await?;
+        self.flush_cancellable(pgb, &self.cancel).await?;
 
         // Send a tarball of the latest layer on the timeline. Compress if not
         // fullbackup. TODO Compress in that case too (tests need to be updated)
@@ -1128,77 +1183,6 @@ impl PageServerHandler {
             .expect("claims presence already checked");
         check_permission(claims, tenant_id).map_err(|e| QueryError::Unauthorized(e.0))
     }
-
-    /// Shorthand for getting a reference to a Timeline of an Active tenant.
-    async fn get_active_tenant_timeline(
-        &self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        selector: ShardSelector,
-    ) -> Result<Arc<Timeline>, GetActiveTimelineError> {
-        let tenant = self
-            .get_active_tenant_with_timeout(tenant_id, selector, ACTIVE_TENANT_TIMEOUT)
-            .await
-            .map_err(GetActiveTimelineError::Tenant)?;
-        let timeline = tenant.get_timeline(timeline_id, true)?;
-        set_tracing_field_shard_id(&timeline);
-        Ok(timeline)
-    }
-
-    /// Get a shard's [`Tenant`] in its active state, if present.  If we don't find the shard and some
-    /// slots for this tenant are `InProgress` then we will wait.
-    /// If we find the [`Tenant`] and it's not yet in state [`TenantState::Active`], we will wait.
-    ///
-    /// `timeout` is used as a total timeout for the whole wait operation.
-    async fn get_active_tenant_with_timeout(
-        &self,
-        tenant_id: TenantId,
-        shard_selector: ShardSelector,
-        timeout: Duration,
-    ) -> Result<Arc<Tenant>, GetActiveTenantError> {
-        let wait_start = Instant::now();
-        let deadline = wait_start + timeout;
-
-        // Resolve TenantId to TenantShardId.  This is usually a quick one-shot thing, the loop is
-        // for handling the rare case that the slot we're accessing is InProgress.
-        let tenant_shard = loop {
-            let resolved = self
-                .tenant_manager
-                .resolve_attached_shard(&tenant_id, shard_selector);
-            match resolved {
-                ShardResolveResult::Found(tenant_shard) => break tenant_shard,
-                ShardResolveResult::NotFound => {
-                    return Err(GetActiveTenantError::NotFound(GetTenantError::NotFound(
-                        tenant_id,
-                    )));
-                }
-                ShardResolveResult::InProgress(barrier) => {
-                    // We can't authoritatively answer right now: wait for InProgress state
-                    // to end, then try again
-                    tokio::select! {
-                        _ = self.await_connection_cancelled() => {
-                            return Err(GetActiveTenantError::Cancelled)
-                        },
-                        _  = barrier.wait() => {
-                            // The barrier completed: proceed around the loop to try looking up again
-                        },
-                        _ = tokio::time::sleep(deadline.duration_since(Instant::now())) => {
-                            return Err(GetActiveTenantError::WaitForActiveTimeout {
-                                latest_state: None,
-                                wait_time: timeout,
-                            });
-                        }
-                    }
-                }
-            };
-        };
-
-        tracing::debug!("Waiting for tenant to enter active state...");
-        tenant_shard
-            .wait_to_become_active(deadline.duration_since(Instant::now()))
-            .await?;
-        Ok(tenant_shard)
-    }
 }
 
 #[async_trait::async_trait]
@@ -1505,7 +1489,7 @@ impl From<GetActiveTenantError> for QueryError {
 }
 
 #[derive(Debug, thiserror::Error)]
-enum GetActiveTimelineError {
+pub(crate) enum GetActiveTimelineError {
     #[error(transparent)]
     Tenant(GetActiveTenantError),
     #[error(transparent)]
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 48c1851a3a..5d0e963b4e 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -386,6 +386,8 @@ impl WalRedoManager {
 
 #[derive(Debug, thiserror::Error, PartialEq, Eq)]
 pub enum GetTimelineError {
+    #[error("Timeline is shutting down")]
+    ShuttingDown,
     #[error("Timeline {tenant_id}/{timeline_id} is not active, state: {state:?}")]
     NotActive {
         tenant_id: TenantShardId,
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 5e1f69f4c1..58f8990892 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -116,8 +116,6 @@ pub(crate) enum ShardSelector {
     /// Only return the 0th shard, if it is present.  If a non-0th shard is present,
     /// ignore it.
     Zero,
-    /// Pick the first shard we find for the TenantId
-    First,
     /// Pick the shard that holds this key
     Page(Key),
     /// The shard ID is known: pick the given shard
@@ -2088,7 +2086,6 @@ impl TenantManager {
                     };
 
                     match selector {
-                        ShardSelector::First => return ShardResolveResult::Found(tenant.clone()),
                         ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
                             return ShardResolveResult::Found(tenant.clone())
                         }
@@ -2170,6 +2167,9 @@ pub(crate) enum GetActiveTenantError {
     /// never happen.
     #[error("Tenant is broken: {0}")]
     Broken(String),
+
+    #[error("reconnect to switch tenant id")]
+    SwitchedTenant,
 }
 
 #[derive(Debug, thiserror::Error)]
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 4db44a3a19..ecae443079 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3,6 +3,7 @@ pub(crate) mod compaction;
 pub mod delete;
 pub(crate) mod detach_ancestor;
 mod eviction_task;
+pub(crate) mod handle;
 mod init;
 pub mod layer_manager;
 pub(crate) mod logical_size;
@@ -17,6 +18,7 @@ use camino::Utf8Path;
 use chrono::{DateTime, Utc};
 use enumset::EnumSet;
 use fail::fail_point;
+use handle::ShardTimelineId;
 use once_cell::sync::Lazy;
 use pageserver_api::{
     key::{
@@ -424,6 +426,8 @@ pub struct Timeline {
     pub(crate) extra_test_dense_keyspace: ArcSwap<KeySpace>,
 
     pub(crate) l0_flush_global_state: L0FlushGlobalState,
+
+    pub(crate) handles: handle::PerTimelineState<crate::page_service::TenantManagerTypes>,
 }
 
 pub struct WalReceiverInfo {
@@ -1915,6 +1919,9 @@ impl Timeline {
         tracing::debug!("Cancelling CancellationToken");
         self.cancel.cancel();
 
+        // Ensure Prevent new page service requests from starting.
+        self.handles.shutdown();
+
         // Transition the remote_client into a state where it's only useful for timeline deletion.
         // (The deletion use case is why we can't just hook up remote_client to Self::cancel).)
         self.remote_client.stop();
@@ -2440,6 +2447,8 @@ impl Timeline {
                 extra_test_dense_keyspace: ArcSwap::new(Arc::new(KeySpace::default())),
 
                 l0_flush_global_state: resources.l0_flush_global_state,
+
+                handles: Default::default(),
             };
             result.repartition_threshold =
                 result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
@@ -3709,6 +3718,17 @@ impl Timeline {
         &self.shard_identity
     }
 
+    #[inline(always)]
+    pub(crate) fn shard_timeline_id(&self) -> ShardTimelineId {
+        ShardTimelineId {
+            shard_index: ShardIndex {
+                shard_number: self.shard_identity.number,
+                shard_count: self.shard_identity.count,
+            },
+            timeline_id: self.timeline_id,
+        }
+    }
+
     ///
     /// Get a handle to the latest layer for appending.
     ///
diff --git a/pageserver/src/tenant/timeline/handle.rs b/pageserver/src/tenant/timeline/handle.rs
new file mode 100644
index 0000000000..e82559b8b3
--- /dev/null
+++ b/pageserver/src/tenant/timeline/handle.rs
@@ -0,0 +1,967 @@
+//! An efficient way to keep the timeline gate open without preventing
+//! timeline shutdown for longer than a single call to a timeline method.
+//!
+//! # Motivation
+//!
+//! On a single page service connection, we're typically serving a single TenantTimelineId.
+//!
+//! Without sharding, there is a single Timeline object to which we dispatch
+//! all requests. For example, a getpage request gets dispatched to the
+//! Timeline::get method of the Timeline object that represents the
+//! (tenant,timeline) of that connection.
+//!
+//! With sharding, for each request that comes in on the connection,
+//! we first have to perform shard routing based on the requested key (=~ page number).
+//! The result of shard routing is a Timeline object.
+//! We then dispatch the request to that Timeline object.
+//!
+//! Regardless of whether the tenant is sharded or not, we want to ensure that
+//! we hold the Timeline gate open while we're invoking the method on the
+//! Timeline object.
+//!
+//! However, we want to avoid the overhead of entering the gate for every
+//! method invocation.
+//!
+//! Further, for shard routing, we want to avoid calling the tenant manager to
+//! resolve the shard for every request. Instead, we want to cache the
+//! routing result so we can bypass the tenant manager for all subsequent requests
+//! that get routed to that shard.
+//!
+//! Regardless of how we accomplish the above, it should not
+//! prevent the Timeline from shutting down promptly.
+//!
+//! # Design
+//!
+//! There are three user-facing data structures:
+//! - `PerTimelineState`: a struct embedded into each Timeline struct. Lifetime == Timeline lifetime.
+//! - `Cache`: a struct private to each connection handler; Lifetime == connection lifetime.
+//! - `Handle`: a smart pointer that holds the Timeline gate open and derefs to `&Timeline`.
+//!   Lifetime: for a single request dispatch on the Timeline (i.e., one getpage request)
+//!
+//! The `Handle` is just a wrapper around an `Arc<HandleInner>`.
+//!
+//! There is one long-lived `Arc<HandleInner>`, which is stored in the `PerTimelineState`.
+//! The `Cache` stores a `Weak<HandleInner>` for each cached Timeline.
+//!
+//! To dispatch a request, the page service connection calls `Cache::get`.
+//!
+//! A cache miss means we consult the tenant manager for shard routing,
+//! resulting in an `Arc<Timeline>`. We enter its gate _once_ and construct an
+//! `Arc<HandleInner>`. We store a `Weak<HandleInner>` in the cache
+//! and the `Arc<HandleInner>` in the `PerTimelineState`.
+//!
+//! For subsequent requests, `Cache::get` will perform a "fast path" shard routing
+//! and find the `Weak<HandleInner>` in the cache.
+//! We upgrade the `Weak<HandleInner>` to an `Arc<HandleInner>` and wrap it in the user-facing `Handle` type.
+//!
+//! The request handler dispatches the request to the right `<Handle as Deref<Target = Timeline>>::$request_method`.
+//! It then drops the `Handle`, which drops the `Arc<HandleInner>`.
+//!
+//! # Memory Management / How The Reference Cycle Is Broken
+//!
+//! The attentive reader may have noticed the strong reference cycle
+//! from `Arc<HandleInner>` to `PerTimelineState` to `Arc<Timeline>`.
+//!
+//! This cycle is intentional: while it exists, the `Cache` can upgrade its
+//! `Weak<HandleInner>` to an `Arc<HandleInner>` in a single atomic operation.
+//!
+//! The cycle is broken by either
+//! - `PerTimelineState::shutdown` or
+//! - dropping the `Cache`.
+//!
+//! Concurrently existing `Handle`s will extend the existence of the cycle.
+//! However, since `Handle`s are short-lived and new `Handle`s are not
+//! handed out after either `PerTimelineState::shutdown` or `Cache` drop,
+//! that extension of the cycle is bounded.
+//!
+//! # Fast Path for Shard Routing
+//!
+//! The `Cache` has a fast path for shard routing to avoid calling into
+//! the tenant manager for every request.
+//!
+//! The `Cache` maintains a hash map of `ShardTimelineId` to `Weak<HandleInner>`.
+//!
+//! The current implementation uses the first entry in the hash map
+//! to determine the `ShardParameters` and derive the correct
+//! `ShardIndex` for the requested key.
+//!
+//! It then looks up the hash map for that `ShardTimelineId := {ShardIndex,TimelineId}`.
+//!
+//! If the lookup is successful and the `Weak<HandleInner>` can be upgraded,
+//! it's a hit.
+//!
+//! ## Cache invalidation
+//!
+//! The insight is that cache invalidation is sufficient and most efficiently done lazily.
+//! The only reasons why an entry in the cache can become stale are:
+//! 1. The `PerTimelineState` / Timeline is shutting down e.g. because the shard is
+//!    being detached, timeline or shard deleted, or pageserver is shutting down.
+//! 2. We're doing a shard split and new traffic should be routed to the child shards.
+//!
+//! Regarding (1), we will eventually fail to upgrade the `Weak<HandleInner>` once the
+//! timeline has shut down, and when that happens, we remove the entry from the cache.
+//!
+//! Regarding (2), the insight is that it is toally fine to keep dispatching requests
+//! to the parent shard during a shard split. Eventually, the shard split task will
+//! shut down the parent => case (1).
+
+use std::collections::hash_map;
+use std::collections::HashMap;
+use std::sync::atomic::AtomicBool;
+use std::sync::atomic::Ordering;
+use std::sync::Arc;
+use std::sync::Mutex;
+use std::sync::Weak;
+
+use pageserver_api::shard::ShardIdentity;
+use tracing::instrument;
+use tracing::trace;
+use utils::id::TimelineId;
+use utils::shard::ShardIndex;
+use utils::shard::ShardNumber;
+
+use crate::tenant::mgr::ShardSelector;
+
+/// The requirement for Debug is so that #[derive(Debug)] works in some places.
+pub(crate) trait Types: Sized + std::fmt::Debug {
+    type TenantManagerError: Sized + std::fmt::Debug;
+    type TenantManager: TenantManager<Self> + Sized;
+    type Timeline: ArcTimeline<Self> + Sized;
+}
+
+/// Uniquely identifies a [`Cache`] instance over the lifetime of the process.
+/// Required so [`Cache::drop`] can take out the handles from the [`PerTimelineState`].
+/// Alternative to this would be to allocate [`Cache`] in a `Box` and identify it by the pointer.
+#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)]
+struct CacheId(u64);
+
+impl CacheId {
+    fn next() -> Self {
+        static NEXT_ID: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(1);
+        let id = NEXT_ID.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
+        if id == 0 {
+            panic!("CacheId::new() returned 0, overflow");
+        }
+        Self(id)
+    }
+}
+
+/// See module-level comment.
+pub(crate) struct Cache<T: Types> {
+    id: CacheId,
+    map: Map<T>,
+}
+
+type Map<T> = HashMap<ShardTimelineId, Weak<HandleInner<T>>>;
+
+impl<T: Types> Default for Cache<T> {
+    fn default() -> Self {
+        Self {
+            id: CacheId::next(),
+            map: Default::default(),
+        }
+    }
+}
+
+#[derive(PartialEq, Eq, Debug, Hash, Clone, Copy)]
+pub(crate) struct ShardTimelineId {
+    pub(crate) shard_index: ShardIndex,
+    pub(crate) timeline_id: TimelineId,
+}
+
+/// See module-level comment.
+pub(crate) struct Handle<T: Types>(Arc<HandleInner<T>>);
+struct HandleInner<T: Types> {
+    shut_down: AtomicBool,
+    timeline: T::Timeline,
+    // The timeline's gate held open.
+    _gate_guard: utils::sync::gate::GateGuard,
+}
+
+/// Embedded in each [`Types::Timeline`] as the anchor for the only long-lived strong ref to `HandleInner`.
+///
+/// See module-level comment for details.
+pub struct PerTimelineState<T: Types> {
+    // None = shutting down
+    handles: Mutex<Option<HashMap<CacheId, Arc<HandleInner<T>>>>>,
+}
+
+impl<T: Types> Default for PerTimelineState<T> {
+    fn default() -> Self {
+        Self {
+            handles: Mutex::new(Some(Default::default())),
+        }
+    }
+}
+
+/// Abstract view of [`crate::tenant::mgr`], for testability.
+pub(crate) trait TenantManager<T: Types> {
+    /// Invoked by [`Cache::get`] to resolve a [`ShardTimelineId`] to a [`Types::Timeline`].
+    /// Errors are returned as [`GetError::TenantManager`].
+    async fn resolve(
+        &self,
+        timeline_id: TimelineId,
+        shard_selector: ShardSelector,
+    ) -> Result<T::Timeline, T::TenantManagerError>;
+}
+
+/// Abstract view of an [`Arc<Timeline>`], for testability.
+pub(crate) trait ArcTimeline<T: Types>: Clone {
+    fn gate(&self) -> &utils::sync::gate::Gate;
+    fn shard_timeline_id(&self) -> ShardTimelineId;
+    fn get_shard_identity(&self) -> &ShardIdentity;
+    fn per_timeline_state(&self) -> &PerTimelineState<T>;
+}
+
+/// Errors returned by [`Cache::get`].
+#[derive(Debug)]
+pub(crate) enum GetError<T: Types> {
+    TenantManager(T::TenantManagerError),
+    TimelineGateClosed,
+    PerTimelineStateShutDown,
+}
+
+/// Internal type used in [`Cache::get`].
+enum RoutingResult<T: Types> {
+    FastPath(Handle<T>),
+    SlowPath(ShardTimelineId),
+    NeedConsultTenantManager,
+}
+
+impl<T: Types> Cache<T> {
+    /// See module-level comment for details.
+    ///
+    /// Does NOT check for the shutdown state of [`Types::Timeline`].
+    /// Instead, the methods of [`Types::Timeline`] that are invoked through
+    /// the [`Handle`] are responsible for checking these conditions
+    /// and if so, return an error that causes the page service to
+    /// close the connection.
+    #[instrument(level = "trace", skip_all)]
+    pub(crate) async fn get(
+        &mut self,
+        timeline_id: TimelineId,
+        shard_selector: ShardSelector,
+        tenant_manager: &T::TenantManager,
+    ) -> Result<Handle<T>, GetError<T>> {
+        // terminates because each iteration removes an element from the map
+        loop {
+            let handle = self
+                .get_impl(timeline_id, shard_selector, tenant_manager)
+                .await?;
+            if handle.0.shut_down.load(Ordering::Relaxed) {
+                let removed = self
+                    .map
+                    .remove(&handle.0.timeline.shard_timeline_id())
+                    .expect("invariant of get_impl is that the returned handle is in the map");
+                assert!(
+                    Weak::ptr_eq(&removed, &Arc::downgrade(&handle.0)),
+                    "shard_timeline_id() incorrect?"
+                );
+            } else {
+                return Ok(handle);
+            }
+        }
+    }
+
+    #[instrument(level = "trace", skip_all)]
+    async fn get_impl(
+        &mut self,
+        timeline_id: TimelineId,
+        shard_selector: ShardSelector,
+        tenant_manager: &T::TenantManager,
+    ) -> Result<Handle<T>, GetError<T>> {
+        let miss: ShardSelector = {
+            let routing_state = self.shard_routing(timeline_id, shard_selector);
+            match routing_state {
+                RoutingResult::FastPath(handle) => return Ok(handle),
+                RoutingResult::SlowPath(key) => match self.map.get(&key) {
+                    Some(cached) => match cached.upgrade() {
+                        Some(upgraded) => return Ok(Handle(upgraded)),
+                        None => {
+                            trace!("handle cache stale");
+                            self.map.remove(&key).unwrap();
+                            ShardSelector::Known(key.shard_index)
+                        }
+                    },
+                    None => ShardSelector::Known(key.shard_index),
+                },
+                RoutingResult::NeedConsultTenantManager => shard_selector,
+            }
+        };
+        self.get_miss(timeline_id, miss, tenant_manager).await
+    }
+
+    #[inline(always)]
+    fn shard_routing(
+        &mut self,
+        timeline_id: TimelineId,
+        shard_selector: ShardSelector,
+    ) -> RoutingResult<T> {
+        loop {
+            // terminates because when every iteration we remove an element from the map
+            let Some((first_key, first_handle)) = self.map.iter().next() else {
+                return RoutingResult::NeedConsultTenantManager;
+            };
+            let Some(first_handle) = first_handle.upgrade() else {
+                // TODO: dedup with get()
+                trace!("handle cache stale");
+                let first_key_owned = *first_key;
+                self.map.remove(&first_key_owned).unwrap();
+                continue;
+            };
+
+            let first_handle_shard_identity = first_handle.timeline.get_shard_identity();
+            let make_shard_index = |shard_num: ShardNumber| ShardIndex {
+                shard_number: shard_num,
+                shard_count: first_handle_shard_identity.count,
+            };
+
+            let need_idx = match shard_selector {
+                ShardSelector::Page(key) => {
+                    make_shard_index(first_handle_shard_identity.get_shard_number(&key))
+                }
+                ShardSelector::Zero => make_shard_index(ShardNumber(0)),
+                ShardSelector::Known(shard_idx) => shard_idx,
+            };
+            let need_shard_timeline_id = ShardTimelineId {
+                shard_index: need_idx,
+                timeline_id,
+            };
+            let first_handle_shard_timeline_id = ShardTimelineId {
+                shard_index: first_handle_shard_identity.shard_index(),
+                timeline_id: first_handle.timeline.shard_timeline_id().timeline_id,
+            };
+
+            if need_shard_timeline_id == first_handle_shard_timeline_id {
+                return RoutingResult::FastPath(Handle(first_handle));
+            } else {
+                return RoutingResult::SlowPath(need_shard_timeline_id);
+            }
+        }
+    }
+
+    #[instrument(level = "trace", skip_all)]
+    #[inline(always)]
+    async fn get_miss(
+        &mut self,
+        timeline_id: TimelineId,
+        shard_selector: ShardSelector,
+        tenant_manager: &T::TenantManager,
+    ) -> Result<Handle<T>, GetError<T>> {
+        match tenant_manager.resolve(timeline_id, shard_selector).await {
+            Ok(timeline) => {
+                let key = timeline.shard_timeline_id();
+                match &shard_selector {
+                    ShardSelector::Zero => assert_eq!(key.shard_index.shard_number, ShardNumber(0)),
+                    ShardSelector::Page(_) => (), // gotta trust tenant_manager
+                    ShardSelector::Known(idx) => assert_eq!(idx, &key.shard_index),
+                }
+
+                let gate_guard = match timeline.gate().enter() {
+                    Ok(guard) => guard,
+                    Err(_) => {
+                        return Err(GetError::TimelineGateClosed);
+                    }
+                };
+                trace!("creating new HandleInner");
+                let handle = Arc::new(
+                    // TODO: global metric that keeps track of the number of live HandlerTimeline instances
+                    // so we can identify reference cycle bugs.
+                    HandleInner {
+                        shut_down: AtomicBool::new(false),
+                        _gate_guard: gate_guard,
+                        timeline: timeline.clone(),
+                    },
+                );
+                let handle = {
+                    let mut lock_guard = timeline
+                        .per_timeline_state()
+                        .handles
+                        .lock()
+                        .expect("mutex poisoned");
+                    match &mut *lock_guard {
+                        Some(per_timeline_state) => {
+                            let replaced = per_timeline_state.insert(self.id, Arc::clone(&handle));
+                            assert!(replaced.is_none(), "some earlier code left a stale handle");
+                            match self.map.entry(key) {
+                                hash_map::Entry::Occupied(_o) => {
+                                    // This cannot not happen because
+                                    // 1. we're the _miss_ handle, i.e., `self.map` didn't contain an entry and
+                                    // 2. we were holding &mut self during .resolve().await above, so, no other thread can have inserted a handle
+                                    //    while we were waiting for the tenant manager.
+                                    unreachable!()
+                                }
+                                hash_map::Entry::Vacant(v) => {
+                                    v.insert(Arc::downgrade(&handle));
+                                    handle
+                                }
+                            }
+                        }
+                        None => {
+                            return Err(GetError::PerTimelineStateShutDown);
+                        }
+                    }
+                };
+                Ok(Handle(handle))
+            }
+            Err(e) => Err(GetError::TenantManager(e)),
+        }
+    }
+}
+
+impl<T: Types> PerTimelineState<T> {
+    /// After this method returns, [`Cache::get`] will never again return a [`Handle`]
+    /// to the [`Types::Timeline`] that embeds this per-timeline state.
+    /// Even if [`TenantManager::resolve`] would still resolve to it.
+    ///
+    /// Already-alive [`Handle`]s for will remain open, usable, and keeping the [`ArcTimeline`] alive.
+    /// That's ok because they're short-lived. See module-level comment for details.
+    #[instrument(level = "trace", skip_all)]
+    pub(super) fn shutdown(&self) {
+        let handles = self
+            .handles
+            .lock()
+            .expect("mutex poisoned")
+            // NB: this .take() sets locked to None.
+            // That's what makes future `Cache::get` misses fail.
+            // Cache hits are taken care of below.
+            .take();
+        let Some(handles) = handles else {
+            trace!("already shut down");
+            return;
+        };
+        for handle in handles.values() {
+            // Make hits fail.
+            handle.shut_down.store(true, Ordering::Relaxed);
+        }
+        drop(handles);
+    }
+}
+
+impl<T: Types> std::ops::Deref for Handle<T> {
+    type Target = T::Timeline;
+    fn deref(&self) -> &Self::Target {
+        &self.0.timeline
+    }
+}
+
+#[cfg(test)]
+impl<T: Types> Drop for HandleInner<T> {
+    fn drop(&mut self) {
+        trace!("HandleInner dropped");
+    }
+}
+
+// When dropping a [`Cache`], prune its handles in the [`PerTimelineState`] to break the reference cycle.
+impl<T: Types> Drop for Cache<T> {
+    fn drop(&mut self) {
+        for (_, weak) in self.map.drain() {
+            if let Some(strong) = weak.upgrade() {
+                // handle is still being kept alive in PerTimelineState
+                let timeline = strong.timeline.per_timeline_state();
+                let mut handles = timeline.handles.lock().expect("mutex poisoned");
+                if let Some(handles) = &mut *handles {
+                    let Some(removed) = handles.remove(&self.id) else {
+                        // There could have been a shutdown inbetween us upgrading the weak and locking the mutex.
+                        continue;
+                    };
+                    assert!(Arc::ptr_eq(&removed, &strong));
+                }
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use pageserver_api::{
+        key::{rel_block_to_key, Key, DBDIR_KEY},
+        models::ShardParameters,
+        reltag::RelTag,
+        shard::ShardStripeSize,
+    };
+    use utils::shard::ShardCount;
+
+    use super::*;
+
+    const FOREVER: std::time::Duration = std::time::Duration::from_secs(u64::MAX);
+
+    #[derive(Debug)]
+    struct TestTypes;
+    impl Types for TestTypes {
+        type TenantManagerError = anyhow::Error;
+        type TenantManager = StubManager;
+        type Timeline = Arc<StubTimeline>;
+    }
+
+    struct StubManager {
+        shards: Vec<Arc<StubTimeline>>,
+    }
+
+    struct StubTimeline {
+        gate: utils::sync::gate::Gate,
+        id: TimelineId,
+        shard: ShardIdentity,
+        per_timeline_state: PerTimelineState<TestTypes>,
+        myself: Weak<StubTimeline>,
+    }
+
+    impl StubTimeline {
+        fn getpage(&self) {
+            // do nothing
+        }
+    }
+
+    impl ArcTimeline<TestTypes> for Arc<StubTimeline> {
+        fn gate(&self) -> &utils::sync::gate::Gate {
+            &self.gate
+        }
+
+        fn shard_timeline_id(&self) -> ShardTimelineId {
+            ShardTimelineId {
+                shard_index: self.shard.shard_index(),
+                timeline_id: self.id,
+            }
+        }
+
+        fn get_shard_identity(&self) -> &ShardIdentity {
+            &self.shard
+        }
+
+        fn per_timeline_state(&self) -> &PerTimelineState<TestTypes> {
+            &self.per_timeline_state
+        }
+    }
+
+    impl TenantManager<TestTypes> for StubManager {
+        async fn resolve(
+            &self,
+            timeline_id: TimelineId,
+            shard_selector: ShardSelector,
+        ) -> anyhow::Result<Arc<StubTimeline>> {
+            for timeline in &self.shards {
+                if timeline.id == timeline_id {
+                    match &shard_selector {
+                        ShardSelector::Zero if timeline.shard.is_shard_zero() => {
+                            return Ok(Arc::clone(timeline));
+                        }
+                        ShardSelector::Zero => continue,
+                        ShardSelector::Page(key) if timeline.shard.is_key_local(key) => {
+                            return Ok(Arc::clone(timeline));
+                        }
+                        ShardSelector::Page(_) => continue,
+                        ShardSelector::Known(idx) if idx == &timeline.shard.shard_index() => {
+                            return Ok(Arc::clone(timeline));
+                        }
+                        ShardSelector::Known(_) => continue,
+                    }
+                }
+            }
+            anyhow::bail!("not found")
+        }
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn test_timeline_shutdown() {
+        crate::tenant::harness::setup_logging();
+
+        let timeline_id = TimelineId::generate();
+        let shard0 = Arc::new_cyclic(|myself| StubTimeline {
+            gate: Default::default(),
+            id: timeline_id,
+            shard: ShardIdentity::unsharded(),
+            per_timeline_state: PerTimelineState::default(),
+            myself: myself.clone(),
+        });
+        let mgr = StubManager {
+            shards: vec![shard0.clone()],
+        };
+        let key = DBDIR_KEY;
+
+        let mut cache = Cache::<TestTypes>::default();
+
+        //
+        // fill the cache
+        //
+        assert_eq!(
+            (Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
+            (2, 1),
+            "strong: shard0, mgr; weak: myself"
+        );
+
+        let handle: Handle<_> = cache
+            .get(timeline_id, ShardSelector::Page(key), &mgr)
+            .await
+            .expect("we have the timeline");
+        let handle_inner_weak = Arc::downgrade(&handle.0);
+        assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
+        assert_eq!(
+            (
+                Weak::strong_count(&handle_inner_weak),
+                Weak::weak_count(&handle_inner_weak)
+            ),
+            (2, 2),
+            "strong: handle, per_timeline_state, weak: handle_inner_weak, cache"
+        );
+        assert_eq!(cache.map.len(), 1);
+
+        assert_eq!(
+            (Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
+            (3, 1),
+            "strong: handleinner(per_timeline_state), shard0, mgr; weak: myself"
+        );
+        drop(handle);
+        assert_eq!(
+            (Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
+            (3, 1),
+            "strong: handleinner(per_timeline_state), shard0, mgr; weak: myself"
+        );
+
+        //
+        // demonstrate that Handle holds up gate closure
+        // but shutdown prevents new handles from being handed out
+        //
+
+        tokio::select! {
+            _ = shard0.gate.close() => {
+                panic!("cache and per-timeline handler state keep cache open");
+            }
+            _ = tokio::time::sleep(FOREVER) => {
+                // NB: first poll of close() makes it enter closing state
+            }
+        }
+
+        let handle = cache
+            .get(timeline_id, ShardSelector::Page(key), &mgr)
+            .await
+            .expect("we have the timeline");
+        assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
+
+        // SHUTDOWN
+        shard0.per_timeline_state.shutdown(); // keeping handle alive across shutdown
+
+        assert_eq!(
+            1,
+            Weak::strong_count(&handle_inner_weak),
+            "through local var handle"
+        );
+        assert_eq!(
+            cache.map.len(),
+            1,
+            "this is an implementation detail but worth pointing out: we can't clear the cache from shutdown(), it's cleared on first access after"
+        );
+        assert_eq!(
+            (Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
+            (3, 1),
+            "strong: handleinner(via handle), shard0, mgr; weak: myself"
+        );
+
+        // this handle is perfectly usable
+        handle.getpage();
+
+        cache
+            .get(timeline_id, ShardSelector::Page(key), &mgr)
+            .await
+            .err()
+            .expect("documented behavior: can't get new handle after shutdown, even if there is an alive Handle");
+        assert_eq!(
+            cache.map.len(),
+            0,
+            "first access after shutdown cleans up the Weak's from the cache"
+        );
+
+        tokio::select! {
+            _ = shard0.gate.close() => {
+                panic!("handle is keeping gate open");
+            }
+            _ = tokio::time::sleep(FOREVER) => { }
+        }
+
+        drop(handle);
+        assert_eq!(
+            0,
+            Weak::strong_count(&handle_inner_weak),
+            "the HandleInner destructor already ran"
+        );
+        assert_eq!(
+            (Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
+            (2, 1),
+            "strong: shard0, mgr; weak: myself"
+        );
+
+        // closing gate succeeds after dropping handle
+        tokio::select! {
+            _ = shard0.gate.close() => { }
+            _ = tokio::time::sleep(FOREVER) => {
+                panic!("handle is dropped, no other gate holders exist")
+            }
+        }
+
+        // map gets cleaned on next lookup
+        cache
+            .get(timeline_id, ShardSelector::Page(key), &mgr)
+            .await
+            .err()
+            .expect("documented behavior: can't get new handle after shutdown");
+        assert_eq!(cache.map.len(), 0);
+
+        // ensure all refs to shard0 are gone and we're not leaking anything
+        let myself = Weak::clone(&shard0.myself);
+        drop(shard0);
+        drop(mgr);
+        assert_eq!(Weak::strong_count(&myself), 0);
+    }
+
+    #[tokio::test]
+    async fn test_multiple_timelines_and_deletion() {
+        crate::tenant::harness::setup_logging();
+
+        let timeline_a = TimelineId::generate();
+        let timeline_b = TimelineId::generate();
+        assert_ne!(timeline_a, timeline_b);
+        let timeline_a = Arc::new_cyclic(|myself| StubTimeline {
+            gate: Default::default(),
+            id: timeline_a,
+            shard: ShardIdentity::unsharded(),
+            per_timeline_state: PerTimelineState::default(),
+            myself: myself.clone(),
+        });
+        let timeline_b = Arc::new_cyclic(|myself| StubTimeline {
+            gate: Default::default(),
+            id: timeline_b,
+            shard: ShardIdentity::unsharded(),
+            per_timeline_state: PerTimelineState::default(),
+            myself: myself.clone(),
+        });
+        let mut mgr = StubManager {
+            shards: vec![timeline_a.clone(), timeline_b.clone()],
+        };
+        let key = DBDIR_KEY;
+
+        let mut cache = Cache::<TestTypes>::default();
+
+        cache
+            .get(timeline_a.id, ShardSelector::Page(key), &mgr)
+            .await
+            .expect("we have it");
+        cache
+            .get(timeline_b.id, ShardSelector::Page(key), &mgr)
+            .await
+            .expect("we have it");
+        assert_eq!(cache.map.len(), 2);
+
+        // delete timeline A
+        timeline_a.per_timeline_state.shutdown();
+        mgr.shards.retain(|t| t.id != timeline_a.id);
+        assert!(
+            mgr.resolve(timeline_a.id, ShardSelector::Page(key))
+                .await
+                .is_err(),
+            "broken StubManager implementation"
+        );
+
+        assert_eq!(
+            cache.map.len(),
+            2,
+            "cache still has a Weak handle to Timeline A"
+        );
+        cache
+            .get(timeline_a.id, ShardSelector::Page(key), &mgr)
+            .await
+            .err()
+            .expect("documented behavior: can't get new handle after shutdown");
+        assert_eq!(cache.map.len(), 1, "next access cleans up the cache");
+
+        cache
+            .get(timeline_b.id, ShardSelector::Page(key), &mgr)
+            .await
+            .expect("we still have it");
+    }
+
+    fn make_relation_key_for_shard(shard: ShardNumber, params: &ShardParameters) -> Key {
+        rel_block_to_key(
+            RelTag {
+                spcnode: 1663,
+                dbnode: 208101,
+                relnode: 2620,
+                forknum: 0,
+            },
+            shard.0 as u32 * params.stripe_size.0,
+        )
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn test_shard_split() {
+        crate::tenant::harness::setup_logging();
+        let timeline_id = TimelineId::generate();
+        let parent = Arc::new_cyclic(|myself| StubTimeline {
+            gate: Default::default(),
+            id: timeline_id,
+            shard: ShardIdentity::unsharded(),
+            per_timeline_state: PerTimelineState::default(),
+            myself: myself.clone(),
+        });
+        let child_params = ShardParameters {
+            count: ShardCount(2),
+            stripe_size: ShardStripeSize::default(),
+        };
+        let child0 = Arc::new_cyclic(|myself| StubTimeline {
+            gate: Default::default(),
+            id: timeline_id,
+            shard: ShardIdentity::from_params(ShardNumber(0), &child_params),
+            per_timeline_state: PerTimelineState::default(),
+            myself: myself.clone(),
+        });
+        let child1 = Arc::new_cyclic(|myself| StubTimeline {
+            gate: Default::default(),
+            id: timeline_id,
+            shard: ShardIdentity::from_params(ShardNumber(1), &child_params),
+            per_timeline_state: PerTimelineState::default(),
+            myself: myself.clone(),
+        });
+        let child_shards_by_shard_number = [child0.clone(), child1.clone()];
+
+        let mut cache = Cache::<TestTypes>::default();
+
+        // fill the cache with the parent
+        for i in 0..2 {
+            let handle = cache
+                .get(
+                    timeline_id,
+                    ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
+                    &StubManager {
+                        shards: vec![parent.clone()],
+                    },
+                )
+                .await
+                .expect("we have it");
+            assert!(
+                Weak::ptr_eq(&handle.myself, &parent.myself),
+                "mgr returns parent first"
+            );
+            drop(handle);
+        }
+
+        //
+        // SHARD SPLIT: tenant manager changes, but the cache isn't informed
+        //
+
+        // while we haven't shut down the parent, the cache will return the cached parent, even
+        // if the tenant manager returns the child
+        for i in 0..2 {
+            let handle = cache
+                .get(
+                    timeline_id,
+                    ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
+                    &StubManager {
+                        shards: vec![], // doesn't matter what's in here, the cache is fully loaded
+                    },
+                )
+                .await
+                .expect("we have it");
+            assert!(
+                Weak::ptr_eq(&handle.myself, &parent.myself),
+                "mgr returns parent"
+            );
+            drop(handle);
+        }
+
+        let parent_handle = cache
+            .get(
+                timeline_id,
+                ShardSelector::Page(make_relation_key_for_shard(ShardNumber(0), &child_params)),
+                &StubManager {
+                    shards: vec![parent.clone()],
+                },
+            )
+            .await
+            .expect("we have it");
+        assert!(Weak::ptr_eq(&parent_handle.myself, &parent.myself));
+
+        // invalidate the cache
+        parent.per_timeline_state.shutdown();
+
+        // the cache will now return the child, even though the parent handle still exists
+        for i in 0..2 {
+            let handle = cache
+                .get(
+                    timeline_id,
+                    ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
+                    &StubManager {
+                        shards: vec![child0.clone(), child1.clone()], // <====== this changed compared to previous loop
+                    },
+                )
+                .await
+                .expect("we have it");
+            assert!(
+                Weak::ptr_eq(
+                    &handle.myself,
+                    &child_shards_by_shard_number[i as usize].myself
+                ),
+                "mgr returns child"
+            );
+            drop(handle);
+        }
+
+        // all the while the parent handle kept the parent gate open
+        tokio::select! {
+            _ = parent_handle.gate.close() => {
+                panic!("parent handle is keeping gate open");
+            }
+            _ = tokio::time::sleep(FOREVER) => { }
+        }
+        drop(parent_handle);
+        tokio::select! {
+            _ = parent.gate.close() => { }
+            _ = tokio::time::sleep(FOREVER) => {
+                panic!("parent handle is dropped, no other gate holders exist")
+            }
+        }
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn test_connection_handler_exit() {
+        crate::tenant::harness::setup_logging();
+        let timeline_id = TimelineId::generate();
+        let shard0 = Arc::new_cyclic(|myself| StubTimeline {
+            gate: Default::default(),
+            id: timeline_id,
+            shard: ShardIdentity::unsharded(),
+            per_timeline_state: PerTimelineState::default(),
+            myself: myself.clone(),
+        });
+        let mgr = StubManager {
+            shards: vec![shard0.clone()],
+        };
+        let key = DBDIR_KEY;
+
+        // Simulate 10 connections that's opened, used, and closed
+        let mut used_handles = vec![];
+        for _ in 0..10 {
+            let mut cache = Cache::<TestTypes>::default();
+            let handle = {
+                let handle = cache
+                    .get(timeline_id, ShardSelector::Page(key), &mgr)
+                    .await
+                    .expect("we have the timeline");
+                assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
+                handle
+            };
+            handle.getpage();
+            used_handles.push(Arc::downgrade(&handle.0));
+        }
+
+        // No handles exist, thus gates are closed and don't require shutdown
+        assert!(used_handles
+            .iter()
+            .all(|weak| Weak::strong_count(weak) == 0));
+
+        // ... thus the gate should close immediately, even without shutdown
+        tokio::select! {
+            _ = shard0.gate.close() => { }
+            _ = tokio::time::sleep(FOREVER) => {
+                panic!("handle is dropped, no other gate holders exist")
+            }
+        }
+    }
+}

From a006f7656e29223278523c7712c298dfbdee9efa Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Thu, 25 Jul 2024 15:45:15 -0500
Subject: [PATCH 310/412] Fix negative replication delay metric

In some cases, we can get a negative metric for replication_delay_bytes.
My best guess from all the research I've done is that we evaluate
pg_last_wal_receive_lsn() before pg_last_wal_replay_lsn(), and that by
the time everything is said and done, the replay LSN has advanced past
the receive LSN. In this case, our lag can effectively be modeled as
0 due to the speed of the WAL reception and replay.
---
 vm-image-spec.yaml | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index 2767710bad..7d005c7139 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -277,8 +277,12 @@ files:
         help: 'Bytes between received and replayed LSN'
         key_labels:
         values: [replication_delay_bytes]
+        # We use a GREATEST call here because this calculation can be negative.
+        # The calculation is not atomic, meaning after we've gotten the receive
+        # LSN, the replay LSN may have advanced past the receive LSN we
+        # are using for the calculation.
         query: |
-          SELECT pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn()) AS replication_delay_bytes;
+          SELECT GREATEST(0, pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())) AS replication_delay_bytes;
 
       - metric_name: replication_delay_seconds
         type: gauge

From 6c1bbe8434c299b2da588f3d26a40dbae9de11e8 Mon Sep 17 00:00:00 2001
From: Cihan Demirci <128653800+fcdm@users.noreply.github.com>
Date: Wed, 31 Jul 2024 19:42:10 +0300
Subject: [PATCH 311/412] cicd: change Azure storage details [2/2] (#8562)

Change Azure storage configuration to point to updated variables/secrets.

Also update subscription id variable.
---
 .github/actionlint.yml                        | 1 -
 .github/workflows/_build-and-test-locally.yml | 6 +++---
 .github/workflows/build_and_test.yml          | 2 +-
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/.github/actionlint.yml b/.github/actionlint.yml
index f086008d34..37983798b7 100644
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -9,6 +9,5 @@ self-hosted-runner:
     - us-east-2
 config-variables:
   - REMOTE_STORAGE_AZURE_CONTAINER
-  - REMOTE_STORAGE_AZURE_CONTAINER_NEW
   - REMOTE_STORAGE_AZURE_REGION
   - SLACK_UPCOMING_RELEASE_CHANNEL_ID
diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index 182e96a8ca..a0ed169024 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -223,9 +223,9 @@ jobs:
           # Run separate tests for real Azure Blob Storage
           # XXX: replace region with `eu-central-1`-like region
           export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
-          export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV_NEW }}"
-          export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV_NEW }}"
-          export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER_NEW }}"
+          export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
+          export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
+          export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
           export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
           ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_azure)'
 
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index c4df98f585..50006dd3d4 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -871,7 +871,7 @@ jobs:
         with:
           client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }}
           tenant-id: ${{ secrets.AZURE_TENANT_ID }}
-          subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
+          subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }}
 
       - name: Login to ACR
         if: github.ref_name == 'main'

From b3a76d9601b6b4cf45739426bfc53611804a08ed Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 31 Jul 2024 18:37:47 +0100
Subject: [PATCH 312/412] controller: simplify reconciler generation increment
 logic (#8560)

## Problem

This code was confusing, untested and covered:
- an impossible case, where intent state is AttacheStale (we never do
this)
- a rare edge case (going from AttachedMulti to Attached), which we were
not testing, and in any case the pageserver internally does the same
Tenant reset in this transition as it would do if we incremented
generation.

Closes: https://github.com/neondatabase/neon/issues/8367

## Summary of changes

- Simplify the logic to only skip incrementing the generation if the
location already has the expected generation and the exact same mode.
---
 storage_controller/src/reconciler.rs | 24 +++++++-----------------
 1 file changed, 7 insertions(+), 17 deletions(-)

diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs
index 12dea2c7ef..254fdb364e 100644
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -656,11 +656,8 @@ impl Reconciler {
                     // reconcile this location.  This includes locations with different configurations, as well
                     // as locations with unknown (None) observed state.
 
-                    // The general case is to increment the generation.  However, there are cases
-                    // where this is not necessary:
-                    // - if we are only updating the TenantConf part of the location
-                    // - if we are only changing the attachment mode (e.g. going to attachedmulti or attachedstale)
-                    //   and the location was already in the correct generation
+                    // Incrementing generation is the safe general case, but is inefficient for changes that only
+                    // modify some details (e.g. the tenant's config).
                     let increment_generation = match observed {
                         None => true,
                         Some(ObservedStateLocation { conf: None }) => true,
@@ -669,18 +666,11 @@ impl Reconciler {
                         }) => {
                             let generations_match = observed.generation == wanted_conf.generation;
 
-                            use LocationConfigMode::*;
-                            let mode_transition_requires_gen_inc =
-                                match (observed.mode, wanted_conf.mode) {
-                                    // Usually the short-lived attachment modes (multi and stale) are only used
-                                    // in the case of [`Self::live_migrate`], but it is simple to handle them correctly
-                                    // here too.  Locations are allowed to go Single->Stale and Multi->Single within the same generation.
-                                    (AttachedSingle, AttachedStale) => false,
-                                    (AttachedMulti, AttachedSingle) => false,
-                                    (lhs, rhs) => lhs != rhs,
-                                };
-
-                            !generations_match || mode_transition_requires_gen_inc
+                            // We may skip incrementing the generation if the location is already in the expected mode and
+                            // generation.  In principle it would also be safe to skip from certain other modes (e.g. AttachedStale),
+                            // but such states are handled inside `live_migrate`, and if we see that state here we're cleaning up
+                            // after a restart/crash, so fall back to the universally safe path of incrementing generation.
+                            !generations_match || (observed.mode != wanted_conf.mode)
                         }
                     };
 

From 46379cd3f213a99b02a3944d45186447eb2bb696 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 31 Jul 2024 20:24:42 +0200
Subject: [PATCH 313/412] storage_scrubber: migrate FindGarbage to
 remote_storage (#8548)

Uses the newly added APIs from #8541 named `stream_tenants_generic` and
`stream_objects_with_retries` and extends them with
`list_objects_with_retries_generic` and
`stream_tenant_timelines_generic` to migrate the `find-garbage` command
of the scrubber to `GenericRemoteStorage`.

Part of https://github.com/neondatabase/neon/issues/7547
---
 libs/remote_storage/src/lib.rs          |  1 +
 storage_scrubber/src/garbage.rs         | 50 ++++++++++------------
 storage_scrubber/src/lib.rs             | 40 +++++++++++++++++
 storage_scrubber/src/metadata_stream.rs | 57 +++++++++++++++++++++++++
 4 files changed, 121 insertions(+), 27 deletions(-)

diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 794e696769..2c9e298f79 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -144,6 +144,7 @@ impl RemotePath {
 ///
 /// The WithDelimiter mode will populate `prefixes` and `keys` in the result.  The
 /// NoDelimiter mode will only populate `keys`.
+#[derive(Copy, Clone)]
 pub enum ListingMode {
     WithDelimiter,
     NoDelimiter,
diff --git a/storage_scrubber/src/garbage.rs b/storage_scrubber/src/garbage.rs
index 73479c3658..d6a73bf366 100644
--- a/storage_scrubber/src/garbage.rs
+++ b/storage_scrubber/src/garbage.rs
@@ -19,8 +19,8 @@ use utils::id::TenantId;
 
 use crate::{
     cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData},
-    init_remote, init_remote_generic, list_objects_with_retries,
-    metadata_stream::{stream_tenant_timelines, stream_tenants},
+    init_remote_generic, list_objects_with_retries_generic,
+    metadata_stream::{stream_tenant_timelines_generic, stream_tenants_generic},
     BucketConfig, ConsoleConfig, NodeKind, TenantShardTimelineId, TraversingDepth,
 };
 
@@ -153,7 +153,7 @@ async fn find_garbage_inner(
     node_kind: NodeKind,
 ) -> anyhow::Result<GarbageList> {
     // Construct clients for S3 and for Console API
-    let (s3_client, target) = init_remote(bucket_config.clone(), node_kind).await?;
+    let (remote_client, target) = init_remote_generic(bucket_config.clone(), node_kind).await?;
     let cloud_admin_api_client = Arc::new(CloudAdminApiClient::new(console_config));
 
     // Build a set of console-known tenants, for quickly eliminating known-active tenants without having
@@ -179,7 +179,7 @@ async fn find_garbage_inner(
 
     // Enumerate Tenants in S3, and check if each one exists in Console
     tracing::info!("Finding all tenants in bucket {}...", bucket_config.bucket);
-    let tenants = stream_tenants(&s3_client, &target);
+    let tenants = stream_tenants_generic(&remote_client, &target);
     let tenants_checked = tenants.map_ok(|t| {
         let api_client = cloud_admin_api_client.clone();
         let console_cache = console_cache.clone();
@@ -237,25 +237,26 @@ async fn find_garbage_inner(
         // Special case: If it's missing in console, check for known bugs that would enable us to conclusively
         // identify it as purge-able anyway
         if console_result.is_none() {
-            let timelines = stream_tenant_timelines(&s3_client, &target, tenant_shard_id)
-                .await?
-                .collect::<Vec<_>>()
-                .await;
+            let timelines =
+                stream_tenant_timelines_generic(&remote_client, &target, tenant_shard_id)
+                    .await?
+                    .collect::<Vec<_>>()
+                    .await;
             if timelines.is_empty() {
                 // No timelines, but a heatmap: the deletion bug where we deleted everything but heatmaps
-                let tenant_objects = list_objects_with_retries(
-                    &s3_client,
+                let tenant_objects = list_objects_with_retries_generic(
+                    &remote_client,
+                    ListingMode::WithDelimiter,
                     &target.tenant_root(&tenant_shard_id),
-                    None,
                 )
                 .await?;
-                let object = tenant_objects.contents.as_ref().unwrap().first().unwrap();
-                if object.key.as_ref().unwrap().ends_with("heatmap-v1.json") {
+                let object = tenant_objects.keys.first().unwrap();
+                if object.key.get_path().as_str().ends_with("heatmap-v1.json") {
                     tracing::info!("Tenant {tenant_shard_id}: is missing in console and is only a heatmap (known historic deletion bug)");
                     garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id));
                     continue;
                 } else {
-                    tracing::info!("Tenant {tenant_shard_id} is missing in console and contains one object: {}", object.key.as_ref().unwrap());
+                    tracing::info!("Tenant {tenant_shard_id} is missing in console and contains one object: {}", object.key);
                 }
             } else {
                 // A console-unknown tenant with timelines: check if these timelines only contain initdb.tar.zst, from the initial
@@ -264,24 +265,18 @@ async fn find_garbage_inner(
 
                 for timeline_r in timelines {
                     let timeline = timeline_r?;
-                    let timeline_objects = list_objects_with_retries(
-                        &s3_client,
+                    let timeline_objects = list_objects_with_retries_generic(
+                        &remote_client,
+                        ListingMode::WithDelimiter,
                         &target.timeline_root(&timeline),
-                        None,
                     )
                     .await?;
-                    if timeline_objects
-                        .common_prefixes
-                        .as_ref()
-                        .map(|v| v.len())
-                        .unwrap_or(0)
-                        > 0
-                    {
+                    if !timeline_objects.prefixes.is_empty() {
                         // Sub-paths?  Unexpected
                         any_non_initdb = true;
                     } else {
-                        let object = timeline_objects.contents.as_ref().unwrap().first().unwrap();
-                        if object.key.as_ref().unwrap().ends_with("initdb.tar.zst") {
+                        let object = timeline_objects.keys.first().unwrap();
+                        if object.key.get_path().as_str().ends_with("initdb.tar.zst") {
                             tracing::info!("Timeline {timeline} contains only initdb.tar.zst");
                         } else {
                             any_non_initdb = true;
@@ -336,7 +331,8 @@ async fn find_garbage_inner(
 
     // Construct a stream of all timelines within active tenants
     let active_tenants = tokio_stream::iter(active_tenants.iter().map(Ok));
-    let timelines = active_tenants.map_ok(|t| stream_tenant_timelines(&s3_client, &target, *t));
+    let timelines =
+        active_tenants.map_ok(|t| stream_tenant_timelines_generic(&remote_client, &target, *t));
     let timelines = timelines.try_buffer_unordered(S3_CONCURRENCY);
     let timelines = timelines.try_flatten();
 
diff --git a/storage_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs
index e0f154def3..152319b731 100644
--- a/storage_scrubber/src/lib.rs
+++ b/storage_scrubber/src/lib.rs
@@ -427,6 +427,7 @@ async fn list_objects_with_retries(
     Err(anyhow!("unreachable unless MAX_RETRIES==0"))
 }
 
+/// Listing possibly large amounts of keys in a streaming fashion.
 fn stream_objects_with_retries<'a>(
     storage_client: &'a GenericRemoteStorage,
     listing_mode: ListingMode,
@@ -465,6 +466,45 @@ fn stream_objects_with_retries<'a>(
     }
 }
 
+/// If you want to list a bounded amount of prefixes or keys. For larger numbers of keys/prefixes,
+/// use [`stream_objects_with_retries`] instead.
+async fn list_objects_with_retries_generic(
+    remote_client: &GenericRemoteStorage,
+    listing_mode: ListingMode,
+    s3_target: &S3Target,
+) -> anyhow::Result<Listing> {
+    let cancel = CancellationToken::new();
+    let prefix_str = &s3_target
+        .prefix_in_bucket
+        .strip_prefix("/")
+        .unwrap_or(&s3_target.prefix_in_bucket);
+    let prefix = RemotePath::from_string(prefix_str)?;
+    for trial in 0..MAX_RETRIES {
+        match remote_client
+            .list(Some(&prefix), listing_mode, None, &cancel)
+            .await
+        {
+            Ok(response) => return Ok(response),
+            Err(e) => {
+                if trial == MAX_RETRIES - 1 {
+                    return Err(e)
+                        .with_context(|| format!("Failed to list objects {MAX_RETRIES} times"));
+                }
+                error!(
+                    "list_objects_v2 query failed: bucket_name={}, prefix={}, delimiter={}, error={}",
+                    s3_target.bucket_name,
+                    s3_target.prefix_in_bucket,
+                    s3_target.delimiter,
+                    DisplayErrorContext(e),
+                );
+                let backoff_time = 1 << trial.max(5);
+                tokio::time::sleep(Duration::from_secs(backoff_time)).await;
+            }
+        }
+    }
+    panic!("MAX_RETRIES is not allowed to be 0");
+}
+
 async fn download_object_with_retries(
     s3_client: &Client,
     bucket_name: &str,
diff --git a/storage_scrubber/src/metadata_stream.rs b/storage_scrubber/src/metadata_stream.rs
index 91dba3c992..c702c0c312 100644
--- a/storage_scrubber/src/metadata_stream.rs
+++ b/storage_scrubber/src/metadata_stream.rs
@@ -189,6 +189,63 @@ pub async fn stream_tenant_timelines<'a>(
     })
 }
 
+/// Given a `TenantShardId`, output a stream of the timelines within that tenant, discovered
+/// using a listing. The listing is done before the stream is built, so that this
+/// function can be used to generate concurrency on a stream using buffer_unordered.
+pub async fn stream_tenant_timelines_generic<'a>(
+    remote_client: &'a GenericRemoteStorage,
+    target: &'a RootTarget,
+    tenant: TenantShardId,
+) -> anyhow::Result<impl Stream<Item = Result<TenantShardTimelineId, anyhow::Error>> + 'a> {
+    let mut timeline_ids: Vec<Result<TimelineId, anyhow::Error>> = Vec::new();
+    let timelines_target = target.timelines_root(&tenant);
+
+    let mut objects_stream = std::pin::pin!(stream_objects_with_retries(
+        remote_client,
+        ListingMode::WithDelimiter,
+        &timelines_target
+    ));
+    loop {
+        tracing::debug!("Listing in {tenant}");
+        let fetch_response = match objects_stream.next().await {
+            None => break,
+            Some(Err(e)) => {
+                timeline_ids.push(Err(e));
+                break;
+            }
+            Some(Ok(r)) => r,
+        };
+
+        let new_entry_ids = fetch_response
+            .prefixes
+            .iter()
+            .filter_map(|prefix| -> Option<&str> {
+                prefix
+                    .get_path()
+                    .as_str()
+                    .strip_prefix(&timelines_target.prefix_in_bucket)?
+                    .strip_suffix('/')
+            })
+            .map(|entry_id_str| {
+                entry_id_str
+                    .parse::<TimelineId>()
+                    .with_context(|| format!("Incorrect entry id str: {entry_id_str}"))
+            });
+
+        for i in new_entry_ids {
+            timeline_ids.push(i);
+        }
+    }
+
+    tracing::debug!("Yielding for {}", tenant);
+    Ok(stream! {
+        for i in timeline_ids {
+            let id = i?;
+            yield Ok(TenantShardTimelineId::new(tenant, id));
+        }
+    })
+}
+
 pub(crate) fn stream_listing<'a>(
     s3_client: &'a Client,
     target: &'a S3Target,

From 8adc4031d0558d4c7390aacfeba9d1f91f0e41df Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 31 Jul 2024 19:47:59 +0100
Subject: [PATCH 314/412] CI(create-test-report): fix missing benchmark results
 in Allure report (#8540)

## Problem

In https://github.com/neondatabase/neon/pull/8241 I've accidentally
removed `create-test-report` dependency on `benchmarks` job

## Summary of changes
- Run `create-test-report` after `benchmarks` job
---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 50006dd3d4..c7ae2aedd4 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -309,7 +309,7 @@ jobs:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
   create-test-report:
-    needs: [ check-permissions, build-and-test-locally, coverage-report, build-build-tools-image ]
+    needs: [ check-permissions, build-and-test-locally, coverage-report, build-build-tools-image, benchmarks ]
     if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
     outputs:
       report-url: ${{ steps.create-allure-report.outputs.report-url }}

From 02e8fd0b527dbefb9e8fd7528d9f4cbff56e6cf0 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <iskyzh@gmail.com>
Date: Wed, 31 Jul 2024 17:55:19 -0400
Subject: [PATCH 315/412] test(pageserver): add test_gc_feedback_with_snapshots
 (#8474)

should be working after https://github.com/neondatabase/neon/pull/8328
gets merged. Part of https://github.com/neondatabase/neon/issues/8002

adds a new perf benchmark case that ensures garbages can be collected
with branches

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 scripts/benchmark_durations.py              |  1 +
 test_runner/performance/test_gc_feedback.py | 54 +++++++++++++++------
 2 files changed, 40 insertions(+), 15 deletions(-)

diff --git a/scripts/benchmark_durations.py b/scripts/benchmark_durations.py
index 01f34a1b96..4ca433679a 100755
--- a/scripts/benchmark_durations.py
+++ b/scripts/benchmark_durations.py
@@ -67,6 +67,7 @@ FALLBACK_DURATION = {
     "test_runner/performance/test_copy.py::test_copy[neon]": 13.817,
     "test_runner/performance/test_copy.py::test_copy[vanilla]": 11.736,
     "test_runner/performance/test_gc_feedback.py::test_gc_feedback": 575.735,
+    "test_runner/performance/test_gc_feedback.py::test_gc_feedback_with_snapshots": 575.735,
     "test_runner/performance/test_gist_build.py::test_gist_buffering_build[neon]": 14.868,
     "test_runner/performance/test_gist_build.py::test_gist_buffering_build[vanilla]": 14.393,
     "test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[neon-1]": 20.588,
diff --git a/test_runner/performance/test_gc_feedback.py b/test_runner/performance/test_gc_feedback.py
index 4c326111c2..9861259c16 100644
--- a/test_runner/performance/test_gc_feedback.py
+++ b/test_runner/performance/test_gc_feedback.py
@@ -6,21 +6,8 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder
 
 
-@pytest.mark.timeout(10000)
-def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker):
-    """
-    Test that GC is able to collect all old layers even if them are forming
-    "stairs" and there are not three delta layers since last image layer.
-
-    Information about image layers needed to collect old layers should
-    be propagated by GC to compaction task which should take in in account
-    when make a decision which new image layers needs to be created.
-
-    NB: this test demonstrates the problem. The source tree contained the
-    `gc_feedback` mechanism for about 9 months, but, there were problems
-    with it and it wasn't enabled at runtime.
-    This PR removed the code: https://github.com/neondatabase/neon/pull/6863
-    """
+def gc_feedback_impl(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker, mode: str):
+    assert mode == "normal" or mode == "with_snapshots"
     env = neon_env_builder.init_start()
     client = env.pageserver.http_client()
 
@@ -74,6 +61,9 @@ def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma
 
             physical_size = client.timeline_detail(tenant_id, timeline_id)["current_physical_size"]
             log.info(f"Physical storage size {physical_size}")
+        if mode == "with_snapshots":
+            if step == n_steps / 2:
+                env.neon_cli.create_branch("child")
 
     max_num_of_deltas_above_image = 0
     max_total_num_of_deltas = 0
@@ -149,3 +139,37 @@ def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma
     log.info(f"Writing layer map to {layer_map_path}")
     with layer_map_path.open("w") as f:
         f.write(json.dumps(client.timeline_layer_map_info(tenant_id, timeline_id)))
+
+
+@pytest.mark.timeout(10000)
+def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker):
+    """
+    Test that GC is able to collect all old layers even if them are forming
+    "stairs" and there are not three delta layers since last image layer.
+
+    Information about image layers needed to collect old layers should
+    be propagated by GC to compaction task which should take in in account
+    when make a decision which new image layers needs to be created.
+
+    NB: this test demonstrates the problem. The source tree contained the
+    `gc_feedback` mechanism for about 9 months, but, there were problems
+    with it and it wasn't enabled at runtime.
+    This PR removed the code: https://github.com/neondatabase/neon/pull/6863
+
+    And the bottom-most GC-compaction epic resolves the problem.
+    https://github.com/neondatabase/neon/issues/8002
+    """
+    gc_feedback_impl(neon_env_builder, zenbenchmark, "normal")
+
+
+@pytest.mark.timeout(10000)
+def test_gc_feedback_with_snapshots(
+    neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker
+):
+    """
+    Compared with `test_gc_feedback`, we create a branch without written data (=snapshot) in the middle
+    of the benchmark, and the   bottom-most compaction should collect as much garbage as possible below the GC
+    horizon. Ideally, there should be images (in an image layer) covering the full range at the branch point,
+    and images covering the full key range (in a delta layer) at the GC horizon.
+    """
+    gc_feedback_impl(neon_env_builder, zenbenchmark, "with_snapshots")

From 1e6a1ac9fa4a1cfbbffdbe93f69f45fd666f3c69 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 1 Aug 2024 07:57:09 +0200
Subject: [PATCH 316/412] pageserver: shutdown all walredo managers 8s into
 shutdown (#8572)

# Motivation

The working theory for hung systemd during PS deploy
(https://github.com/neondatabase/cloud/issues/11387) is that leftover
walredo processes trigger a race condition.

In https://github.com/neondatabase/neon/pull/8150 I arranged that a
clean Tenant shutdown does actually kill its walredo processes.

But many prod machines don't manage to shut down all their tenants until
the 10s systemd timeout hits and, presumably, triggers the race
condition in systemd / the Linux kernel that causes the frozen systemd

# Solution

This PR bolts on a rather ugly mechanism to shut down tenant managers
out of order 8s after we've received the SIGTERM from systemd.

# Changes

- add a global registry of `Weak<WalRedoManager>`
- add a special thread spawned during `shutdown_pageserver` that sleeps
for 8s, then shuts down all redo managers in the registry and prevents
new redo managers from being created
- propagate the new failure mode of tenant spawning throughout the code
base
- make sure shut down tenant manager results in
PageReconstructError::Cancelled so that if Timeline::get calls come in
after the shutdown, they do the right thing
---
 pageserver/src/lib.rs             | 83 ++++++++++++++++++++++++++++++-
 pageserver/src/tenant.rs          | 82 ++++++++++++++++++++++++------
 pageserver/src/tenant/mgr.rs      | 38 ++++++++------
 pageserver/src/tenant/timeline.rs | 22 +++++---
 pageserver/src/walredo.rs         | 29 ++++++++---
 5 files changed, 206 insertions(+), 48 deletions(-)

diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index f729cad3c3..5aee13cfc6 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -12,6 +12,8 @@ pub mod disk_usage_eviction_task;
 pub mod http;
 pub mod import_datadir;
 pub mod l0_flush;
+
+use futures::{stream::FuturesUnordered, StreamExt};
 pub use pageserver_api::keyspace;
 use tokio_util::sync::CancellationToken;
 pub mod aux_file;
@@ -36,7 +38,7 @@ use tenant::{
     mgr::{BackgroundPurges, TenantManager},
     secondary,
 };
-use tracing::info;
+use tracing::{info, info_span};
 
 /// Current storage format version
 ///
@@ -85,6 +87,79 @@ pub async fn shutdown_pageserver(
     exit_code: i32,
 ) {
     use std::time::Duration;
+
+    // If the orderly shutdown below takes too long, we still want to make
+    // sure that all walredo processes are killed and wait()ed on by us, not systemd.
+    //
+    // (Leftover walredo processes are the hypothesized trigger for the systemd freezes
+    //  that we keep seeing in prod => https://github.com/neondatabase/cloud/issues/11387.
+    //
+    // We use a thread instead of a tokio task because the background runtime is likely busy
+    // with the final flushing / uploads. This activity here has priority, and due to lack
+    // of scheduling priority feature sin the tokio scheduler, using a separate thread is
+    // an effective priority booster.
+    let walredo_extraordinary_shutdown_thread_span = {
+        let span = info_span!(parent: None, "walredo_extraordinary_shutdown_thread");
+        span.follows_from(tracing::Span::current());
+        span
+    };
+    let walredo_extraordinary_shutdown_thread_cancel = CancellationToken::new();
+    let walredo_extraordinary_shutdown_thread = std::thread::spawn({
+        let walredo_extraordinary_shutdown_thread_cancel =
+            walredo_extraordinary_shutdown_thread_cancel.clone();
+        move || {
+            let rt = tokio::runtime::Builder::new_current_thread()
+                .enable_all()
+                .build()
+                .unwrap();
+            let _entered = rt.enter();
+            let _entered = walredo_extraordinary_shutdown_thread_span.enter();
+            if let Ok(()) = rt.block_on(tokio::time::timeout(
+                Duration::from_secs(8),
+                walredo_extraordinary_shutdown_thread_cancel.cancelled(),
+            )) {
+                info!("cancellation requested");
+                return;
+            }
+            let managers = tenant::WALREDO_MANAGERS
+                .lock()
+                .unwrap()
+                // prevents new walredo managers from being inserted
+                .take()
+                .expect("only we take()");
+            // Use FuturesUnordered to get in queue early for each manager's
+            // heavier_once_cell semaphore wait list.
+            // Also, for idle tenants that for some reason haven't
+            // shut down yet, it's quite likely that we're not going
+            // to get Poll::Pending once.
+            let mut futs: FuturesUnordered<_> = managers
+                .into_iter()
+                .filter_map(|(_, mgr)| mgr.upgrade())
+                .map(|mgr| async move { tokio::task::unconstrained(mgr.shutdown()).await })
+                .collect();
+            info!(count=%futs.len(), "built FuturesUnordered");
+            let mut last_log_at = std::time::Instant::now();
+            #[derive(Debug, Default)]
+            struct Results {
+                initiated: u64,
+                already: u64,
+            }
+            let mut results = Results::default();
+            while let Some(we_initiated) = rt.block_on(futs.next()) {
+                if we_initiated {
+                    results.initiated += 1;
+                } else {
+                    results.already += 1;
+                }
+                if last_log_at.elapsed() > Duration::from_millis(100) {
+                    info!(remaining=%futs.len(), ?results, "progress");
+                    last_log_at = std::time::Instant::now();
+                }
+            }
+            info!(?results, "done");
+        }
+    });
+
     // Shut down the libpq endpoint task. This prevents new connections from
     // being accepted.
     let remaining_connections = timed(
@@ -160,6 +235,12 @@ pub async fn shutdown_pageserver(
         Duration::from_secs(1),
     )
     .await;
+
+    info!("cancel & join walredo_extraordinary_shutdown_thread");
+    walredo_extraordinary_shutdown_thread_cancel.cancel();
+    walredo_extraordinary_shutdown_thread.join().unwrap();
+    info!("walredo_extraordinary_shutdown_thread done");
+
     info!("Shut down successfully completed");
     std::process::exit(exit_code);
 }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 5d0e963b4e..0f09241d22 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -33,6 +33,7 @@ use remote_storage::GenericRemoteStorage;
 use remote_storage::TimeoutOrCancel;
 use std::collections::BTreeMap;
 use std::fmt;
+use std::sync::Weak;
 use std::time::SystemTime;
 use storage_broker::BrokerClientChannel;
 use tokio::io::BufReader;
@@ -312,14 +313,66 @@ impl std::fmt::Debug for Tenant {
 }
 
 pub(crate) enum WalRedoManager {
-    Prod(PostgresRedoManager),
+    Prod(WalredoManagerId, PostgresRedoManager),
     #[cfg(test)]
     Test(harness::TestRedoManager),
 }
 
-impl From<PostgresRedoManager> for WalRedoManager {
-    fn from(mgr: PostgresRedoManager) -> Self {
-        Self::Prod(mgr)
+#[derive(thiserror::Error, Debug)]
+#[error("pageserver is shutting down")]
+pub(crate) struct GlobalShutDown;
+
+impl WalRedoManager {
+    pub(crate) fn new(mgr: PostgresRedoManager) -> Result<Arc<Self>, GlobalShutDown> {
+        let id = WalredoManagerId::next();
+        let arc = Arc::new(Self::Prod(id, mgr));
+        let mut guard = WALREDO_MANAGERS.lock().unwrap();
+        match &mut *guard {
+            Some(map) => {
+                map.insert(id, Arc::downgrade(&arc));
+                Ok(arc)
+            }
+            None => Err(GlobalShutDown),
+        }
+    }
+}
+
+impl Drop for WalRedoManager {
+    fn drop(&mut self) {
+        match self {
+            Self::Prod(id, _) => {
+                let mut guard = WALREDO_MANAGERS.lock().unwrap();
+                if let Some(map) = &mut *guard {
+                    map.remove(id).expect("new() registers, drop() unregisters");
+                }
+            }
+            #[cfg(test)]
+            Self::Test(_) => {
+                // Not applicable to test redo manager
+            }
+        }
+    }
+}
+
+/// Global registry of all walredo managers so that [`crate::shutdown_pageserver`] can shut down
+/// the walredo processes outside of the regular order.
+///
+/// This is necessary to work around a systemd bug where it freezes if there are
+/// walredo processes left => <https://github.com/neondatabase/cloud/issues/11387>
+#[allow(clippy::type_complexity)]
+pub(crate) static WALREDO_MANAGERS: once_cell::sync::Lazy<
+    Mutex<Option<HashMap<WalredoManagerId, Weak<WalRedoManager>>>>,
+> = once_cell::sync::Lazy::new(|| Mutex::new(Some(HashMap::new())));
+#[derive(PartialEq, Eq, Hash, Clone, Copy, Debug)]
+pub(crate) struct WalredoManagerId(u64);
+impl WalredoManagerId {
+    pub fn next() -> Self {
+        static NEXT: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(1);
+        let id = NEXT.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
+        if id == 0 {
+            panic!("WalredoManagerId::new() returned 0, indicating wraparound, risking it's no longer unique");
+        }
+        Self(id)
     }
 }
 
@@ -331,19 +384,20 @@ impl From<harness::TestRedoManager> for WalRedoManager {
 }
 
 impl WalRedoManager {
-    pub(crate) async fn shutdown(&self) {
+    pub(crate) async fn shutdown(&self) -> bool {
         match self {
-            Self::Prod(mgr) => mgr.shutdown().await,
+            Self::Prod(_, mgr) => mgr.shutdown().await,
             #[cfg(test)]
             Self::Test(_) => {
                 // Not applicable to test redo manager
+                true
             }
         }
     }
 
     pub(crate) fn maybe_quiesce(&self, idle_timeout: Duration) {
         match self {
-            Self::Prod(mgr) => mgr.maybe_quiesce(idle_timeout),
+            Self::Prod(_, mgr) => mgr.maybe_quiesce(idle_timeout),
             #[cfg(test)]
             Self::Test(_) => {
                 // Not applicable to test redo manager
@@ -363,7 +417,7 @@ impl WalRedoManager {
         pg_version: u32,
     ) -> Result<bytes::Bytes, walredo::Error> {
         match self {
-            Self::Prod(mgr) => {
+            Self::Prod(_, mgr) => {
                 mgr.request_redo(key, lsn, base_img, records, pg_version)
                     .await
             }
@@ -377,7 +431,7 @@ impl WalRedoManager {
 
     pub(crate) fn status(&self) -> Option<WalRedoManagerStatus> {
         match self {
-            WalRedoManager::Prod(m) => Some(m.status()),
+            WalRedoManager::Prod(_, m) => Some(m.status()),
             #[cfg(test)]
             WalRedoManager::Test(_) => None,
         }
@@ -677,11 +731,9 @@ impl Tenant {
         init_order: Option<InitializationOrder>,
         mode: SpawnMode,
         ctx: &RequestContext,
-    ) -> Arc<Tenant> {
-        let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
-            conf,
-            tenant_shard_id,
-        )));
+    ) -> Result<Arc<Tenant>, GlobalShutDown> {
+        let wal_redo_manager =
+            WalRedoManager::new(PostgresRedoManager::new(conf, tenant_shard_id))?;
 
         let TenantSharedResources {
             broker_client,
@@ -880,7 +932,7 @@ impl Tenant {
             }
             .instrument(tracing::info_span!(parent: None, "attach", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), gen=?generation)),
         );
-        tenant
+        Ok(tenant)
     }
 
     #[instrument(skip_all)]
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 58f8990892..b5568d37b5 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -55,7 +55,7 @@ use utils::id::{TenantId, TimelineId};
 use super::remote_timeline_client::remote_tenant_path;
 use super::secondary::SecondaryTenant;
 use super::timeline::detach_ancestor::PreparedTimelineDetach;
-use super::TenantSharedResources;
+use super::{GlobalShutDown, TenantSharedResources};
 
 /// For a tenant that appears in TenantsMap, it may either be
 /// - `Attached`: has a full Tenant object, is elegible to service
@@ -665,17 +665,20 @@ pub async fn init_tenant_mgr(
         let tenant_dir_path = conf.tenant_path(&tenant_shard_id);
         let shard_identity = location_conf.shard;
         let slot = match location_conf.mode {
-            LocationMode::Attached(attached_conf) => TenantSlot::Attached(tenant_spawn(
-                conf,
-                tenant_shard_id,
-                &tenant_dir_path,
-                resources.clone(),
-                AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
-                shard_identity,
-                Some(init_order.clone()),
-                SpawnMode::Lazy,
-                &ctx,
-            )),
+            LocationMode::Attached(attached_conf) => TenantSlot::Attached(
+                tenant_spawn(
+                    conf,
+                    tenant_shard_id,
+                    &tenant_dir_path,
+                    resources.clone(),
+                    AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
+                    shard_identity,
+                    Some(init_order.clone()),
+                    SpawnMode::Lazy,
+                    &ctx,
+                )
+                .expect("global shutdown during init_tenant_mgr cannot happen"),
+            ),
             LocationMode::Secondary(secondary_conf) => {
                 info!(
                     tenant_id = %tenant_shard_id.tenant_id,
@@ -723,7 +726,7 @@ fn tenant_spawn(
     init_order: Option<InitializationOrder>,
     mode: SpawnMode,
     ctx: &RequestContext,
-) -> Arc<Tenant> {
+) -> Result<Arc<Tenant>, GlobalShutDown> {
     // All these conditions should have been satisfied by our caller: the tenant dir exists, is a well formed
     // path, and contains a configuration file.  Assertions that do synchronous I/O are limited to debug mode
     // to avoid impacting prod runtime performance.
@@ -1190,7 +1193,10 @@ impl TenantManager {
                     None,
                     spawn_mode,
                     ctx,
-                );
+                )
+                .map_err(|_: GlobalShutDown| {
+                    UpsertLocationError::Unavailable(TenantMapError::ShuttingDown)
+                })?;
 
                 TenantSlot::Attached(tenant)
             }
@@ -1311,7 +1317,7 @@ impl TenantManager {
             None,
             SpawnMode::Eager,
             ctx,
-        );
+        )?;
 
         slot_guard.upsert(TenantSlot::Attached(tenant))?;
 
@@ -2045,7 +2051,7 @@ impl TenantManager {
             None,
             SpawnMode::Eager,
             ctx,
-        );
+        )?;
 
         slot_guard.upsert(TenantSlot::Attached(tenant))?;
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index ecae443079..3a7353c138 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -76,6 +76,7 @@ use crate::{
         metadata::TimelineMetadata,
         storage_layer::PersistentLayerDesc,
     },
+    walredo,
 };
 use crate::{
     context::{DownloadBehavior, RequestContext},
@@ -1000,7 +1001,10 @@ impl Timeline {
             .for_get_kind(GetKind::Singular)
             .observe(elapsed.as_secs_f64());
 
-        if cfg!(feature = "testing") && res.is_err() {
+        if cfg!(feature = "testing")
+            && res.is_err()
+            && !matches!(res, Err(PageReconstructError::Cancelled))
+        {
             // it can only be walredo issue
             use std::fmt::Write;
 
@@ -5466,20 +5470,22 @@ impl Timeline {
                 } else {
                     trace!("found {} WAL records that will init the page for {} at {}, performing WAL redo", data.records.len(), key, request_lsn);
                 };
-
-                let img = match self
+                let res = self
                     .walredo_mgr
                     .as_ref()
                     .context("timeline has no walredo manager")
                     .map_err(PageReconstructError::WalRedo)?
                     .request_redo(key, request_lsn, data.img, data.records, self.pg_version)
-                    .await
-                    .context("reconstruct a page image")
-                {
+                    .await;
+                let img = match res {
                     Ok(img) => img,
-                    Err(e) => return Err(PageReconstructError::WalRedo(e)),
+                    Err(walredo::Error::Cancelled) => return Err(PageReconstructError::Cancelled),
+                    Err(walredo::Error::Other(e)) => {
+                        return Err(PageReconstructError::WalRedo(
+                            e.context("reconstruct a page image"),
+                        ))
+                    }
                 };
-
                 Ok(img)
             }
         }
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index 5095beefd7..770081b3b4 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -241,6 +241,9 @@ impl PostgresRedoManager {
 
     /// Shut down the WAL redo manager.
     ///
+    /// Returns `true` if this call was the one that initiated shutdown.
+    /// `true` may be observed by no caller if the first caller stops polling.
+    ///
     /// After this future completes
     /// - no redo process is running
     /// - no new redo process will be spawned
@@ -250,22 +253,32 @@ impl PostgresRedoManager {
     /// # Cancel-Safety
     ///
     /// This method is cancellation-safe.
-    pub async fn shutdown(&self) {
+    pub async fn shutdown(&self) -> bool {
         // prevent new processes from being spawned
-        let permit = match self.redo_process.get_or_init_detached().await {
+        let maybe_permit = match self.redo_process.get_or_init_detached().await {
             Ok(guard) => {
-                let (proc, permit) = guard.take_and_deinit();
-                drop(proc); // this just drops the Arc, its refcount may not be zero yet
-                permit
+                if matches!(&*guard, ProcessOnceCell::ManagerShutDown) {
+                    None
+                } else {
+                    let (proc, permit) = guard.take_and_deinit();
+                    drop(proc); // this just drops the Arc, its refcount may not be zero yet
+                    Some(permit)
+                }
             }
-            Err(permit) => permit,
+            Err(permit) => Some(permit),
+        };
+        let it_was_us = if let Some(permit) = maybe_permit {
+            self.redo_process
+                .set(ProcessOnceCell::ManagerShutDown, permit);
+            true
+        } else {
+            false
         };
-        self.redo_process
-            .set(ProcessOnceCell::ManagerShutDown, permit);
         // wait for ongoing requests to drain and the refcounts of all Arc<WalRedoProcess> that
         // we ever launched to drop to zero, which when it happens synchronously kill()s & wait()s
         // for the underlying process.
         self.launched_processes.close().await;
+        it_was_us
     }
 
     /// This type doesn't have its own background task to check for idleness: we

From a22361b57b3c831cf17a176c573f11b1debcbc69 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 1 Aug 2024 10:22:21 +0200
Subject: [PATCH 317/412] Reduce linux-raw-sys duplication (#8577)

Before, we had four versions of linux-raw-sys in our dependency graph:

```
  linux-raw-sys@0.1.4
  linux-raw-sys@0.3.8
  linux-raw-sys@0.4.13
  linux-raw-sys@0.6.4
```

now it's only two:

```
  linux-raw-sys@0.4.13
  linux-raw-sys@0.6.4
```

The changes in this PR are minimal. In order to get to its state one
only has to update procfs in Cargo.toml to 0.16 and do `cargo update -p
tempfile -p is-terminal -p prometheus`.
---
 Cargo.lock | 128 +++++++++--------------------------------------------
 Cargo.toml |   2 +-
 2 files changed, 21 insertions(+), 109 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 2186d55e9c..e2e9ca3ed8 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2710,17 +2710,6 @@ version = "3.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02"
 
-[[package]]
-name = "io-lifetimes"
-version = "1.0.11"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2"
-dependencies = [
- "hermit-abi",
- "libc",
- "windows-sys 0.48.0",
-]
-
 [[package]]
 name = "io-uring"
 version = "0.6.2"
@@ -2739,14 +2728,13 @@ checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3"
 
 [[package]]
 name = "is-terminal"
-version = "0.4.7"
+version = "0.4.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "adcf93614601c8129ddf72e2d5633df827ba6551541c6d8c59520a371475be1f"
+checksum = "f23ff5ef2b80d608d61efee834934d862cd92461afc0560dedf493e4c033738b"
 dependencies = [
  "hermit-abi",
- "io-lifetimes",
- "rustix 0.37.25",
- "windows-sys 0.48.0",
+ "libc",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -2872,18 +2860,6 @@ version = "0.2.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
 
-[[package]]
-name = "linux-raw-sys"
-version = "0.1.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4"
-
-[[package]]
-name = "linux-raw-sys"
-version = "0.3.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519"
-
 [[package]]
 name = "linux-raw-sys"
 version = "0.4.13"
@@ -3001,7 +2977,7 @@ checksum = "7c4b80445aeb08e832d87bf1830049a924cdc1d6b7ef40b6b9b365bff17bf8ec"
 dependencies = [
  "libc",
  "measured",
- "procfs 0.16.0",
+ "procfs",
 ]
 
 [[package]]
@@ -3046,7 +3022,7 @@ dependencies = [
  "measured",
  "measured-process",
  "once_cell",
- "procfs 0.14.2",
+ "procfs",
  "prometheus",
  "rand 0.8.5",
  "rand_distr",
@@ -3593,7 +3569,7 @@ dependencies = [
  "postgres_connection",
  "postgres_ffi",
  "pq_proto",
- "procfs 0.14.2",
+ "procfs",
  "rand 0.8.5",
  "regex",
  "remote_storage",
@@ -4139,21 +4115,6 @@ dependencies = [
  "unicode-ident",
 ]
 
-[[package]]
-name = "procfs"
-version = "0.14.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b1de8dacb0873f77e6aefc6d71e044761fcc68060290f5b1089fcdf84626bb69"
-dependencies = [
- "bitflags 1.3.2",
- "byteorder",
- "chrono",
- "flate2",
- "hex",
- "lazy_static",
- "rustix 0.36.16",
-]
-
 [[package]]
 name = "procfs"
 version = "0.16.0"
@@ -4161,10 +4122,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "731e0d9356b0c25f16f33b5be79b1c57b562f141ebfcdb0ad8ac2c13a24293b4"
 dependencies = [
  "bitflags 2.4.1",
+ "chrono",
+ "flate2",
  "hex",
  "lazy_static",
  "procfs-core",
- "rustix 0.38.28",
+ "rustix",
 ]
 
 [[package]]
@@ -4174,14 +4137,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2d3554923a69f4ce04c4a754260c338f505ce22642d3830e049a399fc2059a29"
 dependencies = [
  "bitflags 2.4.1",
+ "chrono",
  "hex",
 ]
 
 [[package]]
 name = "prometheus"
-version = "0.13.3"
+version = "0.13.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "449811d15fbdf5ceb5c1144416066429cf82316e2ec8ce0c1f6f8a02e7bbcf8c"
+checksum = "3d33c28a30771f7f96db69893f78b857f7450d7e0237e9c8fc6427a81bae7ed1"
 dependencies = [
  "cfg-if",
  "fnv",
@@ -4189,7 +4153,7 @@ dependencies = [
  "libc",
  "memchr",
  "parking_lot 0.12.1",
- "procfs 0.14.2",
+ "procfs",
  "thiserror",
 ]
 
@@ -4943,34 +4907,6 @@ dependencies = [
  "nom",
 ]
 
-[[package]]
-name = "rustix"
-version = "0.36.16"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6da3636faa25820d8648e0e31c5d519bbb01f72fdf57131f0f5f7da5fed36eab"
-dependencies = [
- "bitflags 1.3.2",
- "errno",
- "io-lifetimes",
- "libc",
- "linux-raw-sys 0.1.4",
- "windows-sys 0.45.0",
-]
-
-[[package]]
-name = "rustix"
-version = "0.37.25"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d4eb579851244c2c03e7c24f501c3432bed80b8f720af1d6e5b0e0f01555a035"
-dependencies = [
- "bitflags 1.3.2",
- "errno",
- "io-lifetimes",
- "libc",
- "linux-raw-sys 0.3.8",
- "windows-sys 0.48.0",
-]
-
 [[package]]
 name = "rustix"
 version = "0.38.28"
@@ -5973,15 +5909,15 @@ dependencies = [
 
 [[package]]
 name = "tempfile"
-version = "3.5.0"
+version = "3.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b9fbec84f381d5795b08656e4912bec604d162bff9291d6189a78f4c8ab87998"
+checksum = "01ce4141aa927a6d1bd34a041795abd0db1cccba5d5f24b009f694bdf3a1f3fa"
 dependencies = [
  "cfg-if",
- "fastrand 1.9.0",
- "redox_syscall 0.3.5",
- "rustix 0.37.25",
- "windows-sys 0.45.0",
+ "fastrand 2.0.0",
+ "redox_syscall 0.4.1",
+ "rustix",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -7178,15 +7114,6 @@ dependencies = [
  "windows_x86_64_msvc 0.42.2",
 ]
 
-[[package]]
-name = "windows-sys"
-version = "0.45.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0"
-dependencies = [
- "windows-targets 0.42.2",
-]
-
 [[package]]
 name = "windows-sys"
 version = "0.48.0"
@@ -7205,21 +7132,6 @@ dependencies = [
  "windows-targets 0.52.4",
 ]
 
-[[package]]
-name = "windows-targets"
-version = "0.42.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071"
-dependencies = [
- "windows_aarch64_gnullvm 0.42.2",
- "windows_aarch64_msvc 0.42.2",
- "windows_i686_gnu 0.42.2",
- "windows_i686_msvc 0.42.2",
- "windows_x86_64_gnu 0.42.2",
- "windows_x86_64_gnullvm 0.42.2",
- "windows_x86_64_msvc 0.42.2",
-]
-
 [[package]]
 name = "windows-targets"
 version = "0.48.0"
diff --git a/Cargo.toml b/Cargo.toml
index 7749378114..af1c1dfc82 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -126,7 +126,7 @@ parquet = { version = "51.0.0", default-features = false, features = ["zstd"] }
 parquet_derive = "51.0.0"
 pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
 pin-project-lite = "0.2"
-procfs = "0.14"
+procfs = "0.16"
 prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency
 prost = "0.11"
 rand = "0.8"

From f51dc6a44e3d2ca9fb081f6fc806a80d33bb2e49 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 1 Aug 2024 10:25:35 +0100
Subject: [PATCH 318/412] pageserver: add layer visibility calculation (#8511)

## Problem

We recently added a "visibility" state to layers, but nothing
initializes it.

Part of:
- #8398

## Summary of changes

- Add a dependency on `range-set-blaze`, which is used as a fast
incrementally updated alternative to KeySpace. We could also use this to
replace the internals of KeySpaceRandomAccum if we wanted to. Writing a
type that does this kind of "BtreeMap & merge overlapping entries" thing
isn't super complicated, but no reason to write this ourselves when
there's a third party impl available.
- Add a function to layermap to calculate visibilities for each layer
- Add a function to Timeline to call into layermap and then apply these
visibilities to the Layer objects.
- Invoke the calculation during startup, after image layer creations,
and when removing branches. Branch removal and image layer creation are
the two ways that a layer can go from Visible to Covered.
- Add unit test & benchmark for the visibility calculation
- Expose `pageserver_visible_physical_size` metric, which should always
be <= `pageserver_remote_physical_size`.
- This metric will feed into the /v1/utilization endpoint later: the
visible size indicates how much space we would like to use on this
pageserver for this tenant.
- When `pageserver_visible_physical_size` is greater than
`pageserver_resident_physical_size`, this is a sign that the tenant has
long-idle branches, which result in layers that are visible in
principle, but not used in practice.

This does not keep visibility hints up to date in all cases:
particularly, when creating a child timeline, any previously covered
layers will not get marked Visible until they are accessed.

Updates after image layer creation could be implemented as more of a
special case, but this would require more new code: the existing depth
calculation code doesn't maintain+yield the list of deltas that would be
covered by an image layer.

## Performance

This operation is done rarely (at startup and at timeline deletion), so
needs to be efficient but not ultra-fast.

There is a new `visibility` bench that measures runtime for a synthetic
100k layers case (`sequential`) and a real layer map (`real_map`) with
~26k layers.

The benchmark shows runtimes of single digit milliseconds (on a ryzen
7950). This confirms that the runtime shouldn't be a problem at startup
(as we already incur S3-level latencies there), but that it's slow
enough that we definitely shouldn't call it more often than necessary,
and it may be worthwhile to optimize further later (things like: when
removing a branch, only bother scanning layers below the branchpoint)

```
visibility/sequential   time:   [4.5087 ms 4.5894 ms 4.6775 ms]
                        change: [+2.0826% +3.9097% +5.8995%] (p = 0.00 < 0.05)
                        Performance has regressed.
Found 24 outliers among 100 measurements (24.00%)
  2 (2.00%) high mild
  22 (22.00%) high severe
min: 0/1696070, max: 93/1C0887F0
visibility/real_map     time:   [7.0796 ms 7.0832 ms 7.0871 ms]
                        change: [+0.3900% +0.4505% +0.5164%] (p = 0.00 < 0.05)
                        Change within noise threshold.
Found 4 outliers among 100 measurements (4.00%)
  3 (3.00%) high mild
  1 (1.00%) high severe
min: 0/1696070, max: 93/1C0887F0
visibility/real_map_many_branches
                        time:   [4.5285 ms 4.5355 ms 4.5434 ms]
                        change: [-1.0012% -0.8004% -0.5969%] (p = 0.00 < 0.05)
                        Change within noise threshold.
```
---
 Cargo.lock                                    |  56 ++-
 pageserver/Cargo.toml                         |   1 +
 pageserver/benches/bench_layer_map.rs         |  78 ++-
 pageserver/src/metrics.rs                     |  15 +
 pageserver/src/tenant.rs                      |   2 +-
 pageserver/src/tenant/layer_map.rs            | 474 +++++++++++++++++-
 .../layer_map/historic_layer_coverage.rs      |   4 +
 pageserver/src/tenant/storage_layer.rs        |  41 +-
 pageserver/src/tenant/storage_layer/layer.rs  |  53 +-
 pageserver/src/tenant/timeline.rs             |  28 +-
 pageserver/src/tenant/timeline/compaction.rs  |  39 ++
 pageserver/src/tenant/timeline/delete.rs      |   9 +-
 .../indices/mixed_workload/README.md          |   7 +
 .../indices/mixed_workload/index_part.json    |   1 +
 test_runner/fixtures/metrics.py               |   1 +
 15 files changed, 729 insertions(+), 80 deletions(-)
 create mode 100644 pageserver/test_data/indices/mixed_workload/README.md
 create mode 100644 pageserver/test_data/indices/mixed_workload/index_part.json

diff --git a/Cargo.lock b/Cargo.lock
index e2e9ca3ed8..dc4f0c7b81 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1418,7 +1418,7 @@ dependencies = [
  "clap",
  "criterion-plot",
  "is-terminal",
- "itertools",
+ "itertools 0.10.5",
  "num-traits",
  "once_cell",
  "oorandom",
@@ -1439,7 +1439,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1"
 dependencies = [
  "cast",
- "itertools",
+ "itertools 0.10.5",
 ]
 
 [[package]]
@@ -2134,6 +2134,12 @@ dependencies = [
  "slab",
 ]
 
+[[package]]
+name = "gen_ops"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "304de19db7028420975a296ab0fcbbc8e69438c4ed254a1e41e2a7f37d5f0e0a"
+
 [[package]]
 name = "generic-array"
 version = "0.14.7"
@@ -2746,6 +2752,15 @@ dependencies = [
  "either",
 ]
 
+[[package]]
+name = "itertools"
+version = "0.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
+dependencies = [
+ "either",
+]
+
 [[package]]
 name = "itoa"
 version = "1.0.6"
@@ -3551,7 +3566,7 @@ dependencies = [
  "humantime",
  "humantime-serde",
  "hyper 0.14.26",
- "itertools",
+ "itertools 0.10.5",
  "leaky-bucket",
  "md5",
  "metrics",
@@ -3571,6 +3586,7 @@ dependencies = [
  "pq_proto",
  "procfs",
  "rand 0.8.5",
+ "range-set-blaze",
  "regex",
  "remote_storage",
  "reqwest 0.12.4",
@@ -3621,7 +3637,7 @@ dependencies = [
  "hex",
  "humantime",
  "humantime-serde",
- "itertools",
+ "itertools 0.10.5",
  "postgres_ffi",
  "rand 0.8.5",
  "serde",
@@ -3679,7 +3695,7 @@ dependencies = [
  "hex-literal",
  "humantime",
  "humantime-serde",
- "itertools",
+ "itertools 0.10.5",
  "metrics",
  "once_cell",
  "pageserver_api",
@@ -4011,7 +4027,7 @@ name = "postgres_connection"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "itertools",
+ "itertools 0.10.5",
  "once_cell",
  "postgres",
  "tokio-postgres",
@@ -4069,7 +4085,7 @@ version = "0.1.0"
 dependencies = [
  "byteorder",
  "bytes",
- "itertools",
+ "itertools 0.10.5",
  "pin-project-lite",
  "postgres-protocol",
  "rand 0.8.5",
@@ -4175,7 +4191,7 @@ checksum = "119533552c9a7ffacc21e099c24a0ac8bb19c2a2a3f363de84cd9b844feab270"
 dependencies = [
  "bytes",
  "heck 0.4.1",
- "itertools",
+ "itertools 0.10.5",
  "lazy_static",
  "log",
  "multimap",
@@ -4196,7 +4212,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e5d2d8d10f3c6ded6da8b05b5fb3b8a5082514344d56c9f871412d29b4e075b4"
 dependencies = [
  "anyhow",
- "itertools",
+ "itertools 0.10.5",
  "proc-macro2",
  "quote",
  "syn 1.0.109",
@@ -4253,7 +4269,7 @@ dependencies = [
  "hyper-util",
  "indexmap 2.0.1",
  "ipnet",
- "itertools",
+ "itertools 0.10.5",
  "lasso",
  "md5",
  "measured",
@@ -4429,6 +4445,18 @@ dependencies = [
  "rand_core 0.5.1",
 ]
 
+[[package]]
+name = "range-set-blaze"
+version = "0.1.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8421b5d459262eabbe49048d362897ff3e3830b44eac6cfe341d6acb2f0f13d2"
+dependencies = [
+ "gen_ops",
+ "itertools 0.12.1",
+ "num-integer",
+ "num-traits",
+]
+
 [[package]]
 name = "rayon"
 version = "1.7.0"
@@ -4597,7 +4625,7 @@ dependencies = [
  "humantime",
  "humantime-serde",
  "hyper 0.14.26",
- "itertools",
+ "itertools 0.10.5",
  "metrics",
  "once_cell",
  "pin-project-lite",
@@ -5666,7 +5694,7 @@ dependencies = [
  "hex",
  "humantime",
  "hyper 0.14.26",
- "itertools",
+ "itertools 0.10.5",
  "lasso",
  "measured",
  "metrics",
@@ -5732,7 +5760,7 @@ dependencies = [
  "futures-util",
  "hex",
  "humantime",
- "itertools",
+ "itertools 0.10.5",
  "once_cell",
  "pageserver",
  "pageserver_api",
@@ -7361,7 +7389,7 @@ dependencies = [
  "hmac",
  "hyper 0.14.26",
  "indexmap 1.9.3",
- "itertools",
+ "itertools 0.10.5",
  "libc",
  "log",
  "memchr",
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 0d9343d643..43976250a4 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -49,6 +49,7 @@ postgres_backend.workspace = true
 postgres-protocol.workspace = true
 postgres-types.workspace = true
 rand.workspace = true
+range-set-blaze = { version = "0.1.16", features = ["alloc"] }
 regex.workspace = true
 scopeguard.workspace = true
 serde.workspace = true
diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs
index 1d02aa7709..1353e79f7c 100644
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -1,3 +1,4 @@
+use criterion::measurement::WallTime;
 use pageserver::keyspace::{KeyPartitioning, KeySpace};
 use pageserver::repository::Key;
 use pageserver::tenant::layer_map::LayerMap;
@@ -15,7 +16,11 @@ use utils::id::{TenantId, TimelineId};
 
 use utils::lsn::Lsn;
 
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use criterion::{black_box, criterion_group, criterion_main, BenchmarkGroup, Criterion};
+
+fn fixture_path(relative: &str) -> PathBuf {
+    PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(relative)
+}
 
 fn build_layer_map(filename_dump: PathBuf) -> LayerMap {
     let mut layer_map = LayerMap::default();
@@ -109,7 +114,7 @@ fn uniform_key_partitioning(layer_map: &LayerMap, _lsn: Lsn) -> KeyPartitioning
 // between each test run.
 fn bench_from_captest_env(c: &mut Criterion) {
     // TODO consider compressing this file
-    let layer_map = build_layer_map(PathBuf::from("benches/odd-brook-layernames.txt"));
+    let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt"));
     let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map);
 
     // Test with uniform query pattern
@@ -139,7 +144,7 @@ fn bench_from_captest_env(c: &mut Criterion) {
 fn bench_from_real_project(c: &mut Criterion) {
     // Init layer map
     let now = Instant::now();
-    let layer_map = build_layer_map(PathBuf::from("benches/odd-brook-layernames.txt"));
+    let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt"));
     println!("Finished layer map init in {:?}", now.elapsed());
 
     // Choose uniformly distributed queries
@@ -242,7 +247,72 @@ fn bench_sequential(c: &mut Criterion) {
     group.finish();
 }
 
+fn bench_visibility_with_map(
+    group: &mut BenchmarkGroup<WallTime>,
+    layer_map: LayerMap,
+    read_points: Vec<Lsn>,
+    bench_name: &str,
+) {
+    group.bench_function(bench_name, |b| {
+        b.iter(|| black_box(layer_map.get_visibility(read_points.clone())));
+    });
+}
+
+// Benchmark using synthetic data. Arrange image layers on stacked diagonal lines.
+fn bench_visibility(c: &mut Criterion) {
+    let mut group = c.benchmark_group("visibility");
+    {
+        // Init layer map. Create 100_000 layers arranged in 1000 diagonal lines.
+        let now = Instant::now();
+        let mut layer_map = LayerMap::default();
+        let mut updates = layer_map.batch_update();
+        for i in 0..100_000 {
+            let i32 = (i as u32) % 100;
+            let zero = Key::from_hex("000000000000000000000000000000000000").unwrap();
+            let layer = PersistentLayerDesc::new_img(
+                TenantShardId::unsharded(TenantId::generate()),
+                TimelineId::generate(),
+                zero.add(10 * i32)..zero.add(10 * i32 + 1),
+                Lsn(i),
+                0,
+            );
+            updates.insert_historic(layer);
+        }
+        updates.flush();
+        println!("Finished layer map init in {:?}", now.elapsed());
+
+        let mut read_points = Vec::new();
+        for i in (0..100_000).step_by(1000) {
+            read_points.push(Lsn(i));
+        }
+
+        bench_visibility_with_map(&mut group, layer_map, read_points, "sequential");
+    }
+
+    {
+        let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt"));
+        let read_points = vec![Lsn(0x1C760FA190)];
+        bench_visibility_with_map(&mut group, layer_map, read_points, "real_map");
+
+        let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt"));
+        let read_points = vec![
+            Lsn(0x1C760FA190),
+            Lsn(0x000000931BEAD539),
+            Lsn(0x000000931BF63011),
+            Lsn(0x000000931B33AE68),
+            Lsn(0x00000038E67ABFA0),
+            Lsn(0x000000931B33AE68),
+            Lsn(0x000000914E3F38F0),
+            Lsn(0x000000931B33AE68),
+        ];
+        bench_visibility_with_map(&mut group, layer_map, read_points, "real_map_many_branches");
+    }
+
+    group.finish();
+}
+
 criterion_group!(group_1, bench_from_captest_env);
 criterion_group!(group_2, bench_from_real_project);
 criterion_group!(group_3, bench_sequential);
-criterion_main!(group_1, group_2, group_3);
+criterion_group!(group_4, bench_visibility);
+criterion_main!(group_1, group_2, group_3, group_4);
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index ede6b41a75..cd2cd43f27 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -525,6 +525,15 @@ static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+static VISIBLE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_visible_physical_size",
+        "The size of the layer files present in the pageserver's filesystem.",
+        &["tenant_id", "shard_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) static RESIDENT_PHYSICAL_SIZE_GLOBAL: Lazy<UIntGauge> = Lazy::new(|| {
     register_uint_gauge!(
         "pageserver_resident_physical_size_global",
@@ -2204,6 +2213,7 @@ pub(crate) struct TimelineMetrics {
     pub(crate) layer_count_delta: UIntGauge,
     pub standby_horizon_gauge: IntGauge,
     pub resident_physical_size_gauge: UIntGauge,
+    pub visible_physical_size_gauge: UIntGauge,
     /// copy of LayeredTimeline.current_logical_size
     pub current_logical_size_gauge: UIntGauge,
     pub aux_file_size_gauge: IntGauge,
@@ -2326,6 +2336,9 @@ impl TimelineMetrics {
         let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
             .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
             .unwrap();
+        let visible_physical_size_gauge = VISIBLE_PHYSICAL_SIZE
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
+            .unwrap();
         // TODO: we shouldn't expose this metric
         let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
             .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
@@ -2380,6 +2393,7 @@ impl TimelineMetrics {
             layer_count_delta,
             standby_horizon_gauge,
             resident_physical_size_gauge,
+            visible_physical_size_gauge,
             current_logical_size_gauge,
             aux_file_size_gauge,
             directory_entries_count_gauge,
@@ -2431,6 +2445,7 @@ impl TimelineMetrics {
             RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
             let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         }
+        let _ = VISIBLE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) {
             let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]);
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 0f09241d22..b9257dfbe8 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1634,7 +1634,7 @@ impl Tenant {
         self: Arc<Self>,
         timeline_id: TimelineId,
     ) -> Result<(), DeleteTimelineError> {
-        DeleteTimelineFlow::run(&self, timeline_id, false).await?;
+        DeleteTimelineFlow::run(&self, timeline_id).await?;
 
         Ok(())
     }
diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index 6f150a2d5c..ba9c08f6e7 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -51,7 +51,8 @@ use crate::keyspace::KeyPartitioning;
 use crate::repository::Key;
 use crate::tenant::storage_layer::InMemoryLayer;
 use anyhow::Result;
-use pageserver_api::keyspace::KeySpaceAccum;
+use pageserver_api::keyspace::{KeySpace, KeySpaceAccum};
+use range_set_blaze::{CheckSortedDisjoint, RangeSetBlaze};
 use std::collections::{HashMap, VecDeque};
 use std::iter::Peekable;
 use std::ops::Range;
@@ -61,7 +62,7 @@ use utils::lsn::Lsn;
 use historic_layer_coverage::BufferedHistoricLayerCoverage;
 pub use historic_layer_coverage::LayerKey;
 
-use super::storage_layer::PersistentLayerDesc;
+use super::storage_layer::{LayerVisibilityHint, PersistentLayerDesc};
 
 ///
 /// LayerMap tracks what layers exist on a timeline.
@@ -871,11 +872,183 @@ impl LayerMap {
         println!("End dump LayerMap");
         Ok(())
     }
+
+    /// `read_points` represent the tip of a timeline and any branch points, i.e. the places
+    /// where we expect to serve reads.
+    ///
+    /// This function is O(N) and should be called infrequently.  The caller is responsible for
+    /// looking up and updating the Layer objects for these layer descriptors.
+    pub fn get_visibility(
+        &self,
+        mut read_points: Vec<Lsn>,
+    ) -> (
+        Vec<(Arc<PersistentLayerDesc>, LayerVisibilityHint)>,
+        KeySpace,
+    ) {
+        // This is like a KeySpace, but this type is intended for efficient unions with image layer ranges, whereas
+        // KeySpace is intended to be composed statically and iterated over.
+        struct KeyShadow {
+            // Map of range start to range end
+            inner: RangeSetBlaze<i128>,
+        }
+
+        impl KeyShadow {
+            fn new() -> Self {
+                Self {
+                    inner: Default::default(),
+                }
+            }
+
+            fn contains(&self, range: Range<Key>) -> bool {
+                let range_incl = range.start.to_i128()..=range.end.to_i128() - 1;
+                self.inner.is_superset(&RangeSetBlaze::from_sorted_disjoint(
+                    CheckSortedDisjoint::from([range_incl]),
+                ))
+            }
+
+            /// Add the input range to the keys covered by self.
+            ///
+            /// Return true if inserting this range covered some keys that were previously not covered
+            fn cover(&mut self, insert: Range<Key>) -> bool {
+                let range_incl = insert.start.to_i128()..=insert.end.to_i128() - 1;
+                self.inner.ranges_insert(range_incl)
+            }
+
+            fn reset(&mut self) {
+                self.inner = Default::default();
+            }
+
+            fn to_keyspace(&self) -> KeySpace {
+                let mut accum = KeySpaceAccum::new();
+                for range_incl in self.inner.ranges() {
+                    let range = Range {
+                        start: Key::from_i128(*range_incl.start()),
+                        end: Key::from_i128(range_incl.end() + 1),
+                    };
+                    accum.add_range(range)
+                }
+
+                accum.to_keyspace()
+            }
+        }
+
+        // The 'shadow' will be updated as we sweep through the layers: an image layer subtracts from the shadow,
+        // and a ReadPoint
+        read_points.sort_by_key(|rp| rp.0);
+        let mut shadow = KeyShadow::new();
+
+        // We will interleave all our read points and layers into a sorted collection
+        enum Item {
+            ReadPoint { lsn: Lsn },
+            Layer(Arc<PersistentLayerDesc>),
+        }
+
+        let mut items = Vec::with_capacity(self.historic.len() + read_points.len());
+        items.extend(self.iter_historic_layers().map(Item::Layer));
+        items.extend(
+            read_points
+                .into_iter()
+                .map(|rp| Item::ReadPoint { lsn: rp }),
+        );
+
+        // Ordering: we want to iterate like this:
+        // 1. Highest LSNs first
+        // 2. Consider images before deltas if they end at the same LSNs (images cover deltas)
+        // 3. Consider ReadPoints before image layers if they're at the same LSN (readpoints make that image visible)
+        items.sort_by_key(|item| {
+            std::cmp::Reverse(match item {
+                Item::Layer(layer) => {
+                    if layer.is_delta() {
+                        (Lsn(layer.get_lsn_range().end.0 - 1), 0)
+                    } else {
+                        (layer.image_layer_lsn(), 1)
+                    }
+                }
+                Item::ReadPoint { lsn } => (*lsn, 2),
+            })
+        });
+
+        let mut results = Vec::with_capacity(self.historic.len());
+
+        let mut maybe_covered_deltas: Vec<Arc<PersistentLayerDesc>> = Vec::new();
+
+        for item in items {
+            let (reached_lsn, is_readpoint) = match &item {
+                Item::ReadPoint { lsn } => (lsn, true),
+                Item::Layer(layer) => (&layer.lsn_range.start, false),
+            };
+            maybe_covered_deltas.retain(|d| {
+                if *reached_lsn >= d.lsn_range.start && is_readpoint {
+                    // We encountered a readpoint within the delta layer: it is visible
+
+                    results.push((d.clone(), LayerVisibilityHint::Visible));
+                    false
+                } else if *reached_lsn < d.lsn_range.start {
+                    // We passed the layer's range without encountering a read point: it is not visible
+                    results.push((d.clone(), LayerVisibilityHint::Covered));
+                    false
+                } else {
+                    // We're still in the delta layer: continue iterating
+                    true
+                }
+            });
+
+            match item {
+                Item::ReadPoint { lsn: _lsn } => {
+                    // TODO: propagate the child timeline's shadow from their own run of this function, so that we don't have
+                    // to assume that the whole key range is visible at the branch point.
+                    shadow.reset();
+                }
+                Item::Layer(layer) => {
+                    let visibility = if layer.is_delta() {
+                        if shadow.contains(layer.get_key_range()) {
+                            // If a layer isn't visible based on current state, we must defer deciding whether
+                            // it is truly not visible until we have advanced past the delta's range: we might
+                            // encounter another branch point within this delta layer's LSN range.
+                            maybe_covered_deltas.push(layer);
+                            continue;
+                        } else {
+                            LayerVisibilityHint::Visible
+                        }
+                    } else {
+                        let modified = shadow.cover(layer.get_key_range());
+                        if modified {
+                            // An image layer in a region which wasn't fully covered yet: this layer is visible, but layers below it will be covered
+                            LayerVisibilityHint::Visible
+                        } else {
+                            // An image layer in a region that was already covered
+                            LayerVisibilityHint::Covered
+                        }
+                    };
+
+                    results.push((layer, visibility));
+                }
+            }
+        }
+
+        // Drain any remaining maybe_covered deltas
+        results.extend(
+            maybe_covered_deltas
+                .into_iter()
+                .map(|d| (d, LayerVisibilityHint::Covered)),
+        );
+
+        (results, shadow.to_keyspace())
+    }
 }
 
 #[cfg(test)]
 mod tests {
-    use pageserver_api::keyspace::KeySpace;
+    use crate::tenant::{storage_layer::LayerName, IndexPart};
+    use pageserver_api::{
+        key::DBDIR_KEY,
+        keyspace::{KeySpace, KeySpaceRandomAccum},
+    };
+    use std::{collections::HashMap, path::PathBuf};
+    use utils::{
+        id::{TenantId, TimelineId},
+        shard::TenantShardId,
+    };
 
     use super::*;
 
@@ -1002,4 +1175,299 @@ mod tests {
             }
         }
     }
+
+    #[test]
+    fn layer_visibility_basic() {
+        // A simple synthetic input, as a smoke test.
+        let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
+        let timeline_id = TimelineId::generate();
+        let mut layer_map = LayerMap::default();
+        let mut updates = layer_map.batch_update();
+
+        const FAKE_LAYER_SIZE: u64 = 1024;
+
+        let inject_delta = |updates: &mut BatchedUpdates,
+                            key_start: i128,
+                            key_end: i128,
+                            lsn_start: u64,
+                            lsn_end: u64| {
+            let desc = PersistentLayerDesc::new_delta(
+                tenant_shard_id,
+                timeline_id,
+                Range {
+                    start: Key::from_i128(key_start),
+                    end: Key::from_i128(key_end),
+                },
+                Range {
+                    start: Lsn(lsn_start),
+                    end: Lsn(lsn_end),
+                },
+                1024,
+            );
+            updates.insert_historic(desc.clone());
+            desc
+        };
+
+        let inject_image =
+            |updates: &mut BatchedUpdates, key_start: i128, key_end: i128, lsn: u64| {
+                let desc = PersistentLayerDesc::new_img(
+                    tenant_shard_id,
+                    timeline_id,
+                    Range {
+                        start: Key::from_i128(key_start),
+                        end: Key::from_i128(key_end),
+                    },
+                    Lsn(lsn),
+                    FAKE_LAYER_SIZE,
+                );
+                updates.insert_historic(desc.clone());
+                desc
+            };
+
+        //
+        // Construct our scenario: the following lines go in backward-LSN order, constructing the various scenarios
+        // we expect to handle.  You can follow these examples through in the same order as they would be processed
+        // by the function under test.
+        //
+
+        let mut read_points = vec![Lsn(1000)];
+
+        // A delta ahead of any image layer
+        let ahead_layer = inject_delta(&mut updates, 10, 20, 101, 110);
+
+        // An image layer is visible and covers some layers beneath itself
+        let visible_covering_img = inject_image(&mut updates, 5, 25, 99);
+
+        // A delta layer covered by the image layer: should be covered
+        let covered_delta = inject_delta(&mut updates, 10, 20, 90, 100);
+
+        // A delta layer partially covered by an image layer: should be visible
+        let partially_covered_delta = inject_delta(&mut updates, 1, 7, 90, 100);
+
+        // A delta layer not covered by an image layer: should be visible
+        let not_covered_delta = inject_delta(&mut updates, 1, 4, 90, 100);
+
+        // An image layer covered by the image layer above: should be covered
+        let covered_image = inject_image(&mut updates, 10, 20, 89);
+
+        // An image layer partially covered by an image layer: should be visible
+        let partially_covered_image = inject_image(&mut updates, 1, 7, 89);
+
+        // An image layer not covered by an image layer: should be visible
+        let not_covered_image = inject_image(&mut updates, 1, 4, 89);
+
+        // A read point: this will make subsequent layers below here visible, even if there are
+        // more recent layers covering them.
+        read_points.push(Lsn(80));
+
+        // A delta layer covered by an earlier image layer, but visible to a readpoint below that covering layer
+        let covered_delta_below_read_point = inject_delta(&mut updates, 10, 20, 70, 79);
+
+        // A delta layer whose end LSN is covered, but where a read point is present partway through its LSN range:
+        // the read point should make it visible, even though its end LSN is covered
+        let covering_img_between_read_points = inject_image(&mut updates, 10, 20, 69);
+        let covered_delta_between_read_points = inject_delta(&mut updates, 10, 15, 67, 69);
+        read_points.push(Lsn(65));
+        let covered_delta_intersects_read_point = inject_delta(&mut updates, 15, 20, 60, 69);
+
+        let visible_img_after_last_read_point = inject_image(&mut updates, 10, 20, 65);
+
+        updates.flush();
+
+        let (layer_visibilities, shadow) = layer_map.get_visibility(read_points);
+        let layer_visibilities = layer_visibilities.into_iter().collect::<HashMap<_, _>>();
+
+        assert_eq!(
+            layer_visibilities.get(&ahead_layer),
+            Some(&LayerVisibilityHint::Visible)
+        );
+        assert_eq!(
+            layer_visibilities.get(&visible_covering_img),
+            Some(&LayerVisibilityHint::Visible)
+        );
+        assert_eq!(
+            layer_visibilities.get(&covered_delta),
+            Some(&LayerVisibilityHint::Covered)
+        );
+        assert_eq!(
+            layer_visibilities.get(&partially_covered_delta),
+            Some(&LayerVisibilityHint::Visible)
+        );
+        assert_eq!(
+            layer_visibilities.get(&not_covered_delta),
+            Some(&LayerVisibilityHint::Visible)
+        );
+        assert_eq!(
+            layer_visibilities.get(&covered_image),
+            Some(&LayerVisibilityHint::Covered)
+        );
+        assert_eq!(
+            layer_visibilities.get(&partially_covered_image),
+            Some(&LayerVisibilityHint::Visible)
+        );
+        assert_eq!(
+            layer_visibilities.get(&not_covered_image),
+            Some(&LayerVisibilityHint::Visible)
+        );
+        assert_eq!(
+            layer_visibilities.get(&covered_delta_below_read_point),
+            Some(&LayerVisibilityHint::Visible)
+        );
+        assert_eq!(
+            layer_visibilities.get(&covering_img_between_read_points),
+            Some(&LayerVisibilityHint::Visible)
+        );
+        assert_eq!(
+            layer_visibilities.get(&covered_delta_between_read_points),
+            Some(&LayerVisibilityHint::Covered)
+        );
+        assert_eq!(
+            layer_visibilities.get(&covered_delta_intersects_read_point),
+            Some(&LayerVisibilityHint::Visible)
+        );
+        assert_eq!(
+            layer_visibilities.get(&visible_img_after_last_read_point),
+            Some(&LayerVisibilityHint::Visible)
+        );
+
+        // Shadow should include all the images below the last read point
+        let expected_shadow = KeySpace {
+            ranges: vec![Key::from_i128(10)..Key::from_i128(20)],
+        };
+        assert_eq!(shadow, expected_shadow);
+    }
+
+    fn fixture_path(relative: &str) -> PathBuf {
+        PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(relative)
+    }
+
+    #[test]
+    fn layer_visibility_realistic() {
+        // Load a large example layermap
+        let index_raw = std::fs::read_to_string(fixture_path(
+            "test_data/indices/mixed_workload/index_part.json",
+        ))
+        .unwrap();
+        let index: IndexPart = serde_json::from_str::<IndexPart>(&index_raw).unwrap();
+
+        let tenant_id = TenantId::generate();
+        let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+        let timeline_id = TimelineId::generate();
+
+        let mut layer_map = LayerMap::default();
+        let mut updates = layer_map.batch_update();
+        for (layer_name, layer_metadata) in index.layer_metadata {
+            let layer_desc = match layer_name {
+                LayerName::Image(layer_name) => PersistentLayerDesc {
+                    key_range: layer_name.key_range.clone(),
+                    lsn_range: layer_name.lsn_as_range(),
+                    tenant_shard_id,
+                    timeline_id,
+                    is_delta: false,
+                    file_size: layer_metadata.file_size,
+                },
+                LayerName::Delta(layer_name) => PersistentLayerDesc {
+                    key_range: layer_name.key_range,
+                    lsn_range: layer_name.lsn_range,
+                    tenant_shard_id,
+                    timeline_id,
+                    is_delta: true,
+                    file_size: layer_metadata.file_size,
+                },
+            };
+            updates.insert_historic(layer_desc);
+        }
+        updates.flush();
+
+        let read_points = vec![index.metadata.disk_consistent_lsn()];
+        let (layer_visibilities, shadow) = layer_map.get_visibility(read_points);
+        for (layer_desc, visibility) in &layer_visibilities {
+            tracing::info!("{layer_desc:?}: {visibility:?}");
+            eprintln!("{layer_desc:?}: {visibility:?}");
+        }
+
+        // The shadow should be non-empty, since there were some image layers
+        assert!(!shadow.ranges.is_empty());
+
+        // At least some layers should be marked covered
+        assert!(layer_visibilities
+            .iter()
+            .any(|i| matches!(i.1, LayerVisibilityHint::Covered)));
+
+        let layer_visibilities = layer_visibilities.into_iter().collect::<HashMap<_, _>>();
+
+        // Brute force validation: a layer should be marked covered if and only if there are image layers above it in LSN order which cover it
+        for (layer_desc, visible) in &layer_visibilities {
+            let mut coverage = KeySpaceRandomAccum::new();
+            let mut covered_by = Vec::new();
+
+            for other_layer in layer_map.iter_historic_layers() {
+                if &other_layer == layer_desc {
+                    continue;
+                }
+                if !other_layer.is_delta()
+                    && other_layer.image_layer_lsn() >= Lsn(layer_desc.get_lsn_range().end.0 - 1)
+                    && other_layer.key_range.start <= layer_desc.key_range.end
+                    && layer_desc.key_range.start <= other_layer.key_range.end
+                {
+                    coverage.add_range(other_layer.get_key_range());
+                    covered_by.push((*other_layer).clone());
+                }
+            }
+            let coverage = coverage.to_keyspace();
+
+            let expect_visible = if coverage.ranges.len() == 1
+                && coverage.contains(&layer_desc.key_range.start)
+                && coverage.contains(&Key::from_i128(layer_desc.key_range.end.to_i128() - 1))
+            {
+                LayerVisibilityHint::Covered
+            } else {
+                LayerVisibilityHint::Visible
+            };
+
+            if expect_visible != *visible {
+                eprintln!(
+                    "Layer {}..{} @ {}..{} (delta={}) is {visible:?}, should be {expect_visible:?}",
+                    layer_desc.key_range.start,
+                    layer_desc.key_range.end,
+                    layer_desc.lsn_range.start,
+                    layer_desc.lsn_range.end,
+                    layer_desc.is_delta()
+                );
+                if expect_visible == LayerVisibilityHint::Covered {
+                    eprintln!("Covered by:");
+                    for other in covered_by {
+                        eprintln!(
+                            "  {}..{} @ {}",
+                            other.get_key_range().start,
+                            other.get_key_range().end,
+                            other.image_layer_lsn()
+                        );
+                    }
+                    if let Some(range) = coverage.ranges.first() {
+                        eprintln!(
+                            "Total coverage from contributing layers: {}..{}",
+                            range.start, range.end
+                        );
+                    } else {
+                        eprintln!(
+                            "Total coverage from contributing layers: {:?}",
+                            coverage.ranges
+                        );
+                    }
+                }
+            }
+            assert_eq!(expect_visible, *visible);
+        }
+
+        // Sanity: the layer that holds latest data for the DBDIR key should always be visible
+        // (just using this key as a key that will always exist for any layermap fixture)
+        let dbdir_layer = layer_map
+            .search(DBDIR_KEY, index.metadata.disk_consistent_lsn())
+            .unwrap();
+        assert!(matches!(
+            layer_visibilities.get(&dbdir_layer.layer).unwrap(),
+            LayerVisibilityHint::Visible
+        ));
+    }
 }
diff --git a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
index 347490c1ba..136f68bc36 100644
--- a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
@@ -521,6 +521,10 @@ impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {
 
         Ok(&self.historic_coverage)
     }
+
+    pub(crate) fn len(&self) -> usize {
+        self.layers.len()
+    }
 }
 
 #[test]
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index f931341aca..4fd110359b 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -451,20 +451,14 @@ pub enum ValueReconstructResult {
 /// than an authoritative value, so that we do not have to update it synchronously when changing the visibility
 /// of layers (for example when creating a branch that makes some previously covered layers visible).  It should
 /// be used for cache management but not for correctness-critical checks.
-#[derive(Default, Debug, Clone, PartialEq, Eq)]
-pub(crate) enum LayerVisibilityHint {
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum LayerVisibilityHint {
     /// A Visible layer might be read while serving a read, because there is not an image layer between it
     /// and a readable LSN (the tip of the branch or a child's branch point)
     Visible,
     /// A Covered layer probably won't be read right now, but _can_ be read in future if someone creates
     /// a branch or ephemeral endpoint at an LSN below the layer that covers this.
-    #[allow(unused)]
     Covered,
-    /// Calculating layer visibilty requires I/O, so until this has happened layers are loaded
-    /// in this state.  Note that newly written layers may be called Visible immediately, this uninitialized
-    /// state is for when existing layers are constructed while loading a timeline.
-    #[default]
-    Uninitialized,
 }
 
 pub(crate) struct LayerAccessStats(std::sync::atomic::AtomicU64);
@@ -626,23 +620,30 @@ impl LayerAccessStats {
         }
     }
 
-    pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) {
-        let value = match visibility {
-            LayerVisibilityHint::Visible => 0x1 << Self::VISIBILITY_SHIFT,
-            LayerVisibilityHint::Covered | LayerVisibilityHint::Uninitialized => 0x0,
-        };
-
-        self.write_bits(0x1 << Self::VISIBILITY_SHIFT, value);
-    }
-
-    pub(crate) fn visibility(&self) -> LayerVisibilityHint {
-        let read = self.0.load(std::sync::atomic::Ordering::Relaxed);
-        match (read >> Self::VISIBILITY_SHIFT) & 0x1 {
+    /// Helper for extracting the visibility hint from the literal value of our inner u64
+    fn decode_visibility(&self, bits: u64) -> LayerVisibilityHint {
+        match (bits >> Self::VISIBILITY_SHIFT) & 0x1 {
             1 => LayerVisibilityHint::Visible,
             0 => LayerVisibilityHint::Covered,
             _ => unreachable!(),
         }
     }
+
+    /// Returns the old value which has been replaced
+    pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) -> LayerVisibilityHint {
+        let value = match visibility {
+            LayerVisibilityHint::Visible => 0x1 << Self::VISIBILITY_SHIFT,
+            LayerVisibilityHint::Covered => 0x0,
+        };
+
+        let old_bits = self.write_bits(0x1 << Self::VISIBILITY_SHIFT, value);
+        self.decode_visibility(old_bits)
+    }
+
+    pub(crate) fn visibility(&self) -> LayerVisibilityHint {
+        let read = self.0.load(std::sync::atomic::Ordering::Relaxed);
+        self.decode_visibility(read)
+    }
 }
 
 /// Get a layer descriptor from a layer.
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 1075feb1d1..5732779e44 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -24,7 +24,8 @@ use super::delta_layer::{self, DeltaEntry};
 use super::image_layer::{self};
 use super::{
     AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName,
-    PersistentLayerDesc, ValueReconstructResult, ValueReconstructState, ValuesReconstructState,
+    LayerVisibilityHint, PersistentLayerDesc, ValueReconstructResult, ValueReconstructState,
+    ValuesReconstructState,
 };
 
 use utils::generation::Generation;
@@ -246,7 +247,7 @@ impl Layer {
                 &timeline.generation,
             );
 
-            let layer = LayerInner::new(
+            LayerInner::new(
                 conf,
                 timeline,
                 local_path,
@@ -254,14 +255,7 @@ impl Layer {
                 Some(inner),
                 timeline.generation,
                 timeline.get_shard_index(),
-            );
-
-            // Newly created layers are marked visible by default: the usual case is that they were created to be read.
-            layer
-                .access_stats
-                .set_visibility(super::LayerVisibilityHint::Visible);
-
-            layer
+            )
         }));
 
         let downloaded = resident.expect("just initialized");
@@ -493,6 +487,32 @@ impl Layer {
             }
         }
     }
+
+    pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) {
+        let old_visibility = self.access_stats().set_visibility(visibility.clone());
+        use LayerVisibilityHint::*;
+        match (old_visibility, visibility) {
+            (Visible, Covered) => {
+                // Subtract this layer's contribution to the visible size metric
+                if let Some(tl) = self.0.timeline.upgrade() {
+                    tl.metrics
+                        .visible_physical_size_gauge
+                        .sub(self.0.desc.file_size)
+                }
+            }
+            (Covered, Visible) => {
+                // Add this layer's contribution to the visible size metric
+                if let Some(tl) = self.0.timeline.upgrade() {
+                    tl.metrics
+                        .visible_physical_size_gauge
+                        .add(self.0.desc.file_size)
+                }
+            }
+            (Covered, Covered) | (Visible, Visible) => {
+                // no change
+            }
+        }
+    }
 }
 
 /// The download-ness ([`DownloadedLayer`]) can be either resident or wanted evicted.
@@ -693,6 +713,13 @@ impl Drop for LayerInner {
                 timeline.metrics.layer_count_image.dec();
                 timeline.metrics.layer_size_image.sub(self.desc.file_size);
             }
+
+            if matches!(self.access_stats.visibility(), LayerVisibilityHint::Visible) {
+                timeline
+                    .metrics
+                    .visible_physical_size_gauge
+                    .sub(self.desc.file_size);
+            }
         }
 
         if !*self.wanted_deleted.get_mut() {
@@ -801,6 +828,12 @@ impl LayerInner {
             timeline.metrics.layer_size_image.add(desc.file_size);
         }
 
+        // New layers are visible by default. This metric is later updated on drop or in set_visibility
+        timeline
+            .metrics
+            .visible_physical_size_gauge
+            .add(desc.file_size);
+
         LayerInner {
             conf,
             debug_str: {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 3a7353c138..37ebeded66 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2736,6 +2736,10 @@ impl Timeline {
         // Tenant::create_timeline will wait for these uploads to happen before returning, or
         // on retry.
 
+        // Now that we have the full layer map, we may calculate the visibility of layers within it (a global scan)
+        drop(guard); // drop write lock, update_layer_visibility will take a read lock.
+        self.update_layer_visibility().await;
+
         info!(
             "loaded layer map with {} layers at {}, total physical size: {}",
             num_layers, disk_consistent_lsn, total_physical_size
@@ -4677,27 +4681,6 @@ impl Timeline {
             }
         }
 
-        // The writer.finish() above already did the fsync of the inodes.
-        // We just need to fsync the directory in which these inodes are linked,
-        // which we know to be the timeline directory.
-        if !image_layers.is_empty() {
-            // We use fatal_err() below because the after writer.finish() returns with success,
-            // the in-memory state of the filesystem already has the layer file in its final place,
-            // and subsequent pageserver code could think it's durable while it really isn't.
-            let timeline_dir = VirtualFile::open(
-                &self
-                    .conf
-                    .timeline_path(&self.tenant_shard_id, &self.timeline_id),
-                ctx,
-            )
-            .await
-            .fatal_err("VirtualFile::open for timeline dir fsync");
-            timeline_dir
-                .sync_all()
-                .await
-                .fatal_err("VirtualFile::sync_all timeline dir");
-        }
-
         let mut guard = self.layers.write().await;
 
         // FIXME: we could add the images to be uploaded *before* returning from here, but right
@@ -4706,6 +4689,9 @@ impl Timeline {
         drop_wlock(guard);
         timer.stop_and_record();
 
+        // Creating image layers may have caused some previously visible layers to be covered
+        self.update_layer_visibility().await;
+
         Ok(image_layers)
     }
 
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 5e9ff1c9e4..4fe9bbafab 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -443,6 +443,45 @@ impl Timeline {
         Ok(())
     }
 
+    /// Update the LayerVisibilityHint of layers covered by image layers, based on whether there is
+    /// an image layer between them and the most recent readable LSN (branch point or tip of timeline).  The
+    /// purpose of the visibility hint is to record which layers need to be available to service reads.
+    ///
+    /// The result may be used as an input to eviction and secondary downloads to de-prioritize layers
+    /// that we know won't be needed for reads.
+    pub(super) async fn update_layer_visibility(&self) {
+        let head_lsn = self.get_last_record_lsn();
+
+        // We will sweep through layers in reverse-LSN order.  We only do historic layers.  L0 deltas
+        // are implicitly left visible, because LayerVisibilityHint's default is Visible, and we never modify it here.
+        // Note that L0 deltas _can_ be covered by image layers, but we consider them 'visible' because we anticipate that
+        // they will be subject to L0->L1 compaction in the near future.
+        let layer_manager = self.layers.read().await;
+        let layer_map = layer_manager.layer_map();
+
+        let readable_points = {
+            let children = self.gc_info.read().unwrap().retain_lsns.clone();
+
+            let mut readable_points = Vec::with_capacity(children.len() + 1);
+            for (child_lsn, _child_timeline_id) in &children {
+                readable_points.push(*child_lsn);
+            }
+            readable_points.push(head_lsn);
+            readable_points
+        };
+
+        let (layer_visibility, covered) = layer_map.get_visibility(readable_points);
+        for (layer_desc, visibility) in layer_visibility {
+            // FIXME: a more efficiency bulk zip() through the layers rather than NlogN getting each one
+            let layer = layer_manager.get_from_desc(&layer_desc);
+            layer.set_visibility(visibility);
+        }
+
+        // TODO: publish our covered KeySpace to our parent, so that when they update their visibility, they can
+        // avoid assuming that everything at a branch point is visible.
+        drop(covered);
+    }
+
     /// Collect a bunch of Level 0 layer files, and compact and reshuffle them as
     /// as Level 1 files. Returns whether the L0 layers are fully compacted.
     async fn compact_level0(
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index ab6a5f20ba..9b2403f899 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -206,11 +206,10 @@ impl DeleteTimelineFlow {
     // NB: If this fails half-way through, and is retried, the retry will go through
     // all the same steps again. Make sure the code here is idempotent, and don't
     // error out if some of the shutdown tasks have already been completed!
-    #[instrument(skip_all, fields(%inplace))]
+    #[instrument(skip_all)]
     pub async fn run(
         tenant: &Arc<Tenant>,
         timeline_id: TimelineId,
-        inplace: bool,
     ) -> Result<(), DeleteTimelineError> {
         super::debug_assert_current_span_has_tenant_and_timeline_id();
 
@@ -235,11 +234,7 @@ impl DeleteTimelineFlow {
             ))?
         });
 
-        if inplace {
-            Self::background(guard, tenant.conf, tenant, &timeline).await?
-        } else {
-            Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
-        }
+        Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
 
         Ok(())
     }
diff --git a/pageserver/test_data/indices/mixed_workload/README.md b/pageserver/test_data/indices/mixed_workload/README.md
new file mode 100644
index 0000000000..724274fcd9
--- /dev/null
+++ b/pageserver/test_data/indices/mixed_workload/README.md
@@ -0,0 +1,7 @@
+
+# This was captured from one shard of a large tenant in staging.
+
+# It has a mixture of deltas and image layers, >1000 layers in total.
+
+# This is suitable for general smoke tests that want an index which is not
+# trivially small, but doesn't contain weird/pathological cases.
diff --git a/pageserver/test_data/indices/mixed_workload/index_part.json b/pageserver/test_data/indices/mixed_workload/index_part.json
new file mode 100644
index 0000000000..cb4bfc4726
--- /dev/null
+++ b/pageserver/test_data/indices/mixed_workload/index_part.json
@@ -0,0 +1 @@
+{"version":7,"layer_metadata":{"000000067F00004005000060F300069883DB-000000067F00004005000060F300069D13FA__00000178B8B10551-0000017C9F5597E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300039A4000-000000067F00004005000060F300039C0000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300039FC000-000000067F00004005000060F30003A0F066__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB43000082C0F1-000000067F000040050081DB43000086E169__000000A583FBFB91-000000A9EB8C4489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000478000-000000067F00004005000060F3000047C000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000012C000-000000067F00004005000060F300001F0000__0000018624969468":{"file_size":134422528,"generation":7,"shard":"0008"},"000000067F00004005000060F700019E8000-000000067F00004005000060F700019EC000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300018E0FE6-000000067F00004005000060F3000193A10B__00000075CC373F31-00000079F2A2F311":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016E85370000004000-030000000000000000000000000000000002__0000018613F0A050":{"file_size":14172160,"generation":3,"shard":"0008"},"000000067F00004005000060F300034847BD-000000067F00004005000060F300034BD86C__000000EBC9213D59-000000EFA7EAA9E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000C80000-000000067F000040050081DB430000C84000__000000BDAFECFC00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F100000CCBA0-000000067F00004005000060F20100000000__0000000D80565628":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000CA4000-000000067F00004005016EA00C0000CE0000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060FB00013BC000-000000067F00004005000060FB0001400000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001240000-000000067F00004005016EA00C0001244000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30004EC52E9-000000067F00004005000060F30004F1638A__000001440D3D0C69-0000014784964B91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000E10000-000000067F000040050081DB430000E14000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000007F0F-000000067F0000400500EB4A480000037E20__000000F309FCDD19-000000F6661C9241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30004FE8000-000000067F00004005000060F3000502905D__0000014784964B91-0000014B000D1821":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB43000072C000-000000067F000040050081DB430000768000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005E3B48F-000000067F00004005000060F30005EF454F__00000164DEE06671-0000016834A3FC91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500E3A2A100000B7E04-030000000000000000000000000000000002__000000E7C2F1B249-000000EBC9213D59":{"file_size":30146560,"generation":2,"shard":"0008"},"000000067F0000400501025D90000009029B-000000067F0000400501025D950100000000__0000011B688FEDC8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F000040050081DB430000A10000-000000067F000040050081DB430000A14000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002F5105E-000000067F00004005000060F30002F9A0EB__000000D74E29AAD1-000000DBBFA87AE1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB000187FE22-000000067F000040050081D80C0100000000__00000075E5D2A930":{"file_size":59138048,"generation":2,"shard":"0008"},"000000067F000040050081DB4300001E8000-000000067F000040050081DB4300001EC000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB000184C000-000000067F00004005000060FB000187FE22__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005A16504-000000067F00004005000060F30005A57691__0000015DD1D3C809-0000016143292911":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F100005C0000-000000067F00004005000060F100005C821A__000000601F43CF09-000000636DE92159":{"file_size":268451840,"generation":2,"shard":"0008"},"000000000000000000000000000000000000-000000067F00004005000060F00300000000__000001BCB572A4E0":{"file_size":2310144,"generation":17,"shard":"0008"},"000000067F00004005000060F30002214000-000000067F00004005000060F30002264247__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500E3A2A10000110000-000000067F0000400500E3A2A10000114000__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006864000-000000067F00004005000060F30006868000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500DBCED500000D0000-000000067F0000400500DBCED500000D4000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000274C000-000000067F00004005000060F30002790000__000000BAC0041E18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00009274AB-030000000000000000000000000000000002__000001935283F9B9-00000196C9018F59":{"file_size":60104704,"generation":11,"shard":"0008"},"000000067F0000400500C782E4000023D359-000000067F0000400500C782E400002A5E4B__000000D31E48D7C9-000000D74E29AAD1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001780DB7-000000067F00004005000060F700017E1391__0000013C9C0E3339-0000013FEFA7D709":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F000040050081DB4300004E4000-000000067F000040050081DB4300004F8000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00018C0000-000000067F00004005016EA00C00018C4000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F300056DC000-000000067F00004005000060F300056E0000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001F14230-000000067F000040050081D80C0100000000__0000018613F0A050":{"file_size":59138048,"generation":3,"shard":"0008"},"000000067F00004005010F9F120000004000-030000000000000000000000000000000002__0000012E77D3BF00":{"file_size":105775104,"generation":2,"shard":"0008"},"000000067F00004005000060F30002D80000-000000067F00004005000060F30002D84000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000122BBF-000000067F00004005000060F7000013B18E__000000114A805939-00000013FB921C81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002B10000-000000067F00004005000060F30002B88FF2__000000C462B3C2A9-000000C824C09619":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006320C60-000000067F00004005000060F30006349DA2__0000016E1FBB7B99-000001715E483C79":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000079E393-000000067F00004005016EA00C00009BF728__00000196C9018F59-0000019A2EAFE7A9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500F67839000005C000-000000067F0000400500F67839000006AEF4__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001D7F71A-030000000000000000000000000000000002__000001BA93C39481-000001BCB572A4E1":{"file_size":50880512,"generation":17,"shard":"0008"},"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000001BCB572C481-000001BCB572C5D9":{"file_size":24576,"generation":20,"shard":"0008"},"000000067F00004005000060F70001570000-000000067F00004005000060F70001574000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000042C000-000000067F00004005000060F30000478000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000001BCB572C5D9-000001BCB572DFF9":{"file_size":24576,"generation":22,"shard":"0008"},"000000067F00004005000060FB00015FCD31-030000000000000000000000000000000002__000000698F2C3A38":{"file_size":147456,"generation":2,"shard":"0008"},"000000067F00004005000060F30005C841ED-000000067F00004005000060F30005C95225__0000016143292911-00000164DEE06671":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30001B4A119-000000067F00004005000060F30100000000__0000008196C976A1-0000008625CF2891":{"file_size":200990720,"generation":2,"shard":"0008"},"000000067F00004005000060F300019790A2-000000067F00004005000060F300019C2056__00000079F2A2F311-0000007E3A9BFD29":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001838000-000000067F00004005000060FB000183C000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001C00FE1-000000067F00004005000060F30001C0A0A3__0000008625CF2891-00000089F4693119":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300056E0000-000000067F00004005000060F300056E4000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000BBD532-000000067F00004005000060F80100000000__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":96477184,"generation":2,"shard":"0008"},"000000067F00004005000060F30000F9B026-000000067F00004005000060F30100000000__00000047E31D98D1-0000004C49155071":{"file_size":173834240,"generation":2,"shard":"0008"},"000000067F000040050081DB430000500000-000000067F000040050081DB430000504000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004971675-000000067F00004005000060F300049B26A8__000001398B56A519-0000013C9C0E3339":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003102107-000000067F00004005000060F300031130BC__000000DE2A8E4FC9-000000E1CD2FBBE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300048A4000-000000067F00004005000060F30004900000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00004B8000-000000067F00004005016EA00C00004BC000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060FB0001A71688-000000067F00004005000060FB0001A8A1CD__0000007E3A9BFD29-0000008196C976A1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000E60000-000000067F00004005000060F30000E64000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300023B0FF7-000000067F00004005000060F300024020ED__000000A9EB8C4489-000000ACA44C8E99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00003F8000-000000067F00004005016EA00C00003FC000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30004B2B250-000000067F00004005000060F30004B5431C__0000013C9C0E3339-0000013FEFA7D709":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000050000-000000067F00004005000060F700000885C5__000000044854EBD1-00000008B6B51879":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060FB000097168A-030000000000000000000000000000000002__00000028C365FBE1-0000002D2A8E0B81":{"file_size":120299520,"generation":2,"shard":"0008"},"000000067F00004005000060F3000625C000-000000067F00004005000060F30006270000__0000017171761D90":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001BA8000-000000067F00004005000060FB0001BC0B44__0000008625CF2891-00000089F4693119":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003344134-000000067F00004005000060F3000336D193__000000E4C63CFA21-000000E7C2F1B249":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006B10FFF-000000067F00004005000060F30006B22072__0000017C9F5597E1-0000018022640391":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006E34000-000000067F00004005000060F30006E70000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F6000008238C-000000067F00004005000060F60100000000__00000139CF156B58":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70000A30000-000000067F00004005000060F70100000000__0000009DF02C1241-000000A173C00489":{"file_size":269688832,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001CE16ED-000000067F000040050081D80C0100000000__0000008DDCD70B68":{"file_size":59138048,"generation":2,"shard":"0008"},"000000067F000040050081DB4300011B0000-000000067F000040050081DB4300011B4000__000000DBD29DC248":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C000010C0D1-000000067F0000400500F3A25C000011E137__000001048B25A8E9-0000010779A7F551":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000004000-000000067F00004005000060F70000029ED0__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F00004005000060F60000058F73-000000067F00004005000060F60100000000__000000E4D847F4E0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001C3F636-000000067F00004005016EA00C0001CC74D7__000001B6FFE46BC9-000001BA93C39481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500EB4A480000101089-000000067F0000400500EB4A48000012798C__000000F6661C9241-000000F901689359":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300007A8000-000000067F000040050081DB4300007AC000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F1000010043F-000000067F00004005000060F20100000000__0000000D55A212C9-000000114A805939":{"file_size":182878208,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001EAC000-000000067F00004005000060FB0001F14230__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000616F6B2-000000067F00004005000060F300061B8705__0000016B49A934C1-0000016E1FBB7B99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30005C9E3C4-000000067F00004005000060F30005CCF3C5__0000016143292911-00000164DEE06671":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001AA0000-000000067F00004005000060F70001AB05CB__0000015304A396B9-0000015670D6AFD9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F3000073C000-000000067F00004005000060F30000775A02__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300003AE21D-000000067F000040050081DB43000045029C__0000008DBE2855F9-000000923719A971":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001B04000-000000067F00004005000060F70001B18000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000E74000-000000067F00004005000060F30000E78000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F7000182C000-000000067F00004005000060F700018871D6__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000DE8B45-000000067F00004005000060FB0000DF968A__000000417D21ACF9-00000044B4679349":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000E78000-000000067F00004005000060F30000E7C000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB000140C000-030000000000000000000000000000000002__000000603CA8F2F0":{"file_size":89522176,"generation":2,"shard":"0008"},"000000067F00004005000060FB00011CA1CD-000000067F00004005000060FB00011F2D11__0000005413AB3641-00000057593D8169":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000144FB4E-000000067F00004005016EA00C00014B79E7__000001A931C135B1-000001AC25760149":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F700015A195C-000000067F00004005000060F80100000000__0000012E77D3BF00":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70000FC0000-000000067F00004005000060F70000FC4000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A48000012798C-000000067F0000400500EB4A48000013F89B__000000F6661C9241-000000F901689359":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001CE4000-000000067F00004005016EA00C0001D18000__000001BCB572A4E0":{"file_size":134422528,"generation":17,"shard":"0008"},"000000067F00004005000060F30005FC519A-000000067F00004005000060F30005FE621A__0000016834A3FC91-0000016B49A934C1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000370000-000000067F00004005016EA00C0000374000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0001760000-000000067F00004005016EA00C0001764000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F100003A0000-000000067F00004005000060F100003B8214__0000003D03FCCDB9-000000417D21ACF9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300006B0000-000000067F00004005000060F300006B4000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00004E1FF6-030000000000000000000000000000000002__000000174479FC18":{"file_size":147456,"generation":2,"shard":"0008"},"000000067F00004005000060F3000502905D-000000067F00004005000060F300050321C0__0000014784964B91-0000014B000D1821":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001AB05CB-000000067F00004005000060F70001AB8B97__0000015304A396B9-0000015670D6AFD9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000151F7C5-000000067F00004005016EA00C000158F667__000001AC25760149-000001AFC313C819":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F70000B9C000-000000067F00004005000060F80100000000__000000AFE87558B0":{"file_size":83533824,"generation":2,"shard":"0008"},"000000067F00004005000060F7000141882A-000000067F00004005000060F80100000000__00000122E1129DA0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F0000400500EB4A48000018F5CD-000000067F0000400500EB4A48000019F4DD__000000F6661C9241-000000F901689359":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F7000196C000-000000067F00004005000060F70001990000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300029C623C-000000067F00004005000060F30100000000__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":81313792,"generation":2,"shard":"0008"},"000000067F00004005000060F300027C0000-000000067F00004005000060F300027C4000__000000BAC0041E18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500FB3D300000001487-000000067F0000400500FB3D300100000000__0000010FB1BE19B9-00000113456156F1":{"file_size":24428544,"generation":2,"shard":"0008"},"000000067F00004005000060F300056D8000-000000067F00004005000060F300056DC000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700003C0000-000000067F00004005000060F700003C4000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000664E3CA-000000067F00004005000060F30100000000__000001715E483C79-000001751A7D7589":{"file_size":288645120,"generation":2,"shard":"0008"},"000000067F000040050100D04D000004B5AD-000000067F000040050100D04D00000634BB__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500DBCED5000002C000-000000067F0000400500DBCED50000078000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000C20000-000000067F00004005016EA00C0000C24000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F70001B30000-000000067F00004005000060F70001B34000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700009C035C-000000067F00004005000060F80100000000__0000009A1ABDE921-0000009DF02C1241":{"file_size":264159232,"generation":2,"shard":"0008"},"000000067F00004005000060F30003B33945-000000067F00004005000060F30100000000__0000010FB1BE19B9-00000113456156F1":{"file_size":155344896,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000079FCFA-000000067F00004005016EA00C00007C7B9C__000001935283F9B9-00000196C9018F59":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500EB4A480000218000-000000067F0000400500EB4A48000021C000__000000FCD84FE628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005D1D0DC-000000067F00004005000060F30005D76250__00000164DEE06671-0000016834A3FC91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB000149B774-000000067F00004005000060FB00014A42B8__000000601F43CF09-000000636DE92159":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003D0B155-000000067F00004005000060F30003D14206__00000117EDA82C11-0000011B632CC319":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300020FC052-000000067F00004005000060F300021050B0__0000009DF02C1241-000000A173C00489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002268000-000000067F00004005000060F300022B9050__000000A583FBFB91-000000A9EB8C4489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300004FC000-000000067F000040050081DB430000500000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300060A93B5-000000067F00004005000060F300060C2210__0000016834A3FC91-0000016B49A934C1":{"file_size":263479296,"generation":2,"shard":"0008"},"000000067F00004005000060F3000674C000-000000067F00004005000060F30006798000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300007F913A-030000000000000000000000000000000002__000000A5A3F27398":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F0000400500DBCED500000F4000-030000000000000000000000000000000002__000000E4D847F4E0":{"file_size":103907328,"generation":2,"shard":"0008"},"000000067F00004005000060F70001348000-000000067F00004005000060F70100000000__0000011B632CC319-0000011F1A40FA69":{"file_size":270753792,"generation":2,"shard":"0008"},"000000067F00004005000060F10000030000-000000067F00004005000060F20100000000__000000021DC73119-000000044854EBD1":{"file_size":267771904,"generation":2,"shard":"0008"},"000000067F000040050107B54701FFFFFFFF-000000067F000040050107B5470300000000__0000011F1A40FA69-00000122A7BB7B29":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30006674000-000000067F00004005000060F30006690000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050107B54701FFFFFFFF-000000067F000040050107B5470300000000__0000011B632CC319-0000011F1A40FA69":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30000298000-000000067F00004005000060F3000029C000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000F185D4-000000067F00004005000060F80100000000__000000DBBFA87AE1-000000DE2A8E4FC9":{"file_size":249135104,"generation":2,"shard":"0008"},"000000067F00004005000060F300049CB712-000000067F00004005000060F30004A048A8__000001398B56A519-0000013C9C0E3339":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700004B1E77-000000067F00004005000060F80100000000__00000047F1F2B800":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30004B00000-000000067F00004005000060F30004B1111A__0000013C9C0E3339-0000013FEFA7D709":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006D14000-000000067F00004005000060F30006D30000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00002D77AE-030000000000000000000000000000000002__000001880F984A29-0000018C496B6DB1":{"file_size":81018880,"generation":11,"shard":"0008"},"000000067F00004005000060F300002D0000-000000067F00004005000060F30000370FD1__0000000D55A212C9-000000114A805939":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500D69D790000028000-000000067F0000400500D69D79000002C000__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002170000-000000067F00004005000060F30002174000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000F59017-000000067F00004005000060F30000F91FFF__00000047E31D98D1-0000004C49155071":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F6000006A37A-000000067F00004005000060F60100000000__000001180B3FF408":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F6000002F012-000000067F00004005000060F60100000000__00000081AA3C40F0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30005614000-000000067F00004005000060F30005688000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300036C8000-000000067F00004005000060F300036F91FE__000000FCCD5238B1-000000FF8B261599":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001ADF63C-030000000000000000000000000000000002__000001B3E1B95181-000001B6FFE46BC9":{"file_size":64421888,"generation":11,"shard":"0008"},"000000067F0000400500EB4A480000057D31-000000067F0000400500EB4A48000008FC41__000000F309FCDD19-000000F6661C9241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000F58000-000000067F00004005016EA00C0000F5C000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F000040050081DB430000908000-000000067F000040050081DB43000094A076__000000A9EB8C4489-000000ACA44C8E99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000471200E-000000067F00004005000060F3000474302B__000001334140FC21-00000137115BE4D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300000403DA-030000000000000000000000000000000002__00000075E5D2A930":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F00004005000060F60000079C4E-000000067F00004005000060F60100000000__0000012E77D3BF00":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F0000400500F67839000003C000-000000067F0000400500F678390000058000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001C80000-000000067F00004005000060FB0001C84000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300047F5138-000000067F00004005000060F3000480620C__000001334140FC21-00000137115BE4D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006B5C09E-000000067F00004005000060F30006BAD108__0000017C9F5597E1-0000018022640391":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001410F57-000000067F00004005000060F70001429534__00000122A7BB7B29-0000012694E36301":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00006B4000-000000067F00004005016EA00C00006E0000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F700009605D8-000000067F00004005000060F80100000000__000000923719A971-00000096262826C9":{"file_size":251338752,"generation":2,"shard":"0008"},"000000067F00004005000060F70000C8CD0C-000000067F00004005000060F80100000000__000000BAC0041E18":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F700012B8000-000000067F00004005000060F80100000000__00000113456156F1-00000117EDA82C11":{"file_size":265781248,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000049C000-000000067F00004005016EA00C00004A8000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F70000C78000-000000067F00004005000060F70000C7C000__000000BAC0041E18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006B4B0BB-000000067F00004005000060F30006B5C09E__0000017C9F5597E1-0000018022640391":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001844000-000000067F00004005000060FB0001848000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300067F0000-000000067F00004005000060F300067F4000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004C80000-000000067F00004005000060F30004C84000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002A4C000-000000067F00004005000060F30002A98000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002480000-000000067F00004005000060F30002484000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000306A02D-000000067F00004005000060F30100000000__000000DBBFA87AE1-000000DE2A8E4FC9":{"file_size":191299584,"generation":2,"shard":"0008"},"000000067F00004005000060F70001510000-000000067F00004005000060F70001514000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005BDB15B-000000067F00004005000060F30005C841ED__0000016143292911-00000164DEE06671":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001E98000-000000067F00004005000060FB0001E9C000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300057942F4-000000067F00004005000060F300057DD292__00000159A7EC8CB9-0000015DD1D3C809":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30005698000-000000067F00004005000060F3000569C000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002983166-000000067F00004005000060F3000299C28F__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000C24000-000000067F00004005016EA00C0000CA0000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F300033D7D7C-000000067F00004005000060F30003458D42__000000E7C2F1B249-000000EBC9213D59":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000A1C000-000000067F000040050081DB430000A30379__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002D93639-000000067F00004005000060F50100000000__000000D037B2DBD0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000029C195-000000067F00004005016EA00C000029C196__000001BA93C39481-000001BCB572A4E1":{"file_size":32768,"generation":17,"shard":"0008"},"000000067F00004005000060F30000A5F9BB-000000067F00004005000060F60100000000__000000321AA80270":{"file_size":81657856,"generation":2,"shard":"0008"},"000000067F00004005000060F30002D84000-000000067F00004005000060F30002D93639__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005D1C000-000000067F00004005000060F30005D70000__000001684518AF20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300010C8000-000000067F000040050081DB4300010E2072__000000D01F399709-000000D31E48D7C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB43000058AF5E-000000067F000040050081DB4300005BCFD7__0000009A1ABDE921-0000009DF02C1241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F7000034611E-000000067F00004005000060F80100000000__000000321AA80270":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F300000C1095-000000067F00004005000060F60100000000__000000021DC73119-000000044854EBD1":{"file_size":220635136,"generation":2,"shard":"0008"},"000000067F00004005000060FB000183C000-000000067F00004005000060FB0001840000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006C8729E-000000067F00004005000060F30006C98340__0000017C9F5597E1-0000018022640391":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30005138000-000000067F00004005000060F3000513C000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300053E30C3-000000067F00004005000060F300053F40CC__0000014EC58A4A79-0000015304A396B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB43000002C000-000000067F000040050081DB4300000403DA__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004970000-000000067F00004005000060F30004974000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003C08000-000000067F00004005000060F30003C0C000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB000103AD12-000000067F00004005000060FB000104B856__0000004C49155071-0000004F31878919":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00004AC000-000000067F00004005016EA00C00004B8000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0000DB7D33-000000067F00004005016EA00C0000E47BD2__0000019E2C5DCEE1-000001A1DD8B4481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30001F30000-000000067F00004005000060F30001F34000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050109FFA2000000C000-030000000000000000000000000000000002__000001180B3FF408":{"file_size":70516736,"generation":2,"shard":"0008"},"000000067F00004005000060F700017405D4-000000067F00004005000060F70001758B92__000001398B56A519-0000013C9C0E3339":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300030B0000-000000067F00004005000060F300030C0FE5__000000DE2A8E4FC9-000000E1CD2FBBE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005010660F501FFFFFFFF-000000067F00004005010660F50300000000__00000122A7BB7B29-0000012694E36301":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30002168000-000000067F00004005000060F3000216C000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F60000046A83-000000067F00004005000060F60100000000__000000BAC0041E18":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001368000-000000067F00004005000060FB000136C000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000184000-000000067F00004005000060F80100000000__000000174479FC18":{"file_size":93143040,"generation":2,"shard":"0008"},"000000067F00004005000060FB00012A8000-000000067F00004005000060FB0100000000__00000057593D8169-0000005C01565329":{"file_size":273711104,"generation":2,"shard":"0008"},"000000067F00004005000060F700007B0000-000000067F00004005000060F700007D05C8__00000075CC373F31-00000079F2A2F311":{"file_size":268468224,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001680B45-000000067F00004005000060FB000169968A__000000698AF6E809-0000006DDB29D589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300050CC000-000000067F00004005000060F300050E8000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000000000000000000000000000000000-000000067F00004005000060F00300000000__0000018613F0A050":{"file_size":2310144,"generation":3,"shard":"0008"},"000000067F00004005000060F70001B1C000-000000067F00004005000060F70001B30000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000F50000-000000067F00004005000060F70000F705D6__000000DE2A8E4FC9-000000E1CD2FBBE9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F000040050109CD330100000000-000000067F000040050109FFA2000000C000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A4800001FC000-000000067F0000400500EB4A480000200000__000000FCD84FE628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000240B12A-000000067F00004005000060F300024440AE__000000A9EB8C4489-000000ACA44C8E99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000008228D-000000067F00004005000060F60100000000__000000027AF9D7D0":{"file_size":24576,"generation":1,"shard":"0008"},"000000067F00004005016EA00C000042C000-000000067F00004005016EA00C0000478000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060FB0000FF8000-000000067F00004005000060FB0001000B44__0000004C49155071-0000004F31878919":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB000169968A-000000067F00004005000060FB00016D21CF__000000698AF6E809-0000006DDB29D589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F100005F821C-000000067F00004005000060F20100000000__000000636DE92159-000000663565F8C9":{"file_size":149954560,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001D7C000-000000067F00004005016EA00C0001E03DD8__000001BCB572A4E0":{"file_size":134422528,"generation":17,"shard":"0008"},"000000067F0000400500F678390000058000-000000067F0000400500F67839000005C000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A4800003A7E20-000000067F0000400500EB4A4800003BFD31__000000FCCD5238B1-000000FF8B261599":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001228000-000000067F00004005016EA00C000122C000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F000040050081DB430000F0C0E9-000000067F000040050081DB430000F4E15B__000000C462B3C2A9-000000C824C09619":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000758000-000000067F00004005000060F80100000000__0000006DDB29D589-000000722F474369":{"file_size":264781824,"generation":2,"shard":"0008"},"000000067F00004005000060F300068640AF-000000067F00004005000060F3000686D0DE__00000178B8B10551-0000017C9F5597E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000047C000-000000067F00004005016EA00C0000498000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30006166575-000000067F00004005000060F3000616F6B2__0000016B49A934C1-0000016E1FBB7B99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001B18000-000000067F00004005000060F70001B1C000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700016EC000-000000067F00004005000060F70001708000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005CCF3C5-000000067F00004005000060F30005D184F6__0000016143292911-00000164DEE06671":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002848000-000000067F00004005000060F3000285901B__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300039C0000-000000067F00004005000060F300039C4000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002464000-000000067F00004005000060F30002480000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00011D0000-000000067F00004005016EA00C00011D4000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30003D44283-000000067F00004005000060F30003D952B0__0000011B632CC319-0000011F1A40FA69":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480100000000-000000067F0000400500EE16BC0000044000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000533205E-000000067F00004005000060F300053E30C3__0000014EC58A4A79-0000015304A396B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F6000009A255-000000067F00004005000060F60300000000__0000017CC2FD7288":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70001B00000-000000067F00004005000060F70001B04000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004958000-000000067F00004005000060F3000495C000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000518000-000000067F00004005000060F80100000000__0000004C49155071-0000004F31878919":{"file_size":262373376,"generation":2,"shard":"0008"},"000000067F00004005000060F300064D8000-000000067F00004005000060F3000658113F__000001715E483C79-000001751A7D7589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500FDA1F80000014000-000000067F0000400500FDA1F80000020D42__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000284000-000000067F00004005000060FB00002D4B6A__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000CDBB9C-000000067F00004005000060F80100000000__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":148865024,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001298000-000000067F00004005016EA00C000129C000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060FB0001DD8000-000000067F00004005000060FB0001DF0B43__000000923719A971-00000096262826C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001220000-000000067F00004005000060F70001224000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002908000-000000067F00004005000060F30002920FA0__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000F5C000-000000067F00004005016EA00C0000F90000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0001E03DD8-030000000000000000000000000000000002__000001BCB572A4E0":{"file_size":139264,"generation":17,"shard":"0008"},"000000067F00004005000060F30003998000-000000067F00004005000060F3000399C000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00014E75C6-030000000000000000000000000000000002__000001A931C135B1-000001AC25760149":{"file_size":51486720,"generation":11,"shard":"0008"},"000000067F00004005010660F500000F44CB-000000067F00004005010660F70100000000__000001180B3FF408":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00003FC000-000000067F00004005016EA00C0000400000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30003810000-000000067F00004005000060F30003849093__000001048B25A8E9-0000010779A7F551":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006B00000-000000067F00004005000060F30006B10FFF__0000017C9F5597E1-0000018022640391":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001541688-000000067F00004005000060FB000154A1CD__000000636DE92159-000000663565F8C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001098000-000000067F00004005000060FB000109C000__00000054161C34B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700011912D4-000000067F00004005000060F80100000000__00000104BD37F348":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30002A40000-000000067F00004005000060F30002A44000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001448000-000000067F00004005000060F300014B0F7B__000000601F43CF09-000000636DE92159":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001009688-000000067F00004005000060FB000102A1CE__0000004C49155071-0000004F31878919":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC00001A4000-000000067F0000400500EE16BC00001E0000__00000104BD37F348":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000B58B45-000000067F00004005000060FB0000B6168A__0000003579F03331-0000003959DA2DE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500D69D7900000AC000-000000067F0000400500D69D7900000BDAF5__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000193A10B-000000067F00004005000060F30100000000__00000075CC373F31-00000079F2A2F311":{"file_size":198148096,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00005A0000-000000067F00004005016EA00C00005A4000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F700000E0000-000000067F00004005000060F80100000000__0000000D80565628":{"file_size":112009216,"generation":2,"shard":"0008"},"000000067F00004005000060F3000690F2FD-000000067F00004005000060F300069883DB__00000178B8B10551-0000017C9F5597E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300004C6B83-000000067F00004005000060F60100000000__000000174479FC18":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30001E18000-000000067F00004005000060F30001E50FF3__000000923719A971-00000096262826C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300043B4000-000000067F00004005000060F300043B8000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F100006C0000-000000067F00004005000060F20100000000__000000722F474369-00000075CC373F31":{"file_size":267665408,"generation":2,"shard":"0008"},"000000067F00004005000060F70000A78000-000000067F00004005000060F70000A7C000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00011C1688-000000067F00004005000060FB00011CA1CD__0000005413AB3641-00000057593D8169":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00004E8000-000000067F00004005016EA00C00004EC000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0000257A6F-000000067F00004005016EA00C000029F90B__000001880F984A29-0000018C496B6DB1":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060FB0001590000-000000067F00004005000060FB0001594000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000193189A-030000000000000000000000000000000002__000001B3F17FE4E0":{"file_size":139264,"generation":11,"shard":"0008"},"000000067F00004005000060F300027C4000-000000067F00004005000060F30002828000__000000BAC0041E18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000B40000-000000067F00004005016EA00C0000B44000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30006694000-000000067F00004005000060F300066F0000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00015C8000-000000067F00004005000060FB00015CC000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003B84000-000000067F00004005000060F30003B90000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006704000-000000067F00004005000060F30006748000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000793506-030000000000000000000000000000000002__0000002427BD8BD0":{"file_size":147456,"generation":2,"shard":"0008"},"000000067F00004005000060F30004F1638A-000000067F00004005000060F30100000000__000001440D3D0C69-0000014784964B91":{"file_size":93708288,"generation":2,"shard":"0008"},"000000067F00004005000060F80100000000-000000067F00004005000060FB0000014000__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F00004005000060F70000180000-000000067F00004005000060F70000184000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004A2693B-000000067F00004005000060F30004A7F98F__000001398B56A519-0000013C9C0E3339":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002C71F27-000000067F00004005000060F30002C9AFB8__000000C824C09619-000000CC13D2E549":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300038075AF-000000067F00004005000060F30100000000__000000FF8B261599-000001048B25A8E9":{"file_size":49823744,"generation":2,"shard":"0008"},"000000067F0000400500DBCED50000028000-000000067F0000400500DBCED5000002C000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004188000-000000067F00004005000060F300041D9101__0000012694E36301-0000012A3F140591":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F30006868000-000000067F00004005000060F50100000000__00000178C5D5D3A8":{"file_size":116645888,"generation":2,"shard":"0008"},"000000067F00004005000060F30003A789A0-000000067F00004005000060F30003AB9907__0000010FB1BE19B9-00000113456156F1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000368000-000000067F0000400500EB4A48000036FF11__000000FCCD5238B1-000000FF8B261599":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300047EC0CA-000000067F00004005000060F300047F5138__000001334140FC21-00000137115BE4D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001AB8B97-000000067F00004005000060F70001AC115C__0000015304A396B9-0000015670D6AFD9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F70000D61283-000000067F00004005000060F70000D8985C__000000C462B3C2A9-000000C824C09619":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300011D1111-000000067F00004005000060F3000122A1D5__0000005413AB3641-00000057593D8169":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001967D34-000000067F00004005016EA00C000197FBD0__000001B3E1B95181-000001B6FFE46BC9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500FA2AD3000004D85C-000000067F0000400500FB3D300100000000__0000010D77B487A0":{"file_size":31309824,"generation":2,"shard":"0008"},"000000067F000040050081DB4300005BCFD7-000000067F000040050081DB4300005D704F__0000009A1ABDE921-0000009DF02C1241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F10000004000-000000067F00004005000060F100000260F2__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F0000400500EE16BC00000F8000-000000067F0000400500EE16BC000014158C__000000F901689359-000000FCCD5238B1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000921E8A-000000067F00004005000060F60100000000__00000028C365FBE1-0000002D2A8E0B81":{"file_size":228564992,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001190000-000000067F00004005000060FB0001198B44__0000005413AB3641-00000057593D8169":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300067A0000-000000067F00004005000060F300067A4000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F10000200000-000000067F00004005000060F10000204000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003FF0FBB-000000067F00004005000060F3000407201D__00000122A7BB7B29-0000012694E36301":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000001C000-000000067F00004005000060F3000008228D__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F00004005016EA00C0001CD7376-030000000000000000000000000000000002__000001B6FFE46BC9-000001BA93C39481":{"file_size":70238208,"generation":11,"shard":"0008"},"000000067F00004005000060FB0000EBC000-000000067F00004005000060FB0000EC8000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000293210E-000000067F00004005000060F30002983166__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000151F271-000000067F00004005000060F30100000000__000000636DE92159-000000663565F8C9":{"file_size":41271296,"generation":2,"shard":"0008"},"000000067F00004005000060F30004880000-000000067F00004005000060F30004884000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F10000518222-000000067F00004005000060F20100000000__0000005413AB3641-00000057593D8169":{"file_size":169492480,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00003E0000-000000067F00004005016EA00C00003E4000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30000775A02-000000067F00004005000060F60100000000__0000002427BD8BD0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000197FBD0-000000067F00004005016EA00C00019C7A6A__000001B3E1B95181-000001B6FFE46BC9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F3000067114B-000000067F00004005000060F60100000000__0000001B59EEB909-0000001FFBC01501":{"file_size":232669184,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001408000-000000067F00004005000060FB000140C000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A4800001F8000-000000067F0000400500EB4A4800001FC000__000000FCD84FE628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000290000-000000067F0000400500EB4A480000294000__000000FCD84FE628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003061089-000000067F00004005000060F3000306A02D__000000DBBFA87AE1-000000DE2A8E4FC9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30001CE4000-000000067F00004005000060F30001CF0197__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000E20000-000000067F00004005000060F70000E24000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300001D0000-000000067F000040050081DB4300001D4000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005D184F6-000000067F00004005000060F30100000000__0000016143292911-00000164DEE06671":{"file_size":200163328,"generation":2,"shard":"0008"},"000000067F00004005000060F300066F4000-000000067F00004005000060F30006700000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000A38000-000000067F000040050081DB430000A4A074__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000F38000-000000067F00004005000060F30000F59017__00000047E31D98D1-0000004C49155071":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000C0C000-000000067F00004005000060FB0000C18000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006D34000-000000067F00004005000060F30006D60000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005010660F501FFFFFFFF-000000067F00004005010660F50300000000__0000011F1A40FA69-00000122A7BB7B29":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F700013E85D1-000000067F00004005000060F70001410BBC__0000011F1A40FA69-00000122A7BB7B29":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000538B44-000000067F00004005000060FB0000551689__0000001737D88379-0000001B59EEB909":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001410000-000000067F00004005000060F70001414000__00000126C3C69FC0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300032F1113-000000067F00004005000060F3000330A1C8__000000E4C63CFA21-000000E7C2F1B249":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30004974000-000000067F00004005000060F3000498DC49__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000625EB45-000000067F00004005000060F30006277C61__0000016E1FBB7B99-000001715E483C79":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700019E8E81-000000067F00004005000060F80100000000__0000014EC58A4A79-0000015304A396B9":{"file_size":246792192,"generation":2,"shard":"0008"},"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000001BCB5730259-000001BCB5732691":{"file_size":24576,"generation":187,"shard":"0008"},"000000067F000040050081DB4300001CC000-000000067F000040050081DB4300001D0000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002C00000-000000067F00004005000060F30002C18FAE__000000C824C09619-000000CC13D2E549":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000FC4000-000000067F00004005000060F70000FCD85E__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB000107C39B-030000000000000000000000000000000002__0000004C49155071-0000004F31878919":{"file_size":133349376,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000F90000-000000067F00004005016EA00C0000F94000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0000F98000-000000067F00004005016EA00C0000F9C000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F700019EC000-000000067F00004005000060F80100000000__0000014EDD256548":{"file_size":7421952,"generation":2,"shard":"0008"},"000000067F00004005000060F300069FA3F6-000000067F00004005000060F30006A0B44C__00000178B8B10551-0000017C9F5597E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300003AC000-000000067F000040050081DB4300003B27DA__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005A57691-000000067F00004005000060F30005B00697__0000015DD1D3C809-0000016143292911":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300060CB2C8-000000067F00004005000060F300060D4415__0000016B49A934C1-0000016E1FBB7B99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000495C000-000000067F00004005000060F30004970000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500D69D7900000D1C5F-000000067F0000400500D69D7900000F1B5B__000000EFA7EAA9E1-000000F309FCDD19":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001358000-030000000000000000000000000000000002__000001A95031E5B8":{"file_size":21110784,"generation":11,"shard":"0008"},"000000067F00004005000060F3000430C000-000000067F00004005000060F30004370000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004904000-000000067F00004005000060F30004958000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000008000-000000067F00004005000060F30000378000__00000186146441F1-0000018624969469":{"file_size":33357824,"generation":6,"shard":"0008"},"000000067F00004005000060F700005C0000-000000067F00004005000060F700005C85CE__00000057593D8169-0000005C01565329":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000B04000-000000067F00004005016EA00C0000B40000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30002920FA0-000000067F00004005000060F3000293210E__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002058000-000000067F00004005000060F30002070F71__0000009DF02C1241-000000A173C00489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000686D0DE-000000067F00004005000060F3000689E295__00000178B8B10551-0000017C9F5597E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500FA2AD30000004000-000000067F0000400500FA2AD30000030000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00009BF728-000000067F00004005016EA00C0000A575C7__00000196C9018F59-0000019A2EAFE7A9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30004374000-000000067F00004005000060F300043B0000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300051F0000-000000067F00004005000060F300051F4000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006B22072-000000067F00004005000060F30006B4B0BB__0000017C9F5597E1-0000018022640391":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000328FA4E-000000067F00004005000060F50100000000__000000E4D847F4E0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000000FEA0-000000067F00004005016EA00C000001FD3E__0000018624969469-000001880F984A29":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500EB4A48000019F4DD-030000000000000000000000000000000002__000000F6661C9241-000000F901689359":{"file_size":59498496,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00003EC000-000000067F00004005016EA00C00003F8000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C000073C000-000000067F00004005016EA00C000074F43B__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30003542BFF-000000067F00004005000060F50100000000__000000EFDE07FFD8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70001771169-000000067F00004005000060F80100000000__000001398B56A519-0000013C9C0E3339":{"file_size":263454720,"generation":2,"shard":"0008"},"000000067F000040050081DB4300003B27DA-030000000000000000000000000000000002__0000008DDCD70B68":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F00004005000060F3000542AFB0-000000067F00004005000060F30005474062__0000015304A396B9-0000015670D6AFD9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F7000057C94F-000000067F00004005000060F80100000000__00000054161C34B8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F300055861F2-000000067F00004005000060F30100000000__0000015304A396B9-0000015670D6AFD9":{"file_size":127393792,"generation":2,"shard":"0008"},"000000067F00004005000060F30001D79136-000000067F00004005000060F30100000000__0000008DBE2855F9-000000923719A971":{"file_size":227958784,"generation":2,"shard":"0008"},"000000067F00004005000060F10000218000-000000067F00004005000060F1000021C000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001CD4000-000000067F00004005016EA00C0001CE0000__000001BCB572A4E0":{"file_size":134422528,"generation":17,"shard":"0008"},"000000067F00004005000060F300017EC000-000000067F00004005000060F30001886B2A__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001188000-000000067F00004005000060F300011D1111__0000005413AB3641-00000057593D8169":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000ECC000-000000067F00004005000060FB0000F050F2__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300018C0000-000000067F00004005000060F300018E0FE6__00000075CC373F31-00000079F2A2F311":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00006E4000-000000067F00004005016EA00C0000738000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30002790000-000000067F00004005000060F30002794000__000000BAC0041E18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C00001B850B-000000067F0000400500F56D510100000000__0000011B688FEDC8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F100001F8000-000000067F00004005000060F100001FC000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000810000-000000067F00004005000060F80100000000__00000079F2A2F311-0000007E3A9BFD29":{"file_size":263454720,"generation":2,"shard":"0008"},"000000067F00004005000060F100006CBF87-000000067F00004005000060F20100000000__000000A5A3F27398":{"file_size":15851520,"generation":2,"shard":"0008"},"000000067F0000400500F7D2DD0100000000-000000067F0000400500F8E3A50000014000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700010AABC7-000000067F00004005000060F80100000000__000000EFDE07FFD8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30003B80000-000000067F00004005000060F30003B84000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000078000-000000067F000040050081DB4300000AA080__00000075CC373F31-00000079F2A2F311":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002618000-000000067F00004005000060F30002680F9D__000000B2B5C4E8F9-000000B768469051":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002A48000-000000067F00004005000060F30002A4C000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001994000-000000067F00004005000060F700019E8000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000B6168A-000000067F00004005000060FB0000B6A1D0__0000003579F03331-0000003959DA2DE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB000147A0EC-000000067F00004005000060FB000148AC30__000000601F43CF09-000000636DE92159":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC0000060000-000000067F0000400500EE16BC0000064000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003458D42-000000067F00004005000060F30003481DDB__000000E7C2F1B249-000000EBC9213D59":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006E30000-000000067F00004005000060F30006E34000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700017F8000-000000067F00004005000060F700017FC000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004C50000-000000067F00004005000060F30004C54000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001720000-000000067F00004005000060F80100000000__00000139CF156B58":{"file_size":63463424,"generation":2,"shard":"0008"},"000000067F000040050081DB430000A8E15E-000000067F000040050081DB430000A98000__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":265404416,"generation":2,"shard":"0008"},"000000067F00004005000060F30004BAE526-000000067F00004005000060F30004BE7584__0000013C9C0E3339-0000013FEFA7D709":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001ADF97B-000000067F00004005016EA00C0001B0FD2A__000001B6FFE46BC9-000001BA93C39481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F60000014000-000000067F00004005000060F60100000000__0000003D2AB09B68":{"file_size":83329024,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000C1C000-000000067F00004005000060FB0000C70000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005240000-000000067F00004005000060F30005244000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB43000077C000-000000067F000040050081DB430000790000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006D60000-000000067F00004005000060F30006D64000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004C54000-000000067F00004005000060F30004C60000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000000000000000001-000000067F0000400500000A690000000002__00000186146441F1-0000018624969469":{"file_size":57344,"generation":6,"shard":"0008"},"000000067F00004005000060F30005688000-000000067F00004005000060F3000568C000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004370000-000000067F00004005000060F30004374000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300051F4000-000000067F00004005000060F30005210000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004DD8000-000000067F00004005000060F30004DDC000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500C782E400001AFD31-000000067F0000400500C782E400001B7C41__000000D01F399709-000000D31E48D7C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000BB103B-000000067F00004005000060F60000014C3A__0000003579F03331-0000003959DA2DE9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F0000400500D19D030100000000-000000067F0000400500D69D790000024000__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB43000028B253-030000000000000000000000000000000002__0000008196C976A1-0000008625CF2891":{"file_size":151224320,"generation":2,"shard":"0008"},"000000067F00004005000060F30004DD8000-000000067F00004005000060F30004E40FFC__000001440D3D0C69-0000014784964B91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005010F44EB0100000000-000000067F00004005010F57CB000000C000__00000126C3C69FC0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003BCC000-000000067F00004005000060F30003C08000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005B80000-000000067F00004005000060F30005B89170__0000016143292911-00000164DEE06671":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000135FCAD-000000067F00004005016EA00C000144FB4E__000001A931C135B1-000001AC25760149":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005010660F500000B0000-000000067F00004005010660F500000B4000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000D31030-000000067F00004005000060F30100000000__0000003D03FCCDB9-000000417D21ACF9":{"file_size":233791488,"generation":2,"shard":"0008"},"000000067F00004005000060F30002C18FAE-000000067F00004005000060F30002C71F27__000000C824C09619-000000CC13D2E549":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EB4A48000041FB53-000000067F0000400500EB4A480000447A64__000000FCCD5238B1-000000FF8B261599":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC0000048000-000000067F0000400500EE16BC000004C000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00009D0000-000000067F00004005000060FB00009D4000__000000321AA80270":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F100004365FE-000000067F00004005000060F20100000000__00000047F1F2B800":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30006BAD108-000000067F00004005000060F30006C0E146__0000017C9F5597E1-0000018022640391":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300006B4000-000000067F00004005000060F300006E0000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000327C000-000000067F00004005000060F3000328FA4E__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003B94000-000000067F00004005000060F30003BC8000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003CB8FCF-000000067F00004005000060F30003CCA0B9__00000117EDA82C11-0000011B632CC319":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003EA902F-000000067F00004005000060F30003F72201__0000011F1A40FA69-00000122A7BB7B29":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F30004C64000-000000067F00004005000060F30004C80000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000194000-000000067F000040050081DB4300001C8000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB01FFFFFFFF-000000067F00004005000060FB0300000000__0000018613A0DEA9-00000186146441F1":{"file_size":73728,"generation":5,"shard":"0008"},"000000067F00004005000060F300038B5F5B-000000067F00004005000060F300038FF04F__0000010779A7F551-0000010A5E65DF39":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300001C8000-000000067F000040050081DB4300001CC000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500C782E40000137F10-000000067F0000400500C782E40000177E20__000000D01F399709-000000D31E48D7C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB000139C000-000000067F00004005000060FB00013B8000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000447A64-000000067F0000400500EB4A480100000000__000000FCCD5238B1-000000FF8B261599":{"file_size":40550400,"generation":2,"shard":"0008"},"000000067F00004005000060F70000418000-000000067F00004005000060F700004405CF__0000003D03FCCDB9-000000417D21ACF9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F000040050081DB430000728000-000000067F000040050081DB43000072C000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300014B0F7B-000000067F00004005000060F30100000000__000000601F43CF09-000000636DE92159":{"file_size":83951616,"generation":2,"shard":"0008"},"000000067F00004005000060F30005F3303F-000000067F00004005000060F30005FA40AD__0000016834A3FC91-0000016B49A934C1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300012442A9-000000067F00004005000060F3000129D29A__00000057593D8169-0000005C01565329":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300010B14AB-000000067F000040050081DB430100000000__000000D037B2DBD0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00014CF88D-000000067F00004005016EA00C00014D7727__000001A931C135B1-000001AC25760149":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30006A0B44C-000000067F00004005000060F30006A7C566__00000178B8B10551-0000017C9F5597E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F1000062EE46-000000067F00004005000060F20100000000__000000698F2C3A38":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001CE0000-000000067F00004005016EA00C0001CE4000__000001BCB572A4E0":{"file_size":134422528,"generation":17,"shard":"0008"},"000000067F00004005000060F30000250000-000000067F00004005000060F30000254000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300050E8000-000000067F00004005000060F300050EC000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000259F4A3-000000067F00004005000060F30100000000__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":44433408,"generation":2,"shard":"0008"},"000000067F000040050081DB430000A640EA-000000067F000040050081DB430000A8E15E__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003050000-000000067F00004005000060F30003061089__000000DBBFA87AE1-000000DE2A8E4FC9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C0000158000-000000067F0000400500F3A25C000016A065__0000010779A7F551-0000010A5E65DF39":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300010A4000-000000067F000040050081DB4300010B14AB__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC00001E0000-000000067F0000400500EE16BC00001E4000__00000104BD37F348":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300055B8000-000000067F00004005000060F300055BC000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000CE4000-000000067F00004005016EA00C0000D30000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30003640000-000000067F00004005000060F30003644000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A48000014F7AC-000000067F0000400500EB4A4800001876BD__000000F6661C9241-000000F901689359":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001CD338E-000000067F00004005016EA00C0001CE79E0__000001BA93C39481-000001BCB572A4E1":{"file_size":268451840,"generation":17,"shard":"0008"},"000000067F00004005000060FB0001530B44-000000067F00004005000060FB0001541688__000000636DE92159-000000663565F8C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300031D516C-000000067F00004005000060F30100000000__000000DE2A8E4FC9-000000E1CD2FBBE9":{"file_size":137863168,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00019C7A6A-000000067F00004005016EA00C00019F7907__000001B3E1B95181-000001B6FFE46BC9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0000E7F7A7-000000067F00004005016EA00C0000F3F647__0000019E2C5DCEE1-000001A1DD8B4481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F300032C0000-000000067F00004005000060F300032F1113__000000E4C63CFA21-000000E7C2F1B249":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00006E0000-000000067F00004005016EA00C00006E4000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F7000019EA78-000000067F00004005000060F80100000000__0000001737D88379-0000001B59EEB909":{"file_size":50946048,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001B4FBC9-000000067F00004005016EA00C0001BBFA66__000001B6FFE46BC9-000001BA93C39481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060FB0001660000-000000067F00004005000060FB0001680B45__000000698AF6E809-0000006DDB29D589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002BAA1DD-000000067F00004005000060F30100000000__000000C462B3C2A9-000000C824C09619":{"file_size":203554816,"generation":2,"shard":"0008"},"000000067F00004005000060F300049B26A8-000000067F00004005000060F300049CB712__000001398B56A519-0000013C9C0E3339":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000CCB5CD-000000067F00004005000060F70000CDBB9C__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000EEA075-000000067F000040050081DB430000F0C0E9__000000C462B3C2A9-000000C824C09619":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300003E0000-000000067F00004005000060F300003E8FBC__000000114A805939-00000013FB921C81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006C9C000-000000067F00004005000060F30006CA0000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000C7C000-000000067F00004005000060F70000C8CD0C__000000BAC0041E18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001148000-000000067F00004005000060FB000114C000__00000054161C34B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001232ACF-000000067F00004005000060F80100000000__0000010D77B487A0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70000FE8000-000000067F00004005000060F700010105DB__000000E4C63CFA21-000000E7C2F1B249":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000355928-000000067F0000400500EB4A480100000000__000000FCD84FE628":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F700003FE341-000000067F00004005000060F80100000000__0000003D2AB09B68":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F3000244D189-000000067F00004005000060F30100000000__000000A9EB8C4489-000000ACA44C8E99":{"file_size":212566016,"generation":2,"shard":"0008"},"000000067F00004005000060F700003B85C7-000000067F00004005000060F80100000000__0000003579F03331-0000003959DA2DE9":{"file_size":208945152,"generation":2,"shard":"0008"},"000000067F00004005000060F100005A2B80-000000067F00004005000060F20100000000__000000603CA8F2F0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060FB000070C000-000000067F00004005000060FB0000718000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB01FFFFFFFF-000000067F00004005000060FB0300000000__00000186146441F1-0000018624969469":{"file_size":24576,"generation":6,"shard":"0008"},"000000067F00004005000060FB000180C000-000000067F00004005000060FB0001838000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC0000044000-000000067F0000400500EE16BC0000048000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F10100000000-000000067F00004005000060F10300000000__000000A583FBFB91-000000A9EB8C4489":{"file_size":483328,"generation":2,"shard":"0008"},"000000067F00004005000060F30004EA41A5-000000067F00004005000060F30004EC52E9__000001440D3D0C69-0000014784964B91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003AB9907-000000067F00004005000060F30003AF28CB__0000010FB1BE19B9-00000113456156F1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000974000-000000067F00004005000060FB00009D0000__000000321AA80270":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300038720A2-000000067F00004005000060F300038A3082__000001048B25A8E9-0000010779A7F551":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000452BA1-000000067F000040050081DB4300004C4C1E__000000923719A971-00000096262826C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300017AA0CE-000000067F00004005000060F30100000000__0000006DDB29D589-000000722F474369":{"file_size":202719232,"generation":2,"shard":"0008"},"000000067F000040050081DB430000504000-000000067F000040050081DB430000560000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004B5431C-000000067F00004005000060F30004B654F6__0000013C9C0E3339-0000013FEFA7D709":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000C20000-000000067F00004005000060F30000C24000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300028920E4-000000067F00004005000060F30100000000__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":200351744,"generation":2,"shard":"0008"},"000000067F000040050081DB4300004C4C1E-030000000000000000000000000000000002__000000923719A971-00000096262826C9":{"file_size":192356352,"generation":2,"shard":"0008"},"000000067F000040050081DB430000190000-000000067F000040050081DB430000194000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000E88000-000000067F000040050081DB430000E8C000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000738000-000000067F00004005016EA00C000073C000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F000040050081DB430000578EE6-000000067F000040050081DB43000058AF5E__0000009A1ABDE921-0000009DF02C1241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30001C38000-000000067F00004005000060F30001C3C000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000B7C0EA-030000000000000000000000000000000002__000000B2B5C4E8F9-000000B768469051":{"file_size":133464064,"generation":2,"shard":"0008"},"000000067F00004005000060F3000625B8F0-000000067F00004005000060F30100000000__0000016B49A934C1-0000016E1FBB7B99":{"file_size":139640832,"generation":2,"shard":"0008"},"000000067F00004005000060FB000109C000-000000067F00004005000060FB0001110000__00000054161C34B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000001BCB572DFF9-000001BCB5730259":{"file_size":24576,"generation":41,"shard":"0008"},"000000067F00004005000060FB0000AA8000-000000067F00004005000060FB0000AD0B45__0000003203FB5749-0000003579F03331":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300043F8000-000000067F00004005000060F300043FC000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A4800003C7C42-000000067F0000400500EB4A48000041FB53__000000FCCD5238B1-000000FF8B261599":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30005BA213F-000000067F00004005000060F30005BDB15B__0000016143292911-00000164DEE06671":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300063FE10E-000000067F00004005000060F30100000000__0000016E1FBB7B99-000001715E483C79":{"file_size":111067136,"generation":2,"shard":"0008"},"000000067F00004005000060F30000F91FFF-000000067F00004005000060F30000F9B026__00000047E31D98D1-0000004C49155071":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003650000-000000067F00004005000060F30003654000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300050A412B-000000067F00004005000060F300050B5199__0000014784964B91-0000014B000D1821":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001D78000-000000067F00004005016EA00C0001D7C000__000001BCB572A4E0":{"file_size":134422528,"generation":17,"shard":"0008"},"000000067F00004005016EA00C0001244000-000000067F00004005016EA00C0001298000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F100001FC000-000000067F00004005000060F10000200000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000CA0000-000000067F00004005016EA00C0000CA4000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F3000498DC49-000000067F00004005000060F50100000000__00000139CF156B58":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F60000036EA0-000000067F00004005000060F60100000000__0000009A24DF6768":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000928B45-000000067F00004005000060FB000097168A__00000028C365FBE1-0000002D2A8E0B81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006854000-000000067F00004005000060F30006858000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050109FFA2000000C3F5-030000000000000000000000000000000002__00000117EDA82C11-0000011B632CC319":{"file_size":226066432,"generation":2,"shard":"0008"},"000000067F00004005000060F30003A6D1B3-000000067F00004005000060F30100000000__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":117620736,"generation":2,"shard":"0008"},"000000067F00004005000060F30002D2C000-000000067F00004005000060F30002D80000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003A31FB6-000000067F00004005000060F30003A3B020__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000160723E-000000067F00004005016EA00C00016570D9__000001AC25760149-000001AFC313C819":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500FB3D310000018000-000000067F0000400500FB3D31000001C000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001708000-000000067F00004005000060F7000170C000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000283C3E7-000000067F00004005000060F50100000000__000000BAC0041E18":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060FB00018F0000-000000067F00004005000060FB0100000000__00000075CC373F31-00000079F2A2F311":{"file_size":268959744,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000EC8000-000000067F00004005000060FB0000ECC000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000F9C000-000000067F00004005016EA00C0000FF0000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30002680F9D-000000067F00004005000060F3000274A080__000000B2B5C4E8F9-000000B768469051":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000679C000-000000067F00004005000060F300067A0000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000428313F-000000067F00004005000060F300042CC1BD__0000012694E36301-0000012A3F140591":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00000FFFFFFFF-030000000000000000000000000000000002__00000186146441F1-0000018624969469":{"file_size":24576,"generation":6,"shard":"0008"},"000000067F00004005000060FB00017D8000-000000067F00004005000060FB00017DC000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700017FC000-000000067F00004005000060F70001828000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002FD317C-000000067F00004005000060F30002FF427D__000000D74E29AAD1-000000DBBFA87AE1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001701588-000000067F00004005000060FB00017120CE__0000006DDB29D589-000000722F474369":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500000A3000FFFFFFFF-000000067F0000400500000A690000000002__000001BA93C39481-000001BCB572A4E1":{"file_size":40960,"generation":17,"shard":"0008"},"000000067F00004005000060FB0000638B45-030000000000000000000000000000000002__0000001B59EEB909-0000001FFBC01501":{"file_size":252010496,"generation":2,"shard":"0008"},"000000067F000040050081DB430000394000-000000067F000040050081DB4300003A8000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001CF0197-000000067F00004005000060F50100000000__0000008DDCD70B68":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F0000400500EB4A4800000DFB51-000000067F0000400500EB4A4800000E7A62__000000F309FCDD19-000000F6661C9241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F7000014C000-000000067F00004005000060F70000180000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005948000-000000067F00004005000060F300059790CD__0000015DD1D3C809-0000016143292911":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000853115-000000067F00004005000060F60100000000__00000023FEF9F321-00000028C365FBE1":{"file_size":176136192,"generation":2,"shard":"0008"},"000000067F00004005000060F30004884000-000000067F00004005000060F30004888000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000513C000-000000067F00004005000060F30005160000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C000017C000-000000067F0000400500F3A25C00001B850B__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006382F14-000000067F00004005000060F3000638C06D__0000016E1FBB7B99-000001715E483C79":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500E3A2A10000017F02-000000067F0000400500E3A2A100000B7E04__000000E7C2F1B249-000000EBC9213D59":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001000B44-000000067F00004005000060FB0001009688__0000004C49155071-0000004F31878919":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500D69D790100000000-000000067F0000400500DBCED50000024000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300010A0000-000000067F000040050081DB4300010A4000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000310000-000000067F00004005000060FB0000348B45__0000000D55A212C9-000000114A805939":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F60000060038-000000067F00004005000060F60100000000__000000F91FE84F08":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30001CE0000-000000067F00004005000060F30001CE4000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300000AA080-000000067F000040050081DB4300000D40FF__00000075CC373F31-00000079F2A2F311":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000551689-030000000000000000000000000000000002__0000001737D88379-0000001B59EEB909":{"file_size":227418112,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000D90000-000000067F00004005000060FB0100000000__0000003D03FCCDB9-000000417D21ACF9":{"file_size":272769024,"generation":2,"shard":"0008"},"000000067F00004005000060F300059CC403-000000067F00004005000060F300059F53C6__0000015DD1D3C809-0000016143292911":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30001F2C000-000000067F00004005000060F30001F30000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000014000-000000067F00004005000060FB0000084772__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F00004005000060F30004B654F6-000000067F00004005000060F30004BAE526__0000013C9C0E3339-0000013FEFA7D709":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002450000-000000067F00004005000060F30002454000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003A0F066-000000067F00004005000060F50100000000__0000010D77B487A0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F60000032EBE-000000067F00004005000060F60100000000__0000008DDCD70B68":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060FB00001D8000-000000067F00004005000060FB00001DC000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000670000-000000067F00004005016EA00C0000674000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0001344000-000000067F00004005016EA00C0001358000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0000D30000-000000067F00004005016EA00C0000D34000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C000012FE9A-000000067F00004005016EA00C00001F7D38__000001880F984A29-0000018C496B6DB1":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F70000BF0000-000000067F00004005000060F70100000000__000000B2B5C4E8F9-000000B768469051":{"file_size":273809408,"generation":2,"shard":"0008"},"000000067F00004005000060F300005A0000-000000067F00004005000060F3000067114B__0000001B59EEB909-0000001FFBC01501":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EB4A48000021C000-000000067F0000400500EB4A480000290000__000000FCD84FE628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000F3C000-000000067F00004005016EA00C0000F58000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C000074F43B-030000000000000000000000000000000002__000001936E73D028":{"file_size":139264,"generation":11,"shard":"0008"},"000000067F00004005010F57CB000000C000-000000067F00004005010F99A50100000000__00000126C3C69FC0":{"file_size":22978560,"generation":2,"shard":"0008"},"000000067F00004005000060F700017E1391-000000067F00004005000060F80100000000__0000013C9C0E3339-0000013FEFA7D709":{"file_size":232677376,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001CC74D7-000000067F00004005016EA00C0001CD7376__000001B6FFE46BC9-000001BA93C39481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F700005C85CE-000000067F00004005000060F700005E8B9D__00000057593D8169-0000005C01565329":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F30003FCD352-000000067F00004005000060F30100000000__0000011F1A40FA69-00000122A7BB7B29":{"file_size":124788736,"generation":2,"shard":"0008"},"000000067F0000400500C782E400002A5E4B-000000067F0000400500C782E400002CDD5C__000000D31E48D7C9-000000D74E29AAD1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700018871D6-000000067F00004005000060F80100000000__000001444EB7FC10":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30003D252C8-000000067F00004005000060F30100000000__00000117EDA82C11-0000011B632CC319":{"file_size":205963264,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001408A62-000000067F00004005000060FB00014195A7__000000601F43CF09-000000636DE92159":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500C782E400001B7C41-000000067F0000400500C782E400001C7B51__000000D01F399709-000000D31E48D7C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000110000-000000067F00004005000060FB0100000000__000000044854EBD1-00000008B6B51879":{"file_size":272613376,"generation":2,"shard":"0008"},"000000067F00004005000060F300004E8000-000000067F00004005000060F60100000000__0000001737D88379-0000001B59EEB909":{"file_size":260579328,"generation":2,"shard":"0008"},"000000067F00004005000060F30006DF4000-000000067F00004005000060F30006E30000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000C84000-030000000000000000000000000000000002__000000BAC0041E18":{"file_size":59998208,"generation":2,"shard":"0008"},"000000067F00004005000060F30002B88FF2-000000067F00004005000060F30002BAA1DD__000000C462B3C2A9-000000C824C09619":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000434000-000000067F00004005000060FB00004A0000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004DA8000-000000067F00004005000060F30004DAC000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300004E0000-000000067F000040050081DB4300004E4000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC00001E4000-000000067F0000400500EE16BC0000201716__00000104BD37F348":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000C440EA-000000067F000040050081DB430000C5E15B__000000B768469051-000000BAB1E56C91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500D69D7900000BDAF5-000000067F0000400500D69D790100000000__000000EFDE07FFD8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30002A9C000-000000067F00004005000060F30002AEED02__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004DAC000-000000067F00004005000060F30004DD8000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000B94000-000000067F00004005000060F70000B98000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002454000-000000067F00004005000060F30002460000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F100001059CB-000000067F00004005000060F10000125BF2__000000114A805939-00000013FB921C81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000D362CA-000000067F00004005016EA00C0000DB7D33__0000019E2C5DCEE1-000001A1DD8B4481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30001C0A0A3-000000067F00004005000060F30100000000__0000008625CF2891-00000089F4693119":{"file_size":203063296,"generation":2,"shard":"0008"},"000000067F00004005000060F300066F0000-000000067F00004005000060F300066F4000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001414000-000000067F00004005000060F70001428000__00000126C3C69FC0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300014CC16D-000000067F00004005000060F300014D5280__000000636DE92159-000000663565F8C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB000172AC12-030000000000000000000000000000000002__0000006DDB29D589-000000722F474369":{"file_size":186875904,"generation":2,"shard":"0008"},"000000067F000040050081DB430000E4C000-000000067F000040050081DB430000E88000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300063A50CD-000000067F00004005000060F300063FE10E__0000016E1FBB7B99-000001715E483C79":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30005419E9C-000000067F00004005000060F3000542AFB0__0000015304A396B9-0000015670D6AFD9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC000014158C-030000000000000000000000000000000002__000000F901689359-000000FCCD5238B1":{"file_size":67854336,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00015FF3A0-000000067F00004005016EA00C000160723E__000001AC25760149-000001AFC313C819":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005016EA00C00008E760F-000000067F00004005016EA00C00009274AB__000001935283F9B9-00000196C9018F59":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F70000B98000-000000067F00004005000060F70000B9C000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00004A4000-000000067F00004005000060FB00004E1FF6__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006670000-000000067F00004005000060F30006674000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000185EE9-000000067F00004005000060F7000018E4B6__0000001737D88379-0000001B59EEB909":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500D19D030000067CA9-030000000000000000000000000000000002__000000DBBFA87AE1-000000DE2A8E4FC9":{"file_size":29319168,"generation":2,"shard":"0008"},"000000067F0000400500FF2A51000000BFFB-030000000000000000000000000000000002__0000010D77B487A0":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F00004005000060F30004A048A8-000000067F00004005000060F30004A1D870__000001398B56A519-0000013C9C0E3339":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300004BC000-000000067F00004005000060F300004C6B83__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005290FC9-000000067F00004005000060F3000533205E__0000014EC58A4A79-0000015304A396B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300031130BC-000000067F00004005000060F300031C40D1__000000DE2A8E4FC9-000000E1CD2FBBE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500D19D030000047EE2-000000067F0000400500D19D03000004FDC6__000000DBBFA87AE1-000000DE2A8E4FC9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002A44000-000000067F00004005000060F30002A48000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003DAE2DC-000000067F00004005000060F30003DD734C__0000011B632CC319-0000011F1A40FA69":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500F8E3A50000014000-000000067F0000400500F8E3A5000004A25C__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F100002F03E9-000000067F00004005000060F20100000000__000000321AA80270":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70001138000-000000067F00004005000060F80100000000__000000FCCD5238B1-000000FF8B261599":{"file_size":72695808,"generation":2,"shard":"0008"},"000000067F00004005000060F300056E4000-000000067F00004005000060F50100000000__00000159B010F6C0":{"file_size":13393920,"generation":2,"shard":"0008"},"000000067F00004005000060F70000A7C000-000000067F00004005000060F70000ABD9C4__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000CC6E51-030000000000000000000000000000000002__0000003D2AB09B68":{"file_size":147456,"generation":2,"shard":"0008"},"000000067F00004005000060F60000091EFF-000000067F00004005000060F60100000000__0000014EDD256548":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F0000400500EB4A48000008FC41-000000067F0000400500EB4A4800000DFB51__000000F309FCDD19-000000F6661C9241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30001F363B4-000000067F00004005000060F30001F574A6__0000009A1ABDE921-0000009DF02C1241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001CD0000-000000067F00004005016EA00C0001CD4000__000001BCB572A4E0":{"file_size":134422528,"generation":17,"shard":"0008"},"000000067F00004005000060F300059B324D-000000067F00004005000060F300059CC403__0000015DD1D3C809-0000016143292911":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002530000-000000067F00004005000060F30002534000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F6000004B633-000000067F00004005000060F60100000000__000000C483D0D6B8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F700011E0000-000000067F00004005000060F80100000000__0000010779A7F551-0000010A5E65DF39":{"file_size":262922240,"generation":2,"shard":"0008"},"000000067F00004005000060F30006690000-000000067F00004005000060F30006694000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000100E18-000000067F00004005000060F700001213F2__0000000D55A212C9-000000114A805939":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F0000400500FF2A510000004000-000000067F0000400500FF2A51000000BFFB__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000EB8000-000000067F00004005000060FB0000EBC000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000674000-000000067F00004005016EA00C00006B0000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F70000EF85D6-000000067F00004005000060F80100000000__000000D74E29AAD1-000000DBBFA87AE1":{"file_size":262897664,"generation":2,"shard":"0008"},"000000067F00004005000060F700005E8B9D-000000067F00004005000060F700005F9158__00000057593D8169-0000005C01565329":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F30004E40FFC-000000067F00004005000060F30004E7A062__000001440D3D0C69-0000014784964B91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000037E20-000000067F0000400500EB4A480000057D31__000000F309FCDD19-000000F6661C9241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400501101C0901FFFFFFFF-030000000000000000000000000000000002__0000012E71CF31F9-000001334140FC21":{"file_size":65060864,"generation":2,"shard":"0008"},"000000067F00004005000060F70000B10000-000000067F00004005000060F70100000000__000000A583FBFB91-000000A9EB8C4489":{"file_size":272646144,"generation":2,"shard":"0008"},"000000067F00004005000060F300056E104B-000000067F00004005000060F3000570A19E__00000159A7EC8CB9-0000015DD1D3C809":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300059790CD-000000067F00004005000060F300059AA115__0000015DD1D3C809-0000016143292911":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000B54000-000000067F00004005000060F70000B90000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300041D9101-000000067F00004005000060F3000424A099__0000012694E36301-0000012A3F140591":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700000E085E-000000067F00004005000060F70000100E18__0000000D55A212C9-000000114A805939":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300051B0000-000000067F00004005000060F300051B4000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000001BCB572A4E1-000001BCB572C329":{"file_size":24576,"generation":17,"shard":"0008"},"000000067F00004005000060F30006D30000-000000067F00004005000060F30006D34000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500FDA1F80000020D42-000000067F0000400500FDA1F80100000000__0000010D77B487A0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F000040050081D80C0100000000-000000067F000040050081DB430000024000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F600000235B4-000000067F00004005000060F60100000000__000000603CA8F2F0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F0000400500C782E400000A0000-000000067F0000400500C782E400000A4000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002264247-000000067F00004005000060F50100000000__000000A5A3F27398":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F3000302C2D6-000000067F00004005000060F50100000000__000000DBD29DC248":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000129C000-000000067F00004005016EA00C0001340000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F700016E8000-000000067F00004005000060F700016EC000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300023A0000-000000067F00004005000060F300023B0FF7__000000A9EB8C4489-000000ACA44C8E99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F20100000000-000000067F00004005000060F3000000C000__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F00004005016EA00C0000374000-000000067F00004005016EA00C00003E0000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F70000368000-000000067F00004005000060F80100000000__0000003203FB5749-0000003579F03331":{"file_size":263249920,"generation":2,"shard":"0008"},"000000067F000040050081DB4300006310C9-030000000000000000000000000000000002__0000009A1ABDE921-0000009DF02C1241":{"file_size":208953344,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000DC8000-000000067F00004005000060FB0000DE8B45__000000417D21ACF9-00000044B4679349":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000530000-000000067F00004005000060FB0000538B44__0000001737D88379-0000001B59EEB909":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000024000-000000067F000040050081DB430000028000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000488C000-000000067F00004005000060F30004898000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300044D3639-000000067F00004005000060F50100000000__0000012E77D3BF00":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005010450640000000570-000000067F0000400501046F39000000BDD2__0000010FB1BE19B9-00000113456156F1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300021050B0-000000067F00004005000060F3000212E160__0000009DF02C1241-000000A173C00489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700010DD440-000000067F00004005000060F80100000000__000000F309FCDD19-000000F6661C9241":{"file_size":91758592,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000AD0B45-000000067F00004005000060FB0000AE168A__0000003203FB5749-0000003579F03331":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F7000013B18E-000000067F00004005000060F7000014B73D__000000114A805939-00000013FB921C81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001938000-000000067F00004005016EA00C000193FE9D__000001B3E1B95181-000001B6FFE46BC9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500C782E400000A4000-000000067F0000400500C782E4000012A71E__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001A40000-000000067F00004005000060F30001A44000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00008578D4-000000067F00004005016EA00C00008CF772__000001935283F9B9-00000196C9018F59":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30001CC0000-000000067F00004005000060F30001CC4000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004D20000-000000067F00004005000060F30004D24000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00003E8000-000000067F00004005016EA00C00003EC000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F300039C4000-000000067F00004005000060F300039F8000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005164000-000000067F00004005000060F300051B0000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300039F8000-000000067F00004005000060F300039FC000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300010F46BD-000000067F000040050081DB430100000000__000000D31E48D7C9-000000D74E29AAD1":{"file_size":113999872,"generation":2,"shard":"0008"},"000000067F00004005000060F30002E630CF-000000067F00004005000060F30100000000__000000D31E48D7C9-000000D74E29AAD1":{"file_size":171999232,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000ACF305-000000067F00004005016EA00C0000ADF1AB__00000196C9018F59-0000019A2EAFE7A9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30006748000-000000067F00004005000060F3000674C000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003810000-000000067F00004005000060F50100000000__00000104BD37F348":{"file_size":11739136,"generation":2,"shard":"0008"},"000000067F00004005000060F1000021C000-000000067F00004005000060F20100000000__0000002427BD8BD0":{"file_size":132448256,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00017EC000-000000067F00004005016EA00C00018C0000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F7000025DA3C-000000067F00004005000060F80100000000__0000002427BD8BD0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060FB00007F0000-000000067F00004005000060FB0000860B45__00000023FEF9F321-00000028C365FBE1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003FF0000-000000067F00004005000060F30003FF4000__00000126C3C69FC0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000E0AD15-000000067F00004005000060FB0000E1B859__000000417D21ACF9-00000044B4679349":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005010ADFA80000004000-000000067F00004005010F2BD40100000000__00000126C3C69FC0":{"file_size":13369344,"generation":2,"shard":"0008"},"000000067F00004005000060F30004898000-000000067F00004005000060F3000489C000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003D2B1B0-000000067F00004005000060F30003D44283__0000011B632CC319-0000011F1A40FA69":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000FF4000-000000067F00004005016EA00C0001188000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005010F99A50100000000-000000067F00004005010F9F120000004000__00000126C3C69FC0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001F34000-000000067F00004005000060F30001F38F48__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700018A0000-000000067F00004005000060F700018D85CA__000001440D3D0C69-0000014784964B91":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300029A526C-000000067F00004005000060F300029C623C__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB00017DC000-000000067F00004005000060FB0001808000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500DBCED50000024000-000000067F0000400500DBCED50000028000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC0000201716-000000067F0000400500EE16C40100000000__0000012A77C1B0B0":{"file_size":32768,"generation":2,"shard":"0008"},"000000067F00004005000060F30006D10000-000000067F00004005000060F30006D14000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430001064000-000000067F000040050081DB4300010A0000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C01FFFFFFFF-000000067F0000400500F3A25C0300000000__0000011B632CC319-0000011F1A40FA69":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30001340000-000000067F00004005000060F30001344000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003E98000-000000067F00004005000060F30003EA902F__0000011F1A40FA69-00000122A7BB7B29":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006C0E146-000000067F00004005000060F30006C8729E__0000017C9F5597E1-0000018022640391":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F600000166C4-000000067F00004005000060F60100000000__0000003D03FCCDB9-000000417D21ACF9":{"file_size":54165504,"generation":2,"shard":"0008"},"000000067F00004005000060F10000180000-000000067F00004005000060F1000018821A__0000001737D88379-0000001B59EEB909":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000193FE9D-000000067F00004005016EA00C0001967D34__000001B3E1B95181-000001B6FFE46BC9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F000040050081DB43000076C000-000000067F000040050081DB430000778000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300050321C0-000000067F00004005000060F30005063187__0000014784964B91-0000014B000D1821":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500DBCED500000D4000-000000067F0000400500DBCED500000F0000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300004B8000-000000067F00004005000060F300004BC000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB000022C000-000000067F00004005000060FB0000280000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000DF968A-000000067F00004005000060FB0000E021D0__000000417D21ACF9-00000044B4679349":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000228000-000000067F00004005000060FB000022C000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00015D8000-000000067F00004005000060FB00015DC000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005B89170-000000067F00004005000060F30005BA213F__0000016143292911-00000164DEE06671":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300043B0000-000000067F00004005000060F300043B4000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300004F8000-000000067F000040050081DB4300004FC000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006860000-000000067F00004005000060F30006864000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000ADA0D0-000000067F00004005000060F30000B0300C__0000003203FB5749-0000003579F03331":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500FF2A510000000000-000000067F000040050100D04D000004369C__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C00000BB439-030000000000000000000000000000000002__00000104BD37F348":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001C078FA-000000067F00004005016EA00C0001C0F79A__000001B6FFE46BC9-000001BA93C39481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F000040050081DB430000B4A075-000000067F000040050081DB430000B7C0EA__000000B2B5C4E8F9-000000B768469051":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000117C10C-000000067F00004005000060F50100000000__00000054161C34B8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000E47BD2-000000067F00004005016EA00C0000E67A6E__0000019E2C5DCEE1-000001A1DD8B4481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30005D23BB5-000000067F00004005000060F50100000000__00000164EA9EC9A8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F3000336D193-000000067F00004005000060F3000337DCF3__000000E4C63CFA21-000000E7C2F1B249":{"file_size":259473408,"generation":2,"shard":"0008"},"000000067F00004005000060F300001F0000-000000067F00004005000060F300001F4000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000084772-030000000000000000000000000000000002__000000027AF9D7D0":{"file_size":147456,"generation":1,"shard":"0008"},"000000067F00004005016EA00C0001CE79E0-000000067F00004005016EA00C0001D1F87B__000001BA93C39481-000001BCB572A4E1":{"file_size":268451840,"generation":17,"shard":"0008"},"000000067F0000400500EB4A4800FFFFFFFF-000000067F0000400500EB4A480100000000__000000FF8B261599-000001048B25A8E9":{"file_size":1318912,"generation":2,"shard":"0008"},"000000067F00004005000060F70000488000-000000067F00004005000060F7000048C000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000ADF1AB-000000067F00004005016EA00C0100000000__00000196C9018F59-0000019A2EAFE7A9":{"file_size":282132480,"generation":11,"shard":"0008"},"000000067F00004005000060FB000071C000-000000067F00004005000060FB0000793506__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006850000-000000067F00004005000060F30006854000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000390000-000000067F000040050081DB430000394000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000020C000-000000067F00004005000060F30000250000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001398000-000000067F00004005000060FB000139C000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003648000-000000067F00004005000060F3000364C000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500C782E400001C7B51-000000067F0000400500C782E4000023FA62__000000D01F399709-000000D31E48D7C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001788000-000000067F00004005016EA00C000178C000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F000040050081DB430000C3A075-000000067F000040050081DB430000C440EA__000000B768469051-000000BAB1E56C91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300036FE561-000000067F00004005000060F300038075AF__000000FF8B261599-000001048B25A8E9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500D19D03000004FDC6-000000067F0000400500D19D030000067CA9__000000DBBFA87AE1-000000DE2A8E4FC9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000C00000-000000067F00004005000060FB0000C04000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000282C000-000000067F00004005000060F3000283C3E7__000000BAC0041E18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00006B0000-000000067F00004005016EA00C00006B4000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30001789027-000000067F00004005000060F300017AA0CE__0000006DDB29D589-000000722F474369":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30004558000-000000067F00004005000060F300045C1062__0000012E71CF31F9-000001334140FC21":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000C08000-000000067F00004005000060FB0000C0C000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006DCC000-000000067F00004005000060F30006DF0000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004B221FE-000000067F00004005000060F30004B2B250__0000013C9C0E3339-0000013FEFA7D709":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00018C4000-000000067F00004005016EA00C00018E0000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F000040050081DB430000564000-000000067F000040050081DB430000578000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000274A080-000000067F00004005000060F30100000000__000000B2B5C4E8F9-000000B768469051":{"file_size":199057408,"generation":2,"shard":"0008"},"000000067F00004005000060F300046D0EA8-000000067F00004005000060F3000471200E__000001334140FC21-00000137115BE4D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001114000-000000067F00004005000060FB0001120000__00000054161C34B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003FEC000-000000067F00004005000060F30003FF0000__00000126C3C69FC0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F10000368000-000000067F00004005000060F10100000000__0000003959DA2DE9-0000003D03FCCDB9":{"file_size":269967360,"generation":2,"shard":"0008"},"000000067F0000400500C782E4000012A71E-030000000000000000000000000000000002__000000D037B2DBD0":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F00004005000060F30006C98000-000000067F00004005000060F30006C9C000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300055BC000-000000067F00004005000060F30005610000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000F050F2-030000000000000000000000000000000002__00000047F1F2B800":{"file_size":147456,"generation":2,"shard":"0008"},"000000067F00004005000060F30002484000-000000067F00004005000060F300024D8000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003FE8000-000000067F00004005000060F30003FEC000__00000126C3C69FC0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500DBCED500000A8000-000000067F0000400500DBCED500000AC000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700006C3D76-000000067F00004005000060F80100000000__000000663565F8C9-000000698AF6E809":{"file_size":139821056,"generation":2,"shard":"0008"},"000000067F00004005000060F30002534000-000000067F00004005000060F3000253B7A3__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000412D27C-000000067F00004005000060F30004156457__00000122A7BB7B29-0000012694E36301":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000910000-000000067F00004005000060F700009385D4__0000008DBE2855F9-000000923719A971":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F30002510000-000000067F00004005000060F30002514000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002210000-000000067F00004005000060F30002214000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003FF4000-000000067F00004005000060F30004070000__00000126C3C69FC0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001BBFA66-000000067F00004005016EA00C0001C078FA__000001B6FFE46BC9-000001BA93C39481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F3000424A099-000000067F00004005000060F3000428313F__0000012694E36301-0000012A3F140591":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300036F91FE-000000067F00004005000060F30100000000__000000FCCD5238B1-000000FF8B261599":{"file_size":164118528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000718000-000000067F00004005000060FB000071C000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005010F44EB000000C000-000000067F00004005010F44EB0100000000__00000126C3C69FC0":{"file_size":70696960,"generation":2,"shard":"0008"},"000000067F00004005000060F30005214000-000000067F00004005000060F30005240000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000A7AF6E-030000000000000000000000000000000002__000000321AA80270":{"file_size":147456,"generation":2,"shard":"0008"},"000000067F00004005000060F30005063187-000000067F00004005000060F300050A412B__0000014784964B91-0000014B000D1821":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F100005E8000-000000067F00004005000060F100005F821C__000000636DE92159-000000663565F8C9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300020830BE-000000067F00004005000060F300020FC052__0000009DF02C1241-000000A173C00489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300065BB235-000000067F00004005000060F300065F42B4__000001715E483C79-000001751A7D7589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500FA2AD30000034000-000000067F0000400500FA2AD3000004D85C__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00017A8000-000000067F00004005016EA00C00017AC000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060FB00008D8000-000000067F00004005000060FB0000928B45__00000028C365FBE1-0000002D2A8E0B81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000798000-000000067F00004005000060F300007C1007__00000023FEF9F321-00000028C365FBE1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500D19D030000040000-000000067F0000400500D19D030000047EE2__000000DBBFA87AE1-000000DE2A8E4FC9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30001AB1583-000000067F00004005000060F50100000000__00000081AA3C40F0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30001AD8000-000000067F00004005000060F30001B09104__0000008196C976A1-0000008625CF2891":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000E1B859-030000000000000000000000000000000002__000000417D21ACF9-00000044B4679349":{"file_size":156844032,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001E9C000-000000067F00004005000060FB0001EA8000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001374000-000000067F00004005000060FB0001398000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB000155C000-000000067F00004005000060FB0001590000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C00000EA069-000000067F0000400500F3A25C000010C0D1__000001048B25A8E9-0000010779A7F551":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000568C000-000000067F00004005000060F30005698000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000C74000-000000067F00004005000060FB0000C98000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700004F0000-000000067F00004005000060F80100000000__00000047E31D98D1-0000004C49155071":{"file_size":264921088,"generation":2,"shard":"0008"},"000000067F00004005000060F30005598000-000000067F00004005000060F3000559C000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001429534-000000067F00004005000060F80100000000__00000122A7BB7B29-0000012694E36301":{"file_size":231964672,"generation":2,"shard":"0008"},"000000067F00004005000060F70000780000-000000067F00004005000060F80100000000__000000722F474369-00000075CC373F31":{"file_size":263340032,"generation":2,"shard":"0008"},"000000067F00004005000060F300019F31AA-000000067F00004005000060F30100000000__00000079F2A2F311-0000007E3A9BFD29":{"file_size":168484864,"generation":2,"shard":"0008"},"000000067F000040050081DB430000822079-000000067F000040050081DB43000082C0F1__000000A583FBFB91-000000A9EB8C4489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300007AC000-000000067F000040050081DB4300007F913A__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005847319-000000067F00004005000060F300058C8000__00000159A7EC8CB9-0000015DD1D3C809":{"file_size":261505024,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001E21687-000000067F00004005000060FB0100000000__000000923719A971-00000096262826C9":{"file_size":224403456,"generation":2,"shard":"0008"},"000000067F00004005000060F30003C98000-000000067F00004005000060F30003CB8FCF__00000117EDA82C11-0000011B632CC319":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB43000045029C-030000000000000000000000000000000002__0000008DBE2855F9-000000923719A971":{"file_size":89505792,"generation":2,"shard":"0008"},"000000067F00004005000060F3000559C000-000000067F00004005000060F300055B8000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000285901B-000000067F00004005000060F300028920E4__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000E64000-000000067F00004005000060F30000E70000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300015FB022-000000067F00004005000060F3000160410C__000000698AF6E809-0000006DDB29D589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006FDA081-000000067F00004005000060F30100000000__00000184624E5741-000001860C80A151":{"file_size":202276864,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000107973-000000067F0000400500EE16BC0100000000__000000F309FCDD19-000000F6661C9241":{"file_size":275456000,"generation":2,"shard":"0008"},"000000067F00004005000060F300031C40D1-000000067F00004005000060F300031D516C__000000DE2A8E4FC9-000000E1CD2FBBE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00001F7D38-000000067F00004005016EA00C000020FBCF__000001880F984A29-0000018C496B6DB1":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500FDA1F80100000000-000000067F0000400500FF2A510000004000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001182EC9-000000067F00004005000060F80100000000__000000FF8B261599-000001048B25A8E9":{"file_size":174284800,"generation":2,"shard":"0008"},"000000067F00004005000060F700011528FB-000000067F00004005000060F70001182EC9__000000FF8B261599-000001048B25A8E9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300024DC000-000000067F00004005000060F30002510000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00000B0000-030000000000000000000000000000000002__000000021DC73119-000000044854EBD1":{"file_size":259375104,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001DF0B43-000000067F00004005000060FB0001E21687__000000923719A971-00000096262826C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F10000088000-000000067F00004005000060F10000090000__00000008B6B51879-0000000D55A212C9":{"file_size":264142848,"generation":2,"shard":"0008"},"000000067F00004005000060F30003968000-000000067F00004005000060F3000396C000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00017AC000-000000067F00004005016EA00C00017E8000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F1000019C73D-000000067F00004005000060F20100000000__0000001B59EEB909-0000001FFBC01501":{"file_size":124698624,"generation":2,"shard":"0008"},"000000067F00004005000060F700001F8000-000000067F00004005000060F700002005D2__0000001B59EEB909-0000001FFBC01501":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001110000-000000067F00004005000060FB0001114000__00000054161C34B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F1000019842A-000000067F00004005000060F20100000000__0000001737D88379-0000001B59EEB909":{"file_size":145137664,"generation":2,"shard":"0008"},"000000067F00004005000060F700003BC000-000000067F00004005000060F700003C0000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000280000-000000067F00004005000060FB0000284000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500DBCED5000007C000-000000067F0000400500DBCED500000A8000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000001BCB5732691-000001BCB5734CD9":{"file_size":24576,"generation":239,"shard":"0008"},"000000067F00004005010660F70100000000-000000067F000040050107B547000006C000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000C24000-000000067F00004005000060F30000CA0000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000569C000-000000067F00004005000060F300056D8000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00000C7A73-030000000000000000000000000000000002__0000018624969469-000001880F984A29":{"file_size":40566784,"generation":11,"shard":"0008"},"000000067F00004005000060F30001344000-000000067F00004005000060F30001358000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001F38F48-000000067F00004005000060F50100000000__0000009A24DF6768":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30001760000-000000067F00004005000060F30001789027__0000006DDB29D589-000000722F474369":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F1000018821A-000000067F00004005000060F1000019842A__0000001737D88379-0000001B59EEB909":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300059AA115-000000067F00004005000060F300059B324D__0000015DD1D3C809-0000016143292911":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001400000-000000067F00004005000060FB0001404000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A4800000E7A62-000000067F0000400500EB4A480000107973__000000F309FCDD19-000000F6661C9241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000498000-000000067F00004005000060F3000049C000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000D24000-000000067F00004005000060F70000D38000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB43000120E409-000000067F000040050081DB430300000000__0000018613F0A050":{"file_size":24576,"generation":3,"shard":"0008"},"000000067F00004005000060FB0001A8A1CD-000000067F00004005000060FB0100000000__0000007E3A9BFD29-0000008196C976A1":{"file_size":199622656,"generation":2,"shard":"0008"},"000000067F00004005000060F30006270000-000000067F00004005000060F50100000000__0000016E41E03CA0":{"file_size":71114752,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000BAAD15-030000000000000000000000000000000002__0000003579F03331-0000003959DA2DE9":{"file_size":182321152,"generation":2,"shard":"0008"},"000000067F00004005000060F700016205B5-000000067F00004005000060F80100000000__0000012E71CF31F9-000001334140FC21":{"file_size":266862592,"generation":2,"shard":"0008"},"000000067F00004005000060F300030C0FE5-000000067F00004005000060F30003102107__000000DE2A8E4FC9-000000E1CD2FBBE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00004BC000-000000067F00004005016EA00C00004E8000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F10000440000-000000067F00004005000060F1000046821B__00000047E31D98D1-0000004C49155071":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F000040050081DB4300009C8000-000000067F000040050081DB4300009CC000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F7000106C000-000000067F00004005000060F700010AABC7__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000367733F-000000067F00004005000060F50100000000__000000F91FE84F08":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000478000-000000067F00004005016EA00C000047C000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30002E4104A-000000067F00004005000060F30002E4A157__000000D31E48D7C9-000000D74E29AAD1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001370000-000000067F00004005000060FB0001374000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004B1111A-000000067F00004005000060F30004B221FE__0000013C9C0E3339-0000013FEFA7D709":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000029C000-000000067F00004005016EA00C00002D0000__000001BCB572A4E0":{"file_size":134422528,"generation":17,"shard":"0008"},"000000067F00004005000060F30001C3C000-000000067F00004005000060F30001CC0000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB000136C000-000000067F00004005000060FB0001370000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F10000488000-000000067F00004005000060F10100000000__0000004C49155071-0000004F31878919":{"file_size":268754944,"generation":2,"shard":"0008"},"000000067F00004005000060F30000B0300C-000000067F00004005000060F60100000000__0000003203FB5749-0000003579F03331":{"file_size":212885504,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001C0F79A-000000067F00004005016EA00C0001C3F636__000001B6FFE46BC9-000001BA93C39481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F3000399C000-000000067F00004005000060F300039A0000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001574000-000000067F00004005000060F700015A195C__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005B00697-000000067F00004005000060F30100000000__0000015DD1D3C809-0000016143292911":{"file_size":282025984,"generation":2,"shard":"0008"},"000000067F00004005000060F300050C8000-000000067F00004005000060F300050CC000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700000885C5-000000067F00004005000060F80100000000__000000044854EBD1-00000008B6B51879":{"file_size":253878272,"generation":2,"shard":"0008"},"000000067F00004005000060F30001407F7A-000000067F00004005000060F50100000000__000000603CA8F2F0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70000B90000-000000067F00004005000060F70000B94000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000560000-000000067F000040050081DB430000564000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001720000-000000067F00004005000060F700017405D4__000001398B56A519-0000013C9C0E3339":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300043CC000-000000067F00004005000060F300043F8000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000129D29A-000000067F00004005000060F30100000000__00000057593D8169-0000005C01565329":{"file_size":110788608,"generation":2,"shard":"0008"},"000000067F00004005000060F300003F9F83-000000067F00004005000060F30000402F4A__000000114A805939-00000013FB921C81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001940000-000000067F00004005000060F700019685CE__0000014784964B91-0000014B000D1821":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300043B8000-000000067F00004005000060F300043BC000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000370FD1-000000067F00004005000060F60100000000__0000000D55A212C9-000000114A805939":{"file_size":232144896,"generation":2,"shard":"0008"},"000000067F00004005000060F30003849093-000000067F00004005000060F300038720A2__000001048B25A8E9-0000010779A7F551":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F100003C0432-000000067F00004005000060F20100000000__0000003D03FCCDB9-000000417D21ACF9":{"file_size":262701056,"generation":2,"shard":"0008"},"000000067F00004005000060F700014F85DF-000000067F00004005000060F70001510BBE__0000012694E36301-0000012A3F140591":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F3000253B7A3-000000067F00004005000060F50100000000__000000AFE87558B0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001404000-000000067F00004005000060FB0001408000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003F942CF-000000067F00004005000060F30003FCD352__0000011F1A40FA69-00000122A7BB7B29":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000B38000-000000067F00004005000060FB0000B58B45__0000003579F03331-0000003959DA2DE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000B505C8-000000067F00004005000060F80100000000__000000A9EB8C4489-000000ACA44C8E99":{"file_size":226459648,"generation":2,"shard":"0008"},"000000067F00004005000060F3000612D506-000000067F00004005000060F30006166575__0000016B49A934C1-0000016E1FBB7B99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700000DC000-000000067F00004005000060F700000E0000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500FB3D31000000C000-000000067F0000400500FB3D310000018000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000001BCB572C329-000001BCB572C481":{"file_size":24576,"generation":19,"shard":"0008"},"000000067F00004005000060F30002828000-000000067F00004005000060F3000282C000__000000BAC0041E18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300015B0000-000000067F00004005000060F300015B4000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500DBCED50000078000-000000067F0000400500DBCED5000007C000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB43000086E169-030000000000000000000000000000000002__000000A583FBFB91-000000A9EB8C4489":{"file_size":77471744,"generation":2,"shard":"0008"},"000000067F0000400501046F39000000BDD2-000000067F00004005010660F500000161F7__0000010FB1BE19B9-00000113456156F1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500FB3D3101FFFFFFFF-000000067F0000400500FB3D310300000000__00000122A7BB7B29-0000012694E36301":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC00000F28ED-030000000000000000000000000000000002__000000F91FE84F08":{"file_size":147456,"generation":2,"shard":"0008"},"000000067F00004005000060F30004E9307A-000000067F00004005000060F30004EA41A5__000001440D3D0C69-0000014784964B91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB00016D21CF-030000000000000000000000000000000002__000000698AF6E809-0000006DDB29D589":{"file_size":226353152,"generation":2,"shard":"0008"},"000000067F0000400500EB4A4800001876BD-000000067F0000400500EB4A48000018F5CD__000000F6661C9241-000000F901689359":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500C782E400002E5B84-030000000000000000000000000000000002__000000DBD29DC248":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F00004005000060F70000D8985C-000000067F00004005000060F70000DA1E38__000000C462B3C2A9-000000C824C09619":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000C28000-000000067F000040050081DB430000C3A075__000000B768469051-000000BAB1E56C91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000407201D-000000067F00004005000060F300040E319D__00000122A7BB7B29-0000012694E36301":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F6000002B3CE-000000067F00004005000060F60100000000__00000075E5D2A930":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70000D60000-000000067F00004005000060F80100000000__000000C483D0D6B8":{"file_size":133947392,"generation":2,"shard":"0008"},"000000067F00004005000060F70000F705D6-000000067F00004005000060F80100000000__000000DE2A8E4FC9-000000E1CD2FBBE9":{"file_size":259842048,"generation":2,"shard":"0008"},"000000067F00004005000060F30004E7A062-000000067F00004005000060F30004E9307A__000001440D3D0C69-0000014784964B91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006810000-000000067F00004005000060F30006814000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700007D05C8-000000067F00004005000060F80100000000__00000075CC373F31-00000079F2A2F311":{"file_size":251740160,"generation":2,"shard":"0008"},"000000067F00004005000000000000000001-000000067F0000400500000A690000000002__0000018624969469-000001880F984A29":{"file_size":40960,"generation":11,"shard":"0008"},"000000067F00004005000060FB00014D8000-000000067F00004005000060FB0001530B44__000000636DE92159-000000663565F8C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001EA8000-000000067F00004005000060FB0001EAC000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000230A0C7-000000067F00004005000060F30100000000__000000A583FBFB91-000000A9EB8C4489":{"file_size":213680128,"generation":2,"shard":"0008"},"000000067F00004005000060F30000A98000-000000067F00004005000060F30000AC9024__0000003203FB5749-0000003579F03331":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003F72201-000000067F00004005000060F30003F7B254__0000011F1A40FA69-00000122A7BB7B29":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000498000-000000067F00004005016EA00C000049C000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30004CB8000-000000067F00004005000060F30004CBC000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300042CC1BD-000000067F00004005000060F300042D51D6__0000012694E36301-0000012A3F140591":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500FB3D310000028681-000000067F0000400500FB3D320100000000__0000010D77B487A0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F3000474302B-000000067F00004005000060F300047EC0CA__000001334140FC21-00000137115BE4D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003204000-000000067F00004005000060F30003278000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300024020ED-000000067F00004005000060F3000240B12A__000000A9EB8C4489-000000ACA44C8E99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000216C000-000000067F00004005000060F30002170000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F6000005DD43-000000067F00004005000060F60100000000__000000EFDE07FFD8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000348B45-000000067F00004005000060FB000037968A__0000000D55A212C9-000000114A805939":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000778000-000000067F000040050081DB43000077C000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300011B4000-000000067F000040050081DB43000120E409__000000DBD29DC248":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003CCA0B9-000000067F00004005000060F30003D0B155__00000117EDA82C11-0000011B632CC319":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB00009D4000-000000067F00004005000060FB0000A7AF6E__000000321AA80270":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700008F0000-000000067F00004005000060F80100000000__00000089F4693119-0000008DBE2855F9":{"file_size":262905856,"generation":2,"shard":"0008"},"000000067F00004005000060F30006CA0000-000000067F00004005000060F30006CA4000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000E021D0-000000067F00004005000060FB0000E0AD15__000000417D21ACF9-00000044B4679349":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003654000-000000067F00004005000060F3000367733F__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000DC0000-000000067F00004005000060F70000DE05C8__000000C824C09619-000000CC13D2E549":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F700018D85CA-000000067F00004005000060F80100000000__000001440D3D0C69-0000014784964B91":{"file_size":260775936,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000EAC000-000000067F00004005000060FB0000EB8000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000E70000-000000067F00004005000060F30000E74000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005FE621A-000000067F00004005000060F30005FFF23F__0000016834A3FC91-0000016B49A934C1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000D20000-000000067F00004005000060F70000D24000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005244000-000000067F00004005000060F3000525C065__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400501025D9001FFFFFFFF-000000067F0000400501025D900300000000__0000011B632CC319-0000011F1A40FA69":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30001CD4000-000000067F00004005000060F30001CE0000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000E77906-000000067F00004005016EA00C0000E7F7A7__0000019E2C5DCEE1-000001A1DD8B4481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F300046B41AA-000000067F00004005000060F30100000000__0000012E71CF31F9-000001334140FC21":{"file_size":199688192,"generation":2,"shard":"0008"},"000000067F000040050100D04D00000634BB-030000000000000000000000000000000002__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":173744128,"generation":2,"shard":"0008"},"000000067F00004005000060F30000CA4000-000000067F00004005000060F30000CB16B6__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004DDC000-000000067F00004005000060F30004DF086C__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005D7F2DE-000000067F00004005000060F30005DA03A8__00000164DEE06671-0000016834A3FC91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300048A0000-000000067F00004005000060F300048A4000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F100003954D3-000000067F00004005000060F20100000000__0000003D2AB09B68":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F300043BC000-000000067F00004005000060F300043C8000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001D1C000-000000067F00004005016EA00C0001D78000__000001BCB572A4E0":{"file_size":134422528,"generation":17,"shard":"0008"},"000000067F00004005000060F100000D8000-000000067F00004005000060F100000E021B__0000000D55A212C9-000000114A805939":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300060A0282-000000067F00004005000060F300060A93B5__0000016834A3FC91-0000016B49A934C1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F1000021D8F8-000000067F00004005000060F20100000000__00000023FEF9F321-00000028C365FBE1":{"file_size":88227840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000018000-000000067F00004005000060F3000001C000__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F000040050081DB430000E48000-000000067F000040050081DB430000E4C000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300003E8FBC-000000067F00004005000060F300003F9F83__000000114A805939-00000013FB921C81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30004868000-000000067F00004005000060F3000486C000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700013D0000-000000067F00004005000060F700013E85D1__0000011F1A40FA69-00000122A7BB7B29":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001203856-030000000000000000000000000000000002__0000005413AB3641-00000057593D8169":{"file_size":157130752,"generation":2,"shard":"0008"},"000000067F00004005000060F3000029C000-000000067F00004005000060F300002C4887__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005160000-000000067F00004005000060F30005164000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500FB3D31000001C000-000000067F0000400500FB3D310000028681__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000029F90B-000000067F00004005016EA00C00002D77AE__000001880F984A29-0000018C496B6DB1":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30003620000-000000067F00004005000060F30100000000__000000F309FCDD19-000000F6661C9241":{"file_size":249372672,"generation":2,"shard":"0008"},"000000067F00004005000060F30003B90000-000000067F00004005000060F30003B94000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300001F4000-000000067F00004005000060F30000208000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001BB8000-000000067F00004005000060F30001C00FE1__0000008625CF2891-00000089F4693119":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30005210000-000000067F00004005000060F30005214000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002070F71-000000067F00004005000060F30002079FDE__0000009DF02C1241-000000A173C00489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000B40000-000000067F00004005000060F30000BB103B__0000003579F03331-0000003959DA2DE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F10000290000-000000067F00004005000060F10000298000__00000028C365FBE1-0000002D2A8E0B81":{"file_size":264134656,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00007C7B9C-000000067F00004005016EA00C0000807A34__000001935283F9B9-00000196C9018F59":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060FB0001548000-000000067F00004005000060FB000154C000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F100005FC000-000000067F00004005000060F1000062EE46__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC00001A0000-000000067F0000400500EE16BC00001A4000__00000104BD37F348":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000F94000-000000067F00004005016EA00C0000F98000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F70000290000-000000067F00004005000060F80100000000__00000023FEF9F321-00000028C365FBE1":{"file_size":265764864,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001BC0B44-000000067F00004005000060FB0001BD1689__0000008625CF2891-00000089F4693119":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000337DCF2-000000067F00004005000060F30003386D10__000000E7C2F1B249-000000EBC9213D59":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300045C1062-000000067F00004005000060F3000460202F__0000012E71CF31F9-000001334140FC21":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006814000-000000067F00004005000060F30006850000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000073DFA8-000000067F00004005016EA00C000079FCFA__000001935283F9B9-00000196C9018F59":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005016EA00C000178C000-000000067F00004005016EA00C00017A8000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F1000051D1AE-000000067F00004005000060F20100000000__00000057593D8169-0000005C01565329":{"file_size":103145472,"generation":2,"shard":"0008"},"000000067F00004005000060F300034BD86C-000000067F00004005000060F30100000000__000000EBC9213D59-000000EFA7EAA9E1":{"file_size":95617024,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000008000-000000067F00004005016EA00C000000FEA0__0000018624969469-000001880F984A29":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F1000014C000-000000067F00004005000060F1000015F545__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500FB3D300000000EAB-000000067F0000400500FB3D300100000000__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":12976128,"generation":2,"shard":"0008"},"000000067F000040050081DB430000028000-000000067F000040050081DB43000002C000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001BD1689-000000067F00004005000060FB0100000000__0000008625CF2891-00000089F4693119":{"file_size":223690752,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000000000-000000067F0000400500EB4A480000000001__000000FF8B261599-000001048B25A8E9":{"file_size":32768,"generation":2,"shard":"0008"},"000000067F00004005000060F30003D952B0-000000067F00004005000060F30003DAE2DC__0000011B632CC319-0000011F1A40FA69":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000B30000-000000067F00004005000060F70000B505C8__000000A9EB8C4489-000000ACA44C8E99":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F3000549D0A6-000000067F00004005000060F300055861F2__0000015304A396B9-0000015670D6AFD9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F1000046821B-000000067F00004005000060F20100000000__00000047E31D98D1-0000004C49155071":{"file_size":266969088,"generation":2,"shard":"0008"},"000000067F00004005000060F300043C8000-000000067F00004005000060F300043CC000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001E720A2-000000067F00004005000060F30100000000__000000923719A971-00000096262826C9":{"file_size":141344768,"generation":2,"shard":"0008"},"000000067F000040050081DB4300003A8000-000000067F000040050081DB4300003AC000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700006AB7A6-000000067F00004005000060F700006C3D76__000000663565F8C9-000000698AF6E809":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000570A19E-000000067F00004005000060F3000573B206__00000159A7EC8CB9-0000015DD1D3C809":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003AF28CB-000000067F00004005000060F30003B33945__0000010FB1BE19B9-00000113456156F1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB00015CC000-000000067F00004005000060FB00015D8000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500D69D7900000A9CFB-000000067F0000400500D69D7900000D1C5F__000000EFA7EAA9E1-000000F309FCDD19":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002A30000-000000067F00004005000060F30002A34000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000047C000-000000067F00004005000060F30000498000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005FFF23F-000000067F00004005000060F300060A0282__0000016834A3FC91-0000016B49A934C1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000029C194-000000067F00004005016EA00C00004EF809__0000018EC67807C9-000001935283F9B9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30006D64000-000000067F00004005000060F30006DC8000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001340000-000000067F00004005016EA00C0001344000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0000BB0000-000000067F00004005016EA00C0000BB4000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F0000400500EB4A480000000000-000000067F0000400500EB4A480000007F0F__000000F309FCDD19-000000F6661C9241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500E3A2A10000114000-000000067F0000400500E3A2A1000016321A__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000578000-030000000000000000000000000000000002__0000009A24DF6768":{"file_size":107642880,"generation":2,"shard":"0008"},"000000067F00004005000060F30006798000-000000067F00004005000060F3000679C000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F100000E021B-000000067F00004005000060F1000010043F__0000000D55A212C9-000000114A805939":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F000040050081DB430000DA8000-030000000000000000000000000000000002__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":233201664,"generation":2,"shard":"0008"},"000000067F00004005000060F100004EC079-000000067F00004005000060F20100000000__00000054161C34B8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F7000170C000-000000067F00004005000060F70001720000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000FCD85E-000000067F00004005000060F80100000000__000000E4D847F4E0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00015B74FF-000000067F00004005016EA00C00015FF3A0__000001AC25760149-000001AFC313C819":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30000AC9024-000000067F00004005000060F30000ADA0D0__0000003203FB5749-0000003579F03331":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EE16C40100000000-000000067F0000400500F3A25C000006C000__00000104BD37F348":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500D69D7900000F1B5B-000000067F0000400500D69D790100000000__000000EFA7EAA9E1-000000F309FCDD19":{"file_size":233275392,"generation":2,"shard":"0008"},"000000067F00004005000060F30003C0C000-000000067F00004005000060F30003C257AD__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000E44000-000000067F00004005000060F30000E60000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F7000018E4B6-000000067F00004005000060F7000019EA78__0000001737D88379-0000001B59EEB909":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00017E8000-000000067F00004005016EA00C00017EC000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30003A4C09C-000000067F00004005000060F30003A6D1B3__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F100000260F2-000000067F00004005000060F20100000000__000000027AF9D7D0":{"file_size":24576,"generation":1,"shard":"0008"},"000000067F00004005016EA00C0000097BDA-000000067F00004005016EA00C00000C7A73__0000018624969469-000001880F984A29":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500C782E400002CDD5C-030000000000000000000000000000000002__000000D31E48D7C9-000000D74E29AAD1":{"file_size":90923008,"generation":2,"shard":"0008"},"000000067F00004005000060F3000685C000-000000067F00004005000060F30006860000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001C84000-000000067F00004005000060FB0001CE16ED__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000CC4BC2-000000067F000040050081DB430000CD6C36__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006349DA2-000000067F00004005000060F30006382F14__0000016E1FBB7B99-000001715E483C79":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000212E160-000000067F00004005000060F30100000000__0000009DF02C1241-000000A173C00489":{"file_size":224731136,"generation":2,"shard":"0008"},"000000067F00004005000060F30001FF8691-000000067F00004005000060F30100000000__0000009A1ABDE921-0000009DF02C1241":{"file_size":256114688,"generation":2,"shard":"0008"},"000000067F00004005000060F300067F4000-000000067F00004005000060F30006810000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700015A8000-000000067F00004005000060F700016205B5__0000012E71CF31F9-000001334140FC21":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F0000400500D69D790000024000-000000067F0000400500D69D790000028000__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700007AE010-000000067F00004005000060F80100000000__00000075E5D2A930":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000428000-000000067F00004005016EA00C000042C000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30001E74000-000000067F00004005000060F30001F28000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300038FF04F-000000067F00004005000060F30100000000__0000010779A7F551-0000010A5E65DF39":{"file_size":45359104,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001B0FD2A-000000067F00004005016EA00C0001B4FBC9__000001B6FFE46BC9-000001BA93C39481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30006858000-000000067F00004005000060F3000685C000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002F9A0EB-000000067F00004005000060F30002FD317C__000000D74E29AAD1-000000DBBFA87AE1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000808000-000000067F000040050081DB430000822079__000000A583FBFB91-000000A9EB8C4489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB00015DC000-000000067F00004005000060FB00015F0000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F7000021C000-000000067F00004005000060F7000025DA3C__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500D69D79000007C000-000000067F0000400500D69D7900000A8000__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F6000001EE3D-000000067F00004005000060F60100000000__00000054161C34B8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F000040050081DB430000F4E15B-030000000000000000000000000000000002__000000C462B3C2A9-000000C824C09619":{"file_size":73662464,"generation":2,"shard":"0008"},"000000067F00004005000060F30001F28000-000000067F00004005000060F30001F2C000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300001F1DA6-030000000000000000000000000000000002__00000081AA3C40F0":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F00004005000060F70001758B92-000000067F00004005000060F70001771169__000001398B56A519-0000013C9C0E3339":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F0000400500E3A2A10000010000-000000067F0000400500E3A2A10000017F02__000000E7C2F1B249-000000EBC9213D59":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002A98000-000000067F00004005000060F30002A9C000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000573B206-000000067F00004005000060F300057942F4__00000159A7EC8CB9-0000015DD1D3C809":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000860B45-030000000000000000000000000000000002__00000023FEF9F321-00000028C365FBE1":{"file_size":252788736,"generation":2,"shard":"0008"},"000000067F00004005000060F7000090B929-000000067F00004005000060F80100000000__0000008DDCD70B68":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F7000014B73D-000000067F00004005000060F80100000000__000000114A805939-00000013FB921C81":{"file_size":146432000,"generation":2,"shard":"0008"},"000000067F00004005000060F70000D3C000-000000067F00004005000060F70000D60000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001514000-000000067F00004005000060F70001528000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001764000-000000067F00004005016EA00C0001788000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30001358000-000000067F00004005000060F3000135C000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001594000-000000067F00004005000060FB00015C8000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300004AC000-000000067F00004005000060F300004B8000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005610000-000000067F00004005000060F30005614000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002794000-000000067F00004005000060F300027C0000__000000BAC0041E18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004C60000-000000067F00004005000060F30004C64000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700003A0000-000000067F00004005000060F700003B85C7__0000003579F03331-0000003959DA2DE9":{"file_size":268468224,"generation":2,"shard":"0008"},"000000067F0000400500DBCED500000F1034-030000000000000000000000000000000002__000000E4C63CFA21-000000E7C2F1B249":{"file_size":247480320,"generation":2,"shard":"0008"},"000000067F00004005000060F300051B4000-000000067F00004005000060F300051F0000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F6000003C77D-000000067F00004005000060F60100000000__000000A5A3F27398":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005010660F500000161F7-030000000000000000000000000000000002__0000010FB1BE19B9-00000113456156F1":{"file_size":64757760,"generation":2,"shard":"0008"},"000000067F00004005000060F30003F7B254-000000067F00004005000060F30003F942CF__0000011F1A40FA69-00000122A7BB7B29":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30004900000-000000067F00004005000060F30004904000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006F18000-000000067F00004005000060F30006F1C000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003A21037-000000067F00004005000060F30003A31FB6__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000DB0000-000000067F00004005000060F30000E40F86__000000417D21ACF9-00000044B4679349":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001A60B43-000000067F00004005000060FB0001A71688__0000007E3A9BFD29-0000008196C976A1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006DC8000-000000067F00004005000060F30006DCC000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700006E38F6-000000067F00004005000060F80100000000__000000698F2C3A38":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F3000122B1C9-000000067F00004005000060F300012442A9__00000057593D8169-0000005C01565329":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000EA8000-000000067F00004005000060FB0000EAC000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001B5A072-000000067F00004005000060F80100000000__00000159B010F6C0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000144DCA3-000000067F00004005016EA00C000151F7C5__000001AC25760149-000001AFC313C819":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F600000711FF-000000067F00004005000060F60100000000__00000122E1129DA0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F300050EC000-000000067F00004005000060F30005138000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005260000-000000067F00004005000060F30005290FC9__0000014EC58A4A79-0000015304A396B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700012DE407-000000067F00004005000060F80100000000__000001180B3FF408":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70000F10000-000000067F00004005000060F70000F185D4__000000DBBFA87AE1-000000DE2A8E4FC9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000D38000-000000067F00004005000060F70000D3C000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F6000006671F-000000067F00004005000060F60100000000__0000010D77B487A0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F300059F53C6-000000067F00004005000060F30005A16504__0000015DD1D3C809-0000016143292911":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000B08000-000000067F000040050081DB430000B4A075__000000B2B5C4E8F9-000000B768469051":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F7000152C000-000000067F00004005000060F70001570000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000128000-000000067F00004005000060F3000012C000__0000018624969468":{"file_size":134422528,"generation":7,"shard":"0008"},"000000067F00004005000060F70000E24000-000000067F00004005000060F70000E387D6__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300002791D8-000000067F000040050081DB43000028B253__0000008196C976A1-0000008625CF2891":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F600000500F7-000000067F00004005000060F60100000000__000000D037B2DBD0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70000ABD9C4-000000067F00004005000060F80100000000__000000A5A3F27398":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F000040050081DB4300009CC000-000000067F000040050081DB430000A10000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700002005D2-000000067F00004005000060F80100000000__0000001B59EEB909-0000001FFBC01501":{"file_size":261169152,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001AA656E-000000067F000040050081D80C0100000000__00000081AA3C40F0":{"file_size":59138048,"generation":2,"shard":"0008"},"000000067F000040050081DB430000E14000-000000067F000040050081DB430000E48000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003DD734C-000000067F00004005000060F30003E40000__0000011B632CC319-0000011F1A40FA69":{"file_size":261046272,"generation":2,"shard":"0008"},"000000067F0000400500D19D0300FFFFFFFF-030000000000000000000000000000000002__000000DE2A8E4FC9-000000E1CD2FBBE9":{"file_size":5373952,"generation":2,"shard":"0008"},"000000067F00004005000060F30001588000-000000067F00004005000060F3000158C000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500DBCED500000AC000-000000067F0000400500DBCED500000D0000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A48000013F89B-000000067F0000400500EB4A48000014F7AC__000000F6661C9241-000000F901689359":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300005D704F-000000067F000040050081DB4300006310C9__0000009A1ABDE921-0000009DF02C1241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000A14000-000000067F000040050081DB430000A18000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001F574A6-000000067F00004005000060F30001FF8691__0000009A1ABDE921-0000009DF02C1241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500FB3D320100000000-000000067F0000400500FDA1F80000014000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001B09104-000000067F00004005000060F30001B4A119__0000008196C976A1-0000008625CF2891":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005011035750100000000-030000000000000000000000000000000002__00000159B010F6C0":{"file_size":78626816,"generation":2,"shard":"0008"},"000000067F00004005000060F1000015F545-000000067F00004005000060F20100000000__000000174479FC18":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F3000638C06D-000000067F00004005000060F300063A50CD__0000016E1FBB7B99-000001715E483C79":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000299C28F-000000067F00004005000060F300029A526C__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000364C000-000000067F00004005000060F30003650000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000CE0000-000000067F00004005016EA00C0000CE4000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F000040050081DB430000794000-000000067F000040050081DB4300007A8000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000A18000-000000067F000040050081DB430000A1C000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000000C000-000000067F00004005000060F30000018000__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F000040050081DB4300000D40FF-030000000000000000000000000000000002__00000075CC373F31-00000079F2A2F311":{"file_size":78061568,"generation":2,"shard":"0008"},"000000067F00004005000060F60000099FD8-000000067F00004005000060F60100000000__00000159B010F6C0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F3000330A1C8-000000067F00004005000060F3000332B1B6__000000E4C63CFA21-000000E7C2F1B249":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006FA900D-000000067F00004005000060F30006FDA081__00000184624E5741-000001860C80A151":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB000148AC30-000000067F00004005000060FB000149B774__000000601F43CF09-000000636DE92159":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C01FFFFFFFF-000000067F0000400500F3A25C0300000000__0000011F1A40FA69-00000122A7BB7B29":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30000EF1FC3-000000067F00004005000060F50100000000__00000047F1F2B800":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30006A7C566-000000067F00004005000060F30100000000__00000178B8B10551-0000017C9F5597E1":{"file_size":173072384,"generation":2,"shard":"0008"},"000000067F00004005000060FB000104B856-000000067F00004005000060FB000107C39B__0000004C49155071-0000004F31878919":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000030000-000000067F00004005000060F80100000000__000000021DC73119-000000044854EBD1":{"file_size":261341184,"generation":2,"shard":"0008"},"000000067F00004005000060F30003580FD3-000000067F00004005000060F30100000000__000000EFA7EAA9E1-000000F309FCDD19":{"file_size":228188160,"generation":2,"shard":"0008"},"000000067F00004005000060F70001224000-000000067F00004005000060F70001232ACF__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300022B9050-000000067F00004005000060F3000230A0C7__000000A583FBFB91-000000A9EB8C4489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006654000-000000067F00004005000060F30006670000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700010D0000-000000067F00004005000060F700010D85CF__000000EFA7EAA9E1-000000F309FCDD19":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000FD8000-030000000000000000000000000000000002__000000C824C09619-000000CC13D2E549":{"file_size":237559808,"generation":2,"shard":"0008"},"000000067F00004005000060FB00015F0000-000000067F00004005000060FB00015F4000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F60100000000-000000067F00004005000060F70000004000__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F00004005000060F70000DA1E38-000000067F00004005000060F80100000000__000000C462B3C2A9-000000C824C09619":{"file_size":209821696,"generation":2,"shard":"0008"},"000000067F00004005000060F30005D76250-000000067F00004005000060F30005D7F2DE__00000164DEE06671-0000016834A3FC91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F10000418000-000000067F00004005000060F10100000000__00000044B4679349-00000047E31D98D1":{"file_size":269148160,"generation":2,"shard":"0008"},"000000067F00004005000060F70001B61000-000000067F00004005000060F80100000000__0000018613F0A050":{"file_size":65150976,"generation":3,"shard":"0008"},"000000067F00004005000060F300008C8000-000000067F00004005000060F300008E0F49__00000028C365FBE1-0000002D2A8E0B81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300002D8000-030000000000000000000000000000000002__0000008625CF2891-00000089F4693119":{"file_size":231907328,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000C04000-000000067F00004005000060FB0000C08000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001808000-000000067F00004005000060FB000180C000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000A30379-030000000000000000000000000000000002__000000AFE87558B0":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F00004005000060F700010D85CF-000000067F00004005000060F80100000000__000000EFA7EAA9E1-000000F309FCDD19":{"file_size":164970496,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000C70000-000000067F00004005000060FB0000C74000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001188000-000000067F00004005016EA00C000118C000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F70000CB85B3-000000067F00004005000060F70000CC8B74__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F30004A1D870-000000067F00004005000060F30004A2693B__000001398B56A519-0000013C9C0E3339":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00008CF772-000000067F00004005016EA00C00008E760F__000001935283F9B9-00000196C9018F59":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0000D34000-000000067F00004005016EA00C0000D5D1E9__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C00014B79E7-000000067F00004005016EA00C00014CF88D__000001A931C135B1-000001AC25760149":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F300040E319D-000000067F00004005000060F300040F41F4__00000122A7BB7B29-0000012694E36301":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002FF427D-000000067F00004005000060F30100000000__000000D74E29AAD1-000000DBBFA87AE1":{"file_size":156073984,"generation":2,"shard":"0008"},"000000067F00004005000060F30005E0A466-000000067F00004005000060F30005E3B48F__00000164DEE06671-0000016834A3FC91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700005F9158-000000067F00004005000060F80100000000__00000057593D8169-0000005C01565329":{"file_size":230768640,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00018E4000-000000067F00004005016EA00C000193189A__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30005F0202C-000000067F00004005000060F30005F3303F__0000016834A3FC91-0000016B49A934C1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F10000148000-000000067F00004005000060F1000014C000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300060C0000-000000067F00004005000060F300060C4000__0000016E41E03CA0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000C9C000-000000067F00004005000060FB0000CC6E51__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050107B54700000A0EB1-000000067F000040050109CD330100000000__000001180B3FF408":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00004EC000-000000067F00004005016EA00C00005A0000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0000A9F465-000000067F00004005016EA00C0000ACF305__00000196C9018F59-0000019A2EAFE7A9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30000208000-000000067F00004005000060F3000020C000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C000011E137-000000067F0000400500F67839000003E09B__000001048B25A8E9-0000010779A7F551":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F30000402F4A-000000067F00004005000060F60100000000__000000114A805939-00000013FB921C81":{"file_size":166469632,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00004A8000-000000067F00004005016EA00C00004AC000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F70001968000-000000067F00004005000060F7000196C000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006EF8000-000000067F00004005000060F30006EFC000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000BB4000-000000067F00004005016EA00C0000C20000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F700009C0000-000000067F00004005000060F80100000000__0000009A24DF6768":{"file_size":37371904,"generation":2,"shard":"0008"},"000000067F00004005000060F30004C84000-000000067F00004005000060F30004CB8000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002514000-000000067F00004005000060F30002530000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000DE05C8-000000067F00004005000060F80100000000__000000C824C09619-000000CC13D2E549":{"file_size":259473408,"generation":2,"shard":"0008"},"000000067F00004005000060F301FFFFFFFF-000000067F00004005000060F30300000000__00000186146441F1-0000018624969469":{"file_size":57344,"generation":6,"shard":"0008"},"000000067F00004005000060F30001886B2A-000000067F00004005000060F50100000000__00000075E5D2A930":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F700006A8000-000000067F00004005000060F80100000000__000000636DE92159-000000663565F8C9":{"file_size":117022720,"generation":2,"shard":"0008"},"000000067F00004005000060FB000154C000-000000067F00004005000060FB0001558000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300053F40CC-000000067F00004005000060F30100000000__0000014EC58A4A79-0000015304A396B9":{"file_size":223453184,"generation":2,"shard":"0008"},"000000067F00004005000060F30005C95225-000000067F00004005000060F30005C9E3C4__0000016143292911-00000164DEE06671":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000558C000-000000067F00004005000060F30005598000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003FFA699-000000067F00004005000060F50100000000__00000122E1129DA0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30006F1C000-000000067F00004005000060F50100000000__000001848D082B20":{"file_size":24117248,"generation":2,"shard":"0008"},"000000067F00004005000060F3000486C000-000000067F00004005000060F30004878000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300019C2056-000000067F00004005000060F300019F31AA__00000079F2A2F311-0000007E3A9BFD29":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC000004C000-000000067F0000400500EE16BC0000060000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F7000046EAB9-000000067F00004005000060F80100000000__000000417D21ACF9-00000044B4679349":{"file_size":48717824,"generation":2,"shard":"0008"},"000000067F000040050081DB430000790000-000000067F000040050081DB430000794000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500D69D79000002C000-000000067F0000400500D69D790000078000__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F60000026C90-000000067F00004005000060F60100000000__000000698F2C3A38":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30000738000-000000067F00004005000060F3000073C000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F10000204000-000000067F00004005000060F10000218000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500C782E40000177E20-000000067F0000400500C782E400001AFD31__000000D01F399709-000000D31E48D7C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F7000048C000-000000067F00004005000060F700004B1E77__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300015F8000-000000067F00004005000060F50100000000__000000698F2C3A38":{"file_size":131276800,"generation":2,"shard":"0008"},"000000067F00004005000060F30000428000-000000067F00004005000060F3000042C000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB43000038C000-000000067F000040050081DB430000390000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB000102A1CE-000000067F00004005000060FB000103AD12__0000004C49155071-0000004F31878919":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001848000-000000067F00004005000060FB000184C000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00001DC000-000000067F00004005000060FB0000228000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00011D4000-000000067F00004005016EA00C0001228000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C000011775B-030000000000000000000000000000000002__0000018820A34650":{"file_size":139264,"generation":11,"shard":"0008"},"000000067F00004005000060F700011B8000-000000067F00004005000060F80100000000__000001048B25A8E9-0000010779A7F551":{"file_size":263897088,"generation":2,"shard":"0008"},"000000067F00004005000060F3000660D31F-000000067F00004005000060F3000664E3CA__000001715E483C79-000001751A7D7589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC0000064000-000000067F0000400500EE16BC00000F28ED__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000525C065-000000067F00004005000060F50100000000__0000014EDD256548":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30004A7F98F-000000067F00004005000060F30100000000__000001398B56A519-0000013C9C0E3339":{"file_size":47595520,"generation":2,"shard":"0008"},"000000067F000040050100D04D000004369C-000000067F000040050100D04D000004B5AD__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F6000001A6E2-000000067F00004005000060F60100000000__00000047F1F2B800":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F700004405CF-000000067F00004005000060F80100000000__0000003D03FCCDB9-000000417D21ACF9":{"file_size":198836224,"generation":2,"shard":"0008"},"000000067F00004005000060F30002D28000-000000067F00004005000060F30002D2C000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500F56D510100000000-000000067F0000400500F67839000003C000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000E387D6-000000067F00004005000060F80100000000__000000D037B2DBD0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F3000213C000-000000067F00004005000060F30002168000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300060D4415-000000067F00004005000060F3000612D506__0000016B49A934C1-0000016E1FBB7B99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500FB3D3100000546CB-000000067F0000400500FB3D320100000000__00000122E1129DA0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F000040050081DB430000D18CA9-030000000000000000000000000000000002__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":210288640,"generation":2,"shard":"0008"},"000000067F00004005000060F60000062E4F-000000067F00004005000060F60100000000__00000104BD37F348":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C000016A065-000000067F0000400500F3A25C000017C0CB__0000010779A7F551-0000010A5E65DF39":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001AD0000-000000067F00004005000060FB0001B28B44__0000008196C976A1-0000008625CF2891":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000254000-000000067F00004005000060F30000298000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000E8C000-000000067F000040050081DB430000EA0000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300040F41F4-000000067F00004005000060F3000412D27C__00000122A7BB7B29-0000012694E36301":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB00013B8000-000000067F00004005000060FB00013BC000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700000D8000-000000067F00004005000060F700000DC000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000958000-000000067F00004005000060F700009605D8__000000923719A971-00000096262826C9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060FB00004A0000-000000067F00004005000060FB00004A4000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700001213F2-000000067F00004005000060F80100000000__0000000D55A212C9-000000114A805939":{"file_size":55320576,"generation":2,"shard":"0008"},"000000067F00004005000060F30004156457-000000067F00004005000060F30100000000__00000122A7BB7B29-0000012694E36301":{"file_size":96927744,"generation":2,"shard":"0008"},"000000067F00004005000060F30003278000-000000067F00004005000060F3000327C000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000158F667-000000067F00004005016EA00C00015B74FF__000001AC25760149-000001AFC313C819":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060FB0001D50000-000000067F00004005000060FB0001D88B43__0000008DBE2855F9-000000923719A971":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F60000054AE8-000000067F00004005000060F60100000000__000000DBD29DC248":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F300002C4887-000000067F00004005000060F60100000000__0000000D80565628":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70001B34000-000000067F00004005000060F70001B5A072__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F600000416A8-000000067F00004005000060F60100000000__000000AFE87558B0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F10000050000-000000067F00004005000060F10000058000__000000044854EBD1-00000008B6B51879":{"file_size":264011776,"generation":2,"shard":"0008"},"000000067F00004005000060F300043FC000-000000067F00004005000060F300044D3639__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004878000-000000067F00004005000060F3000487C000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000396C000-000000067F00004005000060F30003998000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00019F7907-000000067F00004005016EA00C0001A477A4__000001B3E1B95181-000001B6FFE46BC9":{"file_size":268443648,"generation":11,"shard":"0008"},"000000067F00004005016EA00C00014D7727-000000067F00004005016EA00C00014E75C6__000001A931C135B1-000001AC25760149":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005016EA00C00016570D9-030000000000000000000000000000000002__000001AC25760149-000001AFC313C819":{"file_size":86335488,"generation":11,"shard":"0008"},"000000067F00004005000060F70001270000-000000067F00004005000060F80100000000__0000010FB1BE19B9-00000113456156F1":{"file_size":265363456,"generation":2,"shard":"0008"},"000000067F0000400500EB4A4800003BFD31-000000067F0000400500EB4A4800003C7C42__000000FCCD5238B1-000000FF8B261599":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300014B31F8-000000067F00004005000060F300014CC16D__000000636DE92159-000000663565F8C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000D5D1E9-030000000000000000000000000000000002__0000019E7001E460":{"file_size":139264,"generation":11,"shard":"0008"},"000000067F00004005000060F100003B8214-000000067F00004005000060F100003C0432__0000003D03FCCDB9-000000417D21ACF9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001346854-000000067F00004005016EA00C000135FCAD__000001A931C135B1-000001AC25760149":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F3000160410C-000000067F00004005000060F3000165515A__000000698AF6E809-0000006DDB29D589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB000118B12B-030000000000000000000000000000000002__00000054161C34B8":{"file_size":147456,"generation":2,"shard":"0008"},"000000067F00004005000060F30006DF0000-000000067F00004005000060F30006DF4000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700003C4000-000000067F00004005000060F700003FE341__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000FF0000-000000067F00004005000060F30100000000__0000004C49155071-0000004F31878919":{"file_size":256286720,"generation":2,"shard":"0008"},"000000067F00004005000060FB00015F4000-000000067F00004005000060FB00015FCD31__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005816253-000000067F00004005000060F30005847319__00000159A7EC8CB9-0000015DD1D3C809":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002460000-000000067F00004005000060F30002464000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F7000113A337-000000067F00004005000060F700011528FB__000000FF8B261599-000001048B25A8E9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB000037968A-030000000000000000000000000000000002__0000000D55A212C9-000000114A805939":{"file_size":226426880,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000128000-000000067F00004005016EA00C000012FE9A__000001880F984A29-0000018C496B6DB1":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500EB4A48000036FF11-000000067F0000400500EB4A4800003A7E20__000000FCCD5238B1-000000FF8B261599":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000658113F-000000067F00004005000060F3000659A203__000001715E483C79-000001751A7D7589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001D18000-000000067F00004005016EA00C0001D1C000__000001BCB572A4E0":{"file_size":134422528,"generation":17,"shard":"0008"},"000000067F00004005000060F30001A44000-000000067F00004005000060F30001AB1583__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F10000138000-000000067F00004005000060F1000013C000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300009BC000-000000067F00004005000060F30000A50000__000000321AA80270":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F7000110E30C-000000067F00004005000060F80100000000__000000F91FE84F08":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F50100000000-000000067F00004005000060F60000014000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006F18000-000000067F00004005000060F30006FA900D__00000184624E5741-000001860C80A151":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001D88B43-000000067F00004005000060FB0100000000__0000008DBE2855F9-000000923719A971":{"file_size":249028608,"generation":2,"shard":"0008"},"000000067F00004005000060F3000122A1D5-000000067F00004005000060F30100000000__0000005413AB3641-00000057593D8169":{"file_size":48783360,"generation":2,"shard":"0008"},"000000067F00004005000060F30006277C61-000000067F00004005000060F30006320C60__0000016E1FBB7B99-000001715E483C79":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000388000-000000067F000040050081DB43000038C000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000E67A6E-000000067F00004005016EA00C0000E77906__0000019E2C5DCEE1-000001A1DD8B4481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F300009B8000-000000067F00004005000060F300009BC000__000000321AA80270":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400501025D900000068000-000000067F00004005010450640000000570__0000010FB1BE19B9-00000113456156F1":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060FB00002D4B6A-030000000000000000000000000000000002__0000000D80565628":{"file_size":147456,"generation":2,"shard":"0008"},"000000067F00004005000060F30001E50FF3-000000067F00004005000060F30001E720A2__000000923719A971-00000096262826C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00005A4000-000000067F00004005016EA00C0000670000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060FB0000C18000-000000067F00004005000060FB0000C1C000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000BA4F5B-000000067F00004005000060F70000BBD532__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001AC115C-000000067F00004005000060F80100000000__0000015304A396B9-0000015670D6AFD9":{"file_size":237248512,"generation":2,"shard":"0008"},"000000067F00004005000060F30004D24000-000000067F00004005000060F30004DA8000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006CA4000-000000067F00004005000060F30006D10000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC00001433D0-030000000000000000000000000000000002__000000FCCD5238B1-000000FF8B261599":{"file_size":146407424,"generation":2,"shard":"0008"},"000000067F00004005000060F3000165515A-000000067F00004005000060F30100000000__000000698AF6E809-0000006DDB29D589":{"file_size":112680960,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000118C000-000000067F00004005016EA00C00011D0000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F000040050081DB43000094A076-030000000000000000000000000000000002__000000A9EB8C4489-000000ACA44C8E99":{"file_size":176054272,"generation":2,"shard":"0008"},"000000067F00004005000060F70001528000-000000067F00004005000060F7000152C000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000C82B50-000000067F000040050081DB430000CC4BC2__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300001EF15A-000000067F000040050081DB4300002791D8__0000008196C976A1-0000008625CF2891":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F10000125BF2-000000067F00004005000060F20100000000__000000114A805939-00000013FB921C81":{"file_size":78782464,"generation":2,"shard":"0008"},"000000067F00004005000060F30000E40F86-000000067F00004005000060F30100000000__000000417D21ACF9-00000044B4679349":{"file_size":111108096,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000FF0000-000000067F00004005016EA00C0000FF4000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30000CB16B6-000000067F00004005000060F50100000000__0000003D2AB09B68":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70001990000-000000067F00004005000060F70001994000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000A54000-000000067F00004005000060F30000A5F9BB__000000321AA80270":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300061B8705-000000067F00004005000060F300061D9774__0000016B49A934C1-0000016E1FBB7B99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F7000084C000-000000067F00004005000060F70000858000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000848000-000000067F00004005000060F7000084C000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001D18000-000000067F00004005000060F30001D79136__0000008DBE2855F9-000000923719A971":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001558000-000000067F00004005000060FB000155C000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300024440AE-000000067F00004005000060F3000244D189__000000A9EB8C4489-000000ACA44C8E99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002CFC020-000000067F00004005000060F30100000000__000000C824C09619-000000CC13D2E549":{"file_size":150708224,"generation":2,"shard":"0008"},"000000067F000040050081DB430000A4A074-000000067F000040050081DB430000A640EA__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000C98000-000000067F00004005000060FB0000C9C000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001840000-000000067F00004005000060FB0001844000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000802123-000000067F00004005000060F30000853115__00000023FEF9F321-00000028C365FBE1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000029ED0-000000067F00004005000060F80100000000__000000027AF9D7D0":{"file_size":24576,"generation":1,"shard":"0008"},"000000067F00004005016EA00C00003E4000-000000067F00004005016EA00C00003E8000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30004CBC000-000000067F00004005000060F30004D20000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000122C000-000000067F00004005016EA00C0001240000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30004DF086C-000000067F00004005000060F50100000000__000001444EB7FC10":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F300050B5199-000000067F00004005000060F30100000000__0000014784964B91-0000014B000D1821":{"file_size":126124032,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001A477A4-000000067F00004005016EA00C0001ADF63C__000001B3E1B95181-000001B6FFE46BC9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F70001828000-000000067F00004005000060F7000182C000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F100004F0000-000000067F00004005000060F10000518222__0000005413AB3641-00000057593D8169":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F30005EFD576-000000067F00004005000060F30100000000__00000164DEE06671-0000016834A3FC91":{"file_size":193077248,"generation":2,"shard":"0008"},"000000067F0000400500F8E3A50100000000-000000067F0000400500FA2AD30000004000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000258E3A9-000000067F00004005000060F3000259F4A3__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000C90000-000000067F00004005000060F70000CB85B3__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060FB000114C000-000000067F00004005000060FB000118B12B__00000054161C34B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003644000-000000067F00004005000060F30003648000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001A50000-000000067F00004005000060FB0001A60B43__0000007E3A9BFD29-0000008196C976A1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003C257AD-000000067F00004005000060F50100000000__000001180B3FF408":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30002DE8000-000000067F00004005000060F30002E4104A__000000D31E48D7C9-000000D74E29AAD1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C00000C8000-000000067F0000400500F3A25C00000EA069__000001048B25A8E9-0000010779A7F551":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002174000-000000067F00004005000060F30002210000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300014D5280-000000067F00004005000060F300014E6333__000000636DE92159-000000663565F8C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000332B1B6-000000067F00004005000060F30003344134__000000E4C63CFA21-000000E7C2F1B249":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300065F42B4-000000067F00004005000060F3000660D31F__000001715E483C79-000001751A7D7589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300010E264A-000000067F000040050081DB4300010F46BD__000000D31E48D7C9-000000D74E29AAD1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300069D13FA-000000067F00004005000060F300069FA3F6__00000178B8B10551-0000017C9F5597E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300061D9774-000000067F00004005000060F30006222843__0000016B49A934C1-0000016E1FBB7B99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F100005C821A-000000067F00004005000060F20100000000__000000601F43CF09-000000636DE92159":{"file_size":265183232,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000200000-000000067F0000400500EB4A480000204000__000000FCD84FE628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001690000-000000067F00004005000060F70100000000__000001334140FC21-00000137115BE4D9":{"file_size":273965056,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000A575C7-000000067F00004005016EA00C0000A9F465__00000196C9018F59-0000019A2EAFE7A9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060FB0001E6C000-000000067F00004005000060FB0001E98000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00014195A7-000000067F00004005000060FB000147A0EC__000000601F43CF09-000000636DE92159":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000AE168A-030000000000000000000000000000000002__0000003203FB5749-0000003579F03331":{"file_size":223379456,"generation":2,"shard":"0008"},"000000067F00004005000060F30000CA0000-000000067F00004005000060F30000CA4000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300006E4000-000000067F00004005000060F30000738000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300006E0000-000000067F00004005000060F300006E4000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001124000-000000067F00004005000060FB0001148000__00000054161C34B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500D69D7900000A8000-000000067F0000400500D69D7900000AC000__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500C782E40000130000-000000067F0000400500C782E40000137F10__000000D01F399709-000000D31E48D7C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000020FBCF-000000067F00004005016EA00C0000257A6F__000001880F984A29-0000018C496B6DB1":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060FB0001B28B44-000000067F00004005000060FB0100000000__0000008196C976A1-0000008625CF2891":{"file_size":249454592,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001120000-000000067F00004005000060FB0001124000__00000054161C34B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005474062-000000067F00004005000060F3000549D0A6__0000015304A396B9-0000015670D6AFD9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500C782E4000023FA62-030000000000000000000000000000000002__000000D01F399709-000000D31E48D7C9":{"file_size":245366784,"generation":2,"shard":"0008"},"000000067F000040050081DB430000160484-030000000000000000000000000000000002__00000079F2A2F311-0000007E3A9BFD29":{"file_size":226582528,"generation":2,"shard":"0008"},"000000067F00004005000060F300038A4FB4-000000067F00004005000060F300038B5F5B__0000010779A7F551-0000010A5E65DF39":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300017E8000-000000067F00004005000060F300017EC000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500FB3D300100000000-000000067F0000400500FB3D31000000C000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700010105DB-000000067F00004005000060F80100000000__000000E4C63CFA21-000000E7C2F1B249":{"file_size":254935040,"generation":2,"shard":"0008"},"000000067F00004005000060F70000858570-000000067F00004005000060F80100000000__0000008196C976A1-0000008625CF2891":{"file_size":252985344,"generation":2,"shard":"0008"},"000000067F000040050081DB4300001D4000-000000067F000040050081DB4300001E8000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00005E0000-000000067F00004005000060FB0000638B45__0000001B59EEB909-0000001FFBC01501":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050107B547000006C000-000000067F000040050107B54700000A0EB1__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000430000-000000067F00004005000060FB0000434000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300014E6333-000000067F00004005000060F3000151F271__000000636DE92159-000000663565F8C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500FB3D300100000000-000000067F0000400500FB3D300300000000__00000117EDA82C11-0000011B632CC319":{"file_size":65536,"generation":2,"shard":"0008"},"000000067F00004005000060F30004BE7584-000000067F00004005000060F30100000000__0000013C9C0E3339-0000013FEFA7D709":{"file_size":58204160,"generation":2,"shard":"0008"},"000000067F00004005000060F70001068000-000000067F00004005000060F80100000000__000000E7C2F1B249-000000EBC9213D59":{"file_size":168730624,"generation":2,"shard":"0008"},"000000067F00004005000060F1000013C000-000000067F00004005000060F10000148000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000659A203-000000067F00004005000060F300065BB235__000001715E483C79-000001751A7D7589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000EC0000-000000067F00004005000060F70000EF85D6__000000D74E29AAD1-000000DBBFA87AE1":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005010660F500000B4000-000000067F00004005010660F500000F44CB__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300067A4000-000000067F00004005000060F300067F0000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500DBCED500000F0000-000000067F0000400500DBCED500000F4000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000768000-000000067F000040050081DB43000076C000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00018E0000-000000067F00004005016EA00C00018E4000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30000A50000-000000067F00004005000060F30000A54000__000000321AA80270":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001E68000-000000067F00004005000060FB0001E6C000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001960000-000000067F00004005000060F300019790A2__00000079F2A2F311-0000007E3A9BFD29":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000B6A1D0-000000067F00004005000060FB0000BAAD15__0000003579F03331-0000003959DA2DE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002E4A157-000000067F00004005000060F30002E630CF__000000D31E48D7C9-000000D74E29AAD1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006E70000-000000067F00004005000060F30006E74000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700004464DD-000000067F00004005000060F7000046EAB9__000000417D21ACF9-00000044B4679349":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000204000-000000067F0000400500EB4A480000218000__000000FCD84FE628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300042D51D6-000000067F00004005000060F3000430E1E9__0000012694E36301-0000012A3F140591":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000F30000-000000067F00004005000060FB0100000000__00000047E31D98D1-0000004C49155071":{"file_size":272302080,"generation":2,"shard":"0008"},"000000067F000040050081DB4300006F8000-030000000000000000000000000000000002__0000009DF02C1241-000000A173C00489":{"file_size":235110400,"generation":2,"shard":"0008"},"000000067F000040050081DB4300001EC000-000000067F000040050081DB4300001F1DA6__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300038A3082-000000067F00004005000060F30100000000__000001048B25A8E9-0000010779A7F551":{"file_size":76644352,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000400000-000000067F00004005016EA00C0000404000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30003481DDB-000000067F00004005000060F30100000000__000000E7C2F1B249-000000EBC9213D59":{"file_size":107814912,"generation":2,"shard":"0008"},"000000067F00004005000060F3000489C000-000000067F00004005000060F300048A0000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000CD6C36-000000067F000040050081DB430000D18CA9__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30004888000-000000067F00004005000060F3000488C000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300008E0F49-000000067F00004005000060F30000921E8A__00000028C365FBE1-0000002D2A8E0B81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500C782E40000074000-000000067F0000400500C782E400000A0000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00011F2D11-000000067F00004005000060FB0001203856__0000005413AB3641-00000057593D8169":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300046330B1-000000067F00004005000060F300046B41AA__0000012E71CF31F9-000001334140FC21":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003548000-000000067F00004005000060F30003580FD3__000000EFA7EAA9E1-000000F309FCDD19":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001198B44-000000067F00004005000060FB00011C1688__0000005413AB3641-00000057593D8169":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000049C000-000000067F00004005000060F300004A8000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000B44000-000000067F00004005016EA00C0000BB0000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F700014F0000-000000067F00004005000060F700014F85DF__0000012694E36301-0000012A3F140591":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000C5E15B-000000067F000040050081DB430000C801D1__000000B768469051-000000BAB1E56C91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003A10000-000000067F00004005000060F30003A21037__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006EFC000-000000067F00004005000060F30006F18000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001D1F87B-000000067F00004005016EA00C0001D7F71A__000001BA93C39481-000001BCB572A4E1":{"file_size":268451840,"generation":17,"shard":"0008"},"000000067F00004005000060F30002A34000-000000067F00004005000060F30002A40000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000F0AA88-000000067F00004005000060F80100000000__000000DBD29DC248":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30006700000-000000067F00004005000060F30006704000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001CC4000-000000067F00004005000060F30001CD0000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000858000-000000067F00004005000060F80100000000__00000081AA3C40F0":{"file_size":48439296,"generation":2,"shard":"0008"},"000000067F000040050081DB4300000D6407-000000067F000040050081DB430000160484__00000079F2A2F311-0000007E3A9BFD29":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300057DD292-000000067F00004005000060F30005816253__00000159A7EC8CB9-0000015DD1D3C809":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006222843-000000067F00004005000060F3000625B8F0__0000016B49A934C1-0000016E1FBB7B99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000410000-000000067F00004005000060FB0000430B46__000000114A805939-00000013FB921C81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F100006A8000-000000067F00004005000060F100006B0000__0000006DDB29D589-000000722F474369":{"file_size":264110080,"generation":2,"shard":"0008"},"000000067F00004005000060F3000460202F-000000067F00004005000060F300046330B1__0000012E71CF31F9-000001334140FC21":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006E74000-000000067F00004005000060F30006EF8000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003A3B020-000000067F00004005000060F30003A4C09C__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002535462-000000067F00004005000060F3000258E3A9__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000294000-000000067F0000400500EB4A480000355928__000000FCD84FE628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016E85370000000000-030000000000000000000000000000000002__00000159A7EC8CB9-0000015DD1D3C809":{"file_size":152190976,"generation":2,"shard":"0008"},"000000067F00004005000060F3000158C000-000000067F00004005000060F300015B0000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003386D10-000000067F00004005000060F300033D7D7C__000000E7C2F1B249-000000EBC9213D59":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000E7C000-000000067F00004005000060F30000EF1FC3__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500FA2AD30000030000-000000067F0000400500FA2AD30000034000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005588000-000000067F00004005000060F3000558C000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300039A0000-000000067F00004005000060F300039A4000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F6000008A13D-000000067F00004005000060F60100000000__000001444EB7FC10":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060FB00017120CE-000000067F00004005000060FB000172AC12__0000006DDB29D589-000000722F474369":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003200000-000000067F00004005000060F30003204000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300007C1007-000000067F00004005000060F30000802123__00000023FEF9F321-00000028C365FBE1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C000006C000-000000067F0000400500F3A25C00000BB439__00000104BD37F348":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300015B4000-000000067F00004005000060F300015F8000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300060C220F-000000067F00004005000060F300060CB2C8__0000016B49A934C1-0000016E1FBB7B99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500F8E3A5000004A25C-000000067F0000400500F8E3A50100000000__0000010D77B487A0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30002C9AFB8-000000067F00004005000060F30002CFC020__000000C824C09619-000000CC13D2E549":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005010F2BD40100000000-000000067F00004005010F44EB000000C000__00000126C3C69FC0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002AEED02-000000067F00004005000060F50100000000__000000C483D0D6B8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30002EB8000-000000067F00004005000060F30002F5105E__000000D74E29AAD1-000000DBBFA87AE1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500E3A2A1000016321A-030000000000000000000000000000000002__000000EFDE07FFD8":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F00004005000060F3000135C000-000000067F00004005000060F30001407F7A__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500F67839000006AEF4-000000067F0000400500F7D2DD0100000000__0000010D77B487A0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30005DA03A8-000000067F00004005000060F30005DC93F1__00000164DEE06671-0000016834A3FC91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300010E2072-000000067F000040050081DB430100000000__000000D01F399709-000000D31E48D7C9":{"file_size":15392768,"generation":2,"shard":"0008"},"000000067F00004005000060F300004A8000-000000067F00004005000060F300004AC000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00016E0A44-000000067F00004005000060FB0001701588__0000006DDB29D589-000000722F474369":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300024D8000-000000067F00004005000060F300024DC000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003BC8000-000000067F00004005000060F30003BCC000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F00100000000-000000067F00004005000060F10000004000__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F000040050081DB430100000000-000000067F0000400500C782E40000074000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003D14206-000000067F00004005000060F30003D252C8__00000117EDA82C11-0000011B632CC319":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700006479E7-000000067F00004005000060F80100000000__000000603CA8F2F0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70000B9C988-000000067F00004005000060F70000BA4F5B__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500D69D790000078000-000000067F0000400500D69D79000007C000__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000CC8B74-000000067F00004005000060F80100000000__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":95657984,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000708000-000000067F00004005000060FB000070C000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000EA0000-000000067F000040050081DB430000EEA075__000000C462B3C2A9-000000C824C09619":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000001FD3E-000000067F00004005016EA00C0000097BDA__0000018624969469-000001880F984A29":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F3000689E295-000000067F00004005000060F3000690F2FD__00000178B8B10551-0000017C9F5597E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000CE0000-000000067F00004005000060F30000D31030__0000003D03FCCDB9-000000417D21ACF9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000EA0000-030000000000000000000000000000000002__000000C483D0D6B8":{"file_size":20307968,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000807A34-000000067F00004005016EA00C00008578D4__000001935283F9B9-00000196C9018F59":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F000040050081DB430001060000-000000067F000040050081DB430001064000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000480F32C-000000067F00004005000060F3000486837F__000001334140FC21-00000137115BE4D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700009385D4-000000067F00004005000060F80100000000__0000008DBE2855F9-000000923719A971":{"file_size":252207104,"generation":2,"shard":"0008"},"000000067F00004005000060F30000090000-000000067F00004005000060F300000C1095__000000021DC73119-000000044854EBD1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000480620C-000000067F00004005000060F3000480F32C__000001334140FC21-00000137115BE4D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30005FA40AD-000000067F00004005000060F30005FC519A__0000016834A3FC91-0000016B49A934C1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB00014A42B8-030000000000000000000000000000000002__000000601F43CF09-000000636DE92159":{"file_size":137322496,"generation":2,"shard":"0008"},"000000067F00004005000060F30001CD0000-000000067F00004005000060F30001CD4000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000404000-000000067F00004005016EA00C0000428000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30002079FDE-000000067F00004005000060F300020830BE__0000009DF02C1241-000000A173C00489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000487C000-000000067F00004005000060F30004880000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005010A188401FFFFFFFF-000000067F00004005010A18840300000000__00000137115BE4D9-000001398B56A519":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70000218000-000000067F00004005000060F7000021C000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005EF454F-000000067F00004005000060F30005EFD576__00000164DEE06671-0000016834A3FC91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30005DC93F1-000000067F00004005000060F30005E0A466__00000164DEE06671-0000016834A3FC91":{"file_size":268451840,"generation":2,"shard":"0008"}},"disk_consistent_lsn":"1BC/B5734CD8","metadata_bytes":{"disk_consistent_lsn":"1BC/B5734CD8","prev_record_lsn":"1BC/B5734CB0","ancestor_timeline":null,"ancestor_lsn":"0/0","latest_gc_cutoff_lsn":"1BC/B5732690","initdb_lsn":"0/14EE150","pg_version":16},"lineage":{}}
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index 509f41366b..cda70be8da 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -150,6 +150,7 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
     "pageserver_pitr_history_size",
     "pageserver_layer_bytes",
     "pageserver_layer_count",
+    "pageserver_visible_physical_size",
     "pageserver_storage_operations_seconds_count_total",
     "pageserver_storage_operations_seconds_sum_total",
     "pageserver_evictions_total",

From 7a2625b8033bd869f0ab4f8c020dfab3eade56b2 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <iskyzh@gmail.com>
Date: Thu, 1 Aug 2024 09:52:34 -0400
Subject: [PATCH 319/412] storage-scrubber: log version on start (#8571)

Helps us better identify which version of storage scrubber is running.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 Cargo.lock                   | 1 +
 storage_scrubber/Cargo.toml  | 1 +
 storage_scrubber/src/main.rs | 7 +++++++
 3 files changed, 9 insertions(+)

diff --git a/Cargo.lock b/Cargo.lock
index dc4f0c7b81..2677699702 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5758,6 +5758,7 @@ dependencies = [
  "either",
  "futures",
  "futures-util",
+ "git-version",
  "hex",
  "humantime",
  "itertools 0.10.5",
diff --git a/storage_scrubber/Cargo.toml b/storage_scrubber/Cargo.toml
index 7d5b7d10b9..d19119990b 100644
--- a/storage_scrubber/Cargo.toml
+++ b/storage_scrubber/Cargo.toml
@@ -10,6 +10,7 @@ aws-smithy-async.workspace = true
 either.workspace = true
 tokio-rustls.workspace = true
 anyhow.workspace = true
+git-version.workspace = true
 hex.workspace = true
 humantime.workspace = true
 thiserror.workspace = true
diff --git a/storage_scrubber/src/main.rs b/storage_scrubber/src/main.rs
index 4c804c00c1..a111c31844 100644
--- a/storage_scrubber/src/main.rs
+++ b/storage_scrubber/src/main.rs
@@ -17,6 +17,11 @@ use storage_scrubber::{
 use clap::{Parser, Subcommand};
 use utils::id::TenantId;
 
+use utils::{project_build_tag, project_git_version};
+
+project_git_version!(GIT_VERSION);
+project_build_tag!(BUILD_TAG);
+
 #[derive(Parser)]
 #[command(author, version, about, long_about = None)]
 #[command(arg_required_else_help(true))]
@@ -101,6 +106,8 @@ enum Command {
 async fn main() -> anyhow::Result<()> {
     let cli = Cli::parse();
 
+    tracing::info!("version: {}, build_tag {}", GIT_VERSION, BUILD_TAG);
+
     let bucket_config = BucketConfig::from_env()?;
 
     let command_log_name = match &cli.command {

From a9bcabe503608169d34f85eb242a24ba423f1066 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <iskyzh@gmail.com>
Date: Thu, 1 Aug 2024 10:00:06 -0400
Subject: [PATCH 320/412] fix(pageserver): skip existing layers for
 btm-gc-compaction (#8498)

part of https://github.com/neondatabase/neon/issues/8002

Due to the limitation of the current layer map implementation, we cannot
directly replace a layer. It's interpreted as an insert and a deletion,
and there will be file exist error when renaming the newly-created layer
to replace the old layer. We work around that by changing the end key of
the image layer. A long-term fix would involve a refactor around the
layer file naming. For delta layers, we simply skip layers with the same
key range produced, though it is possible to add an extra key as an
alternative solution.

* The image layer range for the layers generated from gc-compaction will
be Key::MIN..(Key..MAX-1), to avoid being recognized as an L0 delta
layer.
* Skip existing layers if it turns out that we need to generate a layer
with the same persistent key in the same generation.

Note that it is possible that the newly-generated layer has different
content from the existing layer. For example, when the user drops a
retain_lsn, the compaction could have combined or dropped some records,
therefore creating a smaller layer than the existing one. We discard the
"optimized" layer for now because we cannot deal with such rewrites
within the same generation.


---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 pageserver/src/tenant.rs                      |  47 ++-
 .../src/tenant/storage_layer/layer_desc.rs    |  14 +
 pageserver/src/tenant/timeline/compaction.rs  | 279 ++++++++++++++++--
 .../src/tenant/timeline/layer_manager.rs      |  14 +-
 4 files changed, 320 insertions(+), 34 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index b9257dfbe8..84c5095610 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -6963,7 +6963,11 @@ mod tests {
             vec![
                 // Image layer at GC horizon
                 PersistentLayerKey {
-                    key_range: Key::MIN..Key::MAX,
+                    key_range: {
+                        let mut key = Key::MAX;
+                        key.field6 -= 1;
+                        Key::MIN..key
+                    },
                     lsn_range: Lsn(0x30)..Lsn(0x31),
                     is_delta: false
                 },
@@ -6982,6 +6986,15 @@ mod tests {
             ]
         );
 
+        // increase GC horizon and compact again
+        {
+            // Update GC info
+            let mut guard = tline.gc_info.write().unwrap();
+            guard.cutoffs.time = Lsn(0x40);
+            guard.cutoffs.space = Lsn(0x40);
+        }
+        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+
         Ok(())
     }
 
@@ -7333,6 +7346,15 @@ mod tests {
             );
         }
 
+        // increase GC horizon and compact again
+        {
+            // Update GC info
+            let mut guard = tline.gc_info.write().unwrap();
+            guard.cutoffs.time = Lsn(0x40);
+            guard.cutoffs.space = Lsn(0x40);
+        }
+        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+
         Ok(())
     }
 
@@ -7837,6 +7859,10 @@ mod tests {
         ];
 
         let verify_result = || async {
+            let gc_horizon = {
+                let gc_info = tline.gc_info.read().unwrap();
+                gc_info.cutoffs.time
+            };
             for idx in 0..10 {
                 assert_eq!(
                     tline
@@ -7847,7 +7873,7 @@ mod tests {
                 );
                 assert_eq!(
                     tline
-                        .get(get_key(idx as u32), Lsn(0x30), &ctx)
+                        .get(get_key(idx as u32), gc_horizon, &ctx)
                         .await
                         .unwrap(),
                     &expected_result_at_gc_horizon[idx]
@@ -7873,7 +7899,24 @@ mod tests {
 
         let cancel = CancellationToken::new();
         tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+        verify_result().await;
 
+        // compact again
+        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+        verify_result().await;
+
+        // increase GC horizon and compact again
+        {
+            // Update GC info
+            let mut guard = tline.gc_info.write().unwrap();
+            guard.cutoffs.time = Lsn(0x38);
+            guard.cutoffs.space = Lsn(0x38);
+        }
+        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+        verify_result().await; // no wals between 0x30 and 0x38, so we should obtain the same result
+
+        // not increasing the GC horizon and compact again
+        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
         verify_result().await;
 
         Ok(())
diff --git a/pageserver/src/tenant/storage_layer/layer_desc.rs b/pageserver/src/tenant/storage_layer/layer_desc.rs
index bd765560e4..cbd18e650f 100644
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -41,6 +41,20 @@ pub struct PersistentLayerKey {
     pub is_delta: bool,
 }
 
+impl std::fmt::Display for PersistentLayerKey {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "{}..{} {}..{} is_delta={}",
+            self.key_range.start,
+            self.key_range.end,
+            self.lsn_range.start,
+            self.lsn_range.end,
+            self.is_delta
+        )
+    }
+}
+
 impl PersistentLayerDesc {
     pub fn key(&self) -> PersistentLayerKey {
         PersistentLayerKey {
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 4fe9bbafab..61d662d25d 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -4,7 +4,7 @@
 //!
 //! The old legacy algorithm is implemented directly in `timeline.rs`.
 
-use std::collections::BinaryHeap;
+use std::collections::{BinaryHeap, HashSet};
 use std::ops::{Deref, Range};
 use std::sync::Arc;
 
@@ -30,7 +30,9 @@ use crate::page_cache;
 use crate::tenant::config::defaults::{DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD};
 use crate::tenant::remote_timeline_client::WaitCompletionError;
 use crate::tenant::storage_layer::merge_iterator::MergeIterator;
-use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc, ValueReconstructState};
+use crate::tenant::storage_layer::{
+    AsLayerDesc, PersistentLayerDesc, PersistentLayerKey, ValueReconstructState,
+};
 use crate::tenant::timeline::ImageLayerCreationOutcome;
 use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter};
 use crate::tenant::timeline::{Layer, ResidentLayer};
@@ -1368,7 +1370,7 @@ impl Timeline {
     pub(crate) async fn generate_key_retention(
         self: &Arc<Timeline>,
         key: Key,
-        history: &[(Key, Lsn, Value)],
+        full_history: &[(Key, Lsn, Value)],
         horizon: Lsn,
         retain_lsn_below_horizon: &[Lsn],
         delta_threshold_cnt: usize,
@@ -1376,14 +1378,14 @@ impl Timeline {
     ) -> anyhow::Result<KeyHistoryRetention> {
         // Pre-checks for the invariants
         if cfg!(debug_assertions) {
-            for (log_key, _, _) in history {
+            for (log_key, _, _) in full_history {
                 assert_eq!(log_key, &key, "mismatched key");
             }
-            for i in 1..history.len() {
-                assert!(history[i - 1].1 <= history[i].1, "unordered LSN");
-                if history[i - 1].1 == history[i].1 {
+            for i in 1..full_history.len() {
+                assert!(full_history[i - 1].1 <= full_history[i].1, "unordered LSN");
+                if full_history[i - 1].1 == full_history[i].1 {
                     assert!(
-                        matches!(history[i - 1].2, Value::Image(_)),
+                        matches!(full_history[i - 1].2, Value::Image(_)),
                         "unordered delta/image, or duplicated delta"
                     );
                 }
@@ -1414,7 +1416,7 @@ impl Timeline {
             }
             lsn_split_points.push(horizon);
             let mut current_idx = 0;
-            for item @ (_, lsn, _) in history {
+            for item @ (_, lsn, _) in full_history {
                 while current_idx < lsn_split_points.len() && *lsn > lsn_split_points[current_idx] {
                     current_idx += 1;
                 }
@@ -1459,6 +1461,68 @@ impl Timeline {
         if let Some((key, lsn, img)) = base_img_from_ancestor {
             replay_history.push((key, lsn, Value::Image(img)));
         }
+
+        /// Generate debug information for the replay history
+        fn generate_history_trace(replay_history: &[(Key, Lsn, Value)]) -> String {
+            use std::fmt::Write;
+            let mut output = String::new();
+            if let Some((key, _, _)) = replay_history.first() {
+                write!(output, "key={} ", key).unwrap();
+                let mut cnt = 0;
+                for (_, lsn, val) in replay_history {
+                    if val.is_image() {
+                        write!(output, "i@{} ", lsn).unwrap();
+                    } else if val.will_init() {
+                        write!(output, "di@{} ", lsn).unwrap();
+                    } else {
+                        write!(output, "d@{} ", lsn).unwrap();
+                    }
+                    cnt += 1;
+                    if cnt >= 128 {
+                        write!(output, "... and more").unwrap();
+                        break;
+                    }
+                }
+            } else {
+                write!(output, "<no history>").unwrap();
+            }
+            output
+        }
+
+        fn generate_debug_trace(
+            replay_history: Option<&[(Key, Lsn, Value)]>,
+            full_history: &[(Key, Lsn, Value)],
+            lsns: &[Lsn],
+            horizon: Lsn,
+        ) -> String {
+            use std::fmt::Write;
+            let mut output = String::new();
+            if let Some(replay_history) = replay_history {
+                writeln!(
+                    output,
+                    "replay_history: {}",
+                    generate_history_trace(replay_history)
+                )
+                .unwrap();
+            } else {
+                writeln!(output, "replay_history: <disabled>",).unwrap();
+            }
+            writeln!(
+                output,
+                "full_history: {}",
+                generate_history_trace(full_history)
+            )
+            .unwrap();
+            writeln!(
+                output,
+                "when processing: [{}] horizon={}",
+                lsns.iter().map(|l| format!("{l}")).join(","),
+                horizon
+            )
+            .unwrap();
+            output
+        }
+
         for (i, split_for_lsn) in split_history.into_iter().enumerate() {
             // TODO: there could be image keys inside the splits, and we can compute records_since_last_image accordingly.
             records_since_last_image += split_for_lsn.len();
@@ -1483,10 +1547,27 @@ impl Timeline {
                 }
             }
             if let Some((_, _, val)) = replay_history.first() {
-                assert!(val.will_init(), "invalid history, no base image");
+                if !val.will_init() {
+                    return Err(anyhow::anyhow!("invalid history, no base image")).with_context(
+                        || {
+                            generate_debug_trace(
+                                Some(&replay_history),
+                                full_history,
+                                retain_lsn_below_horizon,
+                                horizon,
+                            )
+                        },
+                    );
+                }
             }
             if generate_image && records_since_last_image > 0 {
                 records_since_last_image = 0;
+                let replay_history_for_debug = if cfg!(debug_assertions) {
+                    Some(replay_history.clone())
+                } else {
+                    None
+                };
+                let replay_history_for_debug_ref = replay_history_for_debug.as_deref();
                 let history = std::mem::take(&mut replay_history);
                 let mut img = None;
                 let mut records = Vec::with_capacity(history.len());
@@ -1494,14 +1575,30 @@ impl Timeline {
                     img = Some((*lsn, val.clone()));
                     for (_, lsn, val) in history.into_iter().skip(1) {
                         let Value::WalRecord(rec) = val else {
-                            panic!("invalid record")
+                            return Err(anyhow::anyhow!(
+                                "invalid record, first record is image, expect walrecords"
+                            ))
+                            .with_context(|| {
+                                generate_debug_trace(
+                                    replay_history_for_debug_ref,
+                                    full_history,
+                                    retain_lsn_below_horizon,
+                                    horizon,
+                                )
+                            });
                         };
                         records.push((lsn, rec));
                     }
                 } else {
                     for (_, lsn, val) in history.into_iter() {
                         let Value::WalRecord(rec) = val else {
-                            panic!("invalid record")
+                            return Err(anyhow::anyhow!("invalid record, first record is walrecord, expect rest are walrecord"))
+                                .with_context(|| generate_debug_trace(
+                                    replay_history_for_debug_ref,
+                                    full_history,
+                                    retain_lsn_below_horizon,
+                                    horizon,
+                                ));
                         };
                         records.push((lsn, rec));
                     }
@@ -1513,12 +1610,11 @@ impl Timeline {
                 replay_history.push((key, request_lsn, Value::Image(img.clone())));
                 retention.push(vec![(request_lsn, Value::Image(img))]);
             } else {
-                retention.push(
-                    split_for_lsn
-                        .iter()
-                        .map(|(_, lsn, value)| (*lsn, value.clone()))
-                        .collect(),
-                );
+                let deltas = split_for_lsn
+                    .iter()
+                    .map(|(_, lsn, value)| (*lsn, value.clone()))
+                    .collect_vec();
+                retention.push(deltas);
             }
         }
         let mut result = Vec::with_capacity(retention.len());
@@ -1533,7 +1629,7 @@ impl Timeline {
                 result.push((lsn_split_points[idx], KeyLogAtLsn(logs)));
             }
         }
-        unreachable!()
+        unreachable!("key retention is empty")
     }
 
     /// An experimental compaction building block that combines compaction with garbage collection.
@@ -1544,11 +1640,26 @@ impl Timeline {
     /// and create delta layers with all deltas >= gc horizon.
     pub(crate) async fn compact_with_gc(
         self: &Arc<Self>,
-        _cancel: &CancellationToken,
+        cancel: &CancellationToken,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         use std::collections::BTreeSet;
 
+        // Block other compaction/GC tasks from running for now. GC-compaction could run along
+        // with legacy compaction tasks in the future.
+
+        let _compaction_lock = tokio::select! {
+            guard = self.compaction_lock.lock() => guard,
+            // TODO: refactor to CompactionError to correctly pass cancelled error
+            _ = cancel.cancelled() => return Err(anyhow!("cancelled")),
+        };
+
+        let _gc = tokio::select! {
+            guard = self.gc_lock.lock() => guard,
+            // TODO: refactor to CompactionError to correctly pass cancelled error
+            _ = cancel.cancelled() => return Err(anyhow!("cancelled")),
+        };
+
         info!("running enhanced gc bottom-most compaction");
 
         scopeguard::defer! {
@@ -1644,6 +1755,13 @@ impl Timeline {
         let mut accumulated_values = Vec::new();
         let mut last_key: Option<Key> = None;
 
+        enum FlushDeltaResult {
+            /// Create a new resident layer
+            CreateResidentLayer(ResidentLayer),
+            /// Keep an original delta layer
+            KeepLayer(PersistentLayerKey),
+        }
+
         #[allow(clippy::too_many_arguments)]
         async fn flush_deltas(
             deltas: &mut Vec<(Key, Lsn, crate::repository::Value)>,
@@ -1654,7 +1772,7 @@ impl Timeline {
             lowest_retain_lsn: Lsn,
             ctx: &RequestContext,
             last_batch: bool,
-        ) -> anyhow::Result<Option<ResidentLayer>> {
+        ) -> anyhow::Result<Option<FlushDeltaResult>> {
             // Check if we need to split the delta layer. We split at the original delta layer boundary to avoid
             // overlapping layers.
             //
@@ -1677,28 +1795,78 @@ impl Timeline {
             if !need_split && !last_batch {
                 return Ok(None);
             }
-            let deltas = std::mem::take(deltas);
+            let deltas: Vec<(Key, Lsn, Value)> = std::mem::take(deltas);
             if deltas.is_empty() {
                 return Ok(None);
             }
             let end_lsn = deltas.iter().map(|(_, lsn, _)| lsn).max().copied().unwrap() + 1;
+            let delta_key = PersistentLayerKey {
+                key_range: {
+                    let key_start = deltas.first().unwrap().0;
+                    let key_end = deltas.last().unwrap().0.next();
+                    key_start..key_end
+                },
+                lsn_range: lowest_retain_lsn..end_lsn,
+                is_delta: true,
+            };
+            {
+                // Hack: skip delta layer if we need to produce a layer of a same key-lsn.
+                //
+                // This can happen if we have removed some deltas in "the middle" of some existing layer's key-lsn-range.
+                // For example, consider the case where a single delta with range [0x10,0x50) exists.
+                // And we have branches at LSN 0x10, 0x20, 0x30.
+                // Then we delete branch @ 0x20.
+                // Bottom-most compaction may now delete the delta [0x20,0x30).
+                // And that wouldnt' change the shape of the layer.
+                //
+                // Note that bottom-most-gc-compaction never _adds_ new data in that case, only removes.
+                // That's why it's safe to skip.
+                let guard = tline.layers.read().await;
+
+                if guard.contains_key(&delta_key) {
+                    let layer_generation = guard.get_from_key(&delta_key).metadata().generation;
+                    drop(guard);
+                    if layer_generation == tline.generation {
+                        // TODO: depending on whether we design this compaction process to run along with
+                        // other compactions, there could be layer map modifications after we drop the
+                        // layer guard, and in case it creates duplicated layer key, we will still error
+                        // in the end.
+                        info!(
+                            key=%delta_key,
+                            ?layer_generation,
+                            "discard delta layer due to duplicated layer in the same generation"
+                        );
+                        return Ok(Some(FlushDeltaResult::KeepLayer(delta_key)));
+                    }
+                }
+            }
+
             let mut delta_layer_writer = DeltaLayerWriter::new(
                 tline.conf,
                 tline.timeline_id,
                 tline.tenant_shard_id,
-                deltas.first().unwrap().0,
+                delta_key.key_range.start,
                 lowest_retain_lsn..end_lsn,
                 ctx,
             )
             .await?;
-            let key_end = deltas.last().unwrap().0.next();
             for (key, lsn, val) in deltas {
                 delta_layer_writer.put_value(key, lsn, val, ctx).await?;
             }
-            let delta_layer = delta_layer_writer.finish(key_end, tline, ctx).await?;
-            Ok(Some(delta_layer))
+            let delta_layer = delta_layer_writer
+                .finish(delta_key.key_range.end, tline, ctx)
+                .await?;
+            Ok(Some(FlushDeltaResult::CreateResidentLayer(delta_layer)))
         }
 
+        // Hack the key range to be min..(max-1). Otherwise, the image layer will be
+        // interpreted as an L0 delta layer.
+        let hack_image_layer_range = {
+            let mut end_key = Key::MAX;
+            end_key.field6 -= 1;
+            Key::MIN..end_key
+        };
+
         // Only create image layers when there is no ancestor branches. TODO: create covering image layer
         // when some condition meet.
         let mut image_layer_writer = if self.ancestor_timeline.is_none() {
@@ -1707,7 +1875,7 @@ impl Timeline {
                     self.conf,
                     self.timeline_id,
                     self.tenant_shard_id,
-                    &(Key::MIN..Key::MAX), // covers the full key range
+                    &hack_image_layer_range, // covers the full key range
                     lowest_retain_lsn,
                     ctx,
                 )
@@ -1737,6 +1905,42 @@ impl Timeline {
             let img = tline.get(key, tline.ancestor_lsn, ctx).await?;
             Ok(Some((key, tline.ancestor_lsn, img)))
         }
+        let image_layer_key = PersistentLayerKey {
+            key_range: hack_image_layer_range,
+            lsn_range: PersistentLayerDesc::image_layer_lsn_range(lowest_retain_lsn),
+            is_delta: false,
+        };
+
+        // Like with delta layers, it can happen that we re-produce an already existing image layer.
+        // This could happen when a user triggers force compaction and image generation. In this case,
+        // it's always safe to rewrite the layer.
+        let discard_image_layer = {
+            let guard = self.layers.read().await;
+            if guard.contains_key(&image_layer_key) {
+                let layer_generation = guard.get_from_key(&image_layer_key).metadata().generation;
+                drop(guard);
+                if layer_generation == self.generation {
+                    // TODO: depending on whether we design this compaction process to run along with
+                    // other compactions, there could be layer map modifications after we drop the
+                    // layer guard, and in case it creates duplicated layer key, we will still error
+                    // in the end.
+                    info!(
+                        key=%image_layer_key,
+                        ?layer_generation,
+                        "discard image layer due to duplicated layer key in the same generation",
+                    );
+                    true
+                } else {
+                    false
+                }
+            } else {
+                false
+            }
+        };
+
+        // Actually, we can decide not to write to the image layer at all at this point because
+        // the key and LSN range are determined. However, to keep things simple here, we still
+        // create this writer, and discard the writer in the end.
 
         let mut delta_values = Vec::new();
         let delta_split_points = delta_split_points.into_iter().collect_vec();
@@ -1824,7 +2028,9 @@ impl Timeline {
         );
         assert!(delta_values.is_empty(), "unprocessed keys");
 
-        let image_layer = if let Some(writer) = image_layer_writer {
+        let image_layer = if discard_image_layer {
+            None
+        } else if let Some(writer) = image_layer_writer {
             Some(writer.finish(self, ctx).await?)
         } else {
             None
@@ -1835,7 +2041,22 @@ impl Timeline {
             if image_layer.is_some() { 1 } else { 0 }
         );
         let mut compact_to = Vec::new();
-        compact_to.extend(delta_layers);
+        let mut keep_layers = HashSet::new();
+        for action in delta_layers {
+            match action {
+                FlushDeltaResult::CreateResidentLayer(layer) => {
+                    compact_to.push(layer);
+                }
+                FlushDeltaResult::KeepLayer(l) => {
+                    keep_layers.insert(l);
+                }
+            }
+        }
+        if discard_image_layer {
+            keep_layers.insert(image_layer_key);
+        }
+        let mut layer_selection = layer_selection;
+        layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key()));
         compact_to.extend(image_layer);
         // Step 3: Place back to the layer map.
         {
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index 1e4edd34ad..1bc2acbd34 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -35,6 +35,10 @@ impl LayerManager {
         self.layer_fmgr.get_from_desc(desc)
     }
 
+    pub(crate) fn get_from_key(&self, desc: &PersistentLayerKey) -> Layer {
+        self.layer_fmgr.get_from_key(desc)
+    }
+
     /// Get an immutable reference to the layer map.
     ///
     /// We expect users only to be able to get an immutable layer map. If users want to make modifications,
@@ -365,16 +369,20 @@ impl<T> Default for LayerFileManager<T> {
 }
 
 impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
-    fn get_from_desc(&self, desc: &PersistentLayerDesc) -> T {
+    fn get_from_key(&self, key: &PersistentLayerKey) -> T {
         // The assumption for the `expect()` is that all code maintains the following invariant:
         // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
         self.0
-            .get(&desc.key())
-            .with_context(|| format!("get layer from desc: {}", desc.layer_name()))
+            .get(key)
+            .with_context(|| format!("get layer from key: {}", key))
             .expect("not found")
             .clone()
     }
 
+    fn get_from_desc(&self, desc: &PersistentLayerDesc) -> T {
+        self.get_from_key(&desc.key())
+    }
+
     fn contains_key(&self, key: &PersistentLayerKey) -> bool {
         self.0.contains_key(key)
     }

From 4eea3ce70536c3afcb7f7731d2bf7178e7271f41 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 1 Aug 2024 16:55:43 +0100
Subject: [PATCH 321/412] test_runner: don't create artifacts if Allure is not
 enabled (#8580)

## Problem

`allure_attach_from_dir` method might create `tar.zst` archives even
if `--alluredir` is not set (i.e. Allure results collection is disabled)

## Summary of changes
- Don't run `allure_attach_from_dir` if `--alluredir`  is not set
---
 test_runner/fixtures/neon_fixtures.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 0c33dec784..b370a92e38 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4529,6 +4529,13 @@ def test_output_dir(
 
     yield test_dir
 
+    # Allure artifacts creation might involve the creation of `.tar.zst` archives,
+    # which aren't going to be used if Allure results collection is not enabled
+    # (i.e. --alluredir is not set).
+    # Skip `allure_attach_from_dir` in this case
+    if not request.config.getoption("--alluredir"):
+        return
+
     preserve_database_files = False
     for k, v in request.node.user_properties:
         # NB: the neon_env_builder fixture uses this fixture (test_output_dir).

From 4631179320decea3e20ed8366d35698ead74bceb Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 2 Aug 2024 08:00:46 +0100
Subject: [PATCH 322/412] pageserver: refine how we delete timelines after
 shard split (#8436)

## Problem

Previously, when we do a timeline deletion, shards will delete layers
that belong to an ancestor. That is not a correctness issue, because
when we delete a timeline, we're always deleting it from all shards, and
destroying data for that timeline is clearly fine.

However, there exists a race where one shard might start doing this
deletion while another shard has not yet received the deletion request,
and might try to access an ancestral layer. This creates ambiguity over
the "all layers referenced by my index should always exist" invariant,
which is important to detecting and reporting corruption.

Now that we have a GC mode for clearing up ancestral layers, we can rely
on that to clean up such layers, and avoid deleting them right away.
This makes things easier to reason about: there are now no cases where a
shard will delete a layer that belongs to a ShardIndex other than
itself.

## Summary of changes

- Modify behavior of RemoteTimelineClient::delete_all
- Add `test_scrubber_physical_gc_timeline_deletion` to exercise this
case
- Tweak AWS SDK config in the scrubber to enable retries. Motivated by
seeing the test for this feature encounter some transient "service
error" S3 errors (which are probably nothing to do with the changes in
this PR)
---
 .../src/tenant/remote_timeline_client.rs      | 12 +++
 storage_scrubber/src/lib.rs                   |  8 ++
 test_runner/regress/test_storage_scrubber.py  | 78 ++++++++++++++++++-
 3 files changed, 97 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index fed666ca45..9e021c7e35 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1378,6 +1378,18 @@ impl RemoteTimelineClient {
                 .dirty
                 .layer_metadata
                 .drain()
+                .filter(|(_file_name, meta)| {
+                    // Filter out layers that belonged to an ancestor shard.  Since we are deleting the whole timeline from
+                    // all shards anyway, we _could_ delete these, but
+                    // - it creates a potential race if other shards are still
+                    //   using the layers while this shard deletes them.
+                    // - it means that if we rolled back the shard split, the ancestor shards would be in a state where
+                    //   these timelines are present but corrupt (their index exists but some layers don't)
+                    //
+                    // These layers will eventually be cleaned up by the scrubber when it does physical GC.
+                    meta.shard.shard_number == self.tenant_shard_id.shard_number
+                        && meta.shard.shard_count == self.tenant_shard_id.shard_count
+                })
                 .map(|(file_name, meta)| {
                     remote_layer_path(
                         &self.tenant_shard_id.tenant_id,
diff --git a/storage_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs
index 152319b731..1fc94cc174 100644
--- a/storage_scrubber/src/lib.rs
+++ b/storage_scrubber/src/lib.rs
@@ -16,6 +16,7 @@ use std::sync::Arc;
 use std::time::Duration;
 
 use anyhow::{anyhow, Context};
+use aws_config::retry::{RetryConfigBuilder, RetryMode};
 use aws_sdk_s3::config::Region;
 use aws_sdk_s3::error::DisplayErrorContext;
 use aws_sdk_s3::Client;
@@ -314,8 +315,15 @@ pub fn init_logging(file_name: &str) -> Option<WorkerGuard> {
 }
 
 async fn init_s3_client(bucket_region: Region) -> Client {
+    let mut retry_config_builder = RetryConfigBuilder::new();
+
+    retry_config_builder
+        .set_max_attempts(Some(3))
+        .set_mode(Some(RetryMode::Adaptive));
+
     let config = aws_config::defaults(aws_config::BehaviorVersion::v2024_03_28())
         .region(bucket_region)
+        .retry_config(retry_config_builder.build())
         .load()
         .await;
     Client::new(&config)
diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py
index fadf438788..e3f627b6a6 100644
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -13,6 +13,7 @@ from fixtures.neon_fixtures import (
     NeonEnv,
     NeonEnvBuilder,
 )
+from fixtures.pg_version import PgVersion
 from fixtures.remote_storage import S3Storage, s3_storage
 from fixtures.utils import wait_until
 from fixtures.workload import Workload
@@ -265,10 +266,85 @@ def test_scrubber_physical_gc_ancestors(
     # attach it, to drop any local state, then check it's still readable.
     workload.stop()
     drop_local_state(env, tenant_id)
-
     workload.validate()
 
 
+def test_scrubber_physical_gc_timeline_deletion(neon_env_builder: NeonEnvBuilder):
+    """
+    When we delete a timeline after a shard split, the child shards do not directly delete the
+    layers in the ancestor shards.  They rely on the scrubber to clean up.
+    """
+    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
+    neon_env_builder.num_pageservers = 2
+
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    tenant_id = TenantId.generate()
+    timeline_id = TimelineId.generate()
+    env.neon_cli.create_tenant(
+        tenant_id,
+        timeline_id,
+        shard_count=None,
+        conf={
+            # Small layers and low compaction thresholds, so that when we split we can expect some to
+            # be dropped by child shards
+            "checkpoint_distance": f"{1024 * 1024}",
+            "compaction_threshold": "1",
+            "compaction_target_size": f"{1024 * 1024}",
+            "image_creation_threshold": "2",
+            "image_layer_creation_check_threshold": "0",
+            # Disable background compaction, we will do it explicitly
+            "compaction_period": "0s",
+            # No PITR, so that as soon as child shards generate an image layer, it covers ancestor deltas
+            # and makes them GC'able
+            "pitr_interval": "0s",
+        },
+    )
+
+    # Make sure the original shard has some layers
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init()
+    workload.write_rows(100)
+
+    new_shard_count = 4
+    shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count)
+
+    # Create a second timeline so that when we delete the first one, child shards still have some content in S3.
+    #
+    # This is a limitation of the scrubber: if a shard isn't in S3 (because it has no timelines), then the scrubber
+    # doesn't know about it, and won't perceive its ancestors as ancestors.
+    other_timeline_id = TimelineId.generate()
+    env.storage_controller.pageserver_api().timeline_create(
+        PgVersion.NOT_SET, tenant_id, other_timeline_id
+    )
+
+    # Write after split so that child shards have some indices in S3
+    workload.write_rows(100, upload=False)
+    for shard in shards:
+        ps = env.get_tenant_pageserver(shard)
+        log.info(f"Waiting for shard {shard} on pageserver {ps.id}")
+        ps.http_client().timeline_checkpoint(
+            shard, timeline_id, compact=False, wait_until_uploaded=True
+        )
+
+    # The timeline still exists in child shards and they reference its layers, so scrubbing
+    # now shouldn't delete anything.
+    gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=0, mode="full")
+    assert gc_summary["remote_storage_errors"] == 0
+    assert gc_summary["indices_deleted"] == 0
+    assert gc_summary["ancestor_layers_deleted"] == 0
+
+    # Delete the timeline
+    env.storage_controller.pageserver_api().timeline_delete(tenant_id, timeline_id)
+
+    # Subsequently doing physical GC should clean up the ancestor layers
+    gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=0, mode="full")
+    assert gc_summary["remote_storage_errors"] == 0
+    assert gc_summary["indices_deleted"] == 0
+    assert gc_summary["ancestor_layers_deleted"] > 0
+
+
 def test_scrubber_physical_gc_ancestors_split(neon_env_builder: NeonEnvBuilder):
     """
     Exercise ancestor GC while a tenant is partly split: this test ensures that if we have some child shards

From 3833e30d44f7cd05842f00e0c2b5e308f21371b7 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 2 Aug 2024 09:37:44 +0100
Subject: [PATCH 323/412] storage_controller: start adding chaos hooks (#7946)

Chaos injection bridges the gap between automated testing (where we do
lots of different things with small, short-lived tenants), and staging
(where we do many fewer things, but with larger, long-lived tenants).

This PR adds a first type of chaos which isn't really very chaotic: it's
live migration of tenants between healthy pageservers. This nevertheless
provides continuous checks that things like clean, prompt shutdown of
tenants works for realistically deployed pageservers with realistically
large tenants.
---
 Cargo.lock                                    |  1 +
 storage_controller/Cargo.toml                 |  1 +
 storage_controller/src/main.rs                | 28 ++++++++
 storage_controller/src/service.rs             |  2 +
 .../src/service/chaos_injector.rs             | 71 +++++++++++++++++++
 5 files changed, 103 insertions(+)
 create mode 100644 storage_controller/src/service/chaos_injector.rs

diff --git a/Cargo.lock b/Cargo.lock
index 2677699702..764c0fbd30 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5703,6 +5703,7 @@ dependencies = [
  "pageserver_client",
  "postgres_connection",
  "r2d2",
+ "rand 0.8.5",
  "reqwest 0.12.4",
  "routerify",
  "scopeguard",
diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml
index d14b235046..ecaac04915 100644
--- a/storage_controller/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -32,6 +32,7 @@ once_cell.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true
 postgres_connection.workspace = true
+rand.workspace = true
 reqwest = { workspace = true, features = ["stream"] }
 routerify.workspace = true
 serde.workspace = true
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index adbf5c6496..2799f21fdc 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -9,12 +9,14 @@ use std::time::Duration;
 use storage_controller::http::make_router;
 use storage_controller::metrics::preinitialize_metrics;
 use storage_controller::persistence::Persistence;
+use storage_controller::service::chaos_injector::ChaosInjector;
 use storage_controller::service::{
     Config, Service, MAX_OFFLINE_INTERVAL_DEFAULT, MAX_WARMING_UP_INTERVAL_DEFAULT,
     RECONCILER_CONCURRENCY_DEFAULT,
 };
 use tokio::signal::unix::SignalKind;
 use tokio_util::sync::CancellationToken;
+use tracing::Instrument;
 use utils::auth::{JwtAuth, SwappableJwtAuth};
 use utils::logging::{self, LogFormat};
 
@@ -86,6 +88,10 @@ struct Cli {
     // TODO: make `cfg(feature = "testing")`
     #[arg(long)]
     neon_local_repo_dir: Option<PathBuf>,
+
+    /// Chaos testing
+    #[arg(long)]
+    chaos_interval: Option<humantime::Duration>,
 }
 
 enum StrictMode {
@@ -309,6 +315,22 @@ async fn async_main() -> anyhow::Result<()> {
     tracing::info!("Serving on {0}", args.listen);
     let server_task = tokio::task::spawn(server);
 
+    let chaos_task = args.chaos_interval.map(|interval| {
+        let service = service.clone();
+        let cancel = CancellationToken::new();
+        let cancel_bg = cancel.clone();
+        (
+            tokio::task::spawn(
+                async move {
+                    let mut chaos_injector = ChaosInjector::new(service, interval.into());
+                    chaos_injector.run(cancel_bg).await
+                }
+                .instrument(tracing::info_span!("chaos_injector")),
+            ),
+            cancel,
+        )
+    });
+
     // Wait until we receive a signal
     let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt())?;
     let mut sigquit = tokio::signal::unix::signal(SignalKind::quit())?;
@@ -337,6 +359,12 @@ async fn async_main() -> anyhow::Result<()> {
         }
     }
 
+    // If we were injecting chaos, stop that so that we're not calling into Service while it shuts down
+    if let Some((chaos_jh, chaos_cancel)) = chaos_task {
+        chaos_cancel.cancel();
+        chaos_jh.await.ok();
+    }
+
     service.shutdown().await;
     tracing::info!("Service shutdown complete");
 
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index ea515f67da..6940bf2c64 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -84,6 +84,8 @@ use crate::{
 };
 use serde::{Deserialize, Serialize};
 
+pub mod chaos_injector;
+
 // For operations that should be quick, like attaching a new tenant
 const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5);
 
diff --git a/storage_controller/src/service/chaos_injector.rs b/storage_controller/src/service/chaos_injector.rs
new file mode 100644
index 0000000000..99961d691c
--- /dev/null
+++ b/storage_controller/src/service/chaos_injector.rs
@@ -0,0 +1,71 @@
+use std::{sync::Arc, time::Duration};
+
+use rand::seq::SliceRandom;
+use rand::thread_rng;
+use tokio_util::sync::CancellationToken;
+
+use super::Service;
+
+pub struct ChaosInjector {
+    service: Arc<Service>,
+    interval: Duration,
+}
+
+impl ChaosInjector {
+    pub fn new(service: Arc<Service>, interval: Duration) -> Self {
+        Self { service, interval }
+    }
+
+    pub async fn run(&mut self, cancel: CancellationToken) {
+        let mut interval = tokio::time::interval(self.interval);
+
+        loop {
+            tokio::select! {
+                _ = interval.tick() => {}
+                _ = cancel.cancelled() => {
+                    tracing::info!("Shutting down");
+                    return;
+                }
+            }
+
+            self.inject_chaos().await;
+
+            tracing::info!("Chaos iteration...");
+        }
+    }
+
+    async fn inject_chaos(&mut self) {
+        // Pick some shards to interfere with
+        let batch_size = 128;
+        let mut inner = self.service.inner.write().unwrap();
+        let (nodes, tenants, scheduler) = inner.parts_mut();
+        let tenant_ids = tenants.keys().cloned().collect::<Vec<_>>();
+        let victims = tenant_ids.choose_multiple(&mut thread_rng(), batch_size);
+
+        for victim in victims {
+            let shard = tenants
+                .get_mut(victim)
+                .expect("Held lock between choosing ID and this get");
+
+            // Pick a secondary to promote
+            let Some(new_location) = shard
+                .intent
+                .get_secondary()
+                .choose(&mut thread_rng())
+                .cloned()
+            else {
+                tracing::info!("Skipping shard {victim}: no secondary location, can't migrate");
+                continue;
+            };
+
+            let Some(old_location) = *shard.intent.get_attached() else {
+                tracing::info!("Skipping shard {victim}: currently has no attached location");
+                continue;
+            };
+
+            shard.intent.demote_attached(scheduler, old_location);
+            shard.intent.promote_attached(scheduler, new_location);
+            self.service.maybe_reconcile_shard(shard, nodes);
+        }
+    }
+}

From 7eb3d6bb2dec03925487e618ae26618f15e87c50 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 2 Aug 2024 13:07:12 +0200
Subject: [PATCH 324/412] Wait for completion of the upload queue in
 flush_frozen_layer (#8550)

Makes `flush_frozen_layer` add a barrier to the upload queue and makes
it wait for that barrier to be reached until it lets the flushing be
completed.

This gives us backpressure and ensures that writes can't build up in an
unbounded fashion.

Fixes #7317
---
 compute_tools/Cargo.toml                   |  5 ++
 compute_tools/src/compute.rs               | 10 ++-
 control_plane/src/background_process.rs    |  2 +-
 pageserver/src/tenant/timeline.rs          | 20 ++++-
 test_runner/fixtures/neon_fixtures.py      | 12 ++-
 test_runner/fixtures/pageserver/http.py    |  2 +
 test_runner/regress/test_branching.py      | 23 ++++--
 test_runner/regress/test_remote_storage.py | 87 +++-------------------
 8 files changed, 74 insertions(+), 87 deletions(-)

diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml
index 8ceb8f2ad2..8af0ed43ce 100644
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -4,6 +4,11 @@ version = "0.1.0"
 edition.workspace = true
 license.workspace = true
 
+[features]
+default = []
+# Enables test specific features.
+testing = []
+
 [dependencies]
 anyhow.workspace = true
 async-compression.workspace = true
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 91855d954d..5bd6897fe3 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -400,7 +400,15 @@ impl ComputeNode {
     pub fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
         let mut retry_period_ms = 500.0;
         let mut attempts = 0;
-        let max_attempts = 10;
+        const DEFAULT_ATTEMPTS: u16 = 10;
+        #[cfg(feature = "testing")]
+        let max_attempts = if let Ok(v) = env::var("NEON_COMPUTE_TESTING_BASEBACKUP_RETRIES") {
+            u16::from_str(&v).unwrap()
+        } else {
+            DEFAULT_ATTEMPTS
+        };
+        #[cfg(not(feature = "testing"))]
+        let max_attempts = DEFAULT_ATTEMPTS;
         loop {
             let result = self.try_get_basebackup(compute_state, lsn);
             match result {
diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs
index a272c306e7..bf8a27e550 100644
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -289,7 +289,7 @@ fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command {
 
 fn fill_env_vars_prefixed_neon(mut cmd: &mut Command) -> &mut Command {
     for (var, val) in std::env::vars() {
-        if var.starts_with("NEON_PAGESERVER_") {
+        if var.starts_with("NEON_") {
             cmd = cmd.env(var, val);
         }
     }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 37ebeded66..be72e15c19 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -143,7 +143,10 @@ use self::walreceiver::{WalReceiver, WalReceiverConf};
 use super::{config::TenantConf, upload_queue::NotInitialized};
 use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
 use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
-use super::{remote_timeline_client::RemoteTimelineClient, storage_layer::ReadableLayer};
+use super::{
+    remote_timeline_client::RemoteTimelineClient, remote_timeline_client::WaitCompletionError,
+    storage_layer::ReadableLayer,
+};
 use super::{
     secondary::heatmap::{HeatMapLayer, HeatMapTimeline},
     GcError,
@@ -4089,6 +4092,21 @@ impl Timeline {
             // release lock on 'layers'
         };
 
+        // Backpressure mechanism: wait with continuation of the flush loop until we have uploaded all layer files.
+        // This makes us refuse ingest until the new layers have been persisted to the remote.
+        self.remote_client
+            .wait_completion()
+            .await
+            .map_err(|e| match e {
+                WaitCompletionError::UploadQueueShutDownOrStopped
+                | WaitCompletionError::NotInitialized(
+                    NotInitialized::ShuttingDown | NotInitialized::Stopped,
+                ) => FlushLayerError::Cancelled,
+                WaitCompletionError::NotInitialized(NotInitialized::Uninitialized) => {
+                    FlushLayerError::Other(anyhow!(e).into())
+                }
+            })?;
+
         // FIXME: between create_delta_layer and the scheduling of the upload in `update_metadata_file`,
         // a compaction can delete the file and then it won't be available for uploads any more.
         // We still schedule the upload, resulting in an error, but ideally we'd somehow avoid this
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index b370a92e38..7289472de2 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1943,11 +1943,15 @@ class NeonCli(AbstractNeonCli):
         remote_ext_config: Optional[str] = None,
         pageserver_id: Optional[int] = None,
         allow_multiple=False,
+        basebackup_request_tries: Optional[int] = None,
     ) -> "subprocess.CompletedProcess[str]":
         args = [
             "endpoint",
             "start",
         ]
+        extra_env_vars = {}
+        if basebackup_request_tries is not None:
+            extra_env_vars["NEON_COMPUTE_TESTING_BASEBACKUP_TRIES"] = str(basebackup_request_tries)
         if remote_ext_config is not None:
             args.extend(["--remote-ext-config", remote_ext_config])
 
@@ -1960,7 +1964,7 @@ class NeonCli(AbstractNeonCli):
         if allow_multiple:
             args.extend(["--allow-multiple"])
 
-        res = self.raw_cli(args)
+        res = self.raw_cli(args, extra_env_vars)
         res.check_returncode()
         return res
 
@@ -3812,6 +3816,7 @@ class Endpoint(PgProtocol, LogUtils):
         pageserver_id: Optional[int] = None,
         safekeepers: Optional[List[int]] = None,
         allow_multiple: bool = False,
+        basebackup_request_tries: Optional[int] = None,
     ) -> "Endpoint":
         """
         Start the Postgres instance.
@@ -3833,6 +3838,7 @@ class Endpoint(PgProtocol, LogUtils):
             remote_ext_config=remote_ext_config,
             pageserver_id=pageserver_id,
             allow_multiple=allow_multiple,
+            basebackup_request_tries=basebackup_request_tries,
         )
         self._running.release(1)
 
@@ -3979,6 +3985,7 @@ class Endpoint(PgProtocol, LogUtils):
         remote_ext_config: Optional[str] = None,
         pageserver_id: Optional[int] = None,
         allow_multiple=False,
+        basebackup_request_tries: Optional[int] = None,
     ) -> "Endpoint":
         """
         Create an endpoint, apply config, and start Postgres.
@@ -3999,6 +4006,7 @@ class Endpoint(PgProtocol, LogUtils):
             remote_ext_config=remote_ext_config,
             pageserver_id=pageserver_id,
             allow_multiple=allow_multiple,
+            basebackup_request_tries=basebackup_request_tries,
         )
 
         log.info(f"Postgres startup took {time.time() - started_at} seconds")
@@ -4042,6 +4050,7 @@ class EndpointFactory:
         config_lines: Optional[List[str]] = None,
         remote_ext_config: Optional[str] = None,
         pageserver_id: Optional[int] = None,
+        basebackup_request_tries: Optional[int] = None,
     ) -> Endpoint:
         ep = Endpoint(
             self.env,
@@ -4060,6 +4069,7 @@ class EndpointFactory:
             lsn=lsn,
             remote_ext_config=remote_ext_config,
             pageserver_id=pageserver_id,
+            basebackup_request_tries=basebackup_request_tries,
         )
 
     def create(
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index c6df6b5baf..192324f086 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -663,6 +663,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         force_image_layer_creation=False,
         wait_until_uploaded=False,
         compact: Optional[bool] = None,
+        **kwargs,
     ):
         self.is_testing_enabled_or_skip()
         query = {}
@@ -680,6 +681,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         res = self.put(
             f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint",
             params=query,
+            **kwargs,
         )
         log.info(f"Got checkpoint request response code: {res.status_code}")
         self.verbose_error(res)
diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py
index 190b624a54..fc74707639 100644
--- a/test_runner/regress/test_branching.py
+++ b/test_runner/regress/test_branching.py
@@ -18,7 +18,6 @@ from fixtures.pageserver.utils import wait_until_tenant_active
 from fixtures.utils import query_scalar
 from performance.test_perf_pgbench import get_scales_matrix
 from requests import RequestException
-from requests.exceptions import RetryError
 
 
 # Test branch creation
@@ -151,7 +150,7 @@ def test_cannot_create_endpoint_on_non_uploaded_timeline(neon_env_builder: NeonE
     env.pageserver.allowed_errors.extend(
         [
             ".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*",
-            ".*page_service_conn_main.*: query handler for 'basebackup .* is not active, state: Loading",
+            ".*page_service_conn_main.*: query handler for 'basebackup .* ERROR: Not found: Timeline",
         ]
     )
     ps_http = env.pageserver.http_client()
@@ -176,10 +175,12 @@ def test_cannot_create_endpoint_on_non_uploaded_timeline(neon_env_builder: NeonE
 
         env.neon_cli.map_branch(initial_branch, env.initial_tenant, env.initial_timeline)
 
-        with pytest.raises(RuntimeError, match="is not active, state: Loading"):
-            env.endpoints.create_start(initial_branch, tenant_id=env.initial_tenant)
+        with pytest.raises(RuntimeError, match="ERROR: Not found: Timeline"):
+            env.endpoints.create_start(
+                initial_branch, tenant_id=env.initial_tenant, basebackup_request_tries=2
+            )
+        ps_http.configure_failpoints(("before-upload-index-pausable", "off"))
     finally:
-        # FIXME: paused uploads bother shutdown
         env.pageserver.stop(immediate=True)
 
         t.join()
@@ -193,8 +194,11 @@ def test_cannot_branch_from_non_uploaded_branch(neon_env_builder: NeonEnvBuilder
     env = neon_env_builder.init_configs()
     env.start()
 
-    env.pageserver.allowed_errors.append(
-        ".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*"
+    env.pageserver.allowed_errors.extend(
+        [
+            ".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*",
+            ".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: .*Cannot branch off the timeline that's not present in pageserver.*",
+        ]
     )
     ps_http = env.pageserver.http_client()
 
@@ -216,7 +220,10 @@ def test_cannot_branch_from_non_uploaded_branch(neon_env_builder: NeonEnvBuilder
 
         branch_id = TimelineId.generate()
 
-        with pytest.raises(RetryError, match="too many 503 error responses"):
+        with pytest.raises(
+            PageserverApiException,
+            match="Cannot branch off the timeline that's not present in pageserver",
+        ):
             ps_http.timeline_create(
                 env.pg_version,
                 env.initial_tenant,
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 09f941f582..2e5260ca78 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -12,7 +12,6 @@ from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     wait_for_last_flush_lsn,
 )
-from fixtures.pageserver.common_types import parse_layer_file_name
 from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
 from fixtures.pageserver.utils import (
     timeline_delete_wait_completed,
@@ -313,6 +312,7 @@ def test_remote_storage_upload_queue_retries(
 
     def churn_while_failpoints_active(result):
         overwrite_data_and_wait_for_it_to_arrive_at_pageserver("c")
+        # this call will wait for the failpoints to be turned off
         client.timeline_checkpoint(tenant_id, timeline_id)
         client.timeline_compact(tenant_id, timeline_id)
         overwrite_data_and_wait_for_it_to_arrive_at_pageserver("d")
@@ -332,8 +332,8 @@ def test_remote_storage_upload_queue_retries(
     # Exponential back-off in upload queue, so, gracious timeouts.
 
     wait_until(30, 1, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="upload"), 0))
-    wait_until(30, 1, lambda: assert_ge(get_queued_count(file_kind="index", op_kind="upload"), 2))
-    wait_until(30, 1, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="delete"), 0))
+    wait_until(30, 1, lambda: assert_ge(get_queued_count(file_kind="index", op_kind="upload"), 1))
+    wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0))
 
     # unblock churn operations
     configure_storage_sync_failpoints("off")
@@ -769,11 +769,11 @@ def test_empty_branch_remote_storage_upload_on_restart(neon_env_builder: NeonEnv
         create_thread.join()
 
 
-def test_compaction_waits_for_upload(
+def test_paused_upload_stalls_checkpoint(
     neon_env_builder: NeonEnvBuilder,
 ):
     """
-    This test forces a race between upload and compaction.
+    This test checks that checkpoints block on uploads to remote storage.
     """
     neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
 
@@ -788,6 +788,10 @@ def test_compaction_waits_for_upload(
         }
     )
 
+    env.pageserver.allowed_errors.append(
+        f".*PUT.* path=/v1/tenant/{env.initial_tenant}/timeline.* request was dropped before completing"
+    )
+
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline
 
@@ -808,76 +812,9 @@ def test_compaction_waits_for_upload(
         endpoint.safe_psql("CREATE TABLE foo AS SELECT x FROM generate_series(1, 10000) g(x)")
         wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
 
-        client.timeline_checkpoint(tenant_id, timeline_id)
-        deltas_at_first = len(client.layer_map_info(tenant_id, timeline_id).delta_layers())
-        assert (
-            deltas_at_first == 2
-        ), "are you fixing #5863? just add one more checkpoint after 'CREATE TABLE bar ...' statement."
-
-        endpoint.safe_psql("CREATE TABLE bar AS SELECT x FROM generate_series(1, 10000) g(x)")
-        endpoint.safe_psql("UPDATE foo SET x = 0 WHERE x = 1")
-        wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
-
-    layers_before_last_checkpoint = client.layer_map_info(tenant_id, timeline_id).historic_by_name()
-    upload_stuck_layers = layers_before_last_checkpoint - layers_at_creation.historic_by_name()
-
-    assert len(upload_stuck_layers) > 0
-
-    for name in upload_stuck_layers:
-        assert env.pageserver.layer_exists(
-            tenant_id, timeline_id, parse_layer_file_name(name)
-        ), "while uploads are stuck the layers should be present on disk"
-
-    # now this will do the L0 => L1 compaction and want to remove
-    # upload_stuck_layers and the original initdb L0
-    client.timeline_checkpoint(tenant_id, timeline_id)
-
-    # as uploads are paused, the upload_stuck_layers should still be with us
-    for name in upload_stuck_layers:
-        assert env.pageserver.layer_exists(
-            tenant_id, timeline_id, parse_layer_file_name(name)
-        ), "uploads are stuck still over compaction"
-
-    compacted_layers = client.layer_map_info(tenant_id, timeline_id).historic_by_name()
-    overlap = compacted_layers.intersection(upload_stuck_layers)
-    assert len(overlap) == 0, "none of the L0's should remain after L0 => L1 compaction"
-    assert (
-        len(compacted_layers) == 1
-    ), "there should be one L1 after L0 => L1 compaction (without #5863 being fixed)"
-
-    def layer_deletes_completed():
-        m = client.get_metric_value("pageserver_layer_completed_deletes_total")
-        if m is None:
-            return 0
-        return int(m)
-
-    # if initdb created an initial delta layer, it might already be gc'd
-    # because it was uploaded before the failpoint was enabled. however, the
-    # deletion is not guaranteed to be complete.
-    assert layer_deletes_completed() <= 1
-
-    client.configure_failpoints(("before-upload-layer-pausable", "off"))
-
-    # Ensure that this actually terminates
-    wait_upload_queue_empty(client, tenant_id, timeline_id)
-
-    def until_layer_deletes_completed():
-        deletes = layer_deletes_completed()
-        log.info(f"layer_deletes: {deletes}")
-        # ensure that initdb delta layer AND the previously stuck are now deleted
-        assert deletes >= len(upload_stuck_layers) + 1
-
-    wait_until(10, 1, until_layer_deletes_completed)
-
-    for name in upload_stuck_layers:
-        assert not env.pageserver.layer_exists(
-            tenant_id, timeline_id, parse_layer_file_name(name)
-        ), "l0 should now be removed because of L0 => L1 compaction and completed uploads"
-
-    # We should not have hit the error handling path in uploads where a uploaded file is gone
-    assert not env.pageserver.log_contains(
-        "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more."
-    )
+        with pytest.raises(ReadTimeout):
+            client.timeline_checkpoint(tenant_id, timeline_id, timeout=5)
+        client.configure_failpoints(("before-upload-layer-pausable", "off"))
 
 
 def wait_upload_queue_empty(

From 1758c10dec4fbd361febbfc9f25d1c7c2b3bc892 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Fri, 2 Aug 2024 15:26:46 +0100
Subject: [PATCH 325/412] Improve safekeepers eviction rate limiting (#8456)

This commit tries to fix regular load spikes on staging, caused by too
many eviction and partial upload operations running at the same time.
Usually it was hapenning after restart, for partial backup the load was
delayed.
- Add a semaphore for evictions (2 permits by default)
- Rename `resident_since` to `evict_not_before` and smooth out the curve
by using random duration
- Use random duration in partial uploads as well

related to https://github.com/neondatabase/neon/issues/6338
some discussion in
https://neondb.slack.com/archives/C033RQ5SPDH/p1720601531744029
---
 safekeeper/src/lib.rs                  |  2 ++
 safekeeper/src/rate_limit.rs           | 49 ++++++++++++++++++++++++++
 safekeeper/src/timeline.rs             |  3 +-
 safekeeper/src/timeline_eviction.rs    |  6 ++--
 safekeeper/src/timeline_manager.rs     | 48 ++++++++++++++++++-------
 safekeeper/src/timelines_global_map.rs | 14 +++++---
 safekeeper/src/wal_backup_partial.rs   | 39 +++++++-------------
 7 files changed, 112 insertions(+), 49 deletions(-)
 create mode 100644 safekeeper/src/rate_limit.rs

diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index 8f2920ada3..56d61e8287 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -21,6 +21,7 @@ pub mod json_ctrl;
 pub mod metrics;
 pub mod patch_control_file;
 pub mod pull_timeline;
+pub mod rate_limit;
 pub mod receive_wal;
 pub mod recovery;
 pub mod remove_wal;
@@ -53,6 +54,7 @@ pub mod defaults {
     pub const DEFAULT_PARTIAL_BACKUP_TIMEOUT: &str = "15m";
     pub const DEFAULT_CONTROL_FILE_SAVE_INTERVAL: &str = "300s";
     pub const DEFAULT_PARTIAL_BACKUP_CONCURRENCY: &str = "5";
+    pub const DEFAULT_EVICTION_CONCURRENCY: usize = 2;
 
     // By default, our required residency before eviction is the same as the period that passes
     // before uploading a partial segment, so that in normal operation the eviction can happen
diff --git a/safekeeper/src/rate_limit.rs b/safekeeper/src/rate_limit.rs
new file mode 100644
index 0000000000..72373b5786
--- /dev/null
+++ b/safekeeper/src/rate_limit.rs
@@ -0,0 +1,49 @@
+use std::sync::Arc;
+
+use rand::Rng;
+
+use crate::metrics::MISC_OPERATION_SECONDS;
+
+/// Global rate limiter for background tasks.
+#[derive(Clone)]
+pub struct RateLimiter {
+    partial_backup: Arc<tokio::sync::Semaphore>,
+    eviction: Arc<tokio::sync::Semaphore>,
+}
+
+impl RateLimiter {
+    /// Create a new rate limiter.
+    /// - `partial_backup_max`: maximum number of concurrent partial backups.
+    /// - `eviction_max`: maximum number of concurrent timeline evictions.
+    pub fn new(partial_backup_max: usize, eviction_max: usize) -> Self {
+        Self {
+            partial_backup: Arc::new(tokio::sync::Semaphore::new(partial_backup_max)),
+            eviction: Arc::new(tokio::sync::Semaphore::new(eviction_max)),
+        }
+    }
+
+    /// Get a permit for partial backup. This will block if the maximum number of concurrent
+    /// partial backups is reached.
+    pub async fn acquire_partial_backup(&self) -> tokio::sync::OwnedSemaphorePermit {
+        let _timer = MISC_OPERATION_SECONDS
+            .with_label_values(&["partial_permit_acquire"])
+            .start_timer();
+        self.partial_backup
+            .clone()
+            .acquire_owned()
+            .await
+            .expect("semaphore is closed")
+    }
+
+    /// Try to get a permit for timeline eviction. This will return None if the maximum number of
+    /// concurrent timeline evictions is reached.
+    pub fn try_acquire_eviction(&self) -> Option<tokio::sync::OwnedSemaphorePermit> {
+        self.eviction.clone().try_acquire_owned().ok()
+    }
+}
+
+/// Generate a random duration that is a fraction of the given duration.
+pub fn rand_duration(duration: &std::time::Duration) -> std::time::Duration {
+    let randf64 = rand::thread_rng().gen_range(0.0..1.0);
+    duration.mul_f64(randf64)
+}
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index 132e5ec32f..57935d879f 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -25,6 +25,7 @@ use utils::{
 use storage_broker::proto::SafekeeperTimelineInfo;
 use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
 
+use crate::rate_limit::RateLimiter;
 use crate::receive_wal::WalReceivers;
 use crate::safekeeper::{
     AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, ServerInfo, Term, TermLsn,
@@ -36,7 +37,7 @@ use crate::timeline_guard::ResidenceGuard;
 use crate::timeline_manager::{AtomicStatus, ManagerCtl};
 use crate::timelines_set::TimelinesSet;
 use crate::wal_backup::{self};
-use crate::wal_backup_partial::{PartialRemoteSegment, RateLimiter};
+use crate::wal_backup_partial::PartialRemoteSegment;
 use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION};
 
 use crate::metrics::{FullTimelineInfo, WalStorageMetrics, MISC_OPERATION_SECONDS};
diff --git a/safekeeper/src/timeline_eviction.rs b/safekeeper/src/timeline_eviction.rs
index 7947d83eb4..ae6f3f4b7e 100644
--- a/safekeeper/src/timeline_eviction.rs
+++ b/safekeeper/src/timeline_eviction.rs
@@ -5,7 +5,6 @@
 use anyhow::Context;
 use camino::Utf8PathBuf;
 use remote_storage::RemotePath;
-use std::time::Instant;
 use tokio::{
     fs::File,
     io::{AsyncRead, AsyncWriteExt},
@@ -15,6 +14,7 @@ use utils::crashsafe::durable_rename;
 
 use crate::{
     metrics::{EvictionEvent, EVICTION_EVENTS_COMPLETED, EVICTION_EVENTS_STARTED},
+    rate_limit::rand_duration,
     timeline_manager::{Manager, StateSnapshot},
     wal_backup,
     wal_backup_partial::{self, PartialRemoteSegment},
@@ -50,7 +50,6 @@ impl Manager {
                 .flush_lsn
                 .segment_number(self.wal_seg_size)
                 == self.last_removed_segno + 1
-            && self.resident_since.elapsed() >= self.conf.eviction_min_resident
     }
 
     /// Evict the timeline to remote storage.
@@ -112,7 +111,8 @@ impl Manager {
             return;
         }
 
-        self.resident_since = Instant::now();
+        self.evict_not_before =
+            tokio::time::Instant::now() + rand_duration(&self.conf.eviction_min_resident);
 
         info!("successfully restored evicted timeline");
     }
diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs
index debf8c824f..c224dcd398 100644
--- a/safekeeper/src/timeline_manager.rs
+++ b/safekeeper/src/timeline_manager.rs
@@ -23,6 +23,7 @@ use utils::lsn::Lsn;
 use crate::{
     control_file::{FileStorage, Storage},
     metrics::{MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL, MISC_OPERATION_SECONDS},
+    rate_limit::{rand_duration, RateLimiter},
     recovery::recovery_main,
     remove_wal::calc_horizon_lsn,
     safekeeper::Term,
@@ -32,7 +33,7 @@ use crate::{
     timeline_guard::{AccessService, GuardId, ResidenceGuard},
     timelines_set::{TimelineSetGuard, TimelinesSet},
     wal_backup::{self, WalBackupTaskHandle},
-    wal_backup_partial::{self, PartialRemoteSegment, RateLimiter},
+    wal_backup_partial::{self, PartialRemoteSegment},
     SafeKeeperConf,
 };
 
@@ -185,11 +186,11 @@ pub(crate) struct Manager {
 
     // misc
     pub(crate) access_service: AccessService,
-    pub(crate) partial_backup_rate_limiter: RateLimiter,
+    pub(crate) global_rate_limiter: RateLimiter,
 
     // Anti-flapping state: we evict timelines eagerly if they are inactive, but should not
     // evict them if they go inactive very soon after being restored.
-    pub(crate) resident_since: std::time::Instant,
+    pub(crate) evict_not_before: Instant,
 }
 
 /// This task gets spawned alongside each timeline and is responsible for managing the timeline's
@@ -202,7 +203,7 @@ pub async fn main_task(
     broker_active_set: Arc<TimelinesSet>,
     manager_tx: tokio::sync::mpsc::UnboundedSender<ManagerCtlMessage>,
     mut manager_rx: tokio::sync::mpsc::UnboundedReceiver<ManagerCtlMessage>,
-    partial_backup_rate_limiter: RateLimiter,
+    global_rate_limiter: RateLimiter,
 ) {
     tli.set_status(Status::Started);
 
@@ -220,7 +221,7 @@ pub async fn main_task(
         conf,
         broker_active_set,
         manager_tx,
-        partial_backup_rate_limiter,
+        global_rate_limiter,
     )
     .await;
 
@@ -254,9 +255,29 @@ pub async fn main_task(
             mgr.set_status(Status::UpdatePartialBackup);
             mgr.update_partial_backup(&state_snapshot).await;
 
-            if mgr.conf.enable_offload && mgr.ready_for_eviction(&next_event, &state_snapshot) {
-                mgr.set_status(Status::EvictTimeline);
-                mgr.evict_timeline().await;
+            let now = Instant::now();
+            if mgr.evict_not_before > now {
+                // we should wait until evict_not_before
+                update_next_event(&mut next_event, mgr.evict_not_before);
+            }
+
+            if mgr.conf.enable_offload
+                && mgr.evict_not_before <= now
+                && mgr.ready_for_eviction(&next_event, &state_snapshot)
+            {
+                // check rate limiter and evict timeline if possible
+                match mgr.global_rate_limiter.try_acquire_eviction() {
+                    Some(_permit) => {
+                        mgr.set_status(Status::EvictTimeline);
+                        mgr.evict_timeline().await;
+                    }
+                    None => {
+                        // we can't evict timeline now, will try again later
+                        mgr.evict_not_before =
+                            Instant::now() + rand_duration(&mgr.conf.eviction_min_resident);
+                        update_next_event(&mut next_event, mgr.evict_not_before);
+                    }
+                }
             }
         }
 
@@ -334,11 +355,10 @@ impl Manager {
         conf: SafeKeeperConf,
         broker_active_set: Arc<TimelinesSet>,
         manager_tx: tokio::sync::mpsc::UnboundedSender<ManagerCtlMessage>,
-        partial_backup_rate_limiter: RateLimiter,
+        global_rate_limiter: RateLimiter,
     ) -> Manager {
         let (is_offloaded, partial_backup_uploaded) = tli.bootstrap_mgr().await;
         Manager {
-            conf,
             wal_seg_size: tli.get_wal_seg_size().await,
             walsenders: tli.get_walsenders().clone(),
             state_version_rx: tli.get_state_version_rx(),
@@ -353,8 +373,10 @@ impl Manager {
             partial_backup_uploaded,
             access_service: AccessService::new(manager_tx),
             tli,
-            partial_backup_rate_limiter,
-            resident_since: std::time::Instant::now(),
+            global_rate_limiter,
+            // to smooth out evictions spike after restart
+            evict_not_before: Instant::now() + rand_duration(&conf.eviction_min_resident),
+            conf,
         }
     }
 
@@ -541,7 +563,7 @@ impl Manager {
         self.partial_backup_task = Some(tokio::spawn(wal_backup_partial::main_task(
             self.wal_resident_timeline(),
             self.conf.clone(),
-            self.partial_backup_rate_limiter.clone(),
+            self.global_rate_limiter.clone(),
         )));
     }
 
diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs
index f57da5c7cb..6662e18817 100644
--- a/safekeeper/src/timelines_global_map.rs
+++ b/safekeeper/src/timelines_global_map.rs
@@ -2,10 +2,11 @@
 //! All timelines should always be present in this map, this is done by loading them
 //! all from the disk on startup and keeping them in memory.
 
+use crate::defaults::DEFAULT_EVICTION_CONCURRENCY;
+use crate::rate_limit::RateLimiter;
 use crate::safekeeper::ServerInfo;
 use crate::timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError};
 use crate::timelines_set::TimelinesSet;
-use crate::wal_backup_partial::RateLimiter;
 use crate::SafeKeeperConf;
 use anyhow::{bail, Context, Result};
 use camino::Utf8PathBuf;
@@ -31,7 +32,7 @@ struct GlobalTimelinesState {
     conf: Option<SafeKeeperConf>,
     broker_active_set: Arc<TimelinesSet>,
     load_lock: Arc<tokio::sync::Mutex<TimelineLoadLock>>,
-    partial_backup_rate_limiter: RateLimiter,
+    global_rate_limiter: RateLimiter,
 }
 
 // Used to prevent concurrent timeline loading.
@@ -50,7 +51,7 @@ impl GlobalTimelinesState {
         (
             self.get_conf().clone(),
             self.broker_active_set.clone(),
-            self.partial_backup_rate_limiter.clone(),
+            self.global_rate_limiter.clone(),
         )
     }
 
@@ -85,7 +86,7 @@ static TIMELINES_STATE: Lazy<Mutex<GlobalTimelinesState>> = Lazy::new(|| {
         conf: None,
         broker_active_set: Arc::new(TimelinesSet::default()),
         load_lock: Arc::new(tokio::sync::Mutex::new(TimelineLoadLock)),
-        partial_backup_rate_limiter: RateLimiter::new(1),
+        global_rate_limiter: RateLimiter::new(1, 1),
     })
 });
 
@@ -99,7 +100,10 @@ impl GlobalTimelines {
         // lock, so use explicit block
         let tenants_dir = {
             let mut state = TIMELINES_STATE.lock().unwrap();
-            state.partial_backup_rate_limiter = RateLimiter::new(conf.partial_backup_concurrency);
+            state.global_rate_limiter = RateLimiter::new(
+                conf.partial_backup_concurrency,
+                DEFAULT_EVICTION_CONCURRENCY,
+            );
             state.conf = Some(conf);
 
             // Iterate through all directories and load tenants for all directories
diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs
index b1efa9749f..52765b0e98 100644
--- a/safekeeper/src/wal_backup_partial.rs
+++ b/safekeeper/src/wal_backup_partial.rs
@@ -18,8 +18,6 @@
 //! This way control file stores information about all potentially existing
 //! remote partial segments and can clean them up after uploading a newer version.
 
-use std::sync::Arc;
-
 use camino::Utf8PathBuf;
 use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI};
 use remote_storage::RemotePath;
@@ -30,6 +28,7 @@ use utils::lsn::Lsn;
 
 use crate::{
     metrics::{MISC_OPERATION_SECONDS, PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS},
+    rate_limit::{rand_duration, RateLimiter},
     safekeeper::Term,
     timeline::WalResidentTimeline,
     timeline_manager::StateSnapshot,
@@ -37,30 +36,6 @@ use crate::{
     SafeKeeperConf,
 };
 
-#[derive(Clone)]
-pub struct RateLimiter {
-    semaphore: Arc<tokio::sync::Semaphore>,
-}
-
-impl RateLimiter {
-    pub fn new(permits: usize) -> Self {
-        Self {
-            semaphore: Arc::new(tokio::sync::Semaphore::new(permits)),
-        }
-    }
-
-    async fn acquire_owned(&self) -> tokio::sync::OwnedSemaphorePermit {
-        let _timer = MISC_OPERATION_SECONDS
-            .with_label_values(&["partial_permit_acquire"])
-            .start_timer();
-        self.semaphore
-            .clone()
-            .acquire_owned()
-            .await
-            .expect("semaphore is closed")
-    }
-}
-
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
 pub enum UploadStatus {
     /// Upload is in progress. This status should be used only for garbage collection,
@@ -352,6 +327,7 @@ pub async fn main_task(
 ) -> Option<PartialRemoteSegment> {
     debug!("started");
     let await_duration = conf.partial_backup_timeout;
+    let mut first_iteration = true;
 
     let (_, persistent_state) = tli.get_state().await;
     let mut commit_lsn_rx = tli.get_commit_lsn_watch_rx();
@@ -419,6 +395,15 @@ pub async fn main_task(
             }
         }
 
+        // smoothing the load after restart, by sleeping for a random time.
+        // if this is not the first iteration, we will wait for the full await_duration
+        let await_duration = if first_iteration {
+            first_iteration = false;
+            rand_duration(&await_duration)
+        } else {
+            await_duration
+        };
+
         // fixing the segno and waiting some time to prevent reuploading the same segment too often
         let pending_segno = backup.segno(flush_lsn_rx.borrow().lsn);
         let timeout = tokio::time::sleep(await_duration);
@@ -454,7 +439,7 @@ pub async fn main_task(
         }
 
         // limit concurrent uploads
-        let _upload_permit = limiter.acquire_owned().await;
+        let _upload_permit = limiter.acquire_partial_backup().await;
 
         let prepared = backup.prepare_upload().await;
         if let Some(seg) = &uploaded_segment {

From 3a10bf8c823f7b93405454e0befd73f1cc81f140 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 2 Aug 2024 18:28:23 +0100
Subject: [PATCH 326/412] tests: add test_historic_storage_formats (#8423)

## Problem

Currently, our backward compatibility tests only look one release back.
That means, for example, that when we switch on image layer compression
by default, we'll test reading of uncompressed layers for one release,
and then stop doing it. When we make an index_part.json format change,
we'll test against the old format for a week, then stop (unless we write
separate unit tests for each old format).

The reality in the field is that data in old formats will continue to
exist for weeks/months/years. When we make major format changes, we
should retain examples of the old format data, and continuously verify
that the latest code can still read them.

This test uses contents from a new path in the public S3 bucket,
`compatibility-data-snapshots/`. It is populated by hand. The first
important artifact is one from before we switch on compression, so that
we will keep testing reads of uncompressed data. We will generate more
artifacts ahead of other key changes, like when we update remote storage
format for archival timelines.

Closes: https://github.com/neondatabase/cloud/issues/15576
---
 test_runner/regress/test_compatibility.py | 142 ++++++++++++++++++++--
 1 file changed, 135 insertions(+), 7 deletions(-)

diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index 411b20b2c4..137b0e931d 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -3,18 +3,15 @@ import re
 import shutil
 import subprocess
 import tempfile
+from dataclasses import dataclass
 from pathlib import Path
 from typing import List, Optional
 
 import pytest
 import toml
-from fixtures.common_types import Lsn
+from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import (
-    NeonEnv,
-    NeonEnvBuilder,
-    PgBin,
-)
+from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, PgBin
 from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import (
     timeline_delete_wait_completed,
@@ -22,7 +19,8 @@ from fixtures.pageserver.utils import (
     wait_for_upload,
 )
 from fixtures.pg_version import PgVersion
-from fixtures.remote_storage import RemoteStorageKind
+from fixtures.remote_storage import RemoteStorageKind, S3Storage, s3_storage
+from fixtures.workload import Workload
 
 #
 # A test suite that help to prevent unintentionally breaking backward or forward compatibility between Neon releases.
@@ -409,3 +407,133 @@ def dump_differs(
                     break
 
     return differs
+
+
+@dataclass
+class HistoricDataSet:
+    name: str
+    tenant_id: TenantId
+    pg_version: PgVersion
+    url: str
+
+    def __str__(self):
+        return self.name
+
+
+HISTORIC_DATA_SETS = [
+    # From before we enabled image layer compression.
+    # - IndexPart::LATEST_VERSION 7
+    # - STORAGE_FORMAT_VERSION 3
+    HistoricDataSet(
+        "2024-07-18",
+        TenantId("17bf64a53509714687664b3a84e9b3ba"),
+        PgVersion.V16,
+        "https://neon-github-public-dev.s3.eu-central-1.amazonaws.com/compatibility-data-snapshots/2024-07-18-pgv16.tar.zst",
+    ),
+]
+
+
+@pytest.mark.parametrize("dataset", HISTORIC_DATA_SETS)
+@pytest.mark.xdist_group("compatibility")
+def test_historic_storage_formats(
+    neon_env_builder: NeonEnvBuilder,
+    test_output_dir: Path,
+    pg_version: PgVersion,
+    dataset: HistoricDataSet,
+):
+    """
+    This test is like test_backward_compatibility, but it looks back further to examples of our storage format from long ago.
+    """
+
+    ARTIFACT_CACHE_DIR = "./artifact_cache"
+
+    import tarfile
+    from contextlib import closing
+
+    import requests
+    import zstandard
+
+    artifact_unpack_path = ARTIFACT_CACHE_DIR / Path("unpacked") / Path(dataset.name)
+
+    # Note: we assume that when running across a matrix of PG versions, the matrix includes all the versions needed by
+    # HISTORIC_DATA_SETS. If we ever remove a PG version from the matrix, then historic datasets built using that version
+    # will no longer be covered by this test.
+    if pg_version != dataset.pg_version:
+        pytest.skip(f"Dataset {dataset} is for different PG version, skipping")
+
+    with closing(requests.get(dataset.url, stream=True)) as r:
+        unzstd = zstandard.ZstdDecompressor()
+        with unzstd.stream_reader(r.raw) as stream:
+            with tarfile.open(mode="r|", fileobj=stream) as tf:
+                tf.extractall(artifact_unpack_path)
+
+    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
+    neon_env_builder.pg_version = dataset.pg_version
+    env = neon_env_builder.init_configs()
+    env.start()
+    assert isinstance(env.pageserver_remote_storage, S3Storage)
+
+    # Link artifact data into test's remote storage.  We don't want the whole repo dir, just the remote storage part: we are not testing
+    # compat of local disk data across releases (test_backward_compat does that), we're testing really long-lived data in S3 like layer files and indices.
+    #
+    # The code generating the snapshot uses local_fs, but this test uses S3Storage, so we are copying a tree of files into a bucket.  We use
+    # S3Storage so that the scrubber can run (the scrubber doesn't speak local_fs)
+    artifact_pageserver_path = (
+        artifact_unpack_path / Path("repo") / Path("local_fs_remote_storage") / Path("pageserver")
+    )
+    for root, _dirs, files in os.walk(artifact_pageserver_path):
+        for file in files:
+            local_path = os.path.join(root, file)
+            remote_key = (
+                env.pageserver_remote_storage.prefix_in_bucket
+                + str(local_path)[len(str(artifact_pageserver_path)) :]
+            )
+            log.info(f"Uploading {local_path} -> {remote_key}")
+            env.pageserver_remote_storage.client.upload_file(
+                local_path, env.pageserver_remote_storage.bucket_name, remote_key
+            )
+
+    # Check the scrubber handles this old data correctly (can read it and doesn't consider it corrupt)
+    #
+    # Do this _before_ importing to the pageserver, as that import may start writing immediately
+    metadata_summary = env.storage_scrubber.scan_metadata()
+    assert metadata_summary["tenant_count"] >= 1
+    assert metadata_summary["timeline_count"] >= 1
+    assert not metadata_summary["with_errors"]
+    assert not metadata_summary["with_warnings"]
+
+    env.neon_cli.import_tenant(dataset.tenant_id)
+
+    # Discover timelines
+    timelines = env.pageserver.http_client().timeline_list(dataset.tenant_id)
+    # All our artifacts should contain at least one timeline
+    assert len(timelines) > 0
+
+    # TODO: ensure that the snapshots we're importing contain a sensible variety of content, at the very
+    # least they should include a mixture of deltas and image layers.  Preferably they should also
+    # contain some "exotic" stuff like aux files from logical replication.
+
+    # Check we can start an endpoint and read the SQL that the artifact is meant to contain
+    reference_sql_dump = artifact_unpack_path / Path("dump.sql")
+    ep = env.endpoints.create_start("main", tenant_id=dataset.tenant_id)
+    pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version)
+    pg_bin.run_capture(
+        ["pg_dumpall", f"--dbname={ep.connstr()}", f"--file={test_output_dir / 'dump.sql'}"]
+    )
+    assert not dump_differs(
+        reference_sql_dump,
+        test_output_dir / "dump.sql",
+        test_output_dir / "dump.filediff",
+    )
+    ep.stop()
+
+    # Check we can also do writes to the database
+    existing_timeline_id = TimelineId(timelines[0]["timeline_id"])
+    workload = Workload(env, dataset.tenant_id, existing_timeline_id)
+    workload.init()
+    workload.write_rows(100)
+
+    # Check that compaction works
+    env.pageserver.http_client().timeline_compact(
+        dataset.tenant_id, existing_timeline_id, force_image_layer_creation=True
+    )

From 8624aabc984f504dd9b89f47984e1ab7f792e8e1 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <iskyzh@gmail.com>
Date: Fri, 2 Aug 2024 19:52:04 -0400
Subject: [PATCH 327/412] fix(pageserver): deadlock in gc-compaction (#8590)

We need both compaction and gc lock for gc-compaction. The lock order
should be the same everywhere, otherwise there could be a deadlock where
A waits for B and B waits for A.

We also had a double-lock issue. The compaction lock gets acquired in
the outer `compact` function. Note that the unit tests directly call
`compact_with_gc`, and therefore not triggering the issue.

## Summary of changes

Ensure all places acquire compact lock and then gc lock. Remove an extra
compact lock acqusition.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline/compaction.rs | 28 ++++++++++++--------
 pageserver/src/tenant/timeline/delete.rs     | 20 ++++++++++----
 2 files changed, 32 insertions(+), 16 deletions(-)

diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 61d662d25d..421f718ad6 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -1646,19 +1646,23 @@ impl Timeline {
         use std::collections::BTreeSet;
 
         // Block other compaction/GC tasks from running for now. GC-compaction could run along
-        // with legacy compaction tasks in the future.
+        // with legacy compaction tasks in the future. Always ensure the lock order is compaction -> gc.
+        // Note that we already acquired the compaction lock when the outer `compact` function gets called.
 
-        let _compaction_lock = tokio::select! {
-            guard = self.compaction_lock.lock() => guard,
-            // TODO: refactor to CompactionError to correctly pass cancelled error
-            _ = cancel.cancelled() => return Err(anyhow!("cancelled")),
+        let gc_lock = async {
+            tokio::select! {
+                guard = self.gc_lock.lock() => Ok(guard),
+                // TODO: refactor to CompactionError to correctly pass cancelled error
+                _ = cancel.cancelled() => Err(anyhow!("cancelled")),
+            }
         };
 
-        let _gc = tokio::select! {
-            guard = self.gc_lock.lock() => guard,
-            // TODO: refactor to CompactionError to correctly pass cancelled error
-            _ = cancel.cancelled() => return Err(anyhow!("cancelled")),
-        };
+        let gc_lock = crate::timed(
+            gc_lock,
+            "acquires gc lock",
+            std::time::Duration::from_secs(5),
+        )
+        .await?;
 
         info!("running enhanced gc bottom-most compaction");
 
@@ -2063,9 +2067,11 @@ impl Timeline {
             let mut guard = self.layers.write().await;
             guard.finish_gc_compaction(&layer_selection, &compact_to, &self.metrics)
         };
-
         self.remote_client
             .schedule_compaction_update(&layer_selection, &compact_to)?;
+
+        drop(gc_lock);
+
         Ok(())
     }
 }
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index 9b2403f899..05178c38b4 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -63,10 +63,19 @@ pub(super) async fn delete_local_timeline_directory(
     tenant_shard_id: TenantShardId,
     timeline: &Timeline,
 ) -> anyhow::Result<()> {
-    let guards = async { tokio::join!(timeline.gc_lock.lock(), timeline.compaction_lock.lock()) };
-    let guards = crate::timed(
-        guards,
-        "acquire gc and compaction locks",
+    // Always ensure the lock order is compaction -> gc.
+    let compaction_lock = timeline.compaction_lock.lock();
+    let compaction_lock = crate::timed(
+        compaction_lock,
+        "acquires compaction lock",
+        std::time::Duration::from_secs(5),
+    )
+    .await;
+
+    let gc_lock = timeline.gc_lock.lock();
+    let gc_lock = crate::timed(
+        gc_lock,
+        "acquires gc lock",
         std::time::Duration::from_secs(5),
     )
     .await;
@@ -107,7 +116,8 @@ pub(super) async fn delete_local_timeline_directory(
         .context("fsync_pre_mark_remove")?;
 
     info!("finished deleting layer files, releasing locks");
-    drop(guards);
+    drop(gc_lock);
+    drop(compaction_lock);
 
     fail::fail_point!("timeline-delete-after-rm", |_| {
         Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?

From fde8aa103ebe3189a74e8c3e4beb7a3eaba0c7e4 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <iskyzh@gmail.com>
Date: Mon, 5 Aug 2024 13:55:36 +0800
Subject: [PATCH 328/412] feat(pageserver): support auto split layers based on
 size (#8574)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

part of https://github.com/neondatabase/neon/issues/8002

## Summary of changes

Add a `SplitImageWriter` that automatically splits image layer based on
estimated target image layer size. This does not consider compression
and we might need a better metrics.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
---
 pageserver/src/tenant/storage_layer.rs        |   3 +
 .../src/tenant/storage_layer/image_layer.rs   |  58 ++++-
 .../src/tenant/storage_layer/split_writer.rs  | 244 ++++++++++++++++++
 3 files changed, 303 insertions(+), 2 deletions(-)
 create mode 100644 pageserver/src/tenant/storage_layer/split_writer.rs

diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 4fd110359b..59d3e1ce09 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -8,6 +8,9 @@ mod layer_desc;
 mod layer_name;
 pub mod merge_iterator;
 
+#[cfg(test)]
+pub mod split_writer;
+
 use crate::context::{AccessStatsBehavior, RequestContext};
 use crate::repository::Value;
 use crate::walrecord::NeonWalRecord;
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 08db27514a..aa308ba3c1 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -742,8 +742,14 @@ struct ImageLayerWriterInner {
     // where we have chosen their compressed form
     uncompressed_bytes_chosen: u64,
 
+    // Number of keys in the layer.
+    num_keys: usize,
+
     blob_writer: BlobWriter<false>,
     tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
+
+    #[cfg_attr(not(feature = "testing"), allow(dead_code))]
+    last_written_key: Key,
 }
 
 impl ImageLayerWriterInner {
@@ -800,6 +806,8 @@ impl ImageLayerWriterInner {
             uncompressed_bytes: 0,
             uncompressed_bytes_eligible: 0,
             uncompressed_bytes_chosen: 0,
+            num_keys: 0,
+            last_written_key: Key::MIN,
         };
 
         Ok(writer)
@@ -820,6 +828,7 @@ impl ImageLayerWriterInner {
         let compression = self.conf.image_compression;
         let uncompressed_len = img.len() as u64;
         self.uncompressed_bytes += uncompressed_len;
+        self.num_keys += 1;
         let (_img, res) = self
             .blob_writer
             .write_blob_maybe_compressed(img, ctx, compression)
@@ -839,6 +848,11 @@ impl ImageLayerWriterInner {
         key.write_to_byte_slice(&mut keybuf);
         self.tree.append(&keybuf, off)?;
 
+        #[cfg(feature = "testing")]
+        {
+            self.last_written_key = key;
+        }
+
         Ok(())
     }
 
@@ -849,6 +863,7 @@ impl ImageLayerWriterInner {
         self,
         timeline: &Arc<Timeline>,
         ctx: &RequestContext,
+        end_key: Option<Key>,
     ) -> anyhow::Result<ResidentLayer> {
         let index_start_blk =
             ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
@@ -899,11 +914,23 @@ impl ImageLayerWriterInner {
         let desc = PersistentLayerDesc::new_img(
             self.tenant_shard_id,
             self.timeline_id,
-            self.key_range.clone(),
+            if let Some(end_key) = end_key {
+                self.key_range.start..end_key
+            } else {
+                self.key_range.clone()
+            },
             self.lsn,
             metadata.len(),
         );
 
+        #[cfg(feature = "testing")]
+        if let Some(end_key) = end_key {
+            assert!(
+                self.last_written_key < end_key,
+                "written key violates end_key range"
+            );
+        }
+
         // Note: Because we open the file in write-only mode, we cannot
         // reuse the same VirtualFile for reading later. That's why we don't
         // set inner.file here. The first read will have to re-open it.
@@ -980,6 +1007,18 @@ impl ImageLayerWriter {
         self.inner.as_mut().unwrap().put_image(key, img, ctx).await
     }
 
+    #[cfg(test)]
+    /// Estimated size of the image layer.
+    pub(crate) fn estimated_size(&self) -> u64 {
+        let inner = self.inner.as_ref().unwrap();
+        inner.blob_writer.size() + inner.tree.borrow_writer().size() + PAGE_SZ as u64
+    }
+
+    #[cfg(test)]
+    pub(crate) fn num_keys(&self) -> usize {
+        self.inner.as_ref().unwrap().num_keys
+    }
+
     ///
     /// Finish writing the image layer.
     ///
@@ -988,7 +1027,22 @@ impl ImageLayerWriter {
         timeline: &Arc<Timeline>,
         ctx: &RequestContext,
     ) -> anyhow::Result<super::ResidentLayer> {
-        self.inner.take().unwrap().finish(timeline, ctx).await
+        self.inner.take().unwrap().finish(timeline, ctx, None).await
+    }
+
+    #[cfg(test)]
+    /// Finish writing the image layer with an end key, used in [`super::split_writer::SplitImageLayerWriter`]. The end key determines the end of the image layer's covered range and is exclusive.
+    pub(super) async fn finish_with_end_key(
+        mut self,
+        timeline: &Arc<Timeline>,
+        end_key: Key,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<super::ResidentLayer> {
+        self.inner
+            .take()
+            .unwrap()
+            .finish(timeline, ctx, Some(end_key))
+            .await
     }
 }
 
diff --git a/pageserver/src/tenant/storage_layer/split_writer.rs b/pageserver/src/tenant/storage_layer/split_writer.rs
new file mode 100644
index 0000000000..a4091a890c
--- /dev/null
+++ b/pageserver/src/tenant/storage_layer/split_writer.rs
@@ -0,0 +1,244 @@
+use std::sync::Arc;
+
+use bytes::Bytes;
+use pageserver_api::key::{Key, KEY_SIZE};
+use utils::{id::TimelineId, lsn::Lsn, shard::TenantShardId};
+
+use crate::{config::PageServerConf, context::RequestContext, tenant::Timeline};
+
+use super::{ImageLayerWriter, ResidentLayer};
+
+/// An image writer that takes images and produces multiple image layers. The interface does not
+/// guarantee atomicity (i.e., if the image layer generation fails, there might be leftover files
+/// to be cleaned up)
+#[must_use]
+pub struct SplitImageLayerWriter {
+    inner: ImageLayerWriter,
+    target_layer_size: u64,
+    generated_layers: Vec<ResidentLayer>,
+    conf: &'static PageServerConf,
+    timeline_id: TimelineId,
+    tenant_shard_id: TenantShardId,
+    lsn: Lsn,
+}
+
+impl SplitImageLayerWriter {
+    pub async fn new(
+        conf: &'static PageServerConf,
+        timeline_id: TimelineId,
+        tenant_shard_id: TenantShardId,
+        start_key: Key,
+        lsn: Lsn,
+        target_layer_size: u64,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Self> {
+        Ok(Self {
+            target_layer_size,
+            inner: ImageLayerWriter::new(
+                conf,
+                timeline_id,
+                tenant_shard_id,
+                &(start_key..Key::MAX),
+                lsn,
+                ctx,
+            )
+            .await?,
+            generated_layers: Vec::new(),
+            conf,
+            timeline_id,
+            tenant_shard_id,
+            lsn,
+        })
+    }
+
+    pub async fn put_image(
+        &mut self,
+        key: Key,
+        img: Bytes,
+        tline: &Arc<Timeline>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        // The current estimation is an upper bound of the space that the key/image could take
+        // because we did not consider compression in this estimation. The resulting image layer
+        // could be smaller than the target size.
+        let addition_size_estimation = KEY_SIZE as u64 + img.len() as u64;
+        if self.inner.num_keys() >= 1
+            && self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size
+        {
+            let next_image_writer = ImageLayerWriter::new(
+                self.conf,
+                self.timeline_id,
+                self.tenant_shard_id,
+                &(key..Key::MAX),
+                self.lsn,
+                ctx,
+            )
+            .await?;
+            let prev_image_writer = std::mem::replace(&mut self.inner, next_image_writer);
+            self.generated_layers.push(
+                prev_image_writer
+                    .finish_with_end_key(tline, key, ctx)
+                    .await?,
+            );
+        }
+        self.inner.put_image(key, img, ctx).await
+    }
+
+    pub(crate) async fn finish(
+        self,
+        tline: &Arc<Timeline>,
+        ctx: &RequestContext,
+        end_key: Key,
+    ) -> anyhow::Result<Vec<ResidentLayer>> {
+        let Self {
+            mut generated_layers,
+            inner,
+            ..
+        } = self;
+        generated_layers.push(inner.finish_with_end_key(tline, end_key, ctx).await?);
+        Ok(generated_layers)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::{
+        tenant::{
+            harness::{TenantHarness, TIMELINE_ID},
+            storage_layer::AsLayerDesc,
+        },
+        DEFAULT_PG_VERSION,
+    };
+
+    use super::*;
+
+    fn get_key(id: u32) -> Key {
+        let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
+        key.field6 = id;
+        key
+    }
+
+    fn get_img(id: u32) -> Bytes {
+        format!("{id:064}").into()
+    }
+
+    fn get_large_img() -> Bytes {
+        vec![0; 8192].into()
+    }
+
+    #[tokio::test]
+    async fn write_one_image() {
+        let harness = TenantHarness::create("split_writer_write_one_image")
+            .await
+            .unwrap();
+        let (tenant, ctx) = harness.load().await;
+
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await
+            .unwrap();
+
+        let mut writer = SplitImageLayerWriter::new(
+            tenant.conf,
+            tline.timeline_id,
+            tenant.tenant_shard_id,
+            get_key(0),
+            Lsn(0x18),
+            4 * 1024 * 1024,
+            &ctx,
+        )
+        .await
+        .unwrap();
+
+        writer
+            .put_image(get_key(0), get_img(0), &tline, &ctx)
+            .await
+            .unwrap();
+        let layers = writer.finish(&tline, &ctx, get_key(10)).await.unwrap();
+        assert_eq!(layers.len(), 1);
+    }
+
+    #[tokio::test]
+    async fn write_split() {
+        let harness = TenantHarness::create("split_writer_write_split")
+            .await
+            .unwrap();
+        let (tenant, ctx) = harness.load().await;
+
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await
+            .unwrap();
+
+        let mut writer = SplitImageLayerWriter::new(
+            tenant.conf,
+            tline.timeline_id,
+            tenant.tenant_shard_id,
+            get_key(0),
+            Lsn(0x18),
+            4 * 1024 * 1024,
+            &ctx,
+        )
+        .await
+        .unwrap();
+        const N: usize = 2000;
+        for i in 0..N {
+            let i = i as u32;
+            writer
+                .put_image(get_key(i), get_large_img(), &tline, &ctx)
+                .await
+                .unwrap();
+        }
+        let layers = writer
+            .finish(&tline, &ctx, get_key(N as u32))
+            .await
+            .unwrap();
+        assert_eq!(layers.len(), N / 512 + 1);
+        for idx in 0..layers.len() {
+            assert_ne!(layers[idx].layer_desc().key_range.start, Key::MIN);
+            assert_ne!(layers[idx].layer_desc().key_range.end, Key::MAX);
+            if idx > 0 {
+                assert_eq!(
+                    layers[idx - 1].layer_desc().key_range.end,
+                    layers[idx].layer_desc().key_range.start
+                );
+            }
+        }
+    }
+
+    #[tokio::test]
+    async fn write_large_img() {
+        let harness = TenantHarness::create("split_writer_write_large_img")
+            .await
+            .unwrap();
+        let (tenant, ctx) = harness.load().await;
+
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await
+            .unwrap();
+
+        let mut writer = SplitImageLayerWriter::new(
+            tenant.conf,
+            tline.timeline_id,
+            tenant.tenant_shard_id,
+            get_key(0),
+            Lsn(0x18),
+            4 * 1024,
+            &ctx,
+        )
+        .await
+        .unwrap();
+
+        writer
+            .put_image(get_key(0), get_img(0), &tline, &ctx)
+            .await
+            .unwrap();
+        writer
+            .put_image(get_key(1), get_large_img(), &tline, &ctx)
+            .await
+            .unwrap();
+        let layers = writer.finish(&tline, &ctx, get_key(10)).await.unwrap();
+        assert_eq!(layers.len(), 2);
+    }
+}

From 7ec831c95643606f7d1d3343a2d7b84b23676605 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 7 Aug 2024 20:14:45 +0300
Subject: [PATCH 329/412] fix: drain completed page_service connections (#8632)

We've noticed increased memory usage with the latest release. Drain the
joinset of `page_service` connection handlers to avoid leaking them
until shutdown. An alternative would be to use a TaskTracker.
TaskTracker was not discussed in original PR #8339 review, so not hot
fixing it in here either.
---
 pageserver/src/page_service.rs             | 42 +++++++++++-----------
 test_runner/regress/test_bad_connection.py | 11 +++++-
 2 files changed, 32 insertions(+), 21 deletions(-)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 5344b83e0d..81294291a9 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -122,16 +122,19 @@ impl Listener {
     }
 }
 impl Connections {
-    pub async fn shutdown(self) {
+    pub(crate) async fn shutdown(self) {
         let Self { cancel, mut tasks } = self;
         cancel.cancel();
         while let Some(res) = tasks.join_next().await {
-            // the logging done here mimics what was formerly done by task_mgr
-            match res {
-                Ok(Ok(())) => {}
-                Ok(Err(e)) => error!("error in page_service connection task: {:?}", e),
-                Err(e) => error!("page_service connection task panicked: {:?}", e),
-            }
+            Self::handle_connection_completion(res);
+        }
+    }
+
+    fn handle_connection_completion(res: Result<anyhow::Result<()>, tokio::task::JoinError>) {
+        match res {
+            Ok(Ok(())) => {}
+            Ok(Err(e)) => error!("error in page_service connection task: {:?}", e),
+            Err(e) => error!("page_service connection task panicked: {:?}", e),
         }
     }
 }
@@ -155,20 +158,19 @@ pub async fn libpq_listener_main(
     let connections_cancel = CancellationToken::new();
     let mut connection_handler_tasks = tokio::task::JoinSet::default();
 
-    // Wait for a new connection to arrive, or for server shutdown.
-    while let Some(res) = tokio::select! {
-        biased;
+    loop {
+        let accepted = tokio::select! {
+            biased;
+            _ = listener_cancel.cancelled() => break,
+            next = connection_handler_tasks.join_next(), if !connection_handler_tasks.is_empty() => {
+                let res = next.expect("we dont poll while empty");
+                Connections::handle_connection_completion(res);
+                continue;
+            }
+            accepted = listener.accept() => accepted,
+        };
 
-        _ = listener_cancel.cancelled() => {
-            // We were requested to shut down.
-            None
-        }
-
-        res = listener.accept() => {
-            Some(res)
-        }
-    } {
-        match res {
+        match accepted {
             Ok((socket, peer_addr)) => {
                 // Connection established. Spawn a new task to handle it.
                 debug!("accepted connection from {}", peer_addr);
diff --git a/test_runner/regress/test_bad_connection.py b/test_runner/regress/test_bad_connection.py
index 82a3a05c2b..392b73c1f7 100644
--- a/test_runner/regress/test_bad_connection.py
+++ b/test_runner/regress/test_bad_connection.py
@@ -10,7 +10,12 @@ from fixtures.neon_fixtures import NeonEnvBuilder
 @pytest.mark.timeout(600)
 def test_compute_pageserver_connection_stress(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
-    env.pageserver.allowed_errors.append(".*simulated connection error.*")
+    env.pageserver.allowed_errors.append(".*simulated connection error.*")  # this is never hit
+
+    # the real reason (Simulated Connection Error) is on the next line, and we cannot filter this out.
+    env.pageserver.allowed_errors.append(
+        ".*ERROR error in page_service connection task: Postgres query error"
+    )
 
     # Enable failpoint before starting everything else up so that we exercise the retry
     # on fetching basebackup
@@ -69,3 +74,7 @@ def test_compute_pageserver_connection_stress(neon_env_builder: NeonEnvBuilder):
             cur.fetchall()
         times_executed += 1
     log.info(f"Workload executed {times_executed} times")
+
+    # do a graceful shutdown which would had caught the allowed_errors before
+    # https://github.com/neondatabase/neon/pull/8632
+    env.pageserver.stop()

From b76ab45cbe10e54678ed24094e81f0addcd788c2 Mon Sep 17 00:00:00 2001
From: dotdister <odsk.dr@gmail.com>
Date: Mon, 5 Aug 2024 16:23:59 +0900
Subject: [PATCH 330/412] safekeeper: remove unused partial_backup_enabled
 option (#8547)

## Problem
There is an unused safekeeper option `partial_backup_enabled`.

`partial_backup_enabled` was implemented in #6530, but this option was
always turned into enabled in #8022.

If you intended to keep this option for a specific reason, I will close
this PR.

## Summary of changes
I removed an unused safekeeper option `partial_backup_enabled`.
---
 safekeeper/src/bin/safekeeper.rs               | 6 ------
 safekeeper/src/lib.rs                          | 2 --
 safekeeper/src/timeline_manager.rs             | 4 ++--
 safekeeper/tests/walproposer_sim/safekeeper.rs | 1 -
 4 files changed, 2 insertions(+), 11 deletions(-)

diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 2365fd0587..41c2d3fe08 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -170,11 +170,6 @@ struct Args {
     /// still needed for existing replication connection.
     #[arg(long)]
     walsenders_keep_horizon: bool,
-    /// Enable partial backup. If disabled, safekeeper will not upload partial
-    /// segments to remote storage.
-    /// TODO: now partial backup is always enabled, remove this flag.
-    #[arg(long)]
-    partial_backup_enabled: bool,
     /// Controls how long backup will wait until uploading the partial segment.
     #[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_PARTIAL_BACKUP_TIMEOUT, verbatim_doc_comment)]
     partial_backup_timeout: Duration,
@@ -347,7 +342,6 @@ async fn main() -> anyhow::Result<()> {
         sk_auth_token,
         current_thread_runtime: args.current_thread_runtime,
         walsenders_keep_horizon: args.walsenders_keep_horizon,
-        partial_backup_enabled: true,
         partial_backup_timeout: args.partial_backup_timeout,
         disable_periodic_broker_push: args.disable_periodic_broker_push,
         enable_offload: args.enable_offload,
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index 56d61e8287..2e11a279ca 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -93,7 +93,6 @@ pub struct SafeKeeperConf {
     pub sk_auth_token: Option<SecretString>,
     pub current_thread_runtime: bool,
     pub walsenders_keep_horizon: bool,
-    pub partial_backup_enabled: bool,
     pub partial_backup_timeout: Duration,
     pub disable_periodic_broker_push: bool,
     pub enable_offload: bool,
@@ -137,7 +136,6 @@ impl SafeKeeperConf {
             max_offloader_lag_bytes: defaults::DEFAULT_MAX_OFFLOADER_LAG_BYTES,
             current_thread_runtime: false,
             walsenders_keep_horizon: false,
-            partial_backup_enabled: false,
             partial_backup_timeout: Duration::from_secs(0),
             disable_periodic_broker_push: false,
             enable_offload: false,
diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs
index c224dcd398..482614fac7 100644
--- a/safekeeper/src/timeline_manager.rs
+++ b/safekeeper/src/timeline_manager.rs
@@ -544,8 +544,8 @@ impl Manager {
 
     /// Spawns partial WAL backup task if needed.
     async fn update_partial_backup(&mut self, state: &StateSnapshot) {
-        // check if partial backup is enabled and should be started
-        if !self.conf.is_wal_backup_enabled() || !self.conf.partial_backup_enabled {
+        // check if WAL backup is enabled and should be started
+        if !self.conf.is_wal_backup_enabled() {
             return;
         }
 
diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs
index 0c6d97ddfa..771d905c90 100644
--- a/safekeeper/tests/walproposer_sim/safekeeper.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper.rs
@@ -181,7 +181,6 @@ pub fn run_server(os: NodeOs, disk: Arc<SafekeeperDisk>) -> Result<()> {
         sk_auth_token: None,
         current_thread_runtime: false,
         walsenders_keep_horizon: false,
-        partial_backup_enabled: false,
         partial_backup_timeout: Duration::from_secs(0),
         disable_periodic_broker_push: false,
         enable_offload: false,

From 5945eadd422b3151f142a766434da92825b1741f Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <iskyzh@gmail.com>
Date: Mon, 5 Aug 2024 18:30:49 +0800
Subject: [PATCH 331/412] feat(pageserver): support split delta layers (#8599)

part of https://github.com/neondatabase/neon/issues/8002

Similar to https://github.com/neondatabase/neon/pull/8574, we add
auto-split support for delta layers. Tests are reused from image layer
split writers.


---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 .../src/tenant/storage_layer/delta_layer.rs   |  18 ++
 .../src/tenant/storage_layer/split_writer.rs  | 243 ++++++++++++++++--
 2 files changed, 242 insertions(+), 19 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index f9becf53ff..e50fc2a266 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -384,6 +384,9 @@ struct DeltaLayerWriterInner {
     tree: DiskBtreeBuilder<BlockBuf, DELTA_KEY_SIZE>,
 
     blob_writer: BlobWriter<true>,
+
+    // Number of key-lsns in the layer.
+    num_keys: usize,
 }
 
 impl DeltaLayerWriterInner {
@@ -425,6 +428,7 @@ impl DeltaLayerWriterInner {
             lsn_range,
             tree: tree_builder,
             blob_writer,
+            num_keys: 0,
         })
     }
 
@@ -475,6 +479,9 @@ impl DeltaLayerWriterInner {
 
         let delta_key = DeltaKey::from_key_lsn(&key, lsn);
         let res = self.tree.append(&delta_key.0, blob_ref.0);
+
+        self.num_keys += 1;
+
         (val, res.map_err(|e| anyhow::anyhow!(e)))
     }
 
@@ -686,6 +693,17 @@ impl DeltaLayerWriter {
             .finish(key_end, timeline, ctx)
             .await
     }
+
+    #[cfg(test)]
+    pub(crate) fn num_keys(&self) -> usize {
+        self.inner.as_ref().unwrap().num_keys
+    }
+
+    #[cfg(test)]
+    pub(crate) fn estimated_size(&self) -> u64 {
+        let inner = self.inner.as_ref().unwrap();
+        inner.blob_writer.size() + inner.tree.borrow_writer().size() + PAGE_SZ as u64
+    }
 }
 
 impl Drop for DeltaLayerWriter {
diff --git a/pageserver/src/tenant/storage_layer/split_writer.rs b/pageserver/src/tenant/storage_layer/split_writer.rs
index a4091a890c..a966775f9e 100644
--- a/pageserver/src/tenant/storage_layer/split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/split_writer.rs
@@ -1,12 +1,12 @@
-use std::sync::Arc;
+use std::{ops::Range, sync::Arc};
 
 use bytes::Bytes;
 use pageserver_api::key::{Key, KEY_SIZE};
 use utils::{id::TimelineId, lsn::Lsn, shard::TenantShardId};
 
-use crate::{config::PageServerConf, context::RequestContext, tenant::Timeline};
+use crate::{config::PageServerConf, context::RequestContext, repository::Value, tenant::Timeline};
 
-use super::{ImageLayerWriter, ResidentLayer};
+use super::{DeltaLayerWriter, ImageLayerWriter, ResidentLayer};
 
 /// An image writer that takes images and produces multiple image layers. The interface does not
 /// guarantee atomicity (i.e., if the image layer generation fails, there might be leftover files
@@ -98,6 +98,107 @@ impl SplitImageLayerWriter {
         generated_layers.push(inner.finish_with_end_key(tline, end_key, ctx).await?);
         Ok(generated_layers)
     }
+
+    /// When split writer fails, the caller should call this function and handle partially generated layers.
+    #[allow(dead_code)]
+    pub(crate) async fn take(self) -> anyhow::Result<(Vec<ResidentLayer>, ImageLayerWriter)> {
+        Ok((self.generated_layers, self.inner))
+    }
+}
+
+/// A delta writer that takes key-lsn-values and produces multiple delta layers. The interface does not
+/// guarantee atomicity (i.e., if the delta layer generation fails, there might be leftover files
+/// to be cleaned up).
+#[must_use]
+pub struct SplitDeltaLayerWriter {
+    inner: DeltaLayerWriter,
+    target_layer_size: u64,
+    generated_layers: Vec<ResidentLayer>,
+    conf: &'static PageServerConf,
+    timeline_id: TimelineId,
+    tenant_shard_id: TenantShardId,
+    lsn_range: Range<Lsn>,
+}
+
+impl SplitDeltaLayerWriter {
+    pub async fn new(
+        conf: &'static PageServerConf,
+        timeline_id: TimelineId,
+        tenant_shard_id: TenantShardId,
+        start_key: Key,
+        lsn_range: Range<Lsn>,
+        target_layer_size: u64,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Self> {
+        Ok(Self {
+            target_layer_size,
+            inner: DeltaLayerWriter::new(
+                conf,
+                timeline_id,
+                tenant_shard_id,
+                start_key,
+                lsn_range.clone(),
+                ctx,
+            )
+            .await?,
+            generated_layers: Vec::new(),
+            conf,
+            timeline_id,
+            tenant_shard_id,
+            lsn_range,
+        })
+    }
+
+    pub async fn put_value(
+        &mut self,
+        key: Key,
+        lsn: Lsn,
+        val: Value,
+        tline: &Arc<Timeline>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        // The current estimation is key size plus LSN size plus value size estimation. This is not an accurate
+        // number, and therefore the final layer size could be a little bit larger or smaller than the target.
+        let addition_size_estimation = KEY_SIZE as u64 + 8 /* LSN u64 size */ + 80 /* value size estimation */;
+        if self.inner.num_keys() >= 1
+            && self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size
+        {
+            let next_delta_writer = DeltaLayerWriter::new(
+                self.conf,
+                self.timeline_id,
+                self.tenant_shard_id,
+                key,
+                self.lsn_range.clone(),
+                ctx,
+            )
+            .await?;
+            let prev_delta_writer = std::mem::replace(&mut self.inner, next_delta_writer);
+            self.generated_layers
+                .push(prev_delta_writer.finish(key, tline, ctx).await?);
+        }
+        self.inner.put_value(key, lsn, val, ctx).await
+    }
+
+    pub(crate) async fn finish(
+        self,
+        tline: &Arc<Timeline>,
+        ctx: &RequestContext,
+        end_key: Key,
+    ) -> anyhow::Result<Vec<ResidentLayer>> {
+        let Self {
+            mut generated_layers,
+            inner,
+            ..
+        } = self;
+        generated_layers.push(inner.finish(end_key, tline, ctx).await?);
+        Ok(generated_layers)
+    }
+
+    /// When split writer fails, the caller should call this function and handle partially generated layers.
+    #[allow(dead_code)]
+    pub(crate) async fn take(self) -> anyhow::Result<(Vec<ResidentLayer>, DeltaLayerWriter)> {
+        Ok((self.generated_layers, self.inner))
+    }
 }
 
 #[cfg(test)]
@@ -138,7 +239,7 @@ mod tests {
             .await
             .unwrap();
 
-        let mut writer = SplitImageLayerWriter::new(
+        let mut image_writer = SplitImageLayerWriter::new(
             tenant.conf,
             tline.timeline_id,
             tenant.tenant_shard_id,
@@ -150,11 +251,42 @@ mod tests {
         .await
         .unwrap();
 
-        writer
+        let mut delta_writer = SplitDeltaLayerWriter::new(
+            tenant.conf,
+            tline.timeline_id,
+            tenant.tenant_shard_id,
+            get_key(0),
+            Lsn(0x18)..Lsn(0x20),
+            4 * 1024 * 1024,
+            &ctx,
+        )
+        .await
+        .unwrap();
+
+        image_writer
             .put_image(get_key(0), get_img(0), &tline, &ctx)
             .await
             .unwrap();
-        let layers = writer.finish(&tline, &ctx, get_key(10)).await.unwrap();
+        let layers = image_writer
+            .finish(&tline, &ctx, get_key(10))
+            .await
+            .unwrap();
+        assert_eq!(layers.len(), 1);
+
+        delta_writer
+            .put_value(
+                get_key(0),
+                Lsn(0x18),
+                Value::Image(get_img(0)),
+                &tline,
+                &ctx,
+            )
+            .await
+            .unwrap();
+        let layers = delta_writer
+            .finish(&tline, &ctx, get_key(10))
+            .await
+            .unwrap();
         assert_eq!(layers.len(), 1);
     }
 
@@ -170,7 +302,7 @@ mod tests {
             .await
             .unwrap();
 
-        let mut writer = SplitImageLayerWriter::new(
+        let mut image_writer = SplitImageLayerWriter::new(
             tenant.conf,
             tline.timeline_id,
             tenant.tenant_shard_id,
@@ -181,26 +313,58 @@ mod tests {
         )
         .await
         .unwrap();
+        let mut delta_writer = SplitDeltaLayerWriter::new(
+            tenant.conf,
+            tline.timeline_id,
+            tenant.tenant_shard_id,
+            get_key(0),
+            Lsn(0x18)..Lsn(0x20),
+            4 * 1024 * 1024,
+            &ctx,
+        )
+        .await
+        .unwrap();
         const N: usize = 2000;
         for i in 0..N {
             let i = i as u32;
-            writer
+            image_writer
                 .put_image(get_key(i), get_large_img(), &tline, &ctx)
                 .await
                 .unwrap();
+            delta_writer
+                .put_value(
+                    get_key(i),
+                    Lsn(0x20),
+                    Value::Image(get_large_img()),
+                    &tline,
+                    &ctx,
+                )
+                .await
+                .unwrap();
         }
-        let layers = writer
+        let image_layers = image_writer
             .finish(&tline, &ctx, get_key(N as u32))
             .await
             .unwrap();
-        assert_eq!(layers.len(), N / 512 + 1);
-        for idx in 0..layers.len() {
-            assert_ne!(layers[idx].layer_desc().key_range.start, Key::MIN);
-            assert_ne!(layers[idx].layer_desc().key_range.end, Key::MAX);
+        let delta_layers = delta_writer
+            .finish(&tline, &ctx, get_key(N as u32))
+            .await
+            .unwrap();
+        assert_eq!(image_layers.len(), N / 512 + 1);
+        assert_eq!(delta_layers.len(), N / 512 + 1);
+        for idx in 0..image_layers.len() {
+            assert_ne!(image_layers[idx].layer_desc().key_range.start, Key::MIN);
+            assert_ne!(image_layers[idx].layer_desc().key_range.end, Key::MAX);
+            assert_ne!(delta_layers[idx].layer_desc().key_range.start, Key::MIN);
+            assert_ne!(delta_layers[idx].layer_desc().key_range.end, Key::MAX);
             if idx > 0 {
                 assert_eq!(
-                    layers[idx - 1].layer_desc().key_range.end,
-                    layers[idx].layer_desc().key_range.start
+                    image_layers[idx - 1].layer_desc().key_range.end,
+                    image_layers[idx].layer_desc().key_range.start
+                );
+                assert_eq!(
+                    delta_layers[idx - 1].layer_desc().key_range.end,
+                    delta_layers[idx].layer_desc().key_range.start
                 );
             }
         }
@@ -218,7 +382,7 @@ mod tests {
             .await
             .unwrap();
 
-        let mut writer = SplitImageLayerWriter::new(
+        let mut image_writer = SplitImageLayerWriter::new(
             tenant.conf,
             tline.timeline_id,
             tenant.tenant_shard_id,
@@ -230,15 +394,56 @@ mod tests {
         .await
         .unwrap();
 
-        writer
+        let mut delta_writer = SplitDeltaLayerWriter::new(
+            tenant.conf,
+            tline.timeline_id,
+            tenant.tenant_shard_id,
+            get_key(0),
+            Lsn(0x18)..Lsn(0x20),
+            4 * 1024,
+            &ctx,
+        )
+        .await
+        .unwrap();
+
+        image_writer
             .put_image(get_key(0), get_img(0), &tline, &ctx)
             .await
             .unwrap();
-        writer
+        image_writer
             .put_image(get_key(1), get_large_img(), &tline, &ctx)
             .await
             .unwrap();
-        let layers = writer.finish(&tline, &ctx, get_key(10)).await.unwrap();
+        let layers = image_writer
+            .finish(&tline, &ctx, get_key(10))
+            .await
+            .unwrap();
+        assert_eq!(layers.len(), 2);
+
+        delta_writer
+            .put_value(
+                get_key(0),
+                Lsn(0x18),
+                Value::Image(get_img(0)),
+                &tline,
+                &ctx,
+            )
+            .await
+            .unwrap();
+        delta_writer
+            .put_value(
+                get_key(1),
+                Lsn(0x1A),
+                Value::Image(get_large_img()),
+                &tline,
+                &ctx,
+            )
+            .await
+            .unwrap();
+        let layers = delta_writer
+            .finish(&tline, &ctx, get_key(10))
+            .await
+            .unwrap();
         assert_eq!(layers.len(), 2);
     }
 }

From afdbe0a7d0d688e1c0a49bea93030ba0fa3c962f Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 5 Aug 2024 14:24:54 +0300
Subject: [PATCH 332/412] Update Postgres versions to use smgrexists() instead
 of access() to check if Oid is used (#8597)

## Problem

PR #7992 was merged without correspondent changes in Postgres submodules
and this is why test_oid_overflow.py is failed now.

## Summary of changes

Bump Postgres versions

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/postgres-v16   | 2 +-
 vendor/revisions.json | 6 +++---
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index dbd0e6428b..7bbe834c8c 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit dbd0e6428b9274d72a10ac29bd3e3162faf109d4
+Subproject commit 7bbe834c8c2dc37802eca8484311599bc47341f6
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 035b73a9c5..9eba7dd382 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 035b73a9c5998f9a0ef35cc8df1bae680bf770fc
+Subproject commit 9eba7dd382606ffca43aca865f337ec21bcdac73
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index b39f316137..5377f5ed72 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit b39f316137fdd29e2da15d2af2fdd1cfd18163be
+Subproject commit 5377f5ed7290af45b7cb6b0d98d43cbf4a4e77f3
diff --git a/vendor/revisions.json b/vendor/revisions.json
index eeebd646f5..570dfc1550 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
-  "v16": ["16.3", "b39f316137fdd29e2da15d2af2fdd1cfd18163be"],
-  "v15": ["15.7", "035b73a9c5998f9a0ef35cc8df1bae680bf770fc"],
-  "v14": ["14.12", "dbd0e6428b9274d72a10ac29bd3e3162faf109d4"]
+  "v16": ["16.3", "5377f5ed7290af45b7cb6b0d98d43cbf4a4e77f3"],
+  "v15": ["15.7", "9eba7dd382606ffca43aca865f337ec21bcdac73"],
+  "v14": ["14.12", "7bbe834c8c2dc37802eca8484311599bc47341f6"]
 }

From 274c2c40b99fc04372fc9eea57e0ff29d44af3ec Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 5 Aug 2024 12:25:23 +0100
Subject: [PATCH 333/412] CI(trigger-e2e-tests): wait for promote-images job
 from the last commit (#8592)

## Problem

We don't trigger e2e tests for draft PRs, but we do trigger them once a
PR is in the "Ready for review" state.
Sometimes, a PR can be marked as "Ready for review" before we finish
image building. In such cases, triggering e2e tests fails.

## Summary of changes
- Make `trigger-e2e-tests` job poll status of `promote-images` job from
the build-and-test workflow for the last commit. And trigger only if the
status is `success`
- Remove explicit image checking from the workflow
- Add `concurrency` for `triggere-e2e-tests` workflow to make it
possible to cancel jobs in progress (if PR moves from "Draft" to "Ready
for review" several times in a row)
---
 .github/workflows/trigger-e2e-tests.yml | 42 ++++++++++++++++++-------
 1 file changed, 30 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/trigger-e2e-tests.yml b/.github/workflows/trigger-e2e-tests.yml
index 77928a343e..0a615b3e37 100644
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -10,11 +10,13 @@ defaults:
   run:
     shell: bash -euxo pipefail {0}
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
+  cancel-in-progress: true
+
 env:
   # A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix
   E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
-  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
 
 jobs:
   cancel-previous-e2e-tests:
@@ -64,19 +66,35 @@ jobs:
     needs: [ tag ]
     runs-on: ubuntu-22.04
     env:
+      EVENT_ACTION: ${{ github.event.action }}
+      GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
       TAG: ${{ needs.tag.outputs.build-tag }}
     steps:
-      - name: check if ecr image are present
-        env:
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
+      - name: Wait for `promote-images` job to finish
+        # It's important to have a timeout here, the script in the step can run infinitely
+        timeout-minutes: 60
         run: |
-          for REPO in neon compute-tools compute-node-v14 vm-compute-node-v14 compute-node-v15 vm-compute-node-v15 compute-node-v16 vm-compute-node-v16; do
-            OUTPUT=$(aws ecr describe-images --repository-name ${REPO} --region eu-central-1 --query "imageDetails[?imageTags[?contains(@, '${TAG}')]]" --output text)
-            if [ "$OUTPUT" == "" ]; then
-              echo "$REPO with image tag $TAG not found" >> $GITHUB_OUTPUT
-              exit 1
-            fi
+          if [ "${GITHUB_EVENT_NAME}" != "pull_request" ] || [ "${EVENT_ACTION}" != "ready_for_review" ]; then
+            exit 0
+          fi
+
+          # For PRs we use the run id as the tag
+          BUILD_AND_TEST_RUN_ID=${TAG}
+          while true; do
+            conclusion=$(gh run --repo ${GITHUB_REPOSITORY} view ${BUILD_AND_TEST_RUN_ID} --json jobs --jq '.jobs[] | select(.name == "promote-images") | .conclusion')
+            case "$conclusion" in
+              success)
+                break
+                ;;
+              failure | cancelled | skipped)
+                echo "The 'promote-images' job didn't succeed: '${conclusion}'. Exiting..."
+                exit 1
+                ;;
+              *)
+                echo "The 'promote-images' hasn't succeed yet. Waiting..."
+                sleep 60
+                ;;
+            esac
           done
 
       - name: Set e2e-platforms

From 6564afb82256461bfc67077de91151bc94e5e5aa Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 5 Aug 2024 19:47:59 +0100
Subject: [PATCH 334/412] CI(trigger-e2e-tests): fix deadlock with Build and
 Test workflow (#8606)

## Problem

In some cases, a deadlock between `build-and-test` and
`trigger-e2e-tests` workflows can happen:

```
Build and Test

Canceling since a deadlock for concurrency group 'Build and Test-8600/merge-anysha' was detected between 'top level workflow' and 'trigger-e2e-tests'
```

I don't understand the reason completely, probably `${{ github.workflow
}}` got evaluated to the same value and somehow caused the issue.
We don't need to limit concurrency for `trigger-e2e-tests`
workflow.

See
https://neondb.slack.com/archives/C059ZC138NR/p1722869486708179?thread_ts=1722869027.960029&cid=C059ZC138NR
---
 .github/workflows/trigger-e2e-tests.yml | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/.github/workflows/trigger-e2e-tests.yml b/.github/workflows/trigger-e2e-tests.yml
index 0a615b3e37..6fbe785c56 100644
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -10,10 +10,6 @@ defaults:
   run:
     shell: bash -euxo pipefail {0}
 
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
-  cancel-in-progress: true
-
 env:
   # A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix
   E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}

From e1339ac9159d5bddcd57a5eb7204bf11f25e2a18 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 5 Aug 2024 23:21:33 +0300
Subject: [PATCH 335/412] fix: allow awaiting logical size for root timelines
 (#8604)

Currently if `GET
/v1/tenant/x/timeline/y?force-await-initial-logical-size=true` is
requested for a root timeline created within the current pageserver
session, the request handler panics hitting the debug assertion. These
timelines will always have an accurate (at initdb import) calculated
logical size. Fix is to never attempt prioritizing timeline size
calculation if we already have an exact value.

Split off from #8528.
---
 pageserver/src/tenant/timeline.rs              | 6 ++++++
 pageserver/src/tenant/timeline/logical_size.rs | 4 ++++
 test_runner/regress/test_timeline_size.py      | 3 +++
 3 files changed, 13 insertions(+)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index be72e15c19..8c80a54bdd 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4727,6 +4727,12 @@ impl Timeline {
             return;
         }
 
+        if self.current_logical_size.current_size().is_exact() {
+            // root timelines are initialized with exact count, but never start the background
+            // calculation
+            return;
+        }
+
         if let Some(await_bg_cancel) = self
             .current_logical_size
             .cancel_wait_for_background_loop_concurrency_limit_semaphore
diff --git a/pageserver/src/tenant/timeline/logical_size.rs b/pageserver/src/tenant/timeline/logical_size.rs
index b0d6c4a27a..f4a4eea54a 100644
--- a/pageserver/src/tenant/timeline/logical_size.rs
+++ b/pageserver/src/tenant/timeline/logical_size.rs
@@ -122,6 +122,10 @@ impl CurrentLogicalSize {
             Self::Exact(_) => Accuracy::Exact,
         }
     }
+
+    pub(crate) fn is_exact(&self) -> bool {
+        matches!(self, Self::Exact(_))
+    }
 }
 
 impl LogicalSize {
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index 5e9a42f6b4..1f220eec9e 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -936,6 +936,9 @@ def test_timeline_logical_size_task_priority(neon_env_builder: NeonEnvBuilder):
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline
 
+    # just make sure this doesn't hit an assertion
+    client.timeline_detail(tenant_id, timeline_id, force_await_initial_logical_size=True)
+
     # load in some data
     endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
     endpoint.safe_psql_many(

From 3b4b9c1d0b7d38ddef88cbbcd4765dddd30e01b1 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 5 Aug 2024 23:06:47 +0100
Subject: [PATCH 336/412] CI(benchmarking): set pub/sub projects for LR tests
 (#8483)

## Problem

> Currently, long-running LR tests recreate endpoints every night. We'd
like to have along-running buildup of history to exercise the pageserver
in this case (instead of "unit-testing" the same behavior everynight).

Closes #8317

## Summary of changes
- Update Postgres version for replication tests
- Set `BENCHMARK_PROJECT_ID_PUB`/`BENCHMARK_PROJECT_ID_SUB` env vars to
projects that were created for this purpose

---------

Co-authored-by: Sasha Krassovsky <krassovskysasha@gmail.com>
---
 .github/actionlint.yml                        |  2 ++
 .github/workflows/benchmarking.yml            |  9 ++++--
 test_runner/fixtures/neon_api.py              |  6 ++--
 .../performance/test_logical_replication.py   | 29 ++++++++++++++-----
 4 files changed, 32 insertions(+), 14 deletions(-)

diff --git a/.github/actionlint.yml b/.github/actionlint.yml
index 37983798b7..d27fa01efa 100644
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -8,6 +8,8 @@ self-hosted-runner:
     - small-arm64
     - us-east-2
 config-variables:
+  - BENCHMARK_PROJECT_ID_PUB
+  - BENCHMARK_PROJECT_ID_SUB
   - REMOTE_STORAGE_AZURE_CONTAINER
   - REMOTE_STORAGE_AZURE_REGION
   - SLACK_UPCOMING_RELEASE_CHANNEL_ID
diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index f7ea534fb9..0f4dac841e 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -147,7 +147,7 @@ jobs:
     if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
     env:
       POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 14
+      DEFAULT_PG_VERSION: 16
       TEST_OUTPUT: /tmp/test_output
       BUILD_TYPE: remote
       SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
@@ -168,7 +168,7 @@ jobs:
         path: /tmp/neon/
         prefix: latest
 
-    - name: Run benchmark
+    - name: Run Logical Replication benchmarks
       uses: ./.github/actions/run-python-test-set
       with:
         build_type: ${{ env.BUILD_TYPE }}
@@ -176,12 +176,15 @@ jobs:
         run_in_parallel: false
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 5400
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
       env:
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
         PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
         NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
+        BENCHMARK_PROJECT_ID_PUB: ${{ vars.BENCHMARK_PROJECT_ID_PUB }}
+        BENCHMARK_PROJECT_ID_SUB: ${{ vars.BENCHMARK_PROJECT_ID_SUB }}
 
-    - name: Run benchmark
+    - name: Run Physical Replication benchmarks
       uses: ./.github/actions/run-python-test-set
       with:
         build_type: ${{ env.BUILD_TYPE }}
diff --git a/test_runner/fixtures/neon_api.py b/test_runner/fixtures/neon_api.py
index 658ed119a1..0636cfad06 100644
--- a/test_runner/fixtures/neon_api.py
+++ b/test_runner/fixtures/neon_api.py
@@ -285,9 +285,9 @@ class NeonApiEndpoint:
             self.project_id = project_id
             eps = neon_api.get_endpoints(project_id)["endpoints"]
             self.endpoint_id = eps[0]["id"]
-            self.connstr = neon_api.get_connection_uri(project_id, endpoint_id=self.endpoint_id)[
-                "uri"
-            ]
+            self.connstr = neon_api.get_connection_uri(
+                project_id, endpoint_id=self.endpoint_id, pooled=False
+            )["uri"]
             pw = self.connstr.split("@")[0].split(":")[-1]
             self.pgbench_env = {
                 "PGHOST": eps[0]["host"],
diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py
index 53bb29a659..4b4ffc1fee 100644
--- a/test_runner/performance/test_logical_replication.py
+++ b/test_runner/performance/test_logical_replication.py
@@ -100,24 +100,32 @@ def test_subscriber_lag(
     pub_connstr = benchmark_project_pub.connstr
     sub_connstr = benchmark_project_sub.connstr
 
-    pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env)
-    pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env)
+    if benchmark_project_pub.is_new:
+        pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env)
+    if benchmark_project_sub.is_new:
+        pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env)
 
     pub_conn = psycopg2.connect(pub_connstr)
     sub_conn = psycopg2.connect(sub_connstr)
     pub_conn.autocommit = True
     sub_conn.autocommit = True
     with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
-        if benchmark_project_pub.is_new:
-            pub_cur.execute("create publication pub1 for table pgbench_accounts, pgbench_history")
+        pub_cur.execute("SELECT 1 FROM pg_catalog.pg_publication WHERE pubname = 'pub1'")
+        pub_exists = len(pub_cur.fetchall()) != 0
 
-        if benchmark_project_sub.is_new:
+        if not pub_exists:
+            pub_cur.execute("CREATE PUBLICATION pub1 FOR TABLE pgbench_accounts, pgbench_history")
+
+        sub_cur.execute("SELECT 1 FROM pg_catalog.pg_subscription WHERE subname = 'sub1'")
+        sub_exists = len(sub_cur.fetchall()) != 0
+        if not sub_exists:
             sub_cur.execute("truncate table pgbench_accounts")
             sub_cur.execute("truncate table pgbench_history")
 
-            sub_cur.execute(f"create subscription sub1 connection '{pub_connstr}' publication pub1")
+            sub_cur.execute(f"CREATE SUBSCRIPTION sub1 CONNECTION '{pub_connstr}' PUBLICATION pub1")
 
         initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur)
+
     pub_conn.close()
     sub_conn.close()
 
@@ -195,10 +203,15 @@ def test_publisher_restart(
     pub_conn.autocommit = True
     sub_conn.autocommit = True
     with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
-        if benchmark_project_pub.is_new:
+        pub_cur.execute("SELECT 1 FROM pg_catalog.pg_publication WHERE pubname = 'pub1'")
+        pub_exists = len(pub_cur.fetchall()) != 0
+
+        if not pub_exists:
             pub_cur.execute("create publication pub1 for table pgbench_accounts, pgbench_history")
 
-        if benchmark_project_sub.is_new:
+        sub_cur.execute("SELECT 1 FROM pg_catalog.pg_subscription WHERE subname = 'sub1'")
+        sub_exists = len(sub_cur.fetchall()) != 0
+        if not sub_exists:
             sub_cur.execute("truncate table pgbench_accounts")
             sub_cur.execute("truncate table pgbench_history")
 

From 83afea3edbc52447025c741cd11fbf95a3c7bdd8 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <iskyzh@gmail.com>
Date: Tue, 6 Aug 2024 10:07:48 +0800
Subject: [PATCH 337/412] feat(pageserver): support dry-run for gc-compaction,
 add statistics (#8557)

Add dry-run mode that does not produce any image layer + delta layer. I
will use this code to do some experiments and see how much space we can
reclaim for tenants on staging. Part of
https://github.com/neondatabase/neon/issues/8002

* Add dry-run mode that runs the full compaction process without
updating the layer map. (We never call finish on the writers and the
files will be removed before exiting the function).
* Add compaction statistics and print them at the end of compaction.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant.rs                      |  56 +++++--
 .../src/tenant/storage_layer/image_layer.rs   |   8 +
 pageserver/src/tenant/timeline.rs             |   1 +
 pageserver/src/tenant/timeline/compaction.rs  | 151 +++++++++++++++++-
 4 files changed, 204 insertions(+), 12 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 84c5095610..72d3aedd05 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -6899,7 +6899,10 @@ mod tests {
         }
 
         let cancel = CancellationToken::new();
-        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+        tline
+            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();
 
         for (idx, expected) in expected_result.iter().enumerate() {
             assert_eq!(
@@ -6993,7 +6996,10 @@ mod tests {
             guard.cutoffs.time = Lsn(0x40);
             guard.cutoffs.space = Lsn(0x40);
         }
-        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+        tline
+            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();
 
         Ok(())
     }
@@ -7327,7 +7333,10 @@ mod tests {
         }
 
         let cancel = CancellationToken::new();
-        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+        tline
+            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();
 
         for idx in 0..10 {
             assert_eq!(
@@ -7353,7 +7362,10 @@ mod tests {
             guard.cutoffs.time = Lsn(0x40);
             guard.cutoffs.space = Lsn(0x40);
         }
-        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+        tline
+            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();
 
         Ok(())
     }
@@ -7898,11 +7910,28 @@ mod tests {
         verify_result().await;
 
         let cancel = CancellationToken::new();
-        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+        let mut dryrun_flags = EnumSet::new();
+        dryrun_flags.insert(CompactFlags::DryRun);
+
+        tline
+            .compact_with_gc(&cancel, dryrun_flags, &ctx)
+            .await
+            .unwrap();
+        // We expect layer map to be the same b/c the dry run flag, but we don't know whether there will be other background jobs
+        // cleaning things up, and therefore, we don't do sanity checks on the layer map during unit tests.
+        verify_result().await;
+
+        tline
+            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();
         verify_result().await;
 
         // compact again
-        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+        tline
+            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();
         verify_result().await;
 
         // increase GC horizon and compact again
@@ -7912,11 +7941,17 @@ mod tests {
             guard.cutoffs.time = Lsn(0x38);
             guard.cutoffs.space = Lsn(0x38);
         }
-        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+        tline
+            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();
         verify_result().await; // no wals between 0x30 and 0x38, so we should obtain the same result
 
         // not increasing the GC horizon and compact again
-        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+        tline
+            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();
         verify_result().await;
 
         Ok(())
@@ -8097,7 +8132,10 @@ mod tests {
         verify_result().await;
 
         let cancel = CancellationToken::new();
-        branch_tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+        branch_tline
+            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();
 
         verify_result().await;
 
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index aa308ba3c1..f4f48aaf16 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -753,6 +753,10 @@ struct ImageLayerWriterInner {
 }
 
 impl ImageLayerWriterInner {
+    fn size(&self) -> u64 {
+        self.tree.borrow_writer().size() + self.blob_writer.size()
+    }
+
     ///
     /// Start building a new image layer.
     ///
@@ -1044,6 +1048,10 @@ impl ImageLayerWriter {
             .finish(timeline, ctx, Some(end_key))
             .await
     }
+
+    pub(crate) fn size(&self) -> u64 {
+        self.inner.as_ref().unwrap().size()
+    }
 }
 
 impl Drop for ImageLayerWriter {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 8c80a54bdd..5c268bf875 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -704,6 +704,7 @@ pub(crate) enum CompactFlags {
     ForceRepartition,
     ForceImageLayerCreation,
     EnhancedGcBottomMostCompaction,
+    DryRun,
 }
 
 impl std::fmt::Debug for Timeline {
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 421f718ad6..1ff029a313 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -19,8 +19,10 @@ use bytes::Bytes;
 use enumset::EnumSet;
 use fail::fail_point;
 use itertools::Itertools;
+use pageserver_api::key::KEY_SIZE;
 use pageserver_api::keyspace::ShardedRange;
 use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
+use serde::Serialize;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, info_span, trace, warn, Instrument};
 use utils::id::TimelineId;
@@ -41,6 +43,7 @@ use crate::virtual_file::{MaybeFatalIo, VirtualFile};
 
 use crate::keyspace::KeySpace;
 use crate::repository::{Key, Value};
+use crate::walrecord::NeonWalRecord;
 
 use utils::lsn::Lsn;
 
@@ -73,6 +76,7 @@ impl KeyHistoryRetention {
         key: Key,
         delta_writer: &mut Vec<(Key, Lsn, Value)>,
         mut image_writer: Option<&mut ImageLayerWriter>,
+        stat: &mut CompactionStatistics,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         let mut first_batch = true;
@@ -82,6 +86,7 @@ impl KeyHistoryRetention {
                     let Value::Image(img) = &logs[0].1 else {
                         unreachable!()
                     };
+                    stat.produce_image_key(img);
                     if let Some(image_writer) = image_writer.as_mut() {
                         image_writer.put_image(key, img.clone(), ctx).await?;
                     } else {
@@ -89,24 +94,111 @@ impl KeyHistoryRetention {
                     }
                 } else {
                     for (lsn, val) in logs {
+                        stat.produce_key(&val);
                         delta_writer.push((key, lsn, val));
                     }
                 }
                 first_batch = false;
             } else {
                 for (lsn, val) in logs {
+                    stat.produce_key(&val);
                     delta_writer.push((key, lsn, val));
                 }
             }
         }
         let KeyLogAtLsn(above_horizon_logs) = self.above_horizon;
         for (lsn, val) in above_horizon_logs {
+            stat.produce_key(&val);
             delta_writer.push((key, lsn, val));
         }
         Ok(())
     }
 }
 
+#[derive(Debug, Serialize, Default)]
+struct CompactionStatisticsNumSize {
+    num: u64,
+    size: u64,
+}
+
+#[derive(Debug, Serialize, Default)]
+pub struct CompactionStatistics {
+    delta_layer_visited: CompactionStatisticsNumSize,
+    image_layer_visited: CompactionStatisticsNumSize,
+    delta_layer_produced: CompactionStatisticsNumSize,
+    image_layer_produced: CompactionStatisticsNumSize,
+    num_delta_layer_discarded: usize,
+    num_image_layer_discarded: usize,
+    num_unique_keys_visited: usize,
+    wal_keys_visited: CompactionStatisticsNumSize,
+    image_keys_visited: CompactionStatisticsNumSize,
+    wal_produced: CompactionStatisticsNumSize,
+    image_produced: CompactionStatisticsNumSize,
+}
+
+impl CompactionStatistics {
+    fn estimated_size_of_value(val: &Value) -> usize {
+        match val {
+            Value::Image(img) => img.len(),
+            Value::WalRecord(NeonWalRecord::Postgres { rec, .. }) => rec.len(),
+            _ => std::mem::size_of::<NeonWalRecord>(),
+        }
+    }
+    fn estimated_size_of_key() -> usize {
+        KEY_SIZE // TODO: distinguish image layer and delta layer (count LSN in delta layer)
+    }
+    fn visit_delta_layer(&mut self, size: u64) {
+        self.delta_layer_visited.num += 1;
+        self.delta_layer_visited.size += size;
+    }
+    fn visit_image_layer(&mut self, size: u64) {
+        self.image_layer_visited.num += 1;
+        self.image_layer_visited.size += size;
+    }
+    fn on_unique_key_visited(&mut self) {
+        self.num_unique_keys_visited += 1;
+    }
+    fn visit_wal_key(&mut self, val: &Value) {
+        self.wal_keys_visited.num += 1;
+        self.wal_keys_visited.size +=
+            Self::estimated_size_of_value(val) as u64 + Self::estimated_size_of_key() as u64;
+    }
+    fn visit_image_key(&mut self, val: &Value) {
+        self.image_keys_visited.num += 1;
+        self.image_keys_visited.size +=
+            Self::estimated_size_of_value(val) as u64 + Self::estimated_size_of_key() as u64;
+    }
+    fn produce_key(&mut self, val: &Value) {
+        match val {
+            Value::Image(img) => self.produce_image_key(img),
+            Value::WalRecord(_) => self.produce_wal_key(val),
+        }
+    }
+    fn produce_wal_key(&mut self, val: &Value) {
+        self.wal_produced.num += 1;
+        self.wal_produced.size +=
+            Self::estimated_size_of_value(val) as u64 + Self::estimated_size_of_key() as u64;
+    }
+    fn produce_image_key(&mut self, val: &Bytes) {
+        self.image_produced.num += 1;
+        self.image_produced.size += val.len() as u64 + Self::estimated_size_of_key() as u64;
+    }
+    fn discard_delta_layer(&mut self) {
+        self.num_delta_layer_discarded += 1;
+    }
+    fn discard_image_layer(&mut self) {
+        self.num_image_layer_discarded += 1;
+    }
+    fn produce_delta_layer(&mut self, size: u64) {
+        self.delta_layer_produced.num += 1;
+        self.delta_layer_produced.size += size;
+    }
+    fn produce_image_layer(&mut self, size: u64) {
+        self.image_layer_produced.num += 1;
+        self.image_layer_produced.size += size;
+    }
+}
+
 impl Timeline {
     /// TODO: cancellation
     ///
@@ -118,12 +210,18 @@ impl Timeline {
         ctx: &RequestContext,
     ) -> Result<bool, CompactionError> {
         if flags.contains(CompactFlags::EnhancedGcBottomMostCompaction) {
-            self.compact_with_gc(cancel, ctx)
+            self.compact_with_gc(cancel, flags, ctx)
                 .await
                 .map_err(CompactionError::Other)?;
             return Ok(false);
         }
 
+        if flags.contains(CompactFlags::DryRun) {
+            return Err(CompactionError::Other(anyhow!(
+                "dry-run mode is not supported for legacy compaction for now"
+            )));
+        }
+
         // High level strategy for compaction / image creation:
         //
         // 1. First, calculate the desired "partitioning" of the
@@ -1641,6 +1739,7 @@ impl Timeline {
     pub(crate) async fn compact_with_gc(
         self: &Arc<Self>,
         cancel: &CancellationToken,
+        flags: EnumSet<CompactFlags>,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         use std::collections::BTreeSet;
@@ -1664,12 +1763,16 @@ impl Timeline {
         )
         .await?;
 
-        info!("running enhanced gc bottom-most compaction");
+        let dry_run = flags.contains(CompactFlags::DryRun);
+
+        info!("running enhanced gc bottom-most compaction, dry_run={dry_run}");
 
         scopeguard::defer! {
             info!("done enhanced gc bottom-most compaction");
         };
 
+        let mut stat = CompactionStatistics::default();
+
         // Step 0: pick all delta layers + image layers below/intersect with the GC horizon.
         // The layer selection has the following properties:
         // 1. If a layer is in the selection, all layers below it are in the selection.
@@ -1740,6 +1843,9 @@ impl Timeline {
                 let key_range = desc.get_key_range();
                 delta_split_points.insert(key_range.start);
                 delta_split_points.insert(key_range.end);
+                stat.visit_delta_layer(desc.file_size());
+            } else {
+                stat.visit_image_layer(desc.file_size());
             }
         }
         let mut delta_layers = Vec::new();
@@ -1775,6 +1881,8 @@ impl Timeline {
             tline: &Arc<Timeline>,
             lowest_retain_lsn: Lsn,
             ctx: &RequestContext,
+            stats: &mut CompactionStatistics,
+            dry_run: bool,
             last_batch: bool,
         ) -> anyhow::Result<Option<FlushDeltaResult>> {
             // Check if we need to split the delta layer. We split at the original delta layer boundary to avoid
@@ -1831,6 +1939,7 @@ impl Timeline {
                     let layer_generation = guard.get_from_key(&delta_key).metadata().generation;
                     drop(guard);
                     if layer_generation == tline.generation {
+                        stats.discard_delta_layer();
                         // TODO: depending on whether we design this compaction process to run along with
                         // other compactions, there could be layer map modifications after we drop the
                         // layer guard, and in case it creates duplicated layer key, we will still error
@@ -1857,6 +1966,10 @@ impl Timeline {
             for (key, lsn, val) in deltas {
                 delta_layer_writer.put_value(key, lsn, val, ctx).await?;
             }
+            stats.produce_delta_layer(delta_layer_writer.size());
+            if dry_run {
+                return Ok(None);
+            }
             let delta_layer = delta_layer_writer
                 .finish(delta_key.key_range.end, tline, ctx)
                 .await?;
@@ -1951,6 +2064,13 @@ impl Timeline {
         let mut current_delta_split_point = 0;
         let mut delta_layers = Vec::new();
         while let Some((key, lsn, val)) = merge_iter.next().await? {
+            if cancel.is_cancelled() {
+                return Err(anyhow!("cancelled")); // TODO: refactor to CompactionError and pass cancel error
+            }
+            match val {
+                Value::Image(_) => stat.visit_image_key(&val),
+                Value::WalRecord(_) => stat.visit_wal_key(&val),
+            }
             if last_key.is_none() || last_key.as_ref() == Some(&key) {
                 if last_key.is_none() {
                     last_key = Some(key);
@@ -1958,6 +2078,7 @@ impl Timeline {
                 accumulated_values.push((key, lsn, val));
             } else {
                 let last_key = last_key.as_mut().unwrap();
+                stat.on_unique_key_visited();
                 let retention = self
                     .generate_key_retention(
                         *last_key,
@@ -1974,6 +2095,7 @@ impl Timeline {
                         *last_key,
                         &mut delta_values,
                         image_layer_writer.as_mut(),
+                        &mut stat,
                         ctx,
                     )
                     .await?;
@@ -1986,6 +2108,8 @@ impl Timeline {
                         self,
                         lowest_retain_lsn,
                         ctx,
+                        &mut stat,
+                        dry_run,
                         false,
                     )
                     .await?,
@@ -1998,6 +2122,7 @@ impl Timeline {
 
         let last_key = last_key.expect("no keys produced during compaction");
         // TODO: move this part to the loop body
+        stat.on_unique_key_visited();
         let retention = self
             .generate_key_retention(
                 last_key,
@@ -2014,6 +2139,7 @@ impl Timeline {
                 last_key,
                 &mut delta_values,
                 image_layer_writer.as_mut(),
+                &mut stat,
                 ctx,
             )
             .await?;
@@ -2026,6 +2152,8 @@ impl Timeline {
                 self,
                 lowest_retain_lsn,
                 ctx,
+                &mut stat,
+                dry_run,
                 true,
             )
             .await?,
@@ -2033,12 +2161,28 @@ impl Timeline {
         assert!(delta_values.is_empty(), "unprocessed keys");
 
         let image_layer = if discard_image_layer {
+            stat.discard_image_layer();
             None
         } else if let Some(writer) = image_layer_writer {
-            Some(writer.finish(self, ctx).await?)
+            stat.produce_image_layer(writer.size());
+            if !dry_run {
+                Some(writer.finish(self, ctx).await?)
+            } else {
+                None
+            }
         } else {
             None
         };
+
+        info!(
+            "gc-compaction statistics: {}",
+            serde_json::to_string(&stat)?
+        );
+
+        if dry_run {
+            return Ok(());
+        }
+
         info!(
             "produced {} delta layers and {} image layers",
             delta_layers.len(),
@@ -2062,6 +2206,7 @@ impl Timeline {
         let mut layer_selection = layer_selection;
         layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key()));
         compact_to.extend(image_layer);
+
         // Step 3: Place back to the layer map.
         {
             let mut guard = self.layers.write().await;

From c89ee814e1cbd2d6111d30afd4f13a05fe501ca2 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 6 Aug 2024 10:52:01 +0300
Subject: [PATCH 338/412] fix: make Timeline::set_disk_consistent_lsn use
 fetch_max (#8311)

now it is safe to use from multiple callers, as we have two callers.
---
 pageserver/src/tenant/timeline.rs | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 5c268bf875..05bf4eac8b 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4123,17 +4123,11 @@ impl Timeline {
 
     /// Return true if the value changed
     ///
-    /// This function must only be used from the layer flush task, and may not be called concurrently.
+    /// This function must only be used from the layer flush task.
     fn set_disk_consistent_lsn(&self, new_value: Lsn) -> bool {
-        // We do a simple load/store cycle: that's why this function isn't safe for concurrent use.
-        let old_value = self.disk_consistent_lsn.load();
-        if new_value != old_value {
-            assert!(new_value >= old_value);
-            self.disk_consistent_lsn.store(new_value);
-            true
-        } else {
-            false
-        }
+        let old_value = self.disk_consistent_lsn.fetch_max(new_value);
+        assert!(new_value >= old_value, "disk_consistent_lsn must be growing monotonously at runtime; current {old_value}, offered {new_value}");
+        new_value != old_value
     }
 
     /// Update metadata file

From b2bc5795be03fae0fa579d06430865ee83bbbfb1 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 6 Aug 2024 12:09:56 +0300
Subject: [PATCH 339/412] feat: persistent gc blocking (#8600)

Currently, we do not have facilities to persistently block GC on a
tenant for whatever reason. We could do a tenant configuration update,
but that is risky for generation numbers and would also be transient.
Introduce a `gc_block` facility in the tenant, which manages per
timeline blocking reasons.

Additionally, add HTTP endpoints for enabling/disabling manual gc
blocking for a specific timeline. For debugging, individual tenant
status now includes a similar string representation logged when GC is
skipped.

Cc: #6994
---
 libs/pageserver_api/src/models.rs             |   9 +
 pageserver/src/http/openapi_spec.yml          |  39 ++++
 pageserver/src/http/routes.rs                 |  76 +++++++
 pageserver/src/tenant.rs                      |  30 +++
 pageserver/src/tenant/gc_block.rs             | 213 ++++++++++++++++++
 .../src/tenant/remote_timeline_client.rs      | 117 ++++++++++
 .../tenant/remote_timeline_client/index.rs    | 133 +++++++++++
 pageserver/src/tenant/timeline.rs             |  16 ++
 pageserver/src/tenant/timeline/delete.rs      |   2 +
 test_runner/fixtures/pageserver/http.py       |  16 ++
 .../regress/test_timeline_gc_blocking.py      |  67 ++++++
 11 files changed, 718 insertions(+)
 create mode 100644 pageserver/src/tenant/gc_block.rs
 create mode 100644 test_runner/regress/test_timeline_gc_blocking.py

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 591c45d908..b541bba6a1 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -637,6 +637,13 @@ pub struct TenantInfo {
     pub current_physical_size: Option<u64>, // physical size is only included in `tenant_status` endpoint
     pub attachment_status: TenantAttachmentStatus,
     pub generation: u32,
+
+    /// Opaque explanation if gc is being blocked.
+    ///
+    /// Only looked up for the individual tenant detail, not the listing. This is purely for
+    /// debugging, not included in openapi.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub gc_blocking: Option<String>,
 }
 
 #[derive(Serialize, Deserialize, Clone)]
@@ -1427,6 +1434,7 @@ mod tests {
             current_physical_size: Some(42),
             attachment_status: TenantAttachmentStatus::Attached,
             generation: 1,
+            gc_blocking: None,
         };
         let expected_active = json!({
             "id": original_active.id.to_string(),
@@ -1449,6 +1457,7 @@ mod tests {
             current_physical_size: Some(42),
             attachment_status: TenantAttachmentStatus::Attached,
             generation: 1,
+            gc_blocking: None,
         };
         let expected_broken = json!({
             "id": original_broken.id.to_string(),
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 59e646d0ca..4656f2c93a 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -308,6 +308,45 @@ paths:
             application/json:
               schema:
                 type: string
+
+  /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/block_gc:
+    parameters:
+      - name: tenant_shard_id
+        in: path
+        required: true
+        schema:
+          type: string
+      - name: timeline_id
+        in: path
+        required: true
+        schema:
+          type: string
+          format: hex
+    post:
+      description: Persistently add a gc blocking at the tenant level because of this timeline
+      responses:
+        "200":
+          description: OK
+
+  /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/unblock_gc:
+    parameters:
+      - name: tenant_shard_id
+        in: path
+        required: true
+        schema:
+          type: string
+      - name: timeline_id
+        in: path
+        required: true
+        schema:
+          type: string
+          format: hex
+    post:
+      description: Persistently remove a tenant level gc blocking for this timeline
+      responses:
+        "200":
+          description: OK
+
   /v1/tenant/{tenant_shard_id}/location_config:
     parameters:
       - name: tenant_shard_id
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 117f2c5869..fdab780bfb 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -935,6 +935,7 @@ async fn tenant_list_handler(
             generation: (*gen)
                 .into()
                 .expect("Tenants are always attached with a generation"),
+            gc_blocking: None,
         })
         .collect::<Vec<TenantInfo>>();
 
@@ -986,6 +987,7 @@ async fn tenant_status(
                     .generation()
                     .into()
                     .expect("Tenants are always attached with a generation"),
+                gc_blocking: tenant.gc_block.summary().map(|x| format!("{x:?}")),
             },
             walredo: tenant.wal_redo_manager_status(),
             timelines: tenant.list_timeline_ids(),
@@ -1226,6 +1228,72 @@ async fn evict_timeline_layer_handler(
     }
 }
 
+async fn timeline_gc_blocking_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    block_or_unblock_gc(request, true).await
+}
+
+async fn timeline_gc_unblocking_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    block_or_unblock_gc(request, false).await
+}
+
+/// Adding a block is `POST ../block_gc`, removing a block is `POST ../unblock_gc`.
+///
+/// Both are technically unsafe because they might fire off index uploads, thus they are POST.
+async fn block_or_unblock_gc(
+    request: Request<Body>,
+    block: bool,
+) -> Result<Response<Body>, ApiError> {
+    use crate::tenant::{
+        remote_timeline_client::WaitCompletionError, upload_queue::NotInitialized,
+    };
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    let state = get_state(&request);
+
+    let tenant = state
+        .tenant_manager
+        .get_attached_tenant_shard(tenant_shard_id)?;
+
+    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+
+    let timeline = tenant.get_timeline(timeline_id, true)?;
+
+    let fut = async {
+        if block {
+            timeline.block_gc(&tenant).await.map(|_| ())
+        } else {
+            timeline.unblock_gc(&tenant).await
+        }
+    };
+
+    let span = tracing::info_span!(
+        "block_or_unblock_gc",
+        tenant_id = %tenant_shard_id.tenant_id,
+        shard_id = %tenant_shard_id.shard_slug(),
+        timeline_id = %timeline_id,
+        block = block,
+    );
+
+    let res = fut.instrument(span).await;
+
+    res.map_err(|e| {
+        if e.is::<NotInitialized>() || e.is::<WaitCompletionError>() {
+            ApiError::ShuttingDown
+        } else {
+            ApiError::InternalServerError(e)
+        }
+    })?;
+
+    json_response(StatusCode::OK, ())
+}
+
 /// Get tenant_size SVG graph along with the JSON data.
 fn synthetic_size_html_response(
     inputs: ModelInputs,
@@ -2904,6 +2972,14 @@ pub fn make_router(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name",
             |r| api_handler(r, evict_timeline_layer_handler),
         )
+        .post(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/block_gc",
+            |r| api_handler(r, timeline_gc_blocking_handler),
+        )
+        .post(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/unblock_gc",
+            |r| api_handler(r, timeline_gc_unblocking_handler),
+        )
         .post("/v1/tenant/:tenant_shard_id/heatmap_upload", |r| {
             api_handler(r, secondary_upload_handler)
         })
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 72d3aedd05..de9b55d847 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -148,6 +148,7 @@ pub(crate) mod timeline;
 
 pub mod size;
 
+mod gc_block;
 pub(crate) mod throttle;
 
 pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
@@ -303,6 +304,12 @@ pub struct Tenant {
     /// An ongoing timeline detach must be checked during attempts to GC or compact a timeline.
     ongoing_timeline_detach: std::sync::Mutex<Option<(TimelineId, utils::completion::Barrier)>>,
 
+    /// `index_part.json` based gc blocking reason tracking.
+    ///
+    /// New gc iterations must start a new iteration by acquiring `GcBlock::start` before
+    /// proceeding.
+    pub(crate) gc_block: gc_block::GcBlock,
+
     l0_flush_global_state: L0FlushGlobalState,
 }
 
@@ -1036,6 +1043,8 @@ impl Tenant {
             }
         }
 
+        let mut gc_blocks = HashMap::new();
+
         // For every timeline, download the metadata file, scan the local directory,
         // and build a layer map that contains an entry for each remote and local
         // layer file.
@@ -1045,6 +1054,16 @@ impl Tenant {
                 .remove(&timeline_id)
                 .expect("just put it in above");
 
+            if let Some(blocking) = index_part.gc_blocking.as_ref() {
+                // could just filter these away, but it helps while testing
+                anyhow::ensure!(
+                    !blocking.reasons.is_empty(),
+                    "index_part for {timeline_id} is malformed: it should not have gc blocking with zero reasons"
+                );
+                let prev = gc_blocks.insert(timeline_id, blocking.reasons);
+                assert!(prev.is_none());
+            }
+
             // TODO again handle early failure
             self.load_remote_timeline(
                 timeline_id,
@@ -1089,6 +1108,8 @@ impl Tenant {
         // IndexPart is the source of truth.
         self.clean_up_timelines(&existent_timelines)?;
 
+        self.gc_block.set_scanned(gc_blocks);
+
         fail::fail_point!("attach-before-activate", |_| {
             anyhow::bail!("attach-before-activate");
         });
@@ -1679,6 +1700,14 @@ impl Tenant {
             }
         }
 
+        let _guard = match self.gc_block.start().await {
+            Ok(guard) => guard,
+            Err(reasons) => {
+                info!("Skipping GC: {reasons}");
+                return Ok(GcResult::default());
+            }
+        };
+
         self.gc_iteration_internal(target_timeline_id, horizon, pitr, cancel, ctx)
             .await
     }
@@ -2691,6 +2720,7 @@ impl Tenant {
             )),
             tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
             ongoing_timeline_detach: std::sync::Mutex::default(),
+            gc_block: Default::default(),
             l0_flush_global_state,
         }
     }
diff --git a/pageserver/src/tenant/gc_block.rs b/pageserver/src/tenant/gc_block.rs
new file mode 100644
index 0000000000..8b41ba1746
--- /dev/null
+++ b/pageserver/src/tenant/gc_block.rs
@@ -0,0 +1,213 @@
+use std::collections::HashMap;
+
+use utils::id::TimelineId;
+
+use super::remote_timeline_client::index::GcBlockingReason;
+
+type Storage = HashMap<TimelineId, enumset::EnumSet<GcBlockingReason>>;
+
+#[derive(Default)]
+pub(crate) struct GcBlock {
+    /// The timelines which have current reasons to block gc.
+    ///
+    /// LOCK ORDER: this is held locked while scheduling the next index_part update. This is done
+    /// to keep the this field up to date with RemoteTimelineClient `upload_queue.dirty`.
+    reasons: std::sync::Mutex<Storage>,
+    blocking: tokio::sync::Mutex<()>,
+}
+
+impl GcBlock {
+    /// Start another gc iteration.
+    ///
+    /// Returns a guard to be held for the duration of gc iteration to allow synchronizing with
+    /// it's ending, or if not currently possible, a value describing the reasons why not.
+    ///
+    /// Cancellation safe.
+    pub(super) async fn start(&self) -> Result<Guard<'_>, BlockingReasons> {
+        let reasons = {
+            let g = self.reasons.lock().unwrap();
+
+            // TODO: the assumption is that this method gets called periodically. in prod, we use 1h, in
+            // tests, we use everything. we should warn if the gc has been consecutively blocked
+            // for more than 1h (within single tenant session?).
+            BlockingReasons::clean_and_summarize(g)
+        };
+
+        if let Some(reasons) = reasons {
+            Err(reasons)
+        } else {
+            Ok(Guard {
+                _inner: self.blocking.lock().await,
+            })
+        }
+    }
+
+    pub(crate) fn summary(&self) -> Option<BlockingReasons> {
+        let g = self.reasons.lock().unwrap();
+
+        BlockingReasons::summarize(&g)
+    }
+
+    /// Start blocking gc for this one timeline for the given reason.
+    ///
+    /// This is not a guard based API but instead it mimics set API. The returned future will not
+    /// resolve until an existing gc round has completed.
+    ///
+    /// Returns true if this block was new, false if gc was already blocked for this reason.
+    ///
+    /// Cancellation safe: cancelling after first poll will keep the reason to block gc, but will
+    /// keep the gc blocking reason.
+    pub(crate) async fn insert(
+        &self,
+        timeline: &super::Timeline,
+        reason: GcBlockingReason,
+    ) -> anyhow::Result<bool> {
+        let (added, uploaded) = {
+            let mut g = self.reasons.lock().unwrap();
+            let set = g.entry(timeline.timeline_id).or_default();
+            let added = set.insert(reason);
+
+            // LOCK ORDER: intentionally hold the lock, see self.reasons.
+            let uploaded = timeline
+                .remote_client
+                .schedule_insert_gc_block_reason(reason)?;
+
+            (added, uploaded)
+        };
+
+        uploaded.await?;
+
+        // ensure that any ongoing gc iteration has completed
+        drop(self.blocking.lock().await);
+
+        Ok(added)
+    }
+
+    /// Remove blocking gc for this one timeline and the given reason.
+    pub(crate) async fn remove(
+        &self,
+        timeline: &super::Timeline,
+        reason: GcBlockingReason,
+    ) -> anyhow::Result<()> {
+        use std::collections::hash_map::Entry;
+
+        super::span::debug_assert_current_span_has_tenant_and_timeline_id();
+
+        let (remaining_blocks, uploaded) = {
+            let mut g = self.reasons.lock().unwrap();
+            match g.entry(timeline.timeline_id) {
+                Entry::Occupied(mut oe) => {
+                    let set = oe.get_mut();
+                    set.remove(reason);
+                    if set.is_empty() {
+                        oe.remove();
+                    }
+                }
+                Entry::Vacant(_) => {
+                    // we must still do the index_part.json update regardless, in case we had earlier
+                    // been cancelled
+                }
+            }
+
+            let remaining_blocks = g.len();
+
+            // LOCK ORDER: intentionally hold the lock while scheduling; see self.reasons
+            let uploaded = timeline
+                .remote_client
+                .schedule_remove_gc_block_reason(reason)?;
+
+            (remaining_blocks, uploaded)
+        };
+        uploaded.await?;
+
+        // no need to synchronize with gc iteration again
+
+        if remaining_blocks > 0 {
+            tracing::info!(remaining_blocks, removed=?reason, "gc blocking removed, but gc remains blocked");
+        } else {
+            tracing::info!("gc is now unblocked for the tenant");
+        }
+
+        Ok(())
+    }
+
+    pub(crate) fn before_delete(&self, timeline: &super::Timeline) {
+        let unblocked = {
+            let mut g = self.reasons.lock().unwrap();
+            if g.is_empty() {
+                return;
+            }
+
+            g.remove(&timeline.timeline_id);
+
+            BlockingReasons::clean_and_summarize(g).is_none()
+        };
+
+        if unblocked {
+            tracing::info!("gc is now unblocked following deletion");
+        }
+    }
+
+    /// Initialize with the non-deleted timelines of this tenant.
+    pub(crate) fn set_scanned(&self, scanned: Storage) {
+        let mut g = self.reasons.lock().unwrap();
+        assert!(g.is_empty());
+        g.extend(scanned.into_iter().filter(|(_, v)| !v.is_empty()));
+
+        if let Some(reasons) = BlockingReasons::clean_and_summarize(g) {
+            tracing::info!(summary=?reasons, "initialized with gc blocked");
+        }
+    }
+}
+
+pub(super) struct Guard<'a> {
+    _inner: tokio::sync::MutexGuard<'a, ()>,
+}
+
+#[derive(Debug)]
+pub(crate) struct BlockingReasons {
+    timelines: usize,
+    reasons: enumset::EnumSet<GcBlockingReason>,
+}
+
+impl std::fmt::Display for BlockingReasons {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "{} timelines block for {:?}",
+            self.timelines, self.reasons
+        )
+    }
+}
+
+impl BlockingReasons {
+    fn clean_and_summarize(mut g: std::sync::MutexGuard<'_, Storage>) -> Option<Self> {
+        let mut reasons = enumset::EnumSet::empty();
+        g.retain(|_key, value| {
+            reasons = reasons.union(*value);
+            !value.is_empty()
+        });
+        if !g.is_empty() {
+            Some(BlockingReasons {
+                timelines: g.len(),
+                reasons,
+            })
+        } else {
+            None
+        }
+    }
+
+    fn summarize(g: &std::sync::MutexGuard<'_, Storage>) -> Option<Self> {
+        if g.is_empty() {
+            None
+        } else {
+            let reasons = g
+                .values()
+                .fold(enumset::EnumSet::empty(), |acc, next| acc.union(*next));
+            Some(BlockingReasons {
+                timelines: g.len(),
+                reasons,
+            })
+        }
+    }
+}
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 9e021c7e35..1344fe4192 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -800,6 +800,123 @@ impl RemoteTimelineClient {
             .context("wait completion")
     }
 
+    /// Adds a gc blocking reason for this timeline if one does not exist already.
+    ///
+    /// A retryable step of timeline detach ancestor.
+    ///
+    /// Returns a future which waits until the completion of the upload.
+    pub(crate) fn schedule_insert_gc_block_reason(
+        self: &Arc<Self>,
+        reason: index::GcBlockingReason,
+    ) -> Result<impl std::future::Future<Output = Result<(), WaitCompletionError>>, NotInitialized>
+    {
+        let maybe_barrier = {
+            let mut guard = self.upload_queue.lock().unwrap();
+            let upload_queue = guard.initialized_mut()?;
+
+            if let index::GcBlockingReason::DetachAncestor = reason {
+                if upload_queue.dirty.metadata.ancestor_timeline().is_none() {
+                    drop(guard);
+                    panic!("cannot start detach ancestor if there is nothing to detach from");
+                }
+            }
+
+            let wanted = |x: Option<&index::GcBlocking>| x.is_some_and(|x| x.blocked_by(reason));
+
+            let current = upload_queue.dirty.gc_blocking.as_ref();
+            let uploaded = upload_queue.clean.0.gc_blocking.as_ref();
+
+            match (current, uploaded) {
+                (x, y) if wanted(x) && wanted(y) => None,
+                (x, y) if wanted(x) && !wanted(y) => Some(self.schedule_barrier0(upload_queue)),
+                // Usual case: !wanted(x) && !wanted(y)
+                //
+                // Unusual: !wanted(x) && wanted(y) which means we have two processes waiting to
+                // turn on and off some reason.
+                (x, y) => {
+                    if !wanted(x) && wanted(y) {
+                        // this could be avoided by having external in-memory synchronization, like
+                        // timeline detach ancestor
+                        warn!(?reason, op="insert", "unexpected: two racing processes to enable and disable a gc blocking reason");
+                    }
+
+                    // at this point, the metadata must always show that there is a parent
+                    upload_queue.dirty.gc_blocking = current
+                        .map(|x| x.with_reason(reason))
+                        .or_else(|| Some(index::GcBlocking::started_now_for(reason)));
+                    self.schedule_index_upload(upload_queue)?;
+                    Some(self.schedule_barrier0(upload_queue))
+                }
+            }
+        };
+
+        Ok(async move {
+            if let Some(barrier) = maybe_barrier {
+                Self::wait_completion0(barrier).await?;
+            }
+            Ok(())
+        })
+    }
+
+    /// Removes a gc blocking reason for this timeline if one exists.
+    ///
+    /// A retryable step of timeline detach ancestor.
+    ///
+    /// Returns a future which waits until the completion of the upload.
+    pub(crate) fn schedule_remove_gc_block_reason(
+        self: &Arc<Self>,
+        reason: index::GcBlockingReason,
+    ) -> Result<impl std::future::Future<Output = Result<(), WaitCompletionError>>, NotInitialized>
+    {
+        let maybe_barrier = {
+            let mut guard = self.upload_queue.lock().unwrap();
+            let upload_queue = guard.initialized_mut()?;
+
+            if let index::GcBlockingReason::DetachAncestor = reason {
+                if !upload_queue
+                    .clean
+                    .0
+                    .lineage
+                    .is_detached_from_original_ancestor()
+                {
+                    drop(guard);
+                    panic!("cannot complete timeline_ancestor_detach while not detached");
+                }
+            }
+
+            let wanted = |x: Option<&index::GcBlocking>| {
+                x.is_none() || x.is_some_and(|b| !b.blocked_by(reason))
+            };
+
+            let current = upload_queue.dirty.gc_blocking.as_ref();
+            let uploaded = upload_queue.clean.0.gc_blocking.as_ref();
+
+            match (current, uploaded) {
+                (x, y) if wanted(x) && wanted(y) => None,
+                (x, y) if wanted(x) && !wanted(y) => Some(self.schedule_barrier0(upload_queue)),
+                (x, y) => {
+                    if !wanted(x) && wanted(y) {
+                        warn!(?reason, op="remove", "unexpected: two racing processes to enable and disable a gc blocking reason (remove)");
+                    }
+
+                    upload_queue.dirty.gc_blocking =
+                        current.as_ref().and_then(|x| x.without_reason(reason));
+                    assert!(wanted(upload_queue.dirty.gc_blocking.as_ref()));
+                    // FIXME: bogus ?
+                    self.schedule_index_upload(upload_queue)?;
+                    Some(self.schedule_barrier0(upload_queue))
+                }
+            }
+        };
+
+        Ok(async move {
+            if let Some(barrier) = maybe_barrier {
+                Self::wait_completion0(barrier).await?;
+            }
+            Ok(())
+        })
+    }
+
     /// Launch an upload operation in the background; the file is added to be included in next
     /// `index_part.json` upload.
     pub(crate) fn schedule_layer_file_upload(
diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index 3075df022e..8e6290030d 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -60,6 +60,9 @@ pub struct IndexPart {
     #[serde(default)]
     pub(crate) lineage: Lineage,
 
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub(crate) gc_blocking: Option<GcBlocking>,
+
     /// Describes the kind of aux files stored in the timeline.
     ///
     /// The value is modified during file ingestion when the latest wanted value communicated via tenant config is applied if it is acceptable.
@@ -101,6 +104,7 @@ impl IndexPart {
             deleted_at: None,
             archived_at: None,
             lineage: Default::default(),
+            gc_blocking: None,
             last_aux_file_policy: None,
         }
     }
@@ -251,6 +255,64 @@ impl Lineage {
     }
 }
 
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+pub(crate) struct GcBlocking {
+    pub(crate) started_at: NaiveDateTime,
+    pub(crate) reasons: enumset::EnumSet<GcBlockingReason>,
+}
+
+#[derive(Debug, enumset::EnumSetType, serde::Serialize, serde::Deserialize)]
+#[enumset(serialize_repr = "list")]
+pub(crate) enum GcBlockingReason {
+    Manual,
+    DetachAncestor,
+}
+
+impl GcBlocking {
+    pub(super) fn started_now_for(reason: GcBlockingReason) -> Self {
+        GcBlocking {
+            started_at: chrono::Utc::now().naive_utc(),
+            reasons: enumset::EnumSet::only(reason),
+        }
+    }
+
+    /// Returns true if the given reason is one of the reasons why the gc is blocked.
+    pub(crate) fn blocked_by(&self, reason: GcBlockingReason) -> bool {
+        self.reasons.contains(reason)
+    }
+
+    /// Returns a version of self with the given reason.
+    pub(super) fn with_reason(&self, reason: GcBlockingReason) -> Self {
+        assert!(!self.blocked_by(reason));
+        let mut reasons = self.reasons;
+        reasons.insert(reason);
+
+        Self {
+            started_at: self.started_at,
+            reasons,
+        }
+    }
+
+    /// Returns a version of self without the given reason. Assumption is that if
+    /// there are no more reasons, we can unblock the gc by returning `None`.
+    pub(super) fn without_reason(&self, reason: GcBlockingReason) -> Option<Self> {
+        assert!(self.blocked_by(reason));
+
+        if self.reasons.len() == 1 {
+            None
+        } else {
+            let mut reasons = self.reasons;
+            assert!(reasons.remove(reason));
+            assert!(!reasons.is_empty());
+
+            Some(Self {
+                started_at: self.started_at,
+                reasons,
+            })
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -292,6 +354,7 @@ mod tests {
             deleted_at: None,
             archived_at: None,
             lineage: Lineage::default(),
+            gc_blocking: None,
             last_aux_file_policy: None,
         };
 
@@ -335,6 +398,7 @@ mod tests {
             deleted_at: None,
             archived_at: None,
             lineage: Lineage::default(),
+            gc_blocking: None,
             last_aux_file_policy: None,
         };
 
@@ -379,6 +443,7 @@ mod tests {
             deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
             archived_at: None,
             lineage: Lineage::default(),
+            gc_blocking: None,
             last_aux_file_policy: None,
         };
 
@@ -426,6 +491,7 @@ mod tests {
             deleted_at: None,
             archived_at: None,
             lineage: Lineage::default(),
+            gc_blocking: None,
             last_aux_file_policy: None,
         };
 
@@ -468,6 +534,7 @@ mod tests {
             deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
             archived_at: None,
             lineage: Lineage::default(),
+            gc_blocking: None,
             last_aux_file_policy: None,
         };
 
@@ -513,6 +580,7 @@ mod tests {
                 reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
                 original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))),
             },
+            gc_blocking: None,
             last_aux_file_policy: None,
         };
 
@@ -563,6 +631,7 @@ mod tests {
                 reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
                 original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))),
             },
+            gc_blocking: None,
             last_aux_file_policy: Some(AuxFilePolicy::V2),
         };
 
@@ -618,6 +687,7 @@ mod tests {
             deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
             archived_at: None,
             lineage: Default::default(),
+            gc_blocking: None,
             last_aux_file_policy: Default::default(),
         };
 
@@ -674,6 +744,7 @@ mod tests {
             deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
             archived_at: Some(parse_naive_datetime("2023-04-29T09:00:00.123000000")),
             lineage: Default::default(),
+            gc_blocking: None,
             last_aux_file_policy: Default::default(),
         };
 
@@ -681,6 +752,68 @@ mod tests {
         assert_eq!(part, expected);
     }
 
+    #[test]
+    fn v9_indexpart_is_parsed() {
+        let example = r#"{
+            "version": 9,
+            "layer_metadata":{
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
+            },
+            "disk_consistent_lsn":"0/16960E8",
+            "metadata": {
+                "disk_consistent_lsn": "0/16960E8",
+                "prev_record_lsn": "0/1696070",
+                "ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e",
+                "ancestor_lsn": "0/0",
+                "latest_gc_cutoff_lsn": "0/1696070",
+                "initdb_lsn": "0/1696070",
+                "pg_version": 14
+            },
+            "gc_blocking": {
+                "started_at": "2024-07-19T09:00:00.123",
+                "reasons": ["DetachAncestor"]
+            }
+        }"#;
+
+        let expected = IndexPart {
+            version: 9,
+            layer_metadata: HashMap::from([
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
+                    file_size: 25600000,
+                    generation: Generation::none(),
+                    shard: ShardIndex::unsharded()
+                }),
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
+                    file_size: 9007199254741001,
+                    generation: Generation::none(),
+                    shard: ShardIndex::unsharded()
+                })
+            ]),
+            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
+            metadata: TimelineMetadata::new(
+                Lsn::from_str("0/16960E8").unwrap(),
+                Some(Lsn::from_str("0/1696070").unwrap()),
+                Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()),
+                Lsn::INVALID,
+                Lsn::from_str("0/1696070").unwrap(),
+                Lsn::from_str("0/1696070").unwrap(),
+                14,
+            ).with_recalculated_checksum().unwrap(),
+            deleted_at: None,
+            lineage: Default::default(),
+            gc_blocking: Some(GcBlocking {
+                started_at: parse_naive_datetime("2024-07-19T09:00:00.123000000"),
+                reasons: enumset::EnumSet::from_iter([GcBlockingReason::DetachAncestor]),
+            }),
+            last_aux_file_policy: Default::default(),
+            archived_at: None,
+        };
+
+        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
+        assert_eq!(part, expected);
+    }
+
     fn parse_naive_datetime(s: &str) -> NaiveDateTime {
         chrono::NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S.%f").unwrap()
     }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 05bf4eac8b..79bfd1ebb2 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -5698,6 +5698,22 @@ impl Timeline {
         }
     }
 
+    /// Persistently blocks gc for `Manual` reason.
+    ///
+    /// Returns true if no such block existed before, false otherwise.
+    pub(crate) async fn block_gc(&self, tenant: &super::Tenant) -> anyhow::Result<bool> {
+        use crate::tenant::remote_timeline_client::index::GcBlockingReason;
+        assert_eq!(self.tenant_shard_id, tenant.tenant_shard_id);
+        tenant.gc_block.insert(self, GcBlockingReason::Manual).await
+    }
+
+    /// Persistently unblocks gc for `Manual` reason.
+    pub(crate) async fn unblock_gc(&self, tenant: &super::Tenant) -> anyhow::Result<()> {
+        use crate::tenant::remote_timeline_client::index::GcBlockingReason;
+        assert_eq!(self.tenant_shard_id, tenant.tenant_shard_id);
+        tenant.gc_block.remove(self, GcBlockingReason::Manual).await
+    }
+
     #[cfg(test)]
     pub(super) fn force_advance_lsn(self: &Arc<Timeline>, new_lsn: Lsn) {
         self.last_record_lsn.advance(new_lsn);
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index 05178c38b4..b03dbb092e 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -230,6 +230,8 @@ impl DeleteTimelineFlow {
         // Now that the Timeline is in Stopping state, request all the related tasks to shut down.
         timeline.shutdown(super::ShutdownMode::Hard).await;
 
+        tenant.gc_block.before_delete(&timeline);
+
         fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
             Err(anyhow::anyhow!(
                 "failpoint: timeline-delete-before-index-deleted-at"
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 192324f086..61e2204b23 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -556,6 +556,22 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         assert isinstance(res_json, dict)
         return res_json
 
+    def timeline_block_gc(self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId):
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/block_gc",
+        )
+        log.info(f"Got GC request response code: {res.status_code}")
+        self.verbose_error(res)
+
+    def timeline_unblock_gc(
+        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId
+    ):
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/unblock_gc",
+        )
+        log.info(f"Got GC request response code: {res.status_code}")
+        self.verbose_error(res)
+
     def timeline_compact(
         self,
         tenant_id: Union[TenantId, TenantShardId],
diff --git a/test_runner/regress/test_timeline_gc_blocking.py b/test_runner/regress/test_timeline_gc_blocking.py
new file mode 100644
index 0000000000..24de894687
--- /dev/null
+++ b/test_runner/regress/test_timeline_gc_blocking.py
@@ -0,0 +1,67 @@
+import time
+
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+)
+from fixtures.pageserver.utils import wait_timeline_detail_404
+
+
+def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder):
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={"gc_period": "1s", "lsn_lease_length": "0s"}
+    )
+    ps = env.pageserver
+    http = ps.http_client()
+
+    foo_branch = env.neon_cli.create_branch("foo", "main", env.initial_tenant)
+
+    gc_active_line = ".* gc_loop.*: [12] timelines need GC"
+    gc_skipped_line = ".* gc_loop.*: Skipping GC: .*"
+    init_gc_skipped = ".*: initialized with gc blocked.*"
+
+    tenant_before = http.tenant_status(env.initial_tenant)
+
+    wait_for_another_gc_round()
+    _, offset = ps.assert_log_contains(gc_active_line)
+
+    assert ps.log_contains(gc_skipped_line, offset) is None
+
+    http.timeline_block_gc(env.initial_tenant, foo_branch)
+
+    tenant_after = http.tenant_status(env.initial_tenant)
+    assert tenant_before != tenant_after
+    gc_blocking = tenant_after["gc_blocking"]
+    assert gc_blocking == "BlockingReasons { timelines: 1, reasons: EnumSet(Manual) }"
+
+    wait_for_another_gc_round()
+    _, offset = ps.assert_log_contains(gc_skipped_line, offset)
+
+    ps.restart()
+    ps.quiesce_tenants()
+
+    _, offset = env.pageserver.assert_log_contains(init_gc_skipped, offset)
+
+    wait_for_another_gc_round()
+    _, offset = ps.assert_log_contains(gc_skipped_line, offset)
+
+    # deletion unblocks gc
+    http.timeline_delete(env.initial_tenant, foo_branch)
+    wait_timeline_detail_404(http, env.initial_tenant, foo_branch, 10, 1.0)
+
+    wait_for_another_gc_round()
+    _, offset = ps.assert_log_contains(gc_active_line, offset)
+
+    http.timeline_block_gc(env.initial_tenant, env.initial_timeline)
+
+    wait_for_another_gc_round()
+    _, offset = ps.assert_log_contains(gc_skipped_line, offset)
+
+    # removing the manual block also unblocks gc
+    http.timeline_unblock_gc(env.initial_tenant, env.initial_timeline)
+
+    wait_for_another_gc_round()
+    _, offset = ps.assert_log_contains(gc_active_line, offset)
+
+
+def wait_for_another_gc_round():
+    time.sleep(2)

From 608c3cedbf53a3d076a8003005dab22cb913c529 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 6 Aug 2024 10:14:01 +0100
Subject: [PATCH 340/412] pageserver: remove legacy read path (#8601)

## Problem

We have been maintaining two read paths (legacy and vectored) for a
while now. The legacy read-path was only used for cross validation in some tests.

## Summary of changes
* Tweak all tests that were using the legacy read path to use the
vectored read path instead
* Remove the read path dispatching based on the pageserver configs
* Remove the legacy read path code

We will be able to remove the single blob io code in
`pageserver/src/tenant/blob_io.rs` when https://github.com/neondatabase/neon/issues/7386 is complete.

Closes https://github.com/neondatabase/neon/issues/8005
---
 pageserver/src/tenant.rs                      |  53 +-
 pageserver/src/tenant/storage_layer.rs        |  15 -
 .../src/tenant/storage_layer/delta_layer.rs   |  91 +--
 .../src/tenant/storage_layer/image_layer.rs   |  44 +-
 .../tenant/storage_layer/inmemory_layer.rs    |  76 +--
 pageserver/src/tenant/storage_layer/layer.rs  |  73 +--
 .../src/tenant/storage_layer/layer/tests.rs   |  38 +-
 pageserver/src/tenant/timeline.rs             | 581 ++----------------
 8 files changed, 121 insertions(+), 850 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index de9b55d847..989ed0d4eb 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -4122,7 +4122,7 @@ pub(crate) mod harness {
 
 #[cfg(test)]
 mod tests {
-    use std::collections::BTreeMap;
+    use std::collections::{BTreeMap, BTreeSet};
 
     use super::*;
     use crate::keyspace::KeySpaceAccum;
@@ -4797,7 +4797,7 @@ mod tests {
         lsn: Lsn,
         repeat: usize,
         key_count: usize,
-    ) -> anyhow::Result<()> {
+    ) -> anyhow::Result<HashMap<Key, BTreeSet<Lsn>>> {
         let compact = true;
         bulk_insert_maybe_compact_gc(tenant, timeline, ctx, lsn, repeat, key_count, compact).await
     }
@@ -4810,7 +4810,9 @@ mod tests {
         repeat: usize,
         key_count: usize,
         compact: bool,
-    ) -> anyhow::Result<()> {
+    ) -> anyhow::Result<HashMap<Key, BTreeSet<Lsn>>> {
+        let mut inserted: HashMap<Key, BTreeSet<Lsn>> = Default::default();
+
         let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
         let mut blknum = 0;
 
@@ -4831,6 +4833,7 @@ mod tests {
                         ctx,
                     )
                     .await?;
+                inserted.entry(test_key).or_default().insert(lsn);
                 writer.finish_write(lsn);
                 drop(writer);
 
@@ -4855,7 +4858,7 @@ mod tests {
             assert_eq!(res.layers_removed, 0, "this never removes anything");
         }
 
-        Ok(())
+        Ok(inserted)
     }
 
     //
@@ -4902,7 +4905,7 @@ mod tests {
             .await?;
 
         let lsn = Lsn(0x10);
-        bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?;
+        let inserted = bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?;
 
         let guard = tline.layers.read().await;
         guard.layer_map().dump(true, &ctx).await?;
@@ -4963,9 +4966,39 @@ mod tests {
                     &ctx,
                 )
                 .await;
-            tline
-                .validate_get_vectored_impl(&vectored_res, read, reads_lsn, &ctx)
-                .await;
+
+            let mut expected_lsns: HashMap<Key, Lsn> = Default::default();
+            let mut expect_missing = false;
+            let mut key = read.start().unwrap();
+            while key != read.end().unwrap() {
+                if let Some(lsns) = inserted.get(&key) {
+                    let expected_lsn = lsns.iter().rfind(|lsn| **lsn <= reads_lsn);
+                    match expected_lsn {
+                        Some(lsn) => {
+                            expected_lsns.insert(key, *lsn);
+                        }
+                        None => {
+                            expect_missing = true;
+                            break;
+                        }
+                    }
+                } else {
+                    expect_missing = true;
+                    break;
+                }
+
+                key = key.next();
+            }
+
+            if expect_missing {
+                assert!(matches!(vectored_res, Err(GetVectoredError::MissingKey(_))));
+            } else {
+                for (key, image) in vectored_res? {
+                    let expected_lsn = expected_lsns.get(&key).expect("determined above");
+                    let expected_image = test_img(&format!("{} at {}", key.field6, expected_lsn));
+                    assert_eq!(image?, expected_image);
+                }
+            }
         }
 
         Ok(())
@@ -5015,10 +5048,6 @@ mod tests {
             )
             .await;
 
-        child_timeline
-            .validate_get_vectored_impl(&vectored_res, aux_keyspace, read_lsn, &ctx)
-            .await;
-
         let images = vectored_res?;
         assert!(images.is_empty());
         Ok(())
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 59d3e1ce09..ab32a6035e 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -435,21 +435,6 @@ impl ReadableLayer {
     }
 }
 
-/// Return value from [`Layer::get_value_reconstruct_data`]
-#[derive(Clone, Copy, Debug)]
-pub enum ValueReconstructResult {
-    /// Got all the data needed to reconstruct the requested page
-    Complete,
-    /// This layer didn't contain all the required data, the caller should look up
-    /// the predecessor layer at the returned LSN and collect more data from there.
-    Continue,
-
-    /// This layer didn't contain data needed to reconstruct the page version at
-    /// the returned LSN. This is usually considered an error, but might be OK
-    /// in some circumstances.
-    Missing,
-}
-
 /// Layers contain a hint indicating whether they are likely to be used for reads.  This is a hint rather
 /// than an authoritative value, so that we do not have to update it synchronously when changing the visibility
 /// of layers (for example when creating a branch that makes some previously covered layers visible).  It should
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index e50fc2a266..a17dd28547 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -36,7 +36,7 @@ use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, Fi
 use crate::tenant::disk_btree::{
     DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
 };
-use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
+use crate::tenant::storage_layer::Layer;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
     BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
@@ -826,95 +826,6 @@ impl DeltaLayerInner {
         })
     }
 
-    pub(super) async fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        lsn_range: Range<Lsn>,
-        reconstruct_state: &mut ValueReconstructState,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<ValueReconstructResult> {
-        let mut need_image = true;
-        // Scan the page versions backwards, starting from `lsn`.
-        let block_reader = FileBlockReader::new(&self.file, self.file_id);
-        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
-            self.index_start_blk,
-            self.index_root_blk,
-            &block_reader,
-        );
-        let search_key = DeltaKey::from_key_lsn(&key, Lsn(lsn_range.end.0 - 1));
-
-        let mut offsets: Vec<(Lsn, u64)> = Vec::new();
-
-        tree_reader
-            .visit(
-                &search_key.0,
-                VisitDirection::Backwards,
-                |key, value| {
-                    let blob_ref = BlobRef(value);
-                    if key[..KEY_SIZE] != search_key.0[..KEY_SIZE] {
-                        return false;
-                    }
-                    let entry_lsn = DeltaKey::extract_lsn_from_buf(key);
-                    if entry_lsn < lsn_range.start {
-                        return false;
-                    }
-                    offsets.push((entry_lsn, blob_ref.pos()));
-
-                    !blob_ref.will_init()
-                },
-                &RequestContextBuilder::extend(ctx)
-                    .page_content_kind(PageContentKind::DeltaLayerBtreeNode)
-                    .build(),
-            )
-            .await?;
-
-        let ctx = &RequestContextBuilder::extend(ctx)
-            .page_content_kind(PageContentKind::DeltaLayerValue)
-            .build();
-
-        // Ok, 'offsets' now contains the offsets of all the entries we need to read
-        let cursor = block_reader.block_cursor();
-        let mut buf = Vec::new();
-        for (entry_lsn, pos) in offsets {
-            cursor
-                .read_blob_into_buf(pos, &mut buf, ctx)
-                .await
-                .with_context(|| {
-                    format!("Failed to read blob from virtual file {}", self.file.path)
-                })?;
-            let val = Value::des(&buf).with_context(|| {
-                format!(
-                    "Failed to deserialize file blob from virtual file {}",
-                    self.file.path
-                )
-            })?;
-            match val {
-                Value::Image(img) => {
-                    reconstruct_state.img = Some((entry_lsn, img));
-                    need_image = false;
-                    break;
-                }
-                Value::WalRecord(rec) => {
-                    let will_init = rec.will_init();
-                    reconstruct_state.records.push((entry_lsn, rec));
-                    if will_init {
-                        // This WAL record initializes the page, so no need to go further back
-                        need_image = false;
-                        break;
-                    }
-                }
-            }
-        }
-
-        // If an older page image is needed to reconstruct the page, let the
-        // caller know.
-        if need_image {
-            Ok(ValueReconstructResult::Continue)
-        } else {
-            Ok(ValueReconstructResult::Complete)
-        }
-    }
-
     // Look up the keys in the provided keyspace and update
     // the reconstruct state with whatever is found.
     //
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index f4f48aaf16..b2173455ab 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -32,9 +32,7 @@ use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{
     DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
 };
-use crate::tenant::storage_layer::{
-    LayerAccessStats, ValueReconstructResult, ValueReconstructState,
-};
+use crate::tenant::storage_layer::LayerAccessStats;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
     BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
@@ -429,46 +427,6 @@ impl ImageLayerInner {
         })
     }
 
-    pub(super) async fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        reconstruct_state: &mut ValueReconstructState,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<ValueReconstructResult> {
-        let block_reader = FileBlockReader::new(&self.file, self.file_id);
-        let tree_reader =
-            DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader);
-
-        let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
-        key.write_to_byte_slice(&mut keybuf);
-        if let Some(offset) = tree_reader
-            .get(
-                &keybuf,
-                &RequestContextBuilder::extend(ctx)
-                    .page_content_kind(PageContentKind::ImageLayerBtreeNode)
-                    .build(),
-            )
-            .await?
-        {
-            let blob = block_reader
-                .block_cursor()
-                .read_blob(
-                    offset,
-                    &RequestContextBuilder::extend(ctx)
-                        .page_content_kind(PageContentKind::ImageLayerValue)
-                        .build(),
-                )
-                .await
-                .with_context(|| format!("failed to read value from offset {}", offset))?;
-            let value = Bytes::from(blob);
-
-            reconstruct_state.img = Some((self.lsn, value));
-            Ok(ValueReconstructResult::Complete)
-        } else {
-            Ok(ValueReconstructResult::Missing)
-        }
-    }
-
     // Look up the keys in the provided keyspace and update
     // the reconstruct state with whatever is found.
     pub(super) async fn get_values_reconstruct_data(
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index f9010ae8a6..6abc89c2ed 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -10,11 +10,10 @@ use crate::page_cache::PAGE_SZ;
 use crate::repository::{Key, Value};
 use crate::tenant::block_io::{BlockCursor, BlockReader, BlockReaderRef};
 use crate::tenant::ephemeral_file::EphemeralFile;
-use crate::tenant::storage_layer::ValueReconstructResult;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::{PageReconstructError, Timeline};
 use crate::{l0_flush, page_cache, walrecord};
-use anyhow::{anyhow, ensure, Result};
+use anyhow::{anyhow, Result};
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::InMemoryLayerInfo;
 use pageserver_api::shard::TenantShardId;
@@ -33,10 +32,7 @@ use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::atomic::{AtomicU64, AtomicUsize};
 use tokio::sync::{RwLock, RwLockWriteGuard};
 
-use super::{
-    DeltaLayerWriter, ResidentLayer, ValueReconstructSituation, ValueReconstructState,
-    ValuesReconstructState,
-};
+use super::{DeltaLayerWriter, ResidentLayer, ValueReconstructSituation, ValuesReconstructState};
 
 #[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)]
 pub(crate) struct InMemoryLayerFileId(page_cache::FileId);
@@ -55,9 +51,6 @@ pub struct InMemoryLayer {
     /// Writes are only allowed when this is `None`.
     pub(crate) end_lsn: OnceLock<Lsn>,
 
-    /// Used for traversal path. Cached representation of the in-memory layer before frozen.
-    local_path_str: Arc<str>,
-
     /// Used for traversal path. Cached representation of the in-memory layer after frozen.
     frozen_local_path_str: OnceLock<Arc<str>>,
 
@@ -248,12 +241,6 @@ impl InMemoryLayer {
         self.start_lsn..self.end_lsn_or_max()
     }
 
-    pub(crate) fn local_path_str(&self) -> &Arc<str> {
-        self.frozen_local_path_str
-            .get()
-            .unwrap_or(&self.local_path_str)
-    }
-
     /// debugging function to print out the contents of the layer
     ///
     /// this is likely completly unused
@@ -303,60 +290,6 @@ impl InMemoryLayer {
         Ok(())
     }
 
-    /// Look up given value in the layer.
-    pub(crate) async fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        lsn_range: Range<Lsn>,
-        reconstruct_state: &mut ValueReconstructState,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<ValueReconstructResult> {
-        ensure!(lsn_range.start >= self.start_lsn);
-        let mut need_image = true;
-
-        let ctx = RequestContextBuilder::extend(ctx)
-            .page_content_kind(PageContentKind::InMemoryLayer)
-            .build();
-
-        let inner = self.inner.read().await;
-
-        let reader = inner.file.block_cursor();
-
-        // Scan the page versions backwards, starting from `lsn`.
-        if let Some(vec_map) = inner.index.get(&key) {
-            let slice = vec_map.slice_range(lsn_range);
-            for (entry_lsn, pos) in slice.iter().rev() {
-                let buf = reader.read_blob(*pos, &ctx).await?;
-                let value = Value::des(&buf)?;
-                match value {
-                    Value::Image(img) => {
-                        reconstruct_state.img = Some((*entry_lsn, img));
-                        return Ok(ValueReconstructResult::Complete);
-                    }
-                    Value::WalRecord(rec) => {
-                        let will_init = rec.will_init();
-                        reconstruct_state.records.push((*entry_lsn, rec));
-                        if will_init {
-                            // This WAL record initializes the page, so no need to go further back
-                            need_image = false;
-                            break;
-                        }
-                    }
-                }
-            }
-        }
-
-        // release lock on 'inner'
-
-        // If an older page image is needed to reconstruct the page, let the
-        // caller know.
-        if need_image {
-            Ok(ValueReconstructResult::Continue)
-        } else {
-            Ok(ValueReconstructResult::Complete)
-        }
-    }
-
     // Look up the keys in the provided keyspace and update
     // the reconstruct state with whatever is found.
     //
@@ -458,11 +391,6 @@ impl InMemoryLayer {
 
         Ok(InMemoryLayer {
             file_id: key,
-            local_path_str: {
-                let mut buf = String::new();
-                inmem_layer_log_display(&mut buf, timeline_id, start_lsn, Lsn::MAX).unwrap();
-                buf.into()
-            },
             frozen_local_path_str: OnceLock::new(),
             conf,
             timeline_id,
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 5732779e44..cee2fe7342 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -24,8 +24,7 @@ use super::delta_layer::{self, DeltaEntry};
 use super::image_layer::{self};
 use super::{
     AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName,
-    LayerVisibilityHint, PersistentLayerDesc, ValueReconstructResult, ValueReconstructState,
-    ValuesReconstructState,
+    LayerVisibilityHint, PersistentLayerDesc, ValuesReconstructState,
 };
 
 use utils::generation::Generation;
@@ -301,42 +300,6 @@ impl Layer {
         self.0.delete_on_drop();
     }
 
-    /// Return data needed to reconstruct given page at LSN.
-    ///
-    /// It is up to the caller to collect more data from the previous layer and
-    /// perform WAL redo, if necessary.
-    ///
-    /// # Cancellation-Safety
-    ///
-    /// This method is cancellation-safe.
-    pub(crate) async fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        lsn_range: Range<Lsn>,
-        reconstruct_data: &mut ValueReconstructState,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<ValueReconstructResult> {
-        use anyhow::ensure;
-
-        let layer = self.0.get_or_maybe_download(true, Some(ctx)).await?;
-        self.0.access_stats.record_access(ctx);
-
-        if self.layer_desc().is_delta {
-            ensure!(lsn_range.start >= self.layer_desc().lsn_range.start);
-            ensure!(self.layer_desc().key_range.contains(&key));
-        } else {
-            ensure!(self.layer_desc().key_range.contains(&key));
-            ensure!(lsn_range.start >= self.layer_desc().image_layer_lsn());
-            ensure!(lsn_range.end >= self.layer_desc().image_layer_lsn());
-        }
-
-        layer
-            .get_value_reconstruct_data(key, lsn_range, reconstruct_data, &self.0, ctx)
-            .instrument(tracing::debug_span!("get_value_reconstruct_data", layer=%self))
-            .await
-            .with_context(|| format!("get_value_reconstruct_data for layer {self}"))
-    }
-
     pub(crate) async fn get_values_reconstruct_data(
         &self,
         keyspace: KeySpace,
@@ -441,10 +404,6 @@ impl Layer {
         &self.0.path
     }
 
-    pub(crate) fn debug_str(&self) -> &Arc<str> {
-        &self.0.debug_str
-    }
-
     pub(crate) fn metadata(&self) -> LayerFileMetadata {
         self.0.metadata()
     }
@@ -519,7 +478,7 @@ impl Layer {
 ///
 /// However when we want something evicted, we cannot evict it right away as there might be current
 /// reads happening on it. For example: it has been searched from [`LayerMap::search`] but not yet
-/// read with [`Layer::get_value_reconstruct_data`].
+/// read with [`Layer::get_values_reconstruct_data`].
 ///
 /// [`LayerMap::search`]: crate::tenant::layer_map::LayerMap::search
 #[derive(Debug)]
@@ -600,9 +559,6 @@ struct LayerInner {
     /// Full path to the file; unclear if this should exist anymore.
     path: Utf8PathBuf,
 
-    /// String representation of the layer, used for traversal id.
-    debug_str: Arc<str>,
-
     desc: PersistentLayerDesc,
 
     /// Timeline access is needed for remote timeline client and metrics.
@@ -836,9 +792,6 @@ impl LayerInner {
 
         LayerInner {
             conf,
-            debug_str: {
-                format!("timelines/{}/{}", timeline.timeline_id, desc.layer_name()).into()
-            },
             path: local_path,
             desc,
             timeline: Arc::downgrade(timeline),
@@ -1759,28 +1712,6 @@ impl DownloadedLayer {
             .map_err(|e| anyhow::anyhow!("layer load failed earlier: {e}"))
     }
 
-    async fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        lsn_range: Range<Lsn>,
-        reconstruct_data: &mut ValueReconstructState,
-        owner: &Arc<LayerInner>,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<ValueReconstructResult> {
-        use LayerKind::*;
-
-        match self.get(owner, ctx).await? {
-            Delta(d) => {
-                d.get_value_reconstruct_data(key, lsn_range, reconstruct_data, ctx)
-                    .await
-            }
-            Image(i) => {
-                i.get_value_reconstruct_data(key, reconstruct_data, ctx)
-                    .await
-            }
-        }
-    }
-
     async fn get_values_reconstruct_data(
         &self,
         keyspace: KeySpace,
diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs
index 423cde001c..6b0d5f09ff 100644
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -50,13 +50,26 @@ async fn smoke_test() {
     // all layers created at pageserver are like `layer`, initialized with strong
     // Arc<DownloadedLayer>.
 
+    let controlfile_keyspace = KeySpace {
+        ranges: vec![CONTROLFILE_KEY..CONTROLFILE_KEY.next()],
+    };
+
     let img_before = {
-        let mut data = ValueReconstructState::default();
+        let mut data = ValuesReconstructState::default();
         layer
-            .get_value_reconstruct_data(CONTROLFILE_KEY, Lsn(0x10)..Lsn(0x11), &mut data, &ctx)
+            .get_values_reconstruct_data(
+                controlfile_keyspace.clone(),
+                Lsn(0x10)..Lsn(0x11),
+                &mut data,
+                &ctx,
+            )
             .await
             .unwrap();
-        data.img
+        data.keys
+            .remove(&CONTROLFILE_KEY)
+            .expect("must be present")
+            .expect("should not error")
+            .img
             .take()
             .expect("tenant harness writes the control file")
     };
@@ -74,13 +87,24 @@ async fn smoke_test() {
 
     // on accesses when the layer is evicted, it will automatically be downloaded.
     let img_after = {
-        let mut data = ValueReconstructState::default();
+        let mut data = ValuesReconstructState::default();
         layer
-            .get_value_reconstruct_data(CONTROLFILE_KEY, Lsn(0x10)..Lsn(0x11), &mut data, &ctx)
+            .get_values_reconstruct_data(
+                controlfile_keyspace.clone(),
+                Lsn(0x10)..Lsn(0x11),
+                &mut data,
+                &ctx,
+            )
             .instrument(download_span.clone())
             .await
             .unwrap();
-        data.img.take().unwrap()
+        data.keys
+            .remove(&CONTROLFILE_KEY)
+            .expect("must be present")
+            .expect("should not error")
+            .img
+            .take()
+            .expect("tenant harness writes the control file")
     };
 
     assert_eq!(img_before, img_after);
@@ -830,7 +854,7 @@ async fn eviction_cancellation_on_drop() {
 fn layer_size() {
     assert_eq!(size_of::<LayerAccessStats>(), 8);
     assert_eq!(size_of::<PersistentLayerDesc>(), 104);
-    assert_eq!(size_of::<LayerInner>(), 312);
+    assert_eq!(size_of::<LayerInner>(), 296);
     // it also has the utf8 path
 }
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 79bfd1ebb2..5a02fd4a4c 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -22,8 +22,8 @@ use handle::ShardTimelineId;
 use once_cell::sync::Lazy;
 use pageserver_api::{
     key::{
-        AUX_FILES_KEY, KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX,
-        NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE,
+        KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE,
+        NON_INHERITED_SPARSE_RANGE,
     },
     keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning},
     models::{
@@ -59,10 +59,7 @@ use std::{
     collections::{BTreeMap, HashMap, HashSet},
     sync::atomic::AtomicU64,
 };
-use std::{
-    cmp::{max, min},
-    ops::ControlFlow,
-};
+use std::{cmp::min, ops::ControlFlow};
 use std::{
     collections::btree_map::Entry,
     ops::{Deref, Range},
@@ -87,8 +84,8 @@ use crate::{
     disk_usage_eviction_task::finite_f32,
     tenant::storage_layer::{
         AsLayerDesc, DeltaLayerWriter, EvictionError, ImageLayerWriter, InMemoryLayer, Layer,
-        LayerAccessStatsReset, LayerName, ResidentLayer, ValueReconstructResult,
-        ValueReconstructState, ValuesReconstructState,
+        LayerAccessStatsReset, LayerName, ResidentLayer, ValueReconstructState,
+        ValuesReconstructState,
     },
 };
 use crate::{
@@ -543,7 +540,6 @@ pub struct MissingKeyError {
     cont_lsn: Lsn,
     request_lsn: Lsn,
     ancestor_lsn: Option<Lsn>,
-    traversal_path: Vec<TraversalPathItem>,
     backtrace: Option<std::backtrace::Backtrace>,
 }
 
@@ -564,18 +560,6 @@ impl std::fmt::Display for MissingKeyError {
             write!(f, ", ancestor {}", ancestor_lsn)?;
         }
 
-        if !self.traversal_path.is_empty() {
-            writeln!(f)?;
-        }
-
-        for (r, c, l) in &self.traversal_path {
-            writeln!(
-                f,
-                "layer traversal: result {:?}, cont_lsn {}, layer: {}",
-                r, c, l,
-            )?;
-        }
-
         if let Some(ref backtrace) = self.backtrace {
             write!(f, "\n{}", backtrace)?;
         }
@@ -918,119 +902,44 @@ impl Timeline {
 
         self.timeline_get_throttle.throttle(ctx, 1).await;
 
-        match self.conf.get_impl {
-            GetImpl::Legacy => {
-                let reconstruct_state = ValueReconstructState {
-                    records: Vec::new(),
-                    img: None,
-                };
+        let keyspace = KeySpace {
+            ranges: vec![key..key.next()],
+        };
 
-                self.get_impl(key, lsn, reconstruct_state, ctx).await
-            }
-            GetImpl::Vectored => {
-                let keyspace = KeySpace {
-                    ranges: vec![key..key.next()],
-                };
+        // Initialise the reconstruct state for the key with the cache
+        // entry returned above.
+        let mut reconstruct_state = ValuesReconstructState::new();
 
-                // Initialise the reconstruct state for the key with the cache
-                // entry returned above.
-                let mut reconstruct_state = ValuesReconstructState::new();
+        let vectored_res = self
+            .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx)
+            .await;
 
-                let vectored_res = self
-                    .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx)
-                    .await;
-
-                if self.conf.validate_vectored_get {
-                    self.validate_get_vectored_impl(&vectored_res, keyspace, lsn, ctx)
-                        .await;
-                }
-
-                let key_value = vectored_res?.pop_first();
-                match key_value {
-                    Some((got_key, value)) => {
-                        if got_key != key {
-                            error!(
-                                "Expected {}, but singular vectored get returned {}",
-                                key, got_key
-                            );
-                            Err(PageReconstructError::Other(anyhow!(
-                                "Singular vectored get returned wrong key"
-                            )))
-                        } else {
-                            value
-                        }
-                    }
-                    None => Err(PageReconstructError::MissingKey(MissingKeyError {
-                        key,
-                        shard: self.shard_identity.get_shard_number(&key),
-                        cont_lsn: Lsn(0),
-                        request_lsn: lsn,
-                        ancestor_lsn: None,
-                        traversal_path: Vec::new(),
-                        backtrace: None,
-                    })),
+        let key_value = vectored_res?.pop_first();
+        match key_value {
+            Some((got_key, value)) => {
+                if got_key != key {
+                    error!(
+                        "Expected {}, but singular vectored get returned {}",
+                        key, got_key
+                    );
+                    Err(PageReconstructError::Other(anyhow!(
+                        "Singular vectored get returned wrong key"
+                    )))
+                } else {
+                    value
                 }
             }
+            None => Err(PageReconstructError::MissingKey(MissingKeyError {
+                key,
+                shard: self.shard_identity.get_shard_number(&key),
+                cont_lsn: Lsn(0),
+                request_lsn: lsn,
+                ancestor_lsn: None,
+                backtrace: None,
+            })),
         }
     }
 
-    /// Not subject to [`Self::timeline_get_throttle`].
-    async fn get_impl(
-        &self,
-        key: Key,
-        lsn: Lsn,
-        mut reconstruct_state: ValueReconstructState,
-        ctx: &RequestContext,
-    ) -> Result<Bytes, PageReconstructError> {
-        // XXX: structured stats collection for layer eviction here.
-        trace!(
-            "get page request for {}@{} from task kind {:?}",
-            key,
-            lsn,
-            ctx.task_kind()
-        );
-
-        let timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME
-            .for_get_kind(GetKind::Singular)
-            .start_timer();
-        let path = self
-            .get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx)
-            .await?;
-        timer.stop_and_record();
-
-        let start = Instant::now();
-        let res = self.reconstruct_value(key, lsn, reconstruct_state).await;
-        let elapsed = start.elapsed();
-        crate::metrics::RECONSTRUCT_TIME
-            .for_get_kind(GetKind::Singular)
-            .observe(elapsed.as_secs_f64());
-
-        if cfg!(feature = "testing")
-            && res.is_err()
-            && !matches!(res, Err(PageReconstructError::Cancelled))
-        {
-            // it can only be walredo issue
-            use std::fmt::Write;
-
-            let mut msg = String::new();
-
-            path.into_iter().for_each(|(res, cont_lsn, layer)| {
-                writeln!(
-                    msg,
-                    "- layer traversal: result {res:?}, cont_lsn {cont_lsn}, layer: {}",
-                    layer,
-                )
-                .expect("string grows")
-            });
-
-            // this is to rule out or provide evidence that we could in some cases read a duplicate
-            // walrecord
-            tracing::info!("walredo failed, path:\n{msg}");
-        }
-
-        res
-    }
-
     pub(crate) const MAX_GET_VECTORED_KEYS: u64 = 32;
     pub(crate) const VEC_GET_LAYERS_VISITED_WARN_THRESH: f64 = 512.0;
 
@@ -1080,28 +989,14 @@ impl Timeline {
             .throttle(ctx, key_count as usize)
             .await;
 
-        let res = match self.conf.get_vectored_impl {
-            GetVectoredImpl::Sequential => {
-                self.get_vectored_sequential_impl(keyspace, lsn, ctx).await
-            }
-            GetVectoredImpl::Vectored => {
-                let vectored_res = self
-                    .get_vectored_impl(
-                        keyspace.clone(),
-                        lsn,
-                        &mut ValuesReconstructState::new(),
-                        ctx,
-                    )
-                    .await;
-
-                if self.conf.validate_vectored_get {
-                    self.validate_get_vectored_impl(&vectored_res, keyspace, lsn, ctx)
-                        .await;
-                }
-
-                vectored_res
-            }
-        };
+        let res = self
+            .get_vectored_impl(
+                keyspace.clone(),
+                lsn,
+                &mut ValuesReconstructState::new(),
+                ctx,
+            )
+            .await;
 
         if let Some((metric, start)) = start {
             let elapsed = start.elapsed();
@@ -1190,65 +1085,6 @@ impl Timeline {
         vectored_res
     }
 
-    /// Not subject to [`Self::timeline_get_throttle`].
-    pub(super) async fn get_vectored_sequential_impl(
-        &self,
-        keyspace: KeySpace,
-        lsn: Lsn,
-        ctx: &RequestContext,
-    ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
-        let mut values = BTreeMap::new();
-
-        for range in keyspace.ranges {
-            let mut key = range.start;
-            while key != range.end {
-                let block = self
-                    .get_impl(key, lsn, ValueReconstructState::default(), ctx)
-                    .await;
-
-                use PageReconstructError::*;
-                match block {
-                    Err(Cancelled) => return Err(GetVectoredError::Cancelled),
-                    Err(MissingKey(_))
-                        if NON_INHERITED_RANGE.contains(&key)
-                            || NON_INHERITED_SPARSE_RANGE.contains(&key) =>
-                    {
-                        // Ignore missing key error for aux key range. TODO: currently, we assume non_inherited_range == aux_key_range.
-                        // When we add more types of keys into the page server, we should revisit this part of code and throw errors
-                        // accordingly.
-                        key = key.next();
-                    }
-                    Err(MissingKey(err)) => {
-                        return Err(GetVectoredError::MissingKey(err));
-                    }
-                    Err(Other(err))
-                        if err
-                            .to_string()
-                            .contains("downloading evicted layer file failed") =>
-                    {
-                        return Err(GetVectoredError::Other(err))
-                    }
-                    Err(Other(err))
-                        if err
-                            .chain()
-                            .any(|cause| cause.to_string().contains("layer loading failed")) =>
-                    {
-                        // The intent here is to achieve error parity with the vectored read path.
-                        // When vectored read fails to load a layer it fails the whole read, hence
-                        // we mimic this behaviour here to keep the validation happy.
-                        return Err(GetVectoredError::Other(err));
-                    }
-                    _ => {
-                        values.insert(key, block);
-                        key = key.next();
-                    }
-                }
-            }
-        }
-
-        Ok(values)
-    }
-
     pub(super) async fn get_vectored_impl(
         &self,
         keyspace: KeySpace,
@@ -1319,113 +1155,6 @@ impl Timeline {
         Ok(results)
     }
 
-    /// Not subject to [`Self::timeline_get_throttle`].
-    pub(super) async fn validate_get_vectored_impl(
-        &self,
-        vectored_res: &Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError>,
-        keyspace: KeySpace,
-        lsn: Lsn,
-        ctx: &RequestContext,
-    ) {
-        if keyspace.overlaps(&Key::metadata_key_range()) {
-            // skip validation for metadata key range
-            return;
-        }
-
-        let sequential_res = self
-            .get_vectored_sequential_impl(keyspace.clone(), lsn, ctx)
-            .await;
-
-        fn errors_match(lhs: &GetVectoredError, rhs: &GetVectoredError) -> bool {
-            use GetVectoredError::*;
-            match (lhs, rhs) {
-                (Oversized(l), Oversized(r)) => l == r,
-                (InvalidLsn(l), InvalidLsn(r)) => l == r,
-                (MissingKey(l), MissingKey(r)) => l.key == r.key,
-                (GetReadyAncestorError(_), GetReadyAncestorError(_)) => true,
-                (Other(_), Other(_)) => true,
-                _ => false,
-            }
-        }
-
-        match (&sequential_res, vectored_res) {
-            (Err(GetVectoredError::Cancelled), _) => {},
-            (_, Err(GetVectoredError::Cancelled)) => {},
-            (Err(seq_err), Ok(_)) => {
-                panic!(concat!("Sequential get failed with {}, but vectored get did not",
-                               " - keyspace={:?} lsn={}"),
-                       seq_err, keyspace, lsn) },
-            (Ok(_), Err(GetVectoredError::GetReadyAncestorError(GetReadyAncestorError::AncestorLsnTimeout(_)))) => {
-                // Sequential get runs after vectored get, so it is possible for the later
-                // to time out while waiting for its ancestor's Lsn to become ready and for the
-                // former to succeed (it essentially has a doubled wait time).
-            },
-            (Ok(_), Err(vec_err)) => {
-                panic!(concat!("Vectored get failed with {}, but sequential get did not",
-                               " - keyspace={:?} lsn={}"),
-                       vec_err, keyspace, lsn) },
-            (Err(seq_err), Err(vec_err)) => {
-                assert!(errors_match(seq_err, vec_err),
-                        "Mismatched errors: {seq_err} != {vec_err} - keyspace={keyspace:?} lsn={lsn}")},
-            (Ok(seq_values), Ok(vec_values)) => {
-                seq_values.iter().zip(vec_values.iter()).for_each(|((seq_key, seq_res), (vec_key, vec_res))| {
-                    assert_eq!(seq_key, vec_key);
-                    match (seq_res, vec_res) {
-                        (Ok(seq_blob), Ok(vec_blob)) => {
-                            Self::validate_key_equivalence(seq_key, &keyspace, lsn, seq_blob, vec_blob);
-                        },
-                        (Err(err), Ok(_)) => {
-                            panic!(
-                                concat!("Sequential get failed with {} for key {}, but vectored get did not",
-                                        " - keyspace={:?} lsn={}"),
-                                err, seq_key, keyspace, lsn) },
-                        (Ok(_), Err(err)) => {
-                            panic!(
-                                concat!("Vectored get failed with {} for key {}, but sequential get did not",
-                                        " - keyspace={:?} lsn={}"),
-                                err, seq_key, keyspace, lsn) },
-                        (Err(_), Err(_)) => {}
-                    }
-                })
-            }
-        }
-    }
-
-    fn validate_key_equivalence(
-        key: &Key,
-        keyspace: &KeySpace,
-        lsn: Lsn,
-        seq: &Bytes,
-        vec: &Bytes,
-    ) {
-        if *key == AUX_FILES_KEY {
-            // The value reconstruct of AUX_FILES_KEY from records is not deterministic
-            // since it uses a hash map under the hood. Hence, deserialise both results
-            // before comparing.
-            let seq_aux_dir_res = AuxFilesDirectory::des(seq);
-            let vec_aux_dir_res = AuxFilesDirectory::des(vec);
-            match (&seq_aux_dir_res, &vec_aux_dir_res) {
-                (Ok(seq_aux_dir), Ok(vec_aux_dir)) => {
-                    assert_eq!(
-                        seq_aux_dir, vec_aux_dir,
-                        "Mismatch for key {} - keyspace={:?} lsn={}",
-                        key, keyspace, lsn
-                    );
-                }
-                (Err(_), Err(_)) => {}
-                _ => {
-                    panic!("Mismatch for {key}: {seq_aux_dir_res:?} != {vec_aux_dir_res:?}");
-                }
-            }
-        } else {
-            // All other keys should reconstruct deterministically, so we simply compare the blobs.
-            assert_eq!(
-                seq, vec,
-                "Image mismatch for key {key} - keyspace={keyspace:?} lsn={lsn}"
-            );
-        }
-    }
-
     /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
     pub(crate) fn get_last_record_lsn(&self) -> Lsn {
         self.last_record_lsn.load().last
@@ -3215,228 +2944,7 @@ impl Timeline {
     }
 }
 
-type TraversalId = Arc<str>;
-
-trait TraversalLayerExt {
-    fn traversal_id(&self) -> TraversalId;
-}
-
-impl TraversalLayerExt for Layer {
-    fn traversal_id(&self) -> TraversalId {
-        Arc::clone(self.debug_str())
-    }
-}
-
-impl TraversalLayerExt for Arc<InMemoryLayer> {
-    fn traversal_id(&self) -> TraversalId {
-        Arc::clone(self.local_path_str())
-    }
-}
-
 impl Timeline {
-    ///
-    /// Get a handle to a Layer for reading.
-    ///
-    /// The returned Layer might be from an ancestor timeline, if the
-    /// segment hasn't been updated on this timeline yet.
-    ///
-    /// This function takes the current timeline's locked LayerMap as an argument,
-    /// so callers can avoid potential race conditions.
-    ///
-    /// # Cancel-Safety
-    ///
-    /// This method is cancellation-safe.
-    async fn get_reconstruct_data(
-        &self,
-        key: Key,
-        request_lsn: Lsn,
-        reconstruct_state: &mut ValueReconstructState,
-        ctx: &RequestContext,
-    ) -> Result<Vec<TraversalPathItem>, PageReconstructError> {
-        // Start from the current timeline.
-        let mut timeline_owned;
-        let mut timeline = self;
-
-        let mut read_count = scopeguard::guard(0, |cnt| {
-            crate::metrics::READ_NUM_LAYERS_VISITED.observe(cnt as f64)
-        });
-
-        // For debugging purposes, collect the path of layers that we traversed
-        // through. It's included in the error message if we fail to find the key.
-        let mut traversal_path = Vec::<TraversalPathItem>::new();
-
-        let cached_lsn = if let Some((cached_lsn, _)) = &reconstruct_state.img {
-            *cached_lsn
-        } else {
-            Lsn(0)
-        };
-
-        // 'prev_lsn' tracks the last LSN that we were at in our search. It's used
-        // to check that each iteration make some progress, to break infinite
-        // looping if something goes wrong.
-        let mut prev_lsn = None;
-
-        let mut result = ValueReconstructResult::Continue;
-        let mut cont_lsn = Lsn(request_lsn.0 + 1);
-
-        'outer: loop {
-            if self.cancel.is_cancelled() {
-                return Err(PageReconstructError::Cancelled);
-            }
-
-            // The function should have updated 'state'
-            //info!("CALLED for {} at {}: {:?} with {} records, cached {}", key, cont_lsn, result, reconstruct_state.records.len(), cached_lsn);
-            match result {
-                ValueReconstructResult::Complete => return Ok(traversal_path),
-                ValueReconstructResult::Continue => {
-                    // If we reached an earlier cached page image, we're done.
-                    if cont_lsn == cached_lsn + 1 {
-                        return Ok(traversal_path);
-                    }
-                    if let Some(prev) = prev_lsn {
-                        if prev <= cont_lsn {
-                            // Didn't make any progress in last iteration. Error out to avoid
-                            // getting stuck in the loop.
-                            return Err(PageReconstructError::MissingKey(MissingKeyError {
-                                key,
-                                shard: self.shard_identity.get_shard_number(&key),
-                                cont_lsn: Lsn(cont_lsn.0 - 1),
-                                request_lsn,
-                                ancestor_lsn: Some(timeline.ancestor_lsn),
-                                traversal_path,
-                                backtrace: None,
-                            }));
-                        }
-                    }
-                    prev_lsn = Some(cont_lsn);
-                }
-                ValueReconstructResult::Missing => {
-                    return Err(PageReconstructError::MissingKey(MissingKeyError {
-                        key,
-                        shard: self.shard_identity.get_shard_number(&key),
-                        cont_lsn,
-                        request_lsn,
-                        ancestor_lsn: None,
-                        traversal_path,
-                        backtrace: if cfg!(test) {
-                            Some(std::backtrace::Backtrace::force_capture())
-                        } else {
-                            None
-                        },
-                    }));
-                }
-            }
-
-            // Recurse into ancestor if needed
-            if let Some(ancestor_timeline) = timeline.ancestor_timeline.as_ref() {
-                if key.is_inherited_key() && Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn {
-                    trace!(
-                        "going into ancestor {}, cont_lsn is {}",
-                        timeline.ancestor_lsn,
-                        cont_lsn
-                    );
-
-                    timeline_owned = timeline
-                        .get_ready_ancestor_timeline(ancestor_timeline, ctx)
-                        .await?;
-                    timeline = &*timeline_owned;
-                    prev_lsn = None;
-                    continue 'outer;
-                }
-            }
-
-            let guard = timeline.layers.read().await;
-            let layers = guard.layer_map();
-
-            // Check the open and frozen in-memory layers first, in order from newest
-            // to oldest.
-            if let Some(open_layer) = &layers.open_layer {
-                let start_lsn = open_layer.get_lsn_range().start;
-                if cont_lsn > start_lsn {
-                    //info!("CHECKING for {} at {} on open layer {}", key, cont_lsn, open_layer.layer_name().display());
-                    // Get all the data needed to reconstruct the page version from this layer.
-                    // But if we have an older cached page image, no need to go past that.
-                    let lsn_floor = max(cached_lsn + 1, start_lsn);
-
-                    let open_layer = open_layer.clone();
-                    drop(guard);
-
-                    result = match open_layer
-                        .get_value_reconstruct_data(
-                            key,
-                            lsn_floor..cont_lsn,
-                            reconstruct_state,
-                            ctx,
-                        )
-                        .await
-                    {
-                        Ok(result) => result,
-                        Err(e) => return Err(PageReconstructError::from(e)),
-                    };
-                    cont_lsn = lsn_floor;
-                    *read_count += 1;
-                    traversal_path.push((result, cont_lsn, open_layer.traversal_id()));
-                    continue 'outer;
-                }
-            }
-            for frozen_layer in layers.frozen_layers.iter().rev() {
-                let start_lsn = frozen_layer.get_lsn_range().start;
-                if cont_lsn > start_lsn {
-                    //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.layer_name().display());
-                    let lsn_floor = max(cached_lsn + 1, start_lsn);
-
-                    let frozen_layer = frozen_layer.clone();
-                    drop(guard);
-
-                    result = match frozen_layer
-                        .get_value_reconstruct_data(
-                            key,
-                            lsn_floor..cont_lsn,
-                            reconstruct_state,
-                            ctx,
-                        )
-                        .await
-                    {
-                        Ok(result) => result,
-                        Err(e) => return Err(PageReconstructError::from(e)),
-                    };
-                    cont_lsn = lsn_floor;
-                    *read_count += 1;
-                    traversal_path.push((result, cont_lsn, frozen_layer.traversal_id()));
-                    continue 'outer;
-                }
-            }
-
-            if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) {
-                let layer = guard.get_from_desc(&layer);
-                drop(guard);
-                // Get all the data needed to reconstruct the page version from this layer.
-                // But if we have an older cached page image, no need to go past that.
-                let lsn_floor = max(cached_lsn + 1, lsn_floor);
-                result = match layer
-                    .get_value_reconstruct_data(key, lsn_floor..cont_lsn, reconstruct_state, ctx)
-                    .await
-                {
-                    Ok(result) => result,
-                    Err(e) => return Err(PageReconstructError::from(e)),
-                };
-                cont_lsn = lsn_floor;
-                *read_count += 1;
-                traversal_path.push((result, cont_lsn, layer.traversal_id()));
-                continue 'outer;
-            } else if timeline.ancestor_timeline.is_some() {
-                // Nothing on this timeline. Traverse to parent
-                result = ValueReconstructResult::Continue;
-                cont_lsn = Lsn(timeline.ancestor_lsn.0 + 1);
-                continue 'outer;
-            } else {
-                // Nothing found
-                result = ValueReconstructResult::Missing;
-                continue 'outer;
-            }
-        }
-    }
-
     #[allow(clippy::doc_lazy_continuation)]
     /// Get the data needed to reconstruct all keys in the provided keyspace
     ///
@@ -3530,7 +3038,6 @@ impl Timeline {
                 cont_lsn,
                 request_lsn,
                 ancestor_lsn: Some(timeline.ancestor_lsn),
-                traversal_path: vec![],
                 backtrace: None,
             }));
         }
@@ -5895,8 +5402,6 @@ impl Timeline {
     }
 }
 
-type TraversalPathItem = (ValueReconstructResult, Lsn, TraversalId);
-
 /// Tracking writes ingestion does to a particular in-memory layer.
 ///
 /// Cleared upon freezing a layer.

From 814b09025030ab814f655971bebf767ec80f4399 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 6 Aug 2024 13:45:41 +0300
Subject: [PATCH 341/412] chore: bump index part version (#8611)

#8600 missed the hunk changing index_part.json informative version.
Include it in this PR, in addition add more non-warning index_part.json
versions to scrubber.
---
 pageserver/src/tenant/remote_timeline_client/index.rs | 5 +++--
 storage_scrubber/src/checks.rs                        | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index 8e6290030d..90453b1922 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -88,10 +88,11 @@ impl IndexPart {
     /// - 6: last_aux_file_policy is added.
     /// - 7: metadata_bytes is no longer written, but still read
     /// - 8: added `archived_at`
-    const LATEST_VERSION: usize = 8;
+    /// - 9: +gc_blocking
+    const LATEST_VERSION: usize = 9;
 
     // Versions we may see when reading from a bucket.
-    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8];
+    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9];
 
     pub const FILE_NAME: &'static str = "index_part.json";
 
diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs
index 5aa9e88c40..14788515dd 100644
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -92,7 +92,7 @@ pub(crate) async fn branch_cleanup_and_check_errors(
                             .push(format!("index_part.json version: {}", index_part.version()))
                     }
 
-                    let mut newest_versions = IndexPart::KNOWN_VERSIONS.iter().rev().take(2);
+                    let mut newest_versions = IndexPart::KNOWN_VERSIONS.iter().rev().take(3);
                     if !newest_versions.any(|ip| ip == &index_part.version()) {
                         info!(
                             "index_part.json version is not latest: {}",

From 44d99757995806bf94e9a2a10e88ff51026512ee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 6 Aug 2024 12:51:39 +0200
Subject: [PATCH 342/412] storage_scrubber: migrate scan_safekeeper_metadata to
 remote_storage (#8595)

Migrates the safekeeper-specific parts of `ScanMetadata` to
GenericRemoteStorage, making it Azure-ready.

Part of https://github.com/neondatabase/neon/issues/7547
---
 storage_scrubber/src/metadata_stream.rs       | 32 ++++++++++++++++++-
 .../src/scan_safekeeper_metadata.rs           | 20 ++++++------
 2 files changed, 42 insertions(+), 10 deletions(-)

diff --git a/storage_scrubber/src/metadata_stream.rs b/storage_scrubber/src/metadata_stream.rs
index c702c0c312..54812ffc94 100644
--- a/storage_scrubber/src/metadata_stream.rs
+++ b/storage_scrubber/src/metadata_stream.rs
@@ -4,7 +4,7 @@ use anyhow::{anyhow, Context};
 use async_stream::{stream, try_stream};
 use aws_sdk_s3::{types::ObjectIdentifier, Client};
 use futures::StreamExt;
-use remote_storage::{GenericRemoteStorage, ListingMode};
+use remote_storage::{GenericRemoteStorage, ListingMode, ListingObject, RemotePath};
 use tokio_stream::Stream;
 
 use crate::{
@@ -276,3 +276,33 @@ pub(crate) fn stream_listing<'a>(
         }
     }
 }
+
+pub(crate) fn stream_listing_generic<'a>(
+    remote_client: &'a GenericRemoteStorage,
+    target: &'a S3Target,
+) -> impl Stream<Item = anyhow::Result<(RemotePath, Option<ListingObject>)>> + 'a {
+    let listing_mode = if target.delimiter.is_empty() {
+        ListingMode::NoDelimiter
+    } else {
+        ListingMode::WithDelimiter
+    };
+    try_stream! {
+        let mut objects_stream = std::pin::pin!(stream_objects_with_retries(
+            remote_client,
+            listing_mode,
+            target,
+        ));
+        while let Some(list) = objects_stream.next().await {
+            let list = list?;
+            if target.delimiter.is_empty() {
+                for key in list.keys {
+                    yield (key.key.clone(), Some(key));
+                }
+            } else {
+                for key in list.prefixes {
+                    yield (key, None);
+                }
+            }
+        }
+    }
+}
diff --git a/storage_scrubber/src/scan_safekeeper_metadata.rs b/storage_scrubber/src/scan_safekeeper_metadata.rs
index 553adf8f46..08a4541c5c 100644
--- a/storage_scrubber/src/scan_safekeeper_metadata.rs
+++ b/storage_scrubber/src/scan_safekeeper_metadata.rs
@@ -1,10 +1,10 @@
 use std::{collections::HashSet, str::FromStr, sync::Arc};
 
-use aws_sdk_s3::Client;
 use futures::stream::{StreamExt, TryStreamExt};
 use once_cell::sync::OnceCell;
 use pageserver_api::shard::TenantShardId;
 use postgres_ffi::{XLogFileName, PG_TLI};
+use remote_storage::GenericRemoteStorage;
 use serde::Serialize;
 use tokio_postgres::types::PgLsn;
 use tracing::{error, info, trace};
@@ -14,8 +14,9 @@ use utils::{
 };
 
 use crate::{
-    cloud_admin_api::CloudAdminApiClient, init_remote, metadata_stream::stream_listing,
-    BucketConfig, ConsoleConfig, NodeKind, RootTarget, TenantShardTimelineId,
+    cloud_admin_api::CloudAdminApiClient, init_remote_generic,
+    metadata_stream::stream_listing_generic, BucketConfig, ConsoleConfig, NodeKind, RootTarget,
+    TenantShardTimelineId,
 };
 
 /// Generally we should ask safekeepers, but so far we use everywhere default 16MB.
@@ -106,7 +107,7 @@ pub async fn scan_safekeeper_metadata(
     let timelines = client.query(&query, &[]).await?;
     info!("loaded {} timelines", timelines.len());
 
-    let (s3_client, target) = init_remote(bucket_config, NodeKind::Safekeeper).await?;
+    let (remote_client, target) = init_remote_generic(bucket_config, NodeKind::Safekeeper).await?;
     let console_config = ConsoleConfig::from_env()?;
     let cloud_admin_api_client = CloudAdminApiClient::new(console_config);
 
@@ -119,7 +120,7 @@ pub async fn scan_safekeeper_metadata(
         let backup_lsn: Lsn = Lsn(u64::from(backup_lsn_pg));
         let ttid = TenantTimelineId::new(tenant_id, timeline_id);
         check_timeline(
-            &s3_client,
+            &remote_client,
             &target,
             &cloud_admin_api_client,
             ttid,
@@ -156,7 +157,7 @@ struct TimelineCheckResult {
 /// errors are logged to stderr; returns Ok(true) if timeline is consistent,
 /// Ok(false) if not, Err if failed to check.
 async fn check_timeline(
-    s3_client: &Client,
+    remote_client: &GenericRemoteStorage,
     root: &RootTarget,
     api_client: &CloudAdminApiClient,
     ttid: TenantTimelineId,
@@ -187,12 +188,13 @@ async fn check_timeline(
     // we need files, so unset it.
     timeline_dir_target.delimiter = String::new();
 
-    let mut stream = std::pin::pin!(stream_listing(s3_client, &timeline_dir_target));
+    let mut stream = std::pin::pin!(stream_listing_generic(remote_client, &timeline_dir_target));
     while let Some(obj) = stream.next().await {
-        let obj = obj?;
-        let key = obj.key();
+        let (key, _obj) = obj?;
 
         let seg_name = key
+            .get_path()
+            .as_str()
             .strip_prefix(&timeline_dir_target.prefix_in_bucket)
             .expect("failed to extract segment name");
         expected_segfiles.remove(seg_name);

From bd276839add2c9a417263854a98a4aee3c8e24e4 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 6 Aug 2024 11:52:48 +0300
Subject: [PATCH 343/412] Add package-mode=false to poetry.

We don't use it for packaging, and 'poetry install' will soon error
otherwise. Also remove name and version fields as these are not required for
non-packaging mode.
---
 pyproject.toml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 0d5782ac7c..36a1e24ca1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,8 +1,7 @@
 [tool.poetry]
-name = "neon"
-version = "0.1.0"
 description = ""
 authors = []
+package-mode = false
 
 [tool.poetry.dependencies]
 python = "^3.9"

From 13e794a35cf9e03794b3b904e4b6804645a8ee57 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Tue, 6 Aug 2024 13:08:55 +0200
Subject: [PATCH 344/412] Add a test using Debezium as a client for the logical
 replication (#8568)

## Problem
We need to test the logical replication with some external consumers.
## Summary of changes
A test of the logical replication with Debezium as a consumer was added.
---------

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 .github/workflows/pg-clients.yml              |  26 ++-
 poetry.lock                                   |  16 +-
 pyproject.toml                                |   2 +
 test_runner/fixtures/utils.py                 |   7 +-
 test_runner/logical_repl/README.md            |  22 ++
 .../clickhouse/docker-compose.yml             |   9 +
 .../logical_repl/debezium/docker-compose.yml  |  24 +++
 .../{test_log_repl.py => test_clickhouse.py}  |  16 +-
 test_runner/logical_repl/test_debezium.py     | 189 ++++++++++++++++++
 9 files changed, 297 insertions(+), 14 deletions(-)
 create mode 100644 test_runner/logical_repl/README.md
 create mode 100644 test_runner/logical_repl/clickhouse/docker-compose.yml
 create mode 100644 test_runner/logical_repl/debezium/docker-compose.yml
 rename test_runner/logical_repl/{test_log_repl.py => test_clickhouse.py} (85%)
 create mode 100644 test_runner/logical_repl/test_debezium.py

diff --git a/.github/workflows/pg-clients.yml b/.github/workflows/pg-clients.yml
index 55b68ccdb5..23a2e3876c 100644
--- a/.github/workflows/pg-clients.yml
+++ b/.github/workflows/pg-clients.yml
@@ -66,7 +66,31 @@ jobs:
         ports:
           - 9000:9000
           - 8123:8123
-
+      zookeeper:
+        image: quay.io/debezium/zookeeper:2.7
+        ports:
+          - 2181:2181
+      kafka:
+        image: quay.io/debezium/kafka:2.7
+        env:
+          ZOOKEEPER_CONNECT: "zookeeper:2181"
+          KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092
+          KAFKA_BROKER_ID: 1
+          KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
+          KAFKA_JMX_PORT: 9991
+        ports:
+          - 9092:9092
+      debezium:
+        image: quay.io/debezium/connect:2.7
+        env:
+          BOOTSTRAP_SERVERS: kafka:9092
+          GROUP_ID: 1
+          CONFIG_STORAGE_TOPIC: debezium-config
+          OFFSET_STORAGE_TOPIC: debezium-offset
+          STATUS_STORAGE_TOPIC: debezium-status
+          DEBEZIUM_CONFIG_CONNECTOR_CLASS: io.debezium.connector.postgresql.PostgresConnector
+        ports:
+          - 8083:8083
     steps:
       - uses: actions/checkout@v4
 
diff --git a/poetry.lock b/poetry.lock
index d7a3dde65b..9026824558 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1514,6 +1514,20 @@ files = [
 [package.dependencies]
 six = "*"
 
+[[package]]
+name = "kafka-python"
+version = "2.0.2"
+description = "Pure Python client for Apache Kafka"
+optional = false
+python-versions = "*"
+files = [
+    {file = "kafka-python-2.0.2.tar.gz", hash = "sha256:04dfe7fea2b63726cd6f3e79a2d86e709d608d74406638c5da33a01d45a9d7e3"},
+    {file = "kafka_python-2.0.2-py2.py3-none-any.whl", hash = "sha256:2d92418c7cb1c298fa6c7f0fb3519b520d0d7526ac6cb7ae2a4fc65a51a94b6e"},
+]
+
+[package.extras]
+crc32c = ["crc32c"]
+
 [[package]]
 name = "lazy-object-proxy"
 version = "1.10.0"
@@ -3357,4 +3371,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "7cee6a8c30bc7f4bfb0a87c6bad3952dfb4da127fad853d2710a93ac3eab8a00"
+content-hash = "d569a3593b98baceb0a88e176bdad63cae99d6bfc2a81bf6741663a4abcafd72"
diff --git a/pyproject.toml b/pyproject.toml
index 36a1e24ca1..cfb569b2ba 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -41,6 +41,7 @@ httpx = {extras = ["http2"], version = "^0.26.0"}
 pytest-repeat = "^0.9.3"
 websockets = "^12.0"
 clickhouse-connect = "^0.7.16"
+kafka-python = "^2.0.2"
 
 [tool.poetry.group.dev.dependencies]
 mypy = "==1.3.0"
@@ -74,6 +75,7 @@ module = [
     "allure.*",
     "allure_commons.*",
     "allure_pytest.*",
+    "kafka.*",
 ]
 ignore_missing_imports = true
 
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index 7f54eb0b0a..4dc9f7caae 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -389,7 +389,10 @@ WaitUntilRet = TypeVar("WaitUntilRet")
 
 
 def wait_until(
-    number_of_iterations: int, interval: float, func: Callable[[], WaitUntilRet]
+    number_of_iterations: int,
+    interval: float,
+    func: Callable[[], WaitUntilRet],
+    show_intermediate_error=False,
 ) -> WaitUntilRet:
     """
     Wait until 'func' returns successfully, without exception. Returns the
@@ -402,6 +405,8 @@ def wait_until(
         except Exception as e:
             log.info("waiting for %s iteration %s failed", func, i + 1)
             last_exception = e
+            if show_intermediate_error:
+                log.info(e)
             time.sleep(interval)
             continue
         return res
diff --git a/test_runner/logical_repl/README.md b/test_runner/logical_repl/README.md
new file mode 100644
index 0000000000..8eca056dda
--- /dev/null
+++ b/test_runner/logical_repl/README.md
@@ -0,0 +1,22 @@
+# Logical replication tests
+
+## Clickhouse
+
+```bash
+export BENCHMARK_CONNSTR=postgres://user:pass@ep-abc-xyz-123.us-east-2.aws.neon.build/neondb
+
+docker compose -f clickhouse/docker-compose.yml up -d
+pytest -m remote_cluster -k test_clickhouse
+docker compose -f clickhouse/docker-compose.yml down
+```
+
+## Debezium
+
+```bash
+export BENCHMARK_CONNSTR=postgres://user:pass@ep-abc-xyz-123.us-east-2.aws.neon.build/neondb
+
+docker compose -f debezium/docker-compose.yml up -d
+pytest -m remote_cluster -k test_debezium
+docker compose -f debezium/docker-compose.yml down
+
+```
\ No newline at end of file
diff --git a/test_runner/logical_repl/clickhouse/docker-compose.yml b/test_runner/logical_repl/clickhouse/docker-compose.yml
new file mode 100644
index 0000000000..e00038b811
--- /dev/null
+++ b/test_runner/logical_repl/clickhouse/docker-compose.yml
@@ -0,0 +1,9 @@
+services:
+  clickhouse:
+    image: clickhouse/clickhouse-server
+    user: "101:101"
+    container_name: clickhouse
+    hostname: clickhouse
+    ports:
+      - 127.0.0.1:8123:8123
+      - 127.0.0.1:9000:9000
diff --git a/test_runner/logical_repl/debezium/docker-compose.yml b/test_runner/logical_repl/debezium/docker-compose.yml
new file mode 100644
index 0000000000..fee127a2fd
--- /dev/null
+++ b/test_runner/logical_repl/debezium/docker-compose.yml
@@ -0,0 +1,24 @@
+services:
+  zookeeper:
+    image: quay.io/debezium/zookeeper:2.7
+  kafka:
+    image: quay.io/debezium/kafka:2.7
+    environment:
+      ZOOKEEPER_CONNECT: "zookeeper:2181"
+      KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092
+      KAFKA_BROKER_ID: 1
+      KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
+      KAFKA_JMX_PORT: 9991
+    ports:
+      - 127.0.0.1:9092:9092
+  debezium:
+    image: quay.io/debezium/connect:2.7
+    environment:
+      BOOTSTRAP_SERVERS: kafka:9092
+      GROUP_ID: 1
+      CONFIG_STORAGE_TOPIC: debezium-config
+      OFFSET_STORAGE_TOPIC: debezium-offset
+      STATUS_STORAGE_TOPIC: debezium-status
+      DEBEZIUM_CONFIG_CONNECTOR_CLASS: io.debezium.connector.postgresql.PostgresConnector
+    ports:
+      - 127.0.0.1:8083:8083
diff --git a/test_runner/logical_repl/test_log_repl.py b/test_runner/logical_repl/test_clickhouse.py
similarity index 85%
rename from test_runner/logical_repl/test_log_repl.py
rename to test_runner/logical_repl/test_clickhouse.py
index 0a1aecfe2b..c5ed9bc8af 100644
--- a/test_runner/logical_repl/test_log_repl.py
+++ b/test_runner/logical_repl/test_clickhouse.py
@@ -1,8 +1,9 @@
 """
-Test the logical replication in Neon with the different consumers
+Test the logical replication in Neon with ClickHouse as a consumer
 """
 
 import hashlib
+import os
 import time
 
 import clickhouse_connect
@@ -39,22 +40,15 @@ def test_clickhouse(remote_pg: RemotePostgres):
     """
     Test the logical replication having ClickHouse as a client
     """
+    clickhouse_host = "clickhouse" if ("CI" in os.environ) else "127.0.0.1"
     conn_options = remote_pg.conn_options()
-    for _ in range(5):
-        try:
-            conn = psycopg2.connect(remote_pg.connstr())
-        except psycopg2.OperationalError as perr:
-            log.debug(perr)
-            time.sleep(1)
-        else:
-            break
-        raise TimeoutError
+    conn = psycopg2.connect(remote_pg.connstr())
     cur = conn.cursor()
     cur.execute("DROP TABLE IF EXISTS table1")
     cur.execute("CREATE TABLE table1 (id integer primary key, column1 varchar(10));")
     cur.execute("INSERT INTO table1 (id, column1) VALUES (1, 'abc'), (2, 'def');")
     conn.commit()
-    client = clickhouse_connect.get_client(host="clickhouse")
+    client = clickhouse_connect.get_client(host=clickhouse_host)
     client.command("SET allow_experimental_database_materialized_postgresql=1")
     client.command(
         "CREATE DATABASE db1_postgres ENGINE = "
diff --git a/test_runner/logical_repl/test_debezium.py b/test_runner/logical_repl/test_debezium.py
new file mode 100644
index 0000000000..700b731418
--- /dev/null
+++ b/test_runner/logical_repl/test_debezium.py
@@ -0,0 +1,189 @@
+"""
+Test the logical replication in Neon with Debezium as a consumer
+"""
+
+import json
+import os
+import time
+
+import psycopg2
+import pytest
+import requests
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import RemotePostgres
+from fixtures.utils import wait_until
+from kafka import KafkaConsumer
+
+
+class DebeziumAPI:
+    """
+    The class for Debezium API calls
+    """
+
+    def __init__(self):
+        self.__host = "debezium" if ("CI" in os.environ) else "127.0.0.1"
+        self.__base_url = f"http://{self.__host}:8083"
+        self.__connectors_url = f"{self.__base_url}/connectors"
+
+    def __request(self, method, addurl="", **kwargs):
+        return requests.request(
+            method,
+            self.__connectors_url + addurl,
+            headers={"Accept": "application/json", "Content-type": "application/json"},
+            timeout=60,
+            **kwargs,
+        )
+
+    def create_pg_connector(self, remote_pg: RemotePostgres, dbz_conn_name: str):
+        """
+        Create a Postgres connector in debezium
+        """
+        conn_options = remote_pg.conn_options()
+        payload = {
+            "name": dbz_conn_name,
+            "config": {
+                "connector.class": "io.debezium.connector.postgresql.PostgresConnector",
+                "tasks.max": "1",
+                "database.hostname": conn_options["host"],
+                "database.port": "5432",
+                "database.user": conn_options["user"],
+                "database.password": conn_options["password"],
+                "database.dbname": conn_options["dbname"],
+                "plugin.name": "pgoutput",
+                "topic.prefix": "dbserver1",
+                "schema.include.list": "inventory",
+            },
+        }
+        return self.__request("POST", json=payload)
+
+    def list_connectors(self):
+        """
+        Returns a list of all connectors existent in Debezium.
+        """
+        resp = self.__request("GET")
+        assert resp.ok
+        return json.loads(resp.text)
+
+    def del_connector(self, connector):
+        """
+        Deletes the specified connector
+        """
+        return self.__request("DELETE", f"/{connector}")
+
+
+@pytest.fixture(scope="function")
+def debezium(remote_pg: RemotePostgres):
+    """
+    Prepare the Debezium API handler, connection
+    """
+    conn = psycopg2.connect(remote_pg.connstr())
+    cur = conn.cursor()
+    cur.execute("DROP SCHEMA IF EXISTS inventory CASCADE")
+    cur.execute("CREATE SCHEMA inventory")
+    cur.execute(
+        "CREATE TABLE inventory.customers ("
+        "id SERIAL NOT NULL PRIMARY KEY,"
+        "first_name character varying(255) NOT NULL,"
+        "last_name character varying(255) NOT NULL,"
+        "email character varying(255) NOT NULL)"
+    )
+    conn.commit()
+    dbz = DebeziumAPI()
+    assert len(dbz.list_connectors()) == 0
+    dbz_conn_name = "inventory-connector"
+    resp = dbz.create_pg_connector(remote_pg, dbz_conn_name)
+    log.debug("%s %s %s", resp.status_code, resp.ok, resp.text)
+    assert resp.status_code == 201
+    assert len(dbz.list_connectors()) == 1
+    consumer = KafkaConsumer(
+        "dbserver1.inventory.customers",
+        bootstrap_servers=["kafka:9092"],
+        auto_offset_reset="earliest",
+        enable_auto_commit=False,
+    )
+    yield conn, consumer
+    resp = dbz.del_connector(dbz_conn_name)
+    assert resp.status_code == 204
+
+
+def get_kafka_msg(consumer, ts_ms, before=None, after=None) -> None:
+    """
+    Gets the message from Kafka and checks its validity
+    Arguments:
+        consumer: the consumer object
+        ts_ms:    timestamp in milliseconds of the change of db, the corresponding message must have
+                  the later timestamp
+        before:   a dictionary, if not None, the before field from the kafka message must
+                  have the same values for the same keys
+        after:    a dictionary, if not None, the after field from the kafka message must
+                  have the same values for the same keys
+    """
+    msg = consumer.poll()
+    assert msg, "Empty message"
+    for val in msg.values():
+        r = json.loads(val[-1].value)
+        log.info(r["payload"])
+        assert ts_ms < r["payload"]["ts_ms"], "Incorrect timestamp"
+        for param, pname in ((before, "before"), (after, "after")):
+            if param is not None:
+                for k, v in param.items():
+                    assert r["payload"][pname][k] == v, f"{pname} mismatches"
+
+
+@pytest.mark.remote_cluster
+def test_debezium(debezium):
+    """
+    Test the logical replication having Debezium as a subscriber
+    """
+    conn, consumer = debezium
+    cur = conn.cursor()
+    ts_ms = time.time() * 1000
+    log.info("Insert 1 ts_ms: %s", ts_ms)
+    cur.execute(
+        "insert into inventory.customers (first_name, last_name, email) "
+        "values ('John', 'Dow','johndow@example.com')"
+    )
+    conn.commit()
+    wait_until(
+        100,
+        0.5,
+        lambda: get_kafka_msg(
+            consumer,
+            ts_ms,
+            after={"first_name": "John", "last_name": "Dow", "email": "johndow@example.com"},
+        ),
+        show_intermediate_error=True,
+    )
+    ts_ms = time.time() * 1000
+    log.info("Insert 2 ts_ms: %s", ts_ms)
+    cur.execute(
+        "insert into inventory.customers (first_name, last_name, email) "
+        "values ('Alex', 'Row','alexrow@example.com')"
+    )
+    conn.commit()
+    wait_until(
+        100,
+        0.5,
+        lambda: get_kafka_msg(
+            consumer,
+            ts_ms,
+            after={"first_name": "Alex", "last_name": "Row", "email": "alexrow@example.com"},
+        ),
+        show_intermediate_error=True,
+    )
+    ts_ms = time.time() * 1000
+    log.info("Update ts_ms: %s", ts_ms)
+    cur.execute("update inventory.customers set first_name = 'Alexander' where id = 2")
+    conn.commit()
+    wait_until(
+        100,
+        0.5,
+        lambda: get_kafka_msg(
+            consumer,
+            ts_ms,
+            after={"first_name": "Alexander"},
+        ),
+        show_intermediate_error=True,
+    )
+    time.sleep(3)
+    cur.execute("select 1")

From 0be952fb8904c9ad41d350cab8aa846ba1fde411 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Tue, 6 Aug 2024 13:56:42 +0200
Subject: [PATCH 345/412] enable rum test (#8380)

## Problem
We need to test the rum extension automatically as a path of the GitHub
workflow

## Summary of changes

rum test is enabled
---
 Dockerfile.compute-node               |  6 ++++--
 docker-compose/docker_compose_test.sh |  2 +-
 docker-compose/run-tests.sh           | 10 +++++-----
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 5e53a55316..054d44e0ec 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -933,7 +933,8 @@ COPY --from=pgjwt-pg-build /pgjwt.tar.gz /ext-src
 #COPY --from=pg-tiktoken-pg-build /home/nonroot/pg_tiktoken.tar.gz /ext-src
 COPY --from=hypopg-pg-build /hypopg.tar.gz /ext-src
 COPY --from=pg-hashids-pg-build /pg_hashids.tar.gz /ext-src
-#COPY --from=rum-pg-build /rum.tar.gz /ext-src
+COPY --from=rum-pg-build /rum.tar.gz /ext-src
+COPY patches/rum.patch /ext-src
 #COPY --from=pgtap-pg-build /pgtap.tar.gz /ext-src
 COPY --from=ip4r-pg-build /ip4r.tar.gz /ext-src
 COPY --from=prefix-pg-build /prefix.tar.gz /ext-src
@@ -945,7 +946,7 @@ COPY patches/pg_hintplan.patch /ext-src
 COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src
 COPY patches/pg_cron.patch /ext-src
 #COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src
-COPY --from=rdkit-pg-build /rdkit.tar.gz /ext-src
+#COPY --from=rdkit-pg-build /rdkit.tar.gz /ext-src
 COPY --from=pg-uuidv7-pg-build /pg_uuidv7.tar.gz /ext-src
 COPY --from=pg-roaringbitmap-pg-build /pg_roaringbitmap.tar.gz /ext-src
 COPY --from=pg-semver-pg-build /pg_semver.tar.gz /ext-src
@@ -960,6 +961,7 @@ RUN cd /ext-src/ && for f in *.tar.gz; \
     rm -rf $dname; mkdir $dname; tar xzf $f --strip-components=1 -C $dname \
     || exit 1; rm -f $f; done
 RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
+RUN cd /ext-src/rum-src && patch -p1 <../rum.patch
 # cmake is required for the h3 test
 RUN apt-get update && apt-get install -y cmake
 RUN patch -p1 < /ext-src/pg_hintplan.patch
diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh
index a00591afd0..10805a9952 100755
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -78,7 +78,7 @@ for pg_version in 14 15 16; do
         docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/ext-src/pg_hint_plan-src/
         rm -rf $TMPDIR
         # We are running tests now
-        if docker exec -e SKIP=rum-src,timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \
+        if docker exec -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \
             $TEST_CONTAINER_NAME /run-tests.sh | tee testout.txt
         then
             cleanup
diff --git a/docker-compose/run-tests.sh b/docker-compose/run-tests.sh
index c05fc159aa..58b2581197 100644
--- a/docker-compose/run-tests.sh
+++ b/docker-compose/run-tests.sh
@@ -1,15 +1,15 @@
 #!/bin/bash
 set -x
 
-cd /ext-src
+cd /ext-src || exit 2
 FAILED=
-LIST=$((echo ${SKIP} | sed 's/,/\n/g'; ls -d *-src) | sort | uniq -u)
+LIST=$( (echo "${SKIP//","/"\n"}"; ls -d -- *-src) | sort | uniq -u)
 for d in ${LIST}
 do
-       [ -d ${d} ] || continue
+       [ -d "${d}" ] || continue
     psql -c "select 1" >/dev/null || break
-       make -C ${d} installcheck || FAILED="${d} ${FAILED}"
+       USE_PGXS=1 make -C "${d}" installcheck || FAILED="${d} ${FAILED}"
 done
 [ -z "${FAILED}" ] && exit 0
-echo ${FAILED}
+echo "${FAILED}"
 exit 1
\ No newline at end of file

From 8d8e428d4ccd8b6e427c0b9e321c05abcc8bc296 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 6 Aug 2024 12:58:33 +0100
Subject: [PATCH 346/412] tests: improve stability of
 `test_storage_controller_many_tenants` (#8607)

## Problem

The controller scale test does random migrations. These mutate secondary
locations, and therefore can cause secondary optimizations to happen in
the background, violating the test's expectation that consistency_check
will work as there are no reconciliations running.

Example:
https://neon-github-public-dev.s3.amazonaws.com/reports/main/10247161379/index.html#suites/07874de07c4a1c9effe0d92da7755ebf/6316beacd3fb3060/

## Summary of changes

- Only migrate to existing secondary locations, not randomly picked
nodes, so that we can do a fast reconcile_until_idle (otherwise
reconcile_until_idle is takes a long time to create new secondary
locations).
- Do a reconcile_until_idle before consistency_check.
---
 .../performance/test_storage_controller_scale.py     | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py
index 281c9271e9..04785f7184 100644
--- a/test_runner/performance/test_storage_controller_scale.py
+++ b/test_runner/performance/test_storage_controller_scale.py
@@ -217,7 +217,11 @@ def test_storage_controller_many_tenants(
                 # A reconciler operation: migrate a shard.
                 shard_number = rng.randint(0, shard_count - 1)
                 tenant_shard_id = TenantShardId(tenant_id, shard_number, shard_count)
-                dest_ps_id = rng.choice([ps.id for ps in env.pageservers])
+
+                # Migrate it to its secondary location
+                desc = env.storage_controller.tenant_describe(tenant_id)
+                dest_ps_id = desc["shards"][shard_number]["node_secondary"][0]
+
                 f = executor.submit(
                     env.storage_controller.tenant_shard_migrate, tenant_shard_id, dest_ps_id
                 )
@@ -231,7 +235,11 @@ def test_storage_controller_many_tenants(
         for f in futs:
             f.result()
 
-    # Consistency check is safe here: all the previous operations waited for reconcile before completing
+    # Some of the operations above (notably migrations) might leave the controller in a state where it has
+    # some work to do, for example optimizing shard placement after we do a random migration. Wait for the system
+    # to reach a quiescent state before doing following checks.
+    env.storage_controller.reconcile_until_idle()
+
     env.storage_controller.consistency_check()
     check_memory()
 

From ea7be4152aeda69bd682359e7777df4662d59d0a Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 6 Aug 2024 14:47:01 +0100
Subject: [PATCH 347/412] pageserver: fixes for layer visibility metric (#8603)

## Problem

In staging, we could see that occasionally tenants were wrapping their
pageserver_visible_physical_size metric past zero to 2^64.

This is harmless right now, but will matter more later when we start
using visible size in things like the /utilization endpoint.

## Summary of changes

- Add debug asserts that detect this case. `test_gc_of_remote_layers`
works as a reproducer for this issue once the asserts are added.
- Tighten up the interface around access_stats so that only Layer can
mutate it.
- In Layer, wrap calls to `record_access` in code that will update the
visible size statistic if the access implicitly marks the layer visible
(this was what caused the bug)
- In LayerManager::rewrite_layers, use the proper set_visibility layer
function instead of directly using access_stats (this is an additional
path where metrics could go bad.)
- Removed unused instances of LayerAccessStats in DeltaLayer and
ImageLayer which I noticed while reviewing the code paths that call
record_access.
---
 pageserver/src/tenant/storage_layer.rs        | 14 +++++---
 .../src/tenant/storage_layer/delta_layer.rs   |  8 +----
 .../src/tenant/storage_layer/image_layer.rs   |  4 ---
 pageserver/src/tenant/storage_layer/layer.rs  | 35 ++++++++++++++++---
 pageserver/src/tenant/timeline.rs             |  4 +--
 .../src/tenant/timeline/eviction_task.rs      |  2 +-
 .../src/tenant/timeline/layer_manager.rs      |  7 ++--
 7 files changed, 46 insertions(+), 28 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index ab32a6035e..04f89db401 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -539,19 +539,25 @@ impl LayerAccessStats {
         self.record_residence_event_at(SystemTime::now())
     }
 
-    pub(crate) fn record_access_at(&self, now: SystemTime) {
+    fn record_access_at(&self, now: SystemTime) -> bool {
         let (mut mask, mut value) = Self::to_low_res_timestamp(Self::ATIME_SHIFT, now);
 
         // A layer which is accessed must be visible.
         mask |= 0x1 << Self::VISIBILITY_SHIFT;
         value |= 0x1 << Self::VISIBILITY_SHIFT;
 
-        self.write_bits(mask, value);
+        let old_bits = self.write_bits(mask, value);
+        !matches!(
+            self.decode_visibility(old_bits),
+            LayerVisibilityHint::Visible
+        )
     }
 
-    pub(crate) fn record_access(&self, ctx: &RequestContext) {
+    /// Returns true if we modified the layer's visibility to set it to Visible implicitly
+    /// as a result of this access
+    pub(crate) fn record_access(&self, ctx: &RequestContext) -> bool {
         if ctx.access_stats_behavior() == AccessStatsBehavior::Skip {
-            return;
+            return false;
         }
 
         self.record_access_at(SystemTime::now())
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index a17dd28547..962faa6796 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -72,10 +72,7 @@ use utils::{
     lsn::Lsn,
 };
 
-use super::{
-    AsLayerDesc, LayerAccessStats, LayerName, PersistentLayerDesc, ResidentLayer,
-    ValuesReconstructState,
-};
+use super::{AsLayerDesc, LayerName, PersistentLayerDesc, ResidentLayer, ValuesReconstructState};
 
 ///
 /// Header stored in the beginning of the file
@@ -200,7 +197,6 @@ impl DeltaKey {
 pub struct DeltaLayer {
     path: Utf8PathBuf,
     pub desc: PersistentLayerDesc,
-    access_stats: LayerAccessStats,
     inner: OnceCell<Arc<DeltaLayerInner>>,
 }
 
@@ -299,7 +295,6 @@ impl DeltaLayer {
     /// not loaded already.
     ///
     async fn load(&self, ctx: &RequestContext) -> Result<&Arc<DeltaLayerInner>> {
-        self.access_stats.record_access(ctx);
         // Quick exit if already loaded
         self.inner
             .get_or_try_init(|| self.load_inner(ctx))
@@ -350,7 +345,6 @@ impl DeltaLayer {
                 summary.lsn_range,
                 metadata.len(),
             ),
-            access_stats: Default::default(),
             inner: OnceCell::new(),
         })
     }
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index b2173455ab..16ba0fda94 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -32,7 +32,6 @@ use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{
     DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
 };
-use crate::tenant::storage_layer::LayerAccessStats;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
     BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
@@ -135,7 +134,6 @@ pub struct ImageLayer {
     pub desc: PersistentLayerDesc,
     // This entry contains an image of all pages as of this LSN, should be the same as desc.lsn
     pub lsn: Lsn,
-    access_stats: LayerAccessStats,
     inner: OnceCell<ImageLayerInner>,
 }
 
@@ -253,7 +251,6 @@ impl ImageLayer {
     /// not loaded already.
     ///
     async fn load(&self, ctx: &RequestContext) -> Result<&ImageLayerInner> {
-        self.access_stats.record_access(ctx);
         self.inner
             .get_or_try_init(|| self.load_inner(ctx))
             .await
@@ -304,7 +301,6 @@ impl ImageLayer {
                 metadata.len(),
             ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
             lsn: summary.lsn,
-            access_stats: Default::default(),
             inner: OnceCell::new(),
         })
     }
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index cee2fe7342..83450d24bb 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -316,7 +316,7 @@ impl Layer {
                 other => GetVectoredError::Other(anyhow::anyhow!(other)),
             })?;
 
-        self.0.access_stats.record_access(ctx);
+        self.record_access(ctx);
 
         layer
             .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, &self.0, ctx)
@@ -396,8 +396,12 @@ impl Layer {
         self.0.info(reset)
     }
 
-    pub(crate) fn access_stats(&self) -> &LayerAccessStats {
-        &self.0.access_stats
+    pub(crate) fn latest_activity(&self) -> SystemTime {
+        self.0.access_stats.latest_activity()
+    }
+
+    pub(crate) fn visibility(&self) -> LayerVisibilityHint {
+        self.0.access_stats.visibility()
     }
 
     pub(crate) fn local_path(&self) -> &Utf8Path {
@@ -447,13 +451,31 @@ impl Layer {
         }
     }
 
+    fn record_access(&self, ctx: &RequestContext) {
+        if self.0.access_stats.record_access(ctx) {
+            // Visibility was modified to Visible
+            tracing::info!(
+                "Layer {} became visible as a result of access",
+                self.0.desc.key()
+            );
+            if let Some(tl) = self.0.timeline.upgrade() {
+                tl.metrics
+                    .visible_physical_size_gauge
+                    .add(self.0.desc.file_size)
+            }
+        }
+    }
+
     pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) {
-        let old_visibility = self.access_stats().set_visibility(visibility.clone());
+        let old_visibility = self.0.access_stats.set_visibility(visibility.clone());
         use LayerVisibilityHint::*;
         match (old_visibility, visibility) {
             (Visible, Covered) => {
                 // Subtract this layer's contribution to the visible size metric
                 if let Some(tl) = self.0.timeline.upgrade() {
+                    debug_assert!(
+                        tl.metrics.visible_physical_size_gauge.get() >= self.0.desc.file_size
+                    );
                     tl.metrics
                         .visible_physical_size_gauge
                         .sub(self.0.desc.file_size)
@@ -671,6 +693,9 @@ impl Drop for LayerInner {
             }
 
             if matches!(self.access_stats.visibility(), LayerVisibilityHint::Visible) {
+                debug_assert!(
+                    timeline.metrics.visible_physical_size_gauge.get() >= self.desc.file_size
+                );
                 timeline
                     .metrics
                     .visible_physical_size_gauge
@@ -1810,7 +1835,7 @@ impl ResidentLayer {
                 // this is valid because the DownloadedLayer::kind is a OnceCell, not a
                 // Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
                 // while it's being held.
-                owner.access_stats.record_access(ctx);
+                self.owner.record_access(ctx);
 
                 delta_layer::DeltaLayerInner::load_keys(d, ctx)
                     .await
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 5a02fd4a4c..6c67fb9cb6 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2920,7 +2920,7 @@ impl Timeline {
         let guard = self.layers.read().await;
 
         let resident = guard.likely_resident_layers().map(|layer| {
-            let last_activity_ts = layer.access_stats().latest_activity();
+            let last_activity_ts = layer.latest_activity();
 
             HeatMapLayer::new(
                 layer.layer_desc().layer_name(),
@@ -5182,7 +5182,7 @@ impl Timeline {
                 let file_size = layer.layer_desc().file_size;
                 max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size)));
 
-                let last_activity_ts = layer.access_stats().latest_activity();
+                let last_activity_ts = layer.latest_activity();
 
                 EvictionCandidate {
                     layer: layer.into(),
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index fec66aabc1..1ba1bf9de5 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -225,7 +225,7 @@ impl Timeline {
                     continue;
                 }
 
-                let last_activity_ts = layer.access_stats().latest_activity();
+                let last_activity_ts = layer.latest_activity();
 
                 let no_activity_for = match now.duration_since(last_activity_ts) {
                     Ok(d) => d,
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index 1bc2acbd34..e6e7bc2e77 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -259,13 +259,10 @@ impl LayerManager {
                 new_layer.layer_desc().lsn_range
             );
 
-            // Transfer visibilty hint from old to new layer, since the new layer covers the same key space.  This is not guaranteed to
+            // Transfer visibility hint from old to new layer, since the new layer covers the same key space.  This is not guaranteed to
             // be accurate (as the new layer may cover a different subset of the key range), but is a sensible default, and prevents
             // always marking rewritten layers as visible.
-            new_layer
-                .as_ref()
-                .access_stats()
-                .set_visibility(old_layer.access_stats().visibility());
+            new_layer.as_ref().set_visibility(old_layer.visibility());
 
             // Safety: we may never rewrite the same file in-place.  Callers are responsible
             // for ensuring that they only rewrite layers after something changes the path,

From 6519f875b99c50ddc08ece91eb553fc92c1bb8bf Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 6 Aug 2024 17:15:40 +0100
Subject: [PATCH 348/412] pageserver: use layer visibility when composing
 heatmap (#8616)

## Problem

Sometimes, a layer is Covered by hasn't yet been evicted from local disk
(e.g. shortly after image layer generation). It is not good use of
resources to download these to a secondary location, as there's a good
chance they will never be read.

This follows the previous change that added layer visibility:
- #8511

Part of epic:
- https://github.com/neondatabase/neon/issues/8398

## Summary of changes

- When generating heatmaps, only include Visible layers
- Update test_secondary_downloads to filter to visible layers when
listing layers from an attached location
---
 pageserver/src/tenant/timeline.rs             | 26 ++++++----
 test_runner/fixtures/pageserver/http.py       |  2 +
 .../regress/test_pageserver_secondary.py      | 52 +++++++++++++++----
 3 files changed, 60 insertions(+), 20 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 6c67fb9cb6..4ff87f20f1 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -137,7 +137,7 @@ use self::layer_manager::LayerManager;
 use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};
 
-use super::{config::TenantConf, upload_queue::NotInitialized};
+use super::{config::TenantConf, storage_layer::LayerVisibilityHint, upload_queue::NotInitialized};
 use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
 use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
 use super::{
@@ -2919,14 +2919,22 @@ impl Timeline {
 
         let guard = self.layers.read().await;
 
-        let resident = guard.likely_resident_layers().map(|layer| {
-            let last_activity_ts = layer.latest_activity();
-
-            HeatMapLayer::new(
-                layer.layer_desc().layer_name(),
-                layer.metadata(),
-                last_activity_ts,
-            )
+        let resident = guard.likely_resident_layers().filter_map(|layer| {
+            match layer.visibility() {
+                LayerVisibilityHint::Visible => {
+                    // Layer is visible to one or more read LSNs: elegible for inclusion in layer map
+                    let last_activity_ts = layer.latest_activity();
+                    Some(HeatMapLayer::new(
+                        layer.layer_desc().layer_name(),
+                        layer.metadata(),
+                        last_activity_ts,
+                    ))
+                }
+                LayerVisibilityHint::Covered => {
+                    // Layer is resident but unlikely to be read: not elegible for inclusion in heatmap.
+                    None
+                }
+            }
         });
 
         let layers = resident.collect();
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 61e2204b23..5be59d3749 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -61,6 +61,7 @@ class HistoricLayerInfo:
     remote: bool
     # None for image layers, true if pageserver thinks this is an L0 delta layer
     l0: Optional[bool]
+    visible: bool
 
     @classmethod
     def from_json(cls, d: Dict[str, Any]) -> HistoricLayerInfo:
@@ -79,6 +80,7 @@ class HistoricLayerInfo:
             lsn_end=d.get("lsn_end"),
             remote=d["remote"],
             l0=l0_ness,
+            visible=d["access_stats"]["visible"],
         )
 
 
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 53f69b5b26..4b0af24480 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -2,10 +2,11 @@ import json
 import os
 import random
 import time
-from typing import Any, Dict, Optional
+from pathlib import Path
+from typing import Any, Dict, Optional, Union
 
 import pytest
-from fixtures.common_types import TenantId, TimelineId
+from fixtures.common_types import TenantId, TenantShardId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver
 from fixtures.pageserver.common_types import parse_layer_file_name
@@ -437,6 +438,35 @@ def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder):
     validate_heatmap(heatmap_second)
 
 
+def list_elegible_layers(
+    pageserver, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId
+) -> list[Path]:
+    """
+    The subset of layer filenames that are elegible for secondary download: at time of writing this
+    is all resident layers which are also visible.
+    """
+    candidates = pageserver.list_layers(tenant_id, timeline_id)
+
+    layer_map = pageserver.http_client().layer_map_info(tenant_id, timeline_id)
+
+    # Map of layer filenames to their visibility the "layer name" is not the same as the filename: add suffix to resolve one to the other
+    visible_map = dict(
+        (f"{layer.layer_file_name}-v1-00000001", layer.visible)
+        for layer in layer_map.historic_layers
+    )
+
+    def is_visible(layer_file_name):
+        try:
+            return visible_map[str(layer_file_name)]
+        except KeyError:
+            # Unexpected: tests should call this when pageservers are in a quiet state such that the layer map
+            # matches what's on disk.
+            log.warn(f"Lookup {layer_file_name} from {list(visible_map.keys())}")
+            raise
+
+    return list(c for c in candidates if is_visible(c))
+
+
 def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
     """
     Test the overall data flow in secondary mode:
@@ -491,7 +521,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
 
     ps_secondary.http_client().tenant_secondary_download(tenant_id)
 
-    assert ps_attached.list_layers(tenant_id, timeline_id) == ps_secondary.list_layers(
+    assert list_elegible_layers(ps_attached, tenant_id, timeline_id) == ps_secondary.list_layers(
         tenant_id, timeline_id
     )
 
@@ -509,9 +539,9 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
     ps_secondary.http_client().tenant_secondary_download(tenant_id)
 
     try:
-        assert ps_attached.list_layers(tenant_id, timeline_id) == ps_secondary.list_layers(
-            tenant_id, timeline_id
-        )
+        assert list_elegible_layers(
+            ps_attached, tenant_id, timeline_id
+        ) == ps_secondary.list_layers(tenant_id, timeline_id)
     except:
         # Do a full listing of the secondary location on errors, to help debug of
         # https://github.com/neondatabase/neon/issues/6966
@@ -532,8 +562,8 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
     # ==================================================================
     try:
         log.info("Evicting a layer...")
-        layer_to_evict = ps_attached.list_layers(tenant_id, timeline_id)[0]
-        some_other_layer = ps_attached.list_layers(tenant_id, timeline_id)[1]
+        layer_to_evict = list_elegible_layers(ps_attached, tenant_id, timeline_id)[0]
+        some_other_layer = list_elegible_layers(ps_attached, tenant_id, timeline_id)[1]
         log.info(f"Victim layer: {layer_to_evict.name}")
         ps_attached.http_client().evict_layer(
             tenant_id, timeline_id, layer_name=layer_to_evict.name
@@ -551,9 +581,9 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
         ps_secondary.http_client().tenant_secondary_download(tenant_id)
 
         assert layer_to_evict not in ps_attached.list_layers(tenant_id, timeline_id)
-        assert ps_attached.list_layers(tenant_id, timeline_id) == ps_secondary.list_layers(
-            tenant_id, timeline_id
-        )
+        assert list_elegible_layers(
+            ps_attached, tenant_id, timeline_id
+        ) == ps_secondary.list_layers(tenant_id, timeline_id)
     except:
         # On assertion failures, log some details to help with debugging
         heatmap = env.pageserver_remote_storage.heatmap_content(tenant_id)

From ba4e5b51a0eba308630312cfa8fa1ac306e19290 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 6 Aug 2024 17:39:40 +0100
Subject: [PATCH 349/412] pageserver: add `bench_ingest` (#7409)

## Problem

We lack a rust bench for the inmemory layer and delta layer write paths:
it is useful to benchmark these components independent of postgres & WAL
decoding.

Related: https://github.com/neondatabase/neon/issues/8452

## Summary of changes

- Refactor DeltaLayerWriter to avoid carrying a Timeline, so that it can
be cleanly tested + benched without a Tenant/Timeline test harness. It
only needed the Timeline for building `Layer`, so this can be done in a
separate step.
- Add `bench_ingest`, which exercises a variety of workload "shapes"
(big values, small values, sequential keys, random keys)
- Include a small uncontroversial optimization: in `freeze`, only
exhaustively walk values to assert ordering relative to end_lsn in debug
mode.

These benches are limited by drive performance on a lot of machines, but
still useful as a local tool for iterating on CPU/memory improvements
around this code path.

Anecdotal measurements on Hetzner AX102 (Ryzen 7950xd):

```

ingest-small-values/ingest 128MB/100b seq
                        time:   [1.1160 s 1.1230 s 1.1289 s]
                        thrpt:  [113.38 MiB/s 113.98 MiB/s 114.70 MiB/s]
Found 1 outliers among 10 measurements (10.00%)
  1 (10.00%) low mild
Benchmarking ingest-small-values/ingest 128MB/100b rand: Warming up for 3.0000 s
Warning: Unable to complete 10 samples in 10.0s. You may wish to increase target time to 18.9s.
ingest-small-values/ingest 128MB/100b rand
                        time:   [1.9001 s 1.9056 s 1.9110 s]
                        thrpt:  [66.982 MiB/s 67.171 MiB/s 67.365 MiB/s]
Benchmarking ingest-small-values/ingest 128MB/100b rand-1024keys: Warming up for 3.0000 s
Warning: Unable to complete 10 samples in 10.0s. You may wish to increase target time to 11.0s.
ingest-small-values/ingest 128MB/100b rand-1024keys
                        time:   [1.0715 s 1.0828 s 1.0937 s]
                        thrpt:  [117.04 MiB/s 118.21 MiB/s 119.46 MiB/s]
ingest-small-values/ingest 128MB/100b seq, no delta
                        time:   [425.49 ms 429.07 ms 432.04 ms]
                        thrpt:  [296.27 MiB/s 298.32 MiB/s 300.83 MiB/s]
Found 1 outliers among 10 measurements (10.00%)
  1 (10.00%) low mild

ingest-big-values/ingest 128MB/8k seq
                        time:   [373.03 ms 375.84 ms 379.17 ms]
                        thrpt:  [337.58 MiB/s 340.57 MiB/s 343.13 MiB/s]
Found 1 outliers among 10 measurements (10.00%)
  1 (10.00%) high mild
ingest-big-values/ingest 128MB/8k seq, no delta
                        time:   [81.534 ms 82.811 ms 83.364 ms]
                        thrpt:  [1.4994 GiB/s 1.5095 GiB/s 1.5331 GiB/s]
Found 1 outliers among 10 measurements (10.00%)


```
---
 pageserver/Cargo.toml                         |   4 +
 pageserver/benches/bench_ingest.rs            | 235 ++++++++++++++++++
 pageserver/src/l0_flush.rs                    |   4 +-
 .../src/tenant/storage_layer/delta_layer.rs   |  45 ++--
 .../tenant/storage_layer/inmemory_layer.rs    |  37 +--
 .../src/tenant/storage_layer/split_writer.rs  |  11 +-
 pageserver/src/tenant/timeline.rs             |  11 +-
 pageserver/src/tenant/timeline/compaction.rs  |  44 ++--
 .../src/tenant/timeline/detach_ancestor.rs    |   6 +-
 9 files changed, 322 insertions(+), 75 deletions(-)
 create mode 100644 pageserver/benches/bench_ingest.rs

diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 43976250a4..0e748ee3db 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -108,3 +108,7 @@ harness = false
 [[bench]]
 name = "bench_walredo"
 harness = false
+
+[[bench]]
+name = "bench_ingest"
+harness = false
diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs
new file mode 100644
index 0000000000..af2b6934c6
--- /dev/null
+++ b/pageserver/benches/bench_ingest.rs
@@ -0,0 +1,235 @@
+use std::{env, num::NonZeroUsize};
+
+use bytes::Bytes;
+use camino::Utf8PathBuf;
+use criterion::{criterion_group, criterion_main, Criterion};
+use pageserver::{
+    config::PageServerConf,
+    context::{DownloadBehavior, RequestContext},
+    l0_flush::{L0FlushConfig, L0FlushGlobalState},
+    page_cache,
+    repository::Value,
+    task_mgr::TaskKind,
+    tenant::storage_layer::InMemoryLayer,
+    virtual_file::{self, api::IoEngineKind},
+};
+use pageserver_api::{key::Key, shard::TenantShardId};
+use utils::{
+    bin_ser::BeSer,
+    id::{TenantId, TimelineId},
+};
+
+// A very cheap hash for generating non-sequential keys.
+fn murmurhash32(mut h: u32) -> u32 {
+    h ^= h >> 16;
+    h = h.wrapping_mul(0x85ebca6b);
+    h ^= h >> 13;
+    h = h.wrapping_mul(0xc2b2ae35);
+    h ^= h >> 16;
+    h
+}
+
+enum KeyLayout {
+    /// Sequential unique keys
+    Sequential,
+    /// Random unique keys
+    Random,
+    /// Random keys, but only use the bits from the mask of them
+    RandomReuse(u32),
+}
+
+enum WriteDelta {
+    Yes,
+    No,
+}
+
+async fn ingest(
+    conf: &'static PageServerConf,
+    put_size: usize,
+    put_count: usize,
+    key_layout: KeyLayout,
+    write_delta: WriteDelta,
+) -> anyhow::Result<()> {
+    let mut lsn = utils::lsn::Lsn(1000);
+    let mut key = Key::from_i128(0x0);
+
+    let timeline_id = TimelineId::generate();
+    let tenant_id = TenantId::generate();
+    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+
+    tokio::fs::create_dir_all(conf.timeline_path(&tenant_shard_id, &timeline_id)).await?;
+
+    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
+
+    let layer = InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, &ctx).await?;
+
+    let data = Value::Image(Bytes::from(vec![0u8; put_size])).ser()?;
+    let ctx = RequestContext::new(
+        pageserver::task_mgr::TaskKind::WalReceiverConnectionHandler,
+        pageserver::context::DownloadBehavior::Download,
+    );
+
+    for i in 0..put_count {
+        lsn += put_size as u64;
+
+        // Generate lots of keys within a single relation, which simulates the typical bulk ingest case: people
+        // usually care the most about write performance when they're blasting a huge batch of data into a huge table.
+        match key_layout {
+            KeyLayout::Sequential => {
+                // Use sequential order to illustrate the experience a user is likely to have
+                // when ingesting bulk data.
+                key.field6 = i as u32;
+            }
+            KeyLayout::Random => {
+                // Use random-order keys to avoid giving a false advantage to data structures that are
+                // faster when inserting on the end.
+                key.field6 = murmurhash32(i as u32);
+            }
+            KeyLayout::RandomReuse(mask) => {
+                // Use low bits only, to limit cardinality
+                key.field6 = murmurhash32(i as u32) & mask;
+            }
+        }
+
+        layer.put_value(key, lsn, &data, &ctx).await?;
+    }
+    layer.freeze(lsn + 1).await;
+
+    if matches!(write_delta, WriteDelta::Yes) {
+        let l0_flush_state = L0FlushGlobalState::new(L0FlushConfig::Direct {
+            max_concurrency: NonZeroUsize::new(1).unwrap(),
+        });
+        let (_desc, path) = layer
+            .write_to_disk(&ctx, None, l0_flush_state.inner())
+            .await?
+            .unwrap();
+        tokio::fs::remove_file(path).await?;
+    }
+
+    Ok(())
+}
+
+/// Wrapper to instantiate a tokio runtime
+fn ingest_main(
+    conf: &'static PageServerConf,
+    put_size: usize,
+    put_count: usize,
+    key_layout: KeyLayout,
+    write_delta: WriteDelta,
+) {
+    let runtime = tokio::runtime::Builder::new_current_thread()
+        .enable_all()
+        .build()
+        .unwrap();
+
+    runtime.block_on(async move {
+        let r = ingest(conf, put_size, put_count, key_layout, write_delta).await;
+        if let Err(e) = r {
+            panic!("{e:?}");
+        }
+    });
+}
+
+/// Declare a series of benchmarks for the Pageserver's ingest write path.
+///
+/// This benchmark does not include WAL decode: it starts at InMemoryLayer::put_value, and ends either
+/// at freezing the ephemeral layer, or writing the ephemeral layer out to an L0 (depending on whether WriteDelta is set).
+///
+/// Genuine disk I/O is used, so expect results to differ depending on storage.  However, when running on
+/// a fast disk, CPU is the bottleneck at time of writing.
+fn criterion_benchmark(c: &mut Criterion) {
+    let temp_dir_parent: Utf8PathBuf = env::current_dir().unwrap().try_into().unwrap();
+    let temp_dir = camino_tempfile::tempdir_in(temp_dir_parent).unwrap();
+    eprintln!("Data directory: {}", temp_dir.path());
+
+    let conf: &'static PageServerConf = Box::leak(Box::new(
+        pageserver::config::PageServerConf::dummy_conf(temp_dir.path().to_path_buf()),
+    ));
+    virtual_file::init(16384, IoEngineKind::TokioEpollUring);
+    page_cache::init(conf.page_cache_size);
+
+    {
+        let mut group = c.benchmark_group("ingest-small-values");
+        let put_size = 100usize;
+        let put_count = 128 * 1024 * 1024 / put_size;
+        group.throughput(criterion::Throughput::Bytes((put_size * put_count) as u64));
+        group.sample_size(10);
+        group.bench_function("ingest 128MB/100b seq", |b| {
+            b.iter(|| {
+                ingest_main(
+                    conf,
+                    put_size,
+                    put_count,
+                    KeyLayout::Sequential,
+                    WriteDelta::Yes,
+                )
+            })
+        });
+        group.bench_function("ingest 128MB/100b rand", |b| {
+            b.iter(|| {
+                ingest_main(
+                    conf,
+                    put_size,
+                    put_count,
+                    KeyLayout::Random,
+                    WriteDelta::Yes,
+                )
+            })
+        });
+        group.bench_function("ingest 128MB/100b rand-1024keys", |b| {
+            b.iter(|| {
+                ingest_main(
+                    conf,
+                    put_size,
+                    put_count,
+                    KeyLayout::RandomReuse(0x3ff),
+                    WriteDelta::Yes,
+                )
+            })
+        });
+        group.bench_function("ingest 128MB/100b seq, no delta", |b| {
+            b.iter(|| {
+                ingest_main(
+                    conf,
+                    put_size,
+                    put_count,
+                    KeyLayout::Sequential,
+                    WriteDelta::No,
+                )
+            })
+        });
+    }
+
+    {
+        let mut group = c.benchmark_group("ingest-big-values");
+        let put_size = 8192usize;
+        let put_count = 128 * 1024 * 1024 / put_size;
+        group.throughput(criterion::Throughput::Bytes((put_size * put_count) as u64));
+        group.sample_size(10);
+        group.bench_function("ingest 128MB/8k seq", |b| {
+            b.iter(|| {
+                ingest_main(
+                    conf,
+                    put_size,
+                    put_count,
+                    KeyLayout::Sequential,
+                    WriteDelta::Yes,
+                )
+            })
+        });
+        group.bench_function("ingest 128MB/8k seq, no delta", |b| {
+            b.iter(|| {
+                ingest_main(
+                    conf,
+                    put_size,
+                    put_count,
+                    KeyLayout::Sequential,
+                    WriteDelta::No,
+                )
+            })
+        });
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/pageserver/src/l0_flush.rs b/pageserver/src/l0_flush.rs
index 8945e5accd..10187f2ba3 100644
--- a/pageserver/src/l0_flush.rs
+++ b/pageserver/src/l0_flush.rs
@@ -24,7 +24,7 @@ impl Default for L0FlushConfig {
 #[derive(Clone)]
 pub struct L0FlushGlobalState(Arc<Inner>);
 
-pub(crate) enum Inner {
+pub enum Inner {
     PageCached,
     Direct { semaphore: tokio::sync::Semaphore },
 }
@@ -40,7 +40,7 @@ impl L0FlushGlobalState {
         }
     }
 
-    pub(crate) fn inner(&self) -> &Arc<Inner> {
+    pub fn inner(&self) -> &Arc<Inner> {
         &self.0
     }
 }
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 962faa6796..bff8f7cb24 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -36,13 +36,12 @@ use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, Fi
 use crate::tenant::disk_btree::{
     DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
 };
-use crate::tenant::storage_layer::Layer;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
     BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
     VectoredReadPlanner,
 };
-use crate::tenant::{PageReconstructError, Timeline};
+use crate::tenant::PageReconstructError;
 use crate::virtual_file::{self, VirtualFile};
 use crate::{walrecord, TEMP_FILE_SUFFIX};
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
@@ -72,7 +71,7 @@ use utils::{
     lsn::Lsn,
 };
 
-use super::{AsLayerDesc, LayerName, PersistentLayerDesc, ResidentLayer, ValuesReconstructState};
+use super::{AsLayerDesc, LayerName, PersistentLayerDesc, ValuesReconstructState};
 
 ///
 /// Header stored in the beginning of the file
@@ -367,7 +366,6 @@ impl DeltaLayer {
 /// 3. Call `finish`.
 ///
 struct DeltaLayerWriterInner {
-    conf: &'static PageServerConf,
     pub path: Utf8PathBuf,
     timeline_id: TimelineId,
     tenant_shard_id: TenantShardId,
@@ -414,7 +412,6 @@ impl DeltaLayerWriterInner {
         let tree_builder = DiskBtreeBuilder::new(block_buf);
 
         Ok(Self {
-            conf,
             path,
             timeline_id,
             tenant_shard_id,
@@ -489,11 +486,10 @@ impl DeltaLayerWriterInner {
     async fn finish(
         self,
         key_end: Key,
-        timeline: &Arc<Timeline>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<ResidentLayer> {
+    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
         let temp_path = self.path.clone();
-        let result = self.finish0(key_end, timeline, ctx).await;
+        let result = self.finish0(key_end, ctx).await;
         if result.is_err() {
             tracing::info!(%temp_path, "cleaning up temporary file after error during writing");
             if let Err(e) = std::fs::remove_file(&temp_path) {
@@ -506,9 +502,8 @@ impl DeltaLayerWriterInner {
     async fn finish0(
         self,
         key_end: Key,
-        timeline: &Arc<Timeline>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<ResidentLayer> {
+    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
         let index_start_blk =
             ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
 
@@ -573,11 +568,9 @@ impl DeltaLayerWriterInner {
         // fsync the file
         file.sync_all().await?;
 
-        let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;
+        trace!("created delta layer {}", self.path);
 
-        trace!("created delta layer {}", layer.local_path());
-
-        Ok(layer)
+        Ok((desc, self.path))
     }
 }
 
@@ -678,14 +671,9 @@ impl DeltaLayerWriter {
     pub(crate) async fn finish(
         mut self,
         key_end: Key,
-        timeline: &Arc<Timeline>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<ResidentLayer> {
-        self.inner
-            .take()
-            .unwrap()
-            .finish(key_end, timeline, ctx)
-            .await
+    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
+        self.inner.take().unwrap().finish(key_end, ctx).await
     }
 
     #[cfg(test)]
@@ -1592,8 +1580,9 @@ pub(crate) mod test {
     use super::*;
     use crate::repository::Value;
     use crate::tenant::harness::TIMELINE_ID;
+    use crate::tenant::storage_layer::{Layer, ResidentLayer};
     use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner;
-    use crate::tenant::Tenant;
+    use crate::tenant::{Tenant, Timeline};
     use crate::{
         context::DownloadBehavior,
         task_mgr::TaskKind,
@@ -1887,9 +1876,8 @@ pub(crate) mod test {
             res?;
         }
 
-        let resident = writer
-            .finish(entries_meta.key_range.end, &timeline, &ctx)
-            .await?;
+        let (desc, path) = writer.finish(entries_meta.key_range.end, &ctx).await?;
+        let resident = Layer::finish_creating(harness.conf, &timeline, desc, &path)?;
 
         let inner = resident.get_as_delta(&ctx).await?;
 
@@ -2078,7 +2066,8 @@ pub(crate) mod test {
                 .await
                 .unwrap();
 
-            let copied_layer = writer.finish(Key::MAX, &branch, ctx).await.unwrap();
+            let (desc, path) = writer.finish(Key::MAX, ctx).await.unwrap();
+            let copied_layer = Layer::finish_creating(tenant.conf, &branch, desc, &path).unwrap();
 
             copied_layer.get_as_delta(ctx).await.unwrap();
 
@@ -2206,7 +2195,9 @@ pub(crate) mod test {
         for (key, lsn, value) in deltas {
             writer.put_value(key, lsn, value, ctx).await?;
         }
-        let delta_layer = writer.finish(key_end, tline, ctx).await?;
+
+        let (desc, path) = writer.finish(key_end, ctx).await?;
+        let delta_layer = Layer::finish_creating(tenant.conf, tline, desc, &path)?;
 
         Ok::<_, anyhow::Error>(delta_layer)
     }
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 6abc89c2ed..f118f3d8d8 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -11,9 +11,10 @@ use crate::repository::{Key, Value};
 use crate::tenant::block_io::{BlockCursor, BlockReader, BlockReaderRef};
 use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::timeline::GetVectoredError;
-use crate::tenant::{PageReconstructError, Timeline};
+use crate::tenant::PageReconstructError;
 use crate::{l0_flush, page_cache, walrecord};
 use anyhow::{anyhow, Result};
+use camino::Utf8PathBuf;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::InMemoryLayerInfo;
 use pageserver_api::shard::TenantShardId;
@@ -32,7 +33,9 @@ use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::atomic::{AtomicU64, AtomicUsize};
 use tokio::sync::{RwLock, RwLockWriteGuard};
 
-use super::{DeltaLayerWriter, ResidentLayer, ValueReconstructSituation, ValuesReconstructState};
+use super::{
+    DeltaLayerWriter, PersistentLayerDesc, ValueReconstructSituation, ValuesReconstructState,
+};
 
 #[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)]
 pub(crate) struct InMemoryLayerFileId(page_cache::FileId);
@@ -410,8 +413,7 @@ impl InMemoryLayer {
 
     /// Common subroutine of the public put_wal_record() and put_page_image() functions.
     /// Adds the page version to the in-memory tree
-
-    pub(crate) async fn put_value(
+    pub async fn put_value(
         &self,
         key: Key,
         lsn: Lsn,
@@ -476,8 +478,6 @@ impl InMemoryLayer {
     /// Records the end_lsn for non-dropped layers.
     /// `end_lsn` is exclusive
     pub async fn freeze(&self, end_lsn: Lsn) {
-        let inner = self.inner.write().await;
-
         assert!(
             self.start_lsn < end_lsn,
             "{} >= {}",
@@ -495,9 +495,13 @@ impl InMemoryLayer {
             })
             .expect("frozen_local_path_str set only once");
 
-        for vec_map in inner.index.values() {
-            for (lsn, _pos) in vec_map.as_slice() {
-                assert!(*lsn < end_lsn);
+        #[cfg(debug_assertions)]
+        {
+            let inner = self.inner.write().await;
+            for vec_map in inner.index.values() {
+                for (lsn, _pos) in vec_map.as_slice() {
+                    assert!(*lsn < end_lsn);
+                }
             }
         }
     }
@@ -507,12 +511,12 @@ impl InMemoryLayer {
     /// if there are no matching keys.
     ///
     /// Returns a new delta layer with all the same data as this in-memory layer
-    pub(crate) async fn write_to_disk(
+    pub async fn write_to_disk(
         &self,
-        timeline: &Arc<Timeline>,
         ctx: &RequestContext,
         key_range: Option<Range<Key>>,
-    ) -> Result<Option<ResidentLayer>> {
+        l0_flush_global_state: &l0_flush::Inner,
+    ) -> Result<Option<(PersistentLayerDesc, Utf8PathBuf)>> {
         // Grab the lock in read-mode. We hold it over the I/O, but because this
         // layer is not writeable anymore, no one should be trying to acquire the
         // write lock on it, so we shouldn't block anyone. There's one exception
@@ -524,9 +528,8 @@ impl InMemoryLayer {
         // rare though, so we just accept the potential latency hit for now.
         let inner = self.inner.read().await;
 
-        let l0_flush_global_state = timeline.l0_flush_global_state.inner().clone();
         use l0_flush::Inner;
-        let _concurrency_permit = match &*l0_flush_global_state {
+        let _concurrency_permit = match l0_flush_global_state {
             Inner::PageCached => None,
             Inner::Direct { semaphore, .. } => Some(semaphore.acquire().await),
         };
@@ -556,7 +559,7 @@ impl InMemoryLayer {
         )
         .await?;
 
-        match &*l0_flush_global_state {
+        match l0_flush_global_state {
             l0_flush::Inner::PageCached => {
                 let ctx = RequestContextBuilder::extend(ctx)
                     .page_content_kind(PageContentKind::InMemoryLayer)
@@ -621,7 +624,7 @@ impl InMemoryLayer {
         }
 
         // MAX is used here because we identify L0 layers by full key range
-        let delta_layer = delta_layer_writer.finish(Key::MAX, timeline, ctx).await?;
+        let (desc, path) = delta_layer_writer.finish(Key::MAX, ctx).await?;
 
         // Hold the permit until all the IO is done, including the fsync in `delta_layer_writer.finish()``.
         //
@@ -633,6 +636,6 @@ impl InMemoryLayer {
         // we dirtied when writing to the filesystem have been flushed and marked !dirty.
         drop(_concurrency_permit);
 
-        Ok(Some(delta_layer))
+        Ok(Some((desc, path)))
     }
 }
diff --git a/pageserver/src/tenant/storage_layer/split_writer.rs b/pageserver/src/tenant/storage_layer/split_writer.rs
index a966775f9e..d7bfe48c60 100644
--- a/pageserver/src/tenant/storage_layer/split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/split_writer.rs
@@ -4,6 +4,7 @@ use bytes::Bytes;
 use pageserver_api::key::{Key, KEY_SIZE};
 use utils::{id::TimelineId, lsn::Lsn, shard::TenantShardId};
 
+use crate::tenant::storage_layer::Layer;
 use crate::{config::PageServerConf, context::RequestContext, repository::Value, tenant::Timeline};
 
 use super::{DeltaLayerWriter, ImageLayerWriter, ResidentLayer};
@@ -173,8 +174,9 @@ impl SplitDeltaLayerWriter {
             )
             .await?;
             let prev_delta_writer = std::mem::replace(&mut self.inner, next_delta_writer);
-            self.generated_layers
-                .push(prev_delta_writer.finish(key, tline, ctx).await?);
+            let (desc, path) = prev_delta_writer.finish(key, ctx).await?;
+            let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
+            self.generated_layers.push(delta_layer);
         }
         self.inner.put_value(key, lsn, val, ctx).await
     }
@@ -190,7 +192,10 @@ impl SplitDeltaLayerWriter {
             inner,
             ..
         } = self;
-        generated_layers.push(inner.finish(end_key, tline, ctx).await?);
+
+        let (desc, path) = inner.finish(end_key, ctx).await?;
+        let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
+        generated_layers.push(delta_layer);
         Ok(generated_layers)
     }
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 4ff87f20f1..a05e4e0712 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3709,12 +3709,14 @@ impl Timeline {
         let frozen_layer = Arc::clone(frozen_layer);
         let ctx = ctx.attached_child();
         let work = async move {
-            let Some(new_delta) = frozen_layer
-                .write_to_disk(&self_clone, &ctx, key_range)
+            let Some((desc, path)) = frozen_layer
+                .write_to_disk(&ctx, key_range, self_clone.l0_flush_global_state.inner())
                 .await?
             else {
                 return Ok(None);
             };
+            let new_delta = Layer::finish_creating(self_clone.conf, &self_clone, desc, &path)?;
+
             // The write_to_disk() above calls writer.finish() which already did the fsync of the inodes.
             // We just need to fsync the directory in which these inodes are linked,
             // which we know to be the timeline directory.
@@ -5347,9 +5349,8 @@ impl Timeline {
         for (key, lsn, val) in deltas.data {
             delta_layer_writer.put_value(key, lsn, val, ctx).await?;
         }
-        let delta_layer = delta_layer_writer
-            .finish(deltas.key_range.end, self, ctx)
-            .await?;
+        let (desc, path) = delta_layer_writer.finish(deltas.key_range.end, ctx).await?;
+        let delta_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
 
         {
             let mut guard = self.layers.write().await;
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 1ff029a313..276d7b4967 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -1104,14 +1104,16 @@ impl Timeline {
                         || contains_hole
                     {
                         // ... if so, flush previous layer and prepare to write new one
-                        new_layers.push(
-                            writer
-                                .take()
-                                .unwrap()
-                                .finish(prev_key.unwrap().next(), self, ctx)
-                                .await
-                                .map_err(CompactionError::Other)?,
-                        );
+                        let (desc, path) = writer
+                            .take()
+                            .unwrap()
+                            .finish(prev_key.unwrap().next(), ctx)
+                            .await
+                            .map_err(CompactionError::Other)?;
+                        let new_delta = Layer::finish_creating(self.conf, self, desc, &path)
+                            .map_err(CompactionError::Other)?;
+
+                        new_layers.push(new_delta);
                         writer = None;
 
                         if contains_hole {
@@ -1174,12 +1176,13 @@ impl Timeline {
             prev_key = Some(key);
         }
         if let Some(writer) = writer {
-            new_layers.push(
-                writer
-                    .finish(prev_key.unwrap().next(), self, ctx)
-                    .await
-                    .map_err(CompactionError::Other)?,
-            );
+            let (desc, path) = writer
+                .finish(prev_key.unwrap().next(), ctx)
+                .await
+                .map_err(CompactionError::Other)?;
+            let new_delta = Layer::finish_creating(self.conf, self, desc, &path)
+                .map_err(CompactionError::Other)?;
+            new_layers.push(new_delta);
         }
 
         // Sync layers
@@ -1966,13 +1969,16 @@ impl Timeline {
             for (key, lsn, val) in deltas {
                 delta_layer_writer.put_value(key, lsn, val, ctx).await?;
             }
+
             stats.produce_delta_layer(delta_layer_writer.size());
             if dry_run {
                 return Ok(None);
             }
-            let delta_layer = delta_layer_writer
-                .finish(delta_key.key_range.end, tline, ctx)
+
+            let (desc, path) = delta_layer_writer
+                .finish(delta_key.key_range.end, ctx)
                 .await?;
+            let delta_layer = Layer::finish_creating(tline.conf, tline, desc, &path)?;
             Ok(Some(FlushDeltaResult::CreateResidentLayer(delta_layer)))
         }
 
@@ -2413,9 +2419,9 @@ impl CompactionJobExecutor for TimelineAdaptor {
             ))
         });
 
-        let new_delta_layer = writer
-            .finish(prev.unwrap().0.next(), &self.timeline, ctx)
-            .await?;
+        let (desc, path) = writer.finish(prev.unwrap().0.next(), ctx).await?;
+        let new_delta_layer =
+            Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path)?;
 
         self.new_deltas.push(new_delta_layer);
         Ok(())
diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs
index ee5f8cd52a..645b5ad2bf 100644
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -488,10 +488,12 @@ async fn copy_lsn_prefix(
         // reuse the key instead of adding more holes between layers by using the real
         // highest key in the layer.
         let reused_highest_key = layer.layer_desc().key_range.end;
-        let copied = writer
-            .finish(reused_highest_key, target_timeline, ctx)
+        let (desc, path) = writer
+            .finish(reused_highest_key, ctx)
             .await
             .map_err(CopyDeltaPrefix)?;
+        let copied = Layer::finish_creating(target_timeline.conf, target_timeline, desc, &path)
+            .map_err(CopyDeltaPrefix)?;
 
         tracing::debug!(%layer, %copied, "new layer produced");
 

From 22790fc9075b7108d6d26b84b3ba1e1f8a9d94e7 Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Tue, 6 Aug 2024 13:55:42 -0400
Subject: [PATCH 350/412] scrubber: clean up `scan_metadata` before prod
 (#8565)

Part of #8128.

## Problem
Currently, scrubber `scan_metadata` command will return with an error
code if the metadata on remote storage is corrupted with fatal errors.
To safely deploy this command in a cronjob, we want to differentiate
between failures while running scrubber command and the erroneous
metadata. At the same time, we also want our regression tests to catch
corrupted metadata using the scrubber command.

## Summary of changes

- Return with error code only when the scrubber command fails
- Uses explicit checks on errors and warnings to determine metadata
health in regression tests.

**Resolve conflict with `tenant-snapshot` command (after shard split):**
[`test_scrubber_tenant_snapshot`](https://github.com/neondatabase/neon/blob/yuchen/scrubber-scan-cleanup-before-prod/test_runner/regress/test_storage_scrubber.py#L23)
failed before applying 422a8443ddb7f1c7a26907a96c4aed0c5d554e67
- When taking a snapshot, the old `index_part.json` in the unsharded
tenant directory is not kept.
- The current `list_timeline_blobs` implementation consider no
`index_part.json` as a parse error.
- During the scan, we are only analyzing shards with highest shard
count, so we will not get a parse error. but we do need to add the
layers to tenant object listing, otherwise we will get index is
referencing a layer that is not in remote storage error.
- **Action:** Add s3_layers from `list_timeline_blobs` regardless of
parsing error

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
---
 storage_scrubber/src/checks.rs                | 14 ++++++++----
 storage_scrubber/src/main.rs                  | 10 ++++-----
 .../src/pageserver_physical_gc.rs             | 14 ++++++++----
 .../src/scan_pageserver_metadata.rs           | 22 +++++++++++++------
 storage_scrubber/src/tenant_snapshot.rs       |  2 +-
 test_runner/fixtures/neon_fixtures.py         | 14 +++++++++---
 test_runner/regress/test_compatibility.py     |  5 ++---
 .../regress/test_pageserver_generations.py    |  5 ++---
 .../regress/test_pageserver_secondary.py      |  3 ++-
 test_runner/regress/test_sharding.py          |  3 ++-
 test_runner/regress/test_storage_scrubber.py  | 11 +++++-----
 test_runner/regress/test_tenant_delete.py     |  8 +++----
 12 files changed, 70 insertions(+), 41 deletions(-)

diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs
index 14788515dd..35ec69fd50 100644
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -172,8 +172,11 @@ pub(crate) async fn branch_cleanup_and_check_errors(
                     }
                 }
                 BlobDataParseResult::Relic => {}
-                BlobDataParseResult::Incorrect(parse_errors) => result.errors.extend(
-                    parse_errors
+                BlobDataParseResult::Incorrect {
+                    errors,
+                    s3_layers: _,
+                } => result.errors.extend(
+                    errors
                         .into_iter()
                         .map(|error| format!("parse error: {error}")),
                 ),
@@ -300,7 +303,10 @@ pub(crate) enum BlobDataParseResult {
     },
     /// The remains of a deleted Timeline (i.e. an initdb archive only)
     Relic,
-    Incorrect(Vec<String>),
+    Incorrect {
+        errors: Vec<String>,
+        s3_layers: HashSet<(LayerName, Generation)>,
+    },
 }
 
 pub(crate) fn parse_layer_object_name(name: &str) -> Result<(LayerName, Generation), String> {
@@ -443,7 +449,7 @@ pub(crate) async fn list_timeline_blobs(
     }
 
     Ok(S3TimelineBlobData {
-        blob_data: BlobDataParseResult::Incorrect(errors),
+        blob_data: BlobDataParseResult::Incorrect { errors, s3_layers },
         unused_index_keys: index_part_keys,
         unknown_keys,
     })
diff --git a/storage_scrubber/src/main.rs b/storage_scrubber/src/main.rs
index a111c31844..cbc836755a 100644
--- a/storage_scrubber/src/main.rs
+++ b/storage_scrubber/src/main.rs
@@ -208,21 +208,21 @@ async fn main() -> anyhow::Result<()> {
                         }
 
                         if summary.is_fatal() {
-                            Err(anyhow::anyhow!("Fatal scrub errors detected"))
+                            tracing::error!("Fatal scrub errors detected");
                         } else if summary.is_empty() {
                             // Strictly speaking an empty bucket is a valid bucket, but if someone ran the
                             // scrubber they were likely expecting to scan something, and if we see no timelines
                             // at all then it's likely due to some configuration issues like a bad prefix
-                            Err(anyhow::anyhow!(
+                            tracing::error!(
                                 "No timelines found in bucket {} prefix {}",
                                 bucket_config.bucket,
                                 bucket_config
                                     .prefix_in_bucket
                                     .unwrap_or("<none>".to_string())
-                            ))
-                        } else {
-                            Ok(())
+                            );
                         }
+
+                        Ok(())
                     }
                 }
             }
diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs
index 69896caa82..ff230feae3 100644
--- a/storage_scrubber/src/pageserver_physical_gc.rs
+++ b/storage_scrubber/src/pageserver_physical_gc.rs
@@ -389,10 +389,13 @@ async fn gc_ancestor(
                 // Post-deletion tenant location: don't try and GC it.
                 continue;
             }
-            BlobDataParseResult::Incorrect(reasons) => {
+            BlobDataParseResult::Incorrect {
+                errors,
+                s3_layers: _, // TODO(yuchen): could still check references to these s3 layers?
+            } => {
                 // Our primary purpose isn't to report on bad data, but log this rather than skipping silently
                 tracing::warn!(
-                    "Skipping ancestor GC for timeline {ttid}, bad metadata: {reasons:?}"
+                    "Skipping ancestor GC for timeline {ttid}, bad metadata: {errors:?}"
                 );
                 continue;
             }
@@ -518,9 +521,12 @@ pub async fn pageserver_physical_gc(
                 // Post-deletion tenant location: don't try and GC it.
                 return Ok(summary);
             }
-            BlobDataParseResult::Incorrect(reasons) => {
+            BlobDataParseResult::Incorrect {
+                errors,
+                s3_layers: _,
+            } => {
                 // Our primary purpose isn't to report on bad data, but log this rather than skipping silently
-                tracing::warn!("Skipping timeline {ttid}, bad metadata: {reasons:?}");
+                tracing::warn!("Skipping timeline {ttid}, bad metadata: {errors:?}");
                 return Ok(summary);
             }
         };
diff --git a/storage_scrubber/src/scan_pageserver_metadata.rs b/storage_scrubber/src/scan_pageserver_metadata.rs
index dc410bde41..b9630056e1 100644
--- a/storage_scrubber/src/scan_pageserver_metadata.rs
+++ b/storage_scrubber/src/scan_pageserver_metadata.rs
@@ -290,13 +290,21 @@ pub async fn scan_metadata(
             }
         }
 
-        if let BlobDataParseResult::Parsed {
-            index_part: _index_part,
-            index_part_generation: _index_part_generation,
-            s3_layers,
-        } = &data.blob_data
-        {
-            tenant_objects.push(ttid, s3_layers.clone());
+        match &data.blob_data {
+            BlobDataParseResult::Parsed {
+                index_part: _index_part,
+                index_part_generation: _index_part_generation,
+                s3_layers,
+            } => {
+                tenant_objects.push(ttid, s3_layers.clone());
+            }
+            BlobDataParseResult::Relic => (),
+            BlobDataParseResult::Incorrect {
+                errors: _,
+                s3_layers,
+            } => {
+                tenant_objects.push(ttid, s3_layers.clone());
+            }
         }
         tenant_timeline_results.push((ttid, data));
     }
diff --git a/storage_scrubber/src/tenant_snapshot.rs b/storage_scrubber/src/tenant_snapshot.rs
index 5a75f8d40e..1866e6ec80 100644
--- a/storage_scrubber/src/tenant_snapshot.rs
+++ b/storage_scrubber/src/tenant_snapshot.rs
@@ -269,7 +269,7 @@ impl SnapshotDownloader {
                         .context("Downloading timeline")?;
                     }
                     BlobDataParseResult::Relic => {}
-                    BlobDataParseResult::Incorrect(_) => {
+                    BlobDataParseResult::Incorrect { .. } => {
                         tracing::error!("Bad metadata in timeline {ttid}");
                     }
                 };
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 7289472de2..c6f4404784 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -978,7 +978,10 @@ class NeonEnvBuilder:
                 and self.enable_scrub_on_exit
             ):
                 try:
-                    self.env.storage_scrubber.scan_metadata()
+                    healthy, _ = self.env.storage_scrubber.scan_metadata()
+                    if not healthy:
+                        e = Exception("Remote storage metadata corrupted")
+                        cleanup_error = e
                 except Exception as e:
                     log.error(f"Error during remote storage scrub: {e}")
                     cleanup_error = e
@@ -4411,14 +4414,19 @@ class StorageScrubber:
         assert stdout is not None
         return stdout
 
-    def scan_metadata(self, post_to_storage_controller: bool = False) -> Any:
+    def scan_metadata(self, post_to_storage_controller: bool = False) -> Tuple[bool, Any]:
+        """
+        Returns the health status and the metadata summary.
+        """
         args = ["scan-metadata", "--node-kind", "pageserver", "--json"]
         if post_to_storage_controller:
             args.append("--post")
         stdout = self.scrubber_cli(args, timeout=30)
 
         try:
-            return json.loads(stdout)
+            summary = json.loads(stdout)
+            healthy = not summary["with_errors"] and not summary["with_warnings"]
+            return healthy, summary
         except:
             log.error("Failed to decode JSON output from `scan-metadata`.  Dumping stdout:")
             log.error(stdout)
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index 137b0e931d..afa5f6873c 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -496,11 +496,10 @@ def test_historic_storage_formats(
     # Check the scrubber handles this old data correctly (can read it and doesn't consider it corrupt)
     #
     # Do this _before_ importing to the pageserver, as that import may start writing immediately
-    metadata_summary = env.storage_scrubber.scan_metadata()
+    healthy, metadata_summary = env.storage_scrubber.scan_metadata()
+    assert healthy
     assert metadata_summary["tenant_count"] >= 1
     assert metadata_summary["timeline_count"] >= 1
-    assert not metadata_summary["with_errors"]
-    assert not metadata_summary["with_warnings"]
 
     env.neon_cli.import_tenant(dataset.tenant_id)
 
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index 8941ddd281..73af7950f1 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -214,12 +214,11 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
 
     # Having written a mixture of generation-aware and legacy index_part.json,
     # ensure the scrubber handles the situation as expected.
-    metadata_summary = env.storage_scrubber.scan_metadata()
+    healthy, metadata_summary = env.storage_scrubber.scan_metadata()
     assert metadata_summary["tenant_count"] == 1  # Scrubber should have seen our timeline
     assert metadata_summary["timeline_count"] == 1
     assert metadata_summary["timeline_shard_count"] == 1
-    assert not metadata_summary["with_errors"]
-    assert not metadata_summary["with_warnings"]
+    assert healthy
 
 
 def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 4b0af24480..8746b88a75 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -593,7 +593,8 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
     # Scrub the remote storage
     # ========================
     # This confirms that the scrubber isn't upset by the presence of the heatmap
-    env.storage_scrubber.scan_metadata()
+    healthy, _ = env.storage_scrubber.scan_metadata()
+    assert healthy
 
     # Detach secondary and delete tenant
     # ===================================
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 7f30b2d7a7..1011a6fd22 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -124,7 +124,8 @@ def test_sharding_smoke(
 
     # Check the scrubber isn't confused by sharded content, then disable
     # it during teardown because we'll have deleted by then
-    env.storage_scrubber.scan_metadata()
+    healthy, _ = env.storage_scrubber.scan_metadata()
+    assert healthy
 
     env.storage_controller.pageserver_api().tenant_delete(tenant_id)
     assert_prefix_empty(
diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py
index e3f627b6a6..388f6a9e92 100644
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -516,9 +516,8 @@ def test_scrubber_scan_pageserver_metadata(
     assert len(index.layer_metadata) > 0
     it = iter(index.layer_metadata.items())
 
-    scan_summary = env.storage_scrubber.scan_metadata(post_to_storage_controller=True)
-    assert not scan_summary["with_warnings"]
-    assert not scan_summary["with_errors"]
+    healthy, scan_summary = env.storage_scrubber.scan_metadata(post_to_storage_controller=True)
+    assert healthy
 
     assert env.storage_controller.metadata_health_is_healthy()
 
@@ -532,16 +531,18 @@ def test_scrubber_scan_pageserver_metadata(
     log.info(f"delete response: {delete_response}")
 
     # Check scan summary without posting to storage controller. Expect it to be a L0 layer so only emit warnings.
-    scan_summary = env.storage_scrubber.scan_metadata()
+    _, scan_summary = env.storage_scrubber.scan_metadata()
     log.info(f"{pprint.pformat(scan_summary)}")
     assert len(scan_summary["with_warnings"]) > 0
 
     assert env.storage_controller.metadata_health_is_healthy()
 
     # Now post to storage controller, expect seeing one unhealthy health record
-    scan_summary = env.storage_scrubber.scan_metadata(post_to_storage_controller=True)
+    _, scan_summary = env.storage_scrubber.scan_metadata(post_to_storage_controller=True)
     log.info(f"{pprint.pformat(scan_summary)}")
     assert len(scan_summary["with_warnings"]) > 0
 
     unhealthy = env.storage_controller.metadata_health_list_unhealthy()["unhealthy_tenant_shards"]
     assert len(unhealthy) == 1 and unhealthy[0] == str(tenant_shard_id)
+
+    neon_env_builder.disable_scrub_on_exit()
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index c343b349cf..c01b3a2e89 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -341,13 +341,13 @@ def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder)
     wait_for_upload(ps_http, tenant_id, timeline_id, last_flush_lsn)
     env.stop()
 
-    result = env.storage_scrubber.scan_metadata()
-    assert result["with_warnings"] == []
+    healthy, _ = env.storage_scrubber.scan_metadata()
+    assert healthy
 
     env.start()
     ps_http = env.pageserver.http_client()
     ps_http.tenant_delete(tenant_id)
     env.stop()
 
-    env.storage_scrubber.scan_metadata()
-    assert result["with_warnings"] == []
+    healthy, _ = env.storage_scrubber.scan_metadata()
+    assert healthy

From 9a3bc5556a713dc1560268588ca7873f375a0489 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 7 Aug 2024 09:14:26 +0200
Subject: [PATCH 351/412] storage broker: only print one line for version and
 build tag in init (#8624)

This makes it more consistent with pageserver and safekeeper. Also, it
is easier to collect the two values into one data point.
---
 storage_broker/src/bin/storage_broker.rs | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs
index 0a4af543ab..15acd0e49c 100644
--- a/storage_broker/src/bin/storage_broker.rs
+++ b/storage_broker/src/bin/storage_broker.rs
@@ -642,8 +642,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
     logging::replace_panic_hook_with_tracing_panic_hook().forget();
     // initialize sentry if SENTRY_DSN is provided
     let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
-    info!("version: {GIT_VERSION}");
-    info!("build_tag: {BUILD_TAG}");
+    info!("version: {GIT_VERSION} build_tag: {BUILD_TAG}");
     metrics::set_build_info_metric(GIT_VERSION, BUILD_TAG);
 
     // On any shutdown signal, log receival and exit.

From c75e6fbc46754479b5e6cfe3e52744f1c7414ed4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 7 Aug 2024 09:29:52 +0200
Subject: [PATCH 352/412] Lower level for timeline cancellations during gc
 (#8626)

Timeline cancellation running in parallel with gc yields error log lines
like:

```
Gc failed 1 times, retrying in 2s: TimelineCancelled
```

They are completely harmless though and normal to occur. Therefore, only
print those messages at an info level. Still print them at all so that
we know what is going on if we focus on a single timeline.
---
 pageserver/src/tenant/tasks.rs | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 230362d81a..b4706ea59d 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -407,9 +407,16 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                         error_run_count += 1;
                         let wait_duration = Duration::from_secs_f64(wait_duration);
 
-                        error!(
-                        "Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}",
-                    );
+                        if matches!(e, crate::tenant::GcError::TimelineCancelled) {
+                            // Timeline was cancelled during gc. We might either be in an event
+                            // that affects the entire tenant (tenant deletion, pageserver shutdown),
+                            // or in one that affects the timeline only (timeline deletion).
+                            // Therefore, don't exit the loop.
+                            info!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}");
+                        } else {
+                            error!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}");
+                        }
+
                         wait_duration
                     }
                 }

From 38f184bc9173ccb5ee664cde84be31bb19cf414e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 7 Aug 2024 14:53:52 +0200
Subject: [PATCH 353/412] Add missing colon to ArchivalConfigRequest
 specification (#8627)

Add a missing colon to the API specification of `ArchivalConfigRequest`.
The `state` field is required. Pointed out by Gleb.
---
 pageserver/src/http/openapi_spec.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 4656f2c93a..42086dc2e6 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -932,7 +932,7 @@ components:
           description: Whether to poll remote storage for layers to download.  If false, secondary locations don't download anything.
     ArchivalConfigRequest:
       type: object
-      required
+      required:
         - state
       properties:
         state:

From 1f73dfb8420ed08825dc28c6d3f443313bfceac6 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 7 Aug 2024 14:37:03 +0100
Subject: [PATCH 354/412] proxy: random changes (#8602)

## Problem

1. Hard to correlate startup parameters with the endpoint that provided
them.
2. Some configurations are not needed in the `ProxyConfig` struct.

## Summary of changes

Because of some borrow checker fun, I needed to switch to an
interior-mutability implementation of our `RequestMonitoring` context
system. Using https://docs.rs/try-lock/latest/try_lock/ as a cheap lock
for such a use-case (needed to be thread safe).

Removed the lock of each startup message, instead just logging only the
startup params in a successful handshake.

Also removed from values from `ProxyConfig` and kept as arguments.
(needed for local-proxy config)
---
 Cargo.lock                            |   5 +-
 Cargo.toml                            |   1 +
 proxy/Cargo.toml                      |   1 +
 proxy/src/auth/backend.rs             |  40 ++---
 proxy/src/auth/backend/classic.rs     |   4 +-
 proxy/src/auth/backend/hacks.rs       |   8 +-
 proxy/src/auth/backend/link.rs        |   2 +-
 proxy/src/auth/credentials.rs         |  60 ++++---
 proxy/src/auth/flow.rs                |  10 +-
 proxy/src/bin/pg_sni_router.rs        |   6 +-
 proxy/src/bin/proxy.rs                |  20 +--
 proxy/src/cache/endpoints.rs          |   2 +-
 proxy/src/compute.rs                  |  10 +-
 proxy/src/config.rs                   |   4 -
 proxy/src/console/provider.rs         |  14 +-
 proxy/src/console/provider/mock.rs    |   6 +-
 proxy/src/console/provider/neon.rs    |  22 +--
 proxy/src/context.rs                  | 241 +++++++++++++++++++-------
 proxy/src/context/parquet.rs          |   6 +-
 proxy/src/metrics.rs                  |  31 +---
 proxy/src/proxy.rs                    |  18 +-
 proxy/src/proxy/connect_compute.rs    |  16 +-
 proxy/src/proxy/handshake.rs          |  25 ++-
 proxy/src/proxy/tests.rs              |  41 ++---
 proxy/src/proxy/tests/mitm.rs         |  11 +-
 proxy/src/proxy/wake_compute.rs       |   6 +-
 proxy/src/serverless.rs               |   4 +-
 proxy/src/serverless/backend.rs       |  12 +-
 proxy/src/serverless/conn_pool.rs     |  15 +-
 proxy/src/serverless/sql_over_http.rs |  17 +-
 proxy/src/serverless/websocket.rs     |   4 +-
 31 files changed, 386 insertions(+), 276 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 764c0fbd30..f565119dbd 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4324,6 +4324,7 @@ dependencies = [
  "tracing-opentelemetry",
  "tracing-subscriber",
  "tracing-utils",
+ "try-lock",
  "typed-json",
  "url",
  "urlencoding",
@@ -6563,9 +6564,9 @@ dependencies = [
 
 [[package]]
 name = "try-lock"
-version = "0.2.4"
+version = "0.2.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3528ecfd12c466c6f163363caf2d02a71161dd5e1cc6ae7b34207ea2d42d81ed"
+checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b"
 
 [[package]]
 name = "tungstenite"
diff --git a/Cargo.toml b/Cargo.toml
index af1c1dfc82..963841e340 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -184,6 +184,7 @@ tracing = "0.1"
 tracing-error = "0.2.0"
 tracing-opentelemetry = "0.21.0"
 tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
+try-lock = "0.2.5"
 twox-hash = { version = "1.6.3", default-features = false }
 typed-json = "0.1"
 url = "2.2"
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 2f18b5fbc6..b316c53034 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -92,6 +92,7 @@ tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
 tracing-utils.workspace = true
 tracing.workspace = true
+try-lock.workspace = true
 typed-json.workspace = true
 url.workspace = true
 urlencoding.workspace = true
diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 67c4dd019e..90dea01bf3 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -218,7 +218,7 @@ impl RateBucketInfo {
 impl AuthenticationConfig {
     pub fn check_rate_limit(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         config: &AuthenticationConfig,
         secret: AuthSecret,
         endpoint: &EndpointId,
@@ -243,7 +243,7 @@ impl AuthenticationConfig {
         let limit_not_exceeded = self.rate_limiter.check(
             (
                 endpoint_int,
-                MaskedIp::new(ctx.peer_addr, config.rate_limit_ip_subnet),
+                MaskedIp::new(ctx.peer_addr(), config.rate_limit_ip_subnet),
             ),
             password_weight,
         );
@@ -274,7 +274,7 @@ impl AuthenticationConfig {
 ///
 /// All authentication flows will emit an AuthenticationOk message if successful.
 async fn auth_quirks(
-    ctx: &mut RequestMonitoring,
+    ctx: &RequestMonitoring,
     api: &impl console::Api,
     user_info: ComputeUserInfoMaybeEndpoint,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
@@ -303,8 +303,8 @@ async fn auth_quirks(
     let (allowed_ips, maybe_secret) = api.get_allowed_ips_and_secret(ctx, &info).await?;
 
     // check allowed list
-    if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) {
-        return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr));
+    if !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) {
+        return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr()));
     }
 
     if !endpoint_rate_limiter.check(info.endpoint.clone().into(), 1) {
@@ -356,7 +356,7 @@ async fn auth_quirks(
 }
 
 async fn authenticate_with_secret(
-    ctx: &mut RequestMonitoring,
+    ctx: &RequestMonitoring,
     secret: AuthSecret,
     info: ComputeUserInfo,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
@@ -421,7 +421,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
     #[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)]
     pub async fn authenticate(
         self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
         allow_cleartext: bool,
         config: &'static AuthenticationConfig,
@@ -467,7 +467,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
 impl BackendType<'_, ComputeUserInfo, &()> {
     pub async fn get_role_secret(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
     ) -> Result<CachedRoleSecret, GetAuthInfoError> {
         use BackendType::*;
         match self {
@@ -478,7 +478,7 @@ impl BackendType<'_, ComputeUserInfo, &()> {
 
     pub async fn get_allowed_ips_and_secret(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
         use BackendType::*;
         match self {
@@ -492,7 +492,7 @@ impl BackendType<'_, ComputeUserInfo, &()> {
 impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, NodeInfo> {
     async fn wake_compute(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
     ) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
         use BackendType::*;
 
@@ -514,7 +514,7 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, NodeInfo> {
 impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> {
     async fn wake_compute(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
     ) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
         use BackendType::*;
 
@@ -571,7 +571,7 @@ mod tests {
     impl console::Api for Auth {
         async fn get_role_secret(
             &self,
-            _ctx: &mut RequestMonitoring,
+            _ctx: &RequestMonitoring,
             _user_info: &super::ComputeUserInfo,
         ) -> Result<CachedRoleSecret, console::errors::GetAuthInfoError> {
             Ok(CachedRoleSecret::new_uncached(Some(self.secret.clone())))
@@ -579,7 +579,7 @@ mod tests {
 
         async fn get_allowed_ips_and_secret(
             &self,
-            _ctx: &mut RequestMonitoring,
+            _ctx: &RequestMonitoring,
             _user_info: &super::ComputeUserInfo,
         ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), console::errors::GetAuthInfoError>
         {
@@ -591,7 +591,7 @@ mod tests {
 
         async fn wake_compute(
             &self,
-            _ctx: &mut RequestMonitoring,
+            _ctx: &RequestMonitoring,
             _user_info: &super::ComputeUserInfo,
         ) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
             unimplemented!()
@@ -665,7 +665,7 @@ mod tests {
         let (mut client, server) = tokio::io::duplex(1024);
         let mut stream = PqStream::new(Stream::from_raw(server));
 
-        let mut ctx = RequestMonitoring::test();
+        let ctx = RequestMonitoring::test();
         let api = Auth {
             ips: vec![],
             secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
@@ -723,7 +723,7 @@ mod tests {
         ));
 
         let _creds = auth_quirks(
-            &mut ctx,
+            &ctx,
             &api,
             user_info,
             &mut stream,
@@ -742,7 +742,7 @@ mod tests {
         let (mut client, server) = tokio::io::duplex(1024);
         let mut stream = PqStream::new(Stream::from_raw(server));
 
-        let mut ctx = RequestMonitoring::test();
+        let ctx = RequestMonitoring::test();
         let api = Auth {
             ips: vec![],
             secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
@@ -775,7 +775,7 @@ mod tests {
         ));
 
         let _creds = auth_quirks(
-            &mut ctx,
+            &ctx,
             &api,
             user_info,
             &mut stream,
@@ -794,7 +794,7 @@ mod tests {
         let (mut client, server) = tokio::io::duplex(1024);
         let mut stream = PqStream::new(Stream::from_raw(server));
 
-        let mut ctx = RequestMonitoring::test();
+        let ctx = RequestMonitoring::test();
         let api = Auth {
             ips: vec![],
             secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
@@ -828,7 +828,7 @@ mod tests {
         ));
 
         let creds = auth_quirks(
-            &mut ctx,
+            &ctx,
             &api,
             user_info,
             &mut stream,
diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs
index b98fa63120..285fa29428 100644
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -12,7 +12,7 @@ use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{info, warn};
 
 pub(super) async fn authenticate(
-    ctx: &mut RequestMonitoring,
+    ctx: &RequestMonitoring,
     creds: ComputeUserInfo,
     client: &mut PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
     config: &'static AuthenticationConfig,
@@ -27,7 +27,7 @@ pub(super) async fn authenticate(
         }
         AuthSecret::Scram(secret) => {
             info!("auth endpoint chooses SCRAM");
-            let scram = auth::Scram(&secret, &mut *ctx);
+            let scram = auth::Scram(&secret, ctx);
 
             let auth_outcome = tokio::time::timeout(
                 config.scram_protocol_timeout,
diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs
index 6b0f5e1726..56921dd949 100644
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -18,7 +18,7 @@ use tracing::{info, warn};
 /// These properties are benefical for serverless JS workers, so we
 /// use this mechanism for websocket connections.
 pub async fn authenticate_cleartext(
-    ctx: &mut RequestMonitoring,
+    ctx: &RequestMonitoring,
     info: ComputeUserInfo,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
     secret: AuthSecret,
@@ -28,7 +28,7 @@ pub async fn authenticate_cleartext(
     ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
 
     // pause the timer while we communicate with the client
-    let paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
+    let paused = ctx.latency_timer_pause(crate::metrics::Waiting::Client);
 
     let ep = EndpointIdInt::from(&info.endpoint);
 
@@ -60,7 +60,7 @@ pub async fn authenticate_cleartext(
 /// Similar to [`authenticate_cleartext`], but there's a specific password format,
 /// and passwords are not yet validated (we don't know how to validate them!)
 pub async fn password_hack_no_authentication(
-    ctx: &mut RequestMonitoring,
+    ctx: &RequestMonitoring,
     info: ComputeUserInfoNoEndpoint,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
 ) -> auth::Result<ComputeCredentials> {
@@ -68,7 +68,7 @@ pub async fn password_hack_no_authentication(
     ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
 
     // pause the timer while we communicate with the client
-    let _paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
+    let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Client);
 
     let payload = AuthFlow::new(client)
         .begin(auth::PasswordHack)
diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs
index 5932e1337c..95f4614736 100644
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -57,7 +57,7 @@ pub fn new_psql_session_id() -> String {
 }
 
 pub(super) async fn authenticate(
-    ctx: &mut RequestMonitoring,
+    ctx: &RequestMonitoring,
     link_uri: &reqwest::Url,
     client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
 ) -> auth::Result<NodeInfo> {
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index d06f5614f1..8f4a392131 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -84,7 +84,7 @@ pub fn endpoint_sni(
 
 impl ComputeUserInfoMaybeEndpoint {
     pub fn parse(
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         params: &StartupMessageParams,
         sni: Option<&str>,
         common_names: Option<&HashSet<String>>,
@@ -249,8 +249,8 @@ mod tests {
     fn parse_bare_minimum() -> anyhow::Result<()> {
         // According to postgresql, only `user` should be required.
         let options = StartupMessageParams::new([("user", "john_doe")]);
-        let mut ctx = RequestMonitoring::test();
-        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
+        let ctx = RequestMonitoring::test();
+        let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
         assert_eq!(user_info.user, "john_doe");
         assert_eq!(user_info.endpoint_id, None);
 
@@ -264,8 +264,8 @@ mod tests {
             ("database", "world"), // should be ignored
             ("foo", "bar"),        // should be ignored
         ]);
-        let mut ctx = RequestMonitoring::test();
-        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
+        let ctx = RequestMonitoring::test();
+        let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
         assert_eq!(user_info.user, "john_doe");
         assert_eq!(user_info.endpoint_id, None);
 
@@ -279,9 +279,9 @@ mod tests {
         let sni = Some("foo.localhost");
         let common_names = Some(["localhost".into()].into());
 
-        let mut ctx = RequestMonitoring::test();
+        let ctx = RequestMonitoring::test();
         let user_info =
-            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
+            ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?;
         assert_eq!(user_info.user, "john_doe");
         assert_eq!(user_info.endpoint_id.as_deref(), Some("foo"));
         assert_eq!(user_info.options.get_cache_key("foo"), "foo");
@@ -296,8 +296,8 @@ mod tests {
             ("options", "-ckey=1 project=bar -c geqo=off"),
         ]);
 
-        let mut ctx = RequestMonitoring::test();
-        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
+        let ctx = RequestMonitoring::test();
+        let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
         assert_eq!(user_info.user, "john_doe");
         assert_eq!(user_info.endpoint_id.as_deref(), Some("bar"));
 
@@ -311,8 +311,8 @@ mod tests {
             ("options", "-ckey=1 endpoint=bar -c geqo=off"),
         ]);
 
-        let mut ctx = RequestMonitoring::test();
-        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
+        let ctx = RequestMonitoring::test();
+        let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
         assert_eq!(user_info.user, "john_doe");
         assert_eq!(user_info.endpoint_id.as_deref(), Some("bar"));
 
@@ -329,8 +329,8 @@ mod tests {
             ),
         ]);
 
-        let mut ctx = RequestMonitoring::test();
-        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
+        let ctx = RequestMonitoring::test();
+        let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
         assert_eq!(user_info.user, "john_doe");
         assert!(user_info.endpoint_id.is_none());
 
@@ -344,8 +344,8 @@ mod tests {
             ("options", "-ckey=1 endpoint=bar project=foo -c geqo=off"),
         ]);
 
-        let mut ctx = RequestMonitoring::test();
-        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
+        let ctx = RequestMonitoring::test();
+        let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
         assert_eq!(user_info.user, "john_doe");
         assert!(user_info.endpoint_id.is_none());
 
@@ -359,9 +359,9 @@ mod tests {
         let sni = Some("baz.localhost");
         let common_names = Some(["localhost".into()].into());
 
-        let mut ctx = RequestMonitoring::test();
+        let ctx = RequestMonitoring::test();
         let user_info =
-            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
+            ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?;
         assert_eq!(user_info.user, "john_doe");
         assert_eq!(user_info.endpoint_id.as_deref(), Some("baz"));
 
@@ -374,16 +374,16 @@ mod tests {
 
         let common_names = Some(["a.com".into(), "b.com".into()].into());
         let sni = Some("p1.a.com");
-        let mut ctx = RequestMonitoring::test();
+        let ctx = RequestMonitoring::test();
         let user_info =
-            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
+            ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?;
         assert_eq!(user_info.endpoint_id.as_deref(), Some("p1"));
 
         let common_names = Some(["a.com".into(), "b.com".into()].into());
         let sni = Some("p1.b.com");
-        let mut ctx = RequestMonitoring::test();
+        let ctx = RequestMonitoring::test();
         let user_info =
-            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
+            ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?;
         assert_eq!(user_info.endpoint_id.as_deref(), Some("p1"));
 
         Ok(())
@@ -397,10 +397,9 @@ mod tests {
         let sni = Some("second.localhost");
         let common_names = Some(["localhost".into()].into());
 
-        let mut ctx = RequestMonitoring::test();
-        let err =
-            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())
-                .expect_err("should fail");
+        let ctx = RequestMonitoring::test();
+        let err = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())
+            .expect_err("should fail");
         match err {
             InconsistentProjectNames { domain, option } => {
                 assert_eq!(option, "first");
@@ -417,10 +416,9 @@ mod tests {
         let sni = Some("project.localhost");
         let common_names = Some(["example.com".into()].into());
 
-        let mut ctx = RequestMonitoring::test();
-        let err =
-            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())
-                .expect_err("should fail");
+        let ctx = RequestMonitoring::test();
+        let err = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())
+            .expect_err("should fail");
         match err {
             UnknownCommonName { cn } => {
                 assert_eq!(cn, "localhost");
@@ -438,9 +436,9 @@ mod tests {
 
         let sni = Some("project.localhost");
         let common_names = Some(["localhost".into()].into());
-        let mut ctx = RequestMonitoring::test();
+        let ctx = RequestMonitoring::test();
         let user_info =
-            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
+            ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?;
         assert_eq!(user_info.endpoint_id.as_deref(), Some("project"));
         assert_eq!(
             user_info.options.get_cache_key("project"),
diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs
index 59d1ac17f4..acf7b4f6b6 100644
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -27,7 +27,7 @@ pub trait AuthMethod {
 pub struct Begin;
 
 /// Use [SCRAM](crate::scram)-based auth in [`AuthFlow`].
-pub struct Scram<'a>(pub &'a scram::ServerSecret, pub &'a mut RequestMonitoring);
+pub struct Scram<'a>(pub &'a scram::ServerSecret, pub &'a RequestMonitoring);
 
 impl AuthMethod for Scram<'_> {
     #[inline(always)]
@@ -155,7 +155,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
         let Scram(secret, ctx) = self.state;
 
         // pause the timer while we communicate with the client
-        let _paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
+        let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Client);
 
         // Initial client message contains the chosen auth method's name.
         let msg = self.stream.read_password_message().await?;
@@ -168,10 +168,8 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
         }
 
         match sasl.method {
-            SCRAM_SHA_256 => ctx.auth_method = Some(crate::context::AuthMethod::ScramSha256),
-            SCRAM_SHA_256_PLUS => {
-                ctx.auth_method = Some(crate::context::AuthMethod::ScramSha256Plus)
-            }
+            SCRAM_SHA_256 => ctx.set_auth_method(crate::context::AuthMethod::ScramSha256),
+            SCRAM_SHA_256_PLUS => ctx.set_auth_method(crate::context::AuthMethod::ScramSha256Plus),
             _ => {}
         }
         info!("client chooses {}", sasl.method);
diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs
index d7a3eb9a4d..1038fa5116 100644
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -205,7 +205,7 @@ async fn task_main(
 const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
 
 async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
-    ctx: &mut RequestMonitoring,
+    ctx: &RequestMonitoring,
     raw_stream: S,
     tls_config: Arc<rustls::ServerConfig>,
     tls_server_end_point: TlsServerEndPoint,
@@ -256,13 +256,13 @@ async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
 }
 
 async fn handle_client(
-    mut ctx: RequestMonitoring,
+    ctx: RequestMonitoring,
     dest_suffix: Arc<String>,
     tls_config: Arc<rustls::ServerConfig>,
     tls_server_end_point: TlsServerEndPoint,
     stream: impl AsyncRead + AsyncWrite + Unpin,
 ) -> anyhow::Result<()> {
-    let mut tls_stream = ssl_handshake(&mut ctx, stream, tls_config, tls_server_end_point).await?;
+    let mut tls_stream = ssl_handshake(&ctx, stream, tls_config, tls_server_end_point).await?;
 
     // Cut off first part of the SNI domain
     // We receive required destination details in the format of
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index c1fd6dfd80..b44e0ddd2f 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -5,6 +5,7 @@ use aws_config::meta::region::RegionProviderChain;
 use aws_config::profile::ProfileFileCredentialsProvider;
 use aws_config::provider_config::ProviderConfig;
 use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider;
+use aws_config::Region;
 use futures::future::Either;
 use proxy::auth;
 use proxy::auth::backend::AuthRateLimiter;
@@ -290,9 +291,10 @@ async fn main() -> anyhow::Result<()> {
     let config = build_config(&args)?;
 
     info!("Authentication backend: {}", config.auth_backend);
-    info!("Using region: {}", config.aws_region);
+    info!("Using region: {}", args.aws_region);
 
-    let region_provider = RegionProviderChain::default_provider().or_else(&*config.aws_region); // Replace with your Redis region if needed
+    let region_provider =
+        RegionProviderChain::default_provider().or_else(Region::new(args.aws_region.clone()));
     let provider_conf =
         ProviderConfig::without_region().with_region(region_provider.region().await);
     let aws_credentials_provider = {
@@ -318,7 +320,7 @@ async fn main() -> anyhow::Result<()> {
     };
     let elasticache_credentials_provider = Arc::new(elasticache::CredentialsProvider::new(
         elasticache::AWSIRSAConfig::new(
-            config.aws_region.clone(),
+            args.aws_region.clone(),
             args.redis_cluster_name,
             args.redis_user_id,
         ),
@@ -376,11 +378,14 @@ async fn main() -> anyhow::Result<()> {
 
     let cancel_map = CancelMap::default();
 
+    let redis_rps_limit = Vec::leak(args.redis_rps_limit.clone());
+    RateBucketInfo::validate(redis_rps_limit)?;
+
     let redis_publisher = match &regional_redis_client {
         Some(redis_publisher) => Some(Arc::new(Mutex::new(RedisPublisherClient::new(
             redis_publisher.clone(),
             args.region.clone(),
-            &config.redis_rps_limit,
+            redis_rps_limit,
         )?))),
         None => None,
     };
@@ -656,7 +661,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
     )?;
 
     let http_config = HttpConfig {
-        request_timeout: args.sql_over_http.sql_over_http_timeout,
         pool_options: GlobalConnPoolOptions {
             max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_conns_per_endpoint,
             gc_epoch: args.sql_over_http.sql_over_http_pool_gc_epoch,
@@ -676,9 +680,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         rate_limit_ip_subnet: args.auth_rate_limit_ip_subnet,
     };
 
-    let mut redis_rps_limit = args.redis_rps_limit.clone();
-    RateBucketInfo::validate(&mut redis_rps_limit)?;
-
     let config = Box::leak(Box::new(ProxyConfig {
         tls_config,
         auth_backend,
@@ -687,11 +688,8 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         http_config,
         authentication_config,
         require_client_ip: args.require_client_ip,
-        disable_ip_check_for_http: args.disable_ip_check_for_http,
-        redis_rps_limit,
         handshake_timeout: args.handshake_timeout,
         region: args.region.clone(),
-        aws_region: args.aws_region.clone(),
         wake_compute_retry_config: config::RetryConfig::parse(&args.wake_compute_retry)?,
         connect_compute_locks,
         connect_to_compute_retry_config: config::RetryConfig::parse(
diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs
index 4bc10a6020..8c851790c2 100644
--- a/proxy/src/cache/endpoints.rs
+++ b/proxy/src/cache/endpoints.rs
@@ -68,7 +68,7 @@ impl EndpointsCache {
             ready: AtomicBool::new(false),
         }
     }
-    pub async fn is_valid(&self, ctx: &mut RequestMonitoring, endpoint: &EndpointId) -> bool {
+    pub async fn is_valid(&self, ctx: &RequestMonitoring, endpoint: &EndpointId) -> bool {
         if !self.ready.load(Ordering::Acquire) {
             return true;
         }
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index f91693c704..21687160ea 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -288,12 +288,12 @@ impl ConnCfg {
     /// Connect to a corresponding compute node.
     pub async fn connect(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         allow_self_signed_compute: bool,
         aux: MetricsAuxInfo,
         timeout: Duration,
     ) -> Result<PostgresConnection, ConnectionError> {
-        let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
+        let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
         let (socket_addr, stream, host) = self.connect_raw(timeout).await?;
         drop(pause);
 
@@ -316,14 +316,14 @@ impl ConnCfg {
         )?;
 
         // connect_raw() will not use TLS if sslmode is "disable"
-        let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
+        let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
         let (client, connection) = self.0.connect_raw(stream, tls).await?;
         drop(pause);
         tracing::Span::current().record("pid", tracing::field::display(client.get_process_id()));
         let stream = connection.stream.into_inner();
 
         info!(
-            cold_start_info = ctx.cold_start_info.as_str(),
+            cold_start_info = ctx.cold_start_info().as_str(),
             "connected to compute node at {host} ({socket_addr}) sslmode={:?}",
             self.0.get_ssl_mode()
         );
@@ -342,7 +342,7 @@ impl ConnCfg {
             params,
             cancel_closure,
             aux,
-            _guage: Metrics::get().proxy.db_connections.guard(ctx.protocol),
+            _guage: Metrics::get().proxy.db_connections.guard(ctx.protocol()),
         };
 
         Ok(connection)
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 6504919760..1412095505 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -31,11 +31,8 @@ pub struct ProxyConfig {
     pub http_config: HttpConfig,
     pub authentication_config: AuthenticationConfig,
     pub require_client_ip: bool,
-    pub disable_ip_check_for_http: bool,
-    pub redis_rps_limit: Vec<RateBucketInfo>,
     pub region: String,
     pub handshake_timeout: Duration,
-    pub aws_region: String,
     pub wake_compute_retry_config: RetryConfig,
     pub connect_compute_locks: ApiLocks<Host>,
     pub connect_to_compute_retry_config: RetryConfig,
@@ -55,7 +52,6 @@ pub struct TlsConfig {
 }
 
 pub struct HttpConfig {
-    pub request_timeout: tokio::time::Duration,
     pub pool_options: GlobalConnPoolOptions,
     pub cancel_set: CancelSet,
     pub client_conn_threshold: u64,
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index 7a9637066f..15fc0134b3 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -292,7 +292,7 @@ pub struct NodeInfo {
 impl NodeInfo {
     pub async fn connect(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         timeout: Duration,
     ) -> Result<compute::PostgresConnection, compute::ConnectionError> {
         self.config
@@ -330,20 +330,20 @@ pub(crate) trait Api {
     /// We still have to mock the scram to avoid leaking information that user doesn't exist.
     async fn get_role_secret(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<CachedRoleSecret, errors::GetAuthInfoError>;
 
     async fn get_allowed_ips_and_secret(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError>;
 
     /// Wake up the compute node and return the corresponding connection info.
     async fn wake_compute(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<CachedNodeInfo, errors::WakeComputeError>;
 }
@@ -363,7 +363,7 @@ pub enum ConsoleBackend {
 impl Api for ConsoleBackend {
     async fn get_role_secret(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<CachedRoleSecret, errors::GetAuthInfoError> {
         use ConsoleBackend::*;
@@ -378,7 +378,7 @@ impl Api for ConsoleBackend {
 
     async fn get_allowed_ips_and_secret(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError> {
         use ConsoleBackend::*;
@@ -393,7 +393,7 @@ impl Api for ConsoleBackend {
 
     async fn wake_compute(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<CachedNodeInfo, errors::WakeComputeError> {
         use ConsoleBackend::*;
diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs
index cfe491f2aa..2093da7562 100644
--- a/proxy/src/console/provider/mock.rs
+++ b/proxy/src/console/provider/mock.rs
@@ -158,7 +158,7 @@ impl super::Api for Api {
     #[tracing::instrument(skip_all)]
     async fn get_role_secret(
         &self,
-        _ctx: &mut RequestMonitoring,
+        _ctx: &RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<CachedRoleSecret, GetAuthInfoError> {
         Ok(CachedRoleSecret::new_uncached(
@@ -168,7 +168,7 @@ impl super::Api for Api {
 
     async fn get_allowed_ips_and_secret(
         &self,
-        _ctx: &mut RequestMonitoring,
+        _ctx: &RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
         Ok((
@@ -182,7 +182,7 @@ impl super::Api for Api {
     #[tracing::instrument(skip_all)]
     async fn wake_compute(
         &self,
-        _ctx: &mut RequestMonitoring,
+        _ctx: &RequestMonitoring,
         _user_info: &ComputeUserInfo,
     ) -> Result<CachedNodeInfo, WakeComputeError> {
         self.do_wake_compute().map_ok(Cached::new_uncached).await
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 768cd2fdfa..7eda238b66 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -57,7 +57,7 @@ impl Api {
 
     async fn do_get_auth_info(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<AuthInfo, GetAuthInfoError> {
         if !self
@@ -69,7 +69,7 @@ impl Api {
             info!("endpoint is not valid, skipping the request");
             return Ok(AuthInfo::default());
         }
-        let request_id = ctx.session_id.to_string();
+        let request_id = ctx.session_id().to_string();
         let application_name = ctx.console_application_name();
         async {
             let request = self
@@ -77,7 +77,7 @@ impl Api {
                 .get("proxy_get_role_secret")
                 .header("X-Request-ID", &request_id)
                 .header("Authorization", format!("Bearer {}", &self.jwt))
-                .query(&[("session_id", ctx.session_id)])
+                .query(&[("session_id", ctx.session_id())])
                 .query(&[
                     ("application_name", application_name.as_str()),
                     ("project", user_info.endpoint.as_str()),
@@ -87,7 +87,7 @@ impl Api {
 
             info!(url = request.url().as_str(), "sending http request");
             let start = Instant::now();
-            let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Cplane);
+            let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
             let response = self.endpoint.execute(request).await?;
             drop(pause);
             info!(duration = ?start.elapsed(), "received http response");
@@ -130,10 +130,10 @@ impl Api {
 
     async fn do_wake_compute(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<NodeInfo, WakeComputeError> {
-        let request_id = ctx.session_id.to_string();
+        let request_id = ctx.session_id().to_string();
         let application_name = ctx.console_application_name();
         async {
             let mut request_builder = self
@@ -141,7 +141,7 @@ impl Api {
                 .get("proxy_wake_compute")
                 .header("X-Request-ID", &request_id)
                 .header("Authorization", format!("Bearer {}", &self.jwt))
-                .query(&[("session_id", ctx.session_id)])
+                .query(&[("session_id", ctx.session_id())])
                 .query(&[
                     ("application_name", application_name.as_str()),
                     ("project", user_info.endpoint.as_str()),
@@ -156,7 +156,7 @@ impl Api {
 
             info!(url = request.url().as_str(), "sending http request");
             let start = Instant::now();
-            let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Cplane);
+            let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
             let response = self.endpoint.execute(request).await?;
             drop(pause);
             info!(duration = ?start.elapsed(), "received http response");
@@ -192,7 +192,7 @@ impl super::Api for Api {
     #[tracing::instrument(skip_all)]
     async fn get_role_secret(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<CachedRoleSecret, GetAuthInfoError> {
         let normalized_ep = &user_info.endpoint.normalize();
@@ -226,7 +226,7 @@ impl super::Api for Api {
 
     async fn get_allowed_ips_and_secret(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
         let normalized_ep = &user_info.endpoint.normalize();
@@ -268,7 +268,7 @@ impl super::Api for Api {
     #[tracing::instrument(skip_all)]
     async fn wake_compute(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<CachedNodeInfo, WakeComputeError> {
         let key = user_info.endpoint_cache_key();
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index ff79ba8275..e925f67233 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -7,13 +7,14 @@ use smol_str::SmolStr;
 use std::net::IpAddr;
 use tokio::sync::mpsc;
 use tracing::{field::display, info, info_span, Span};
+use try_lock::TryLock;
 use uuid::Uuid;
 
 use crate::{
     console::messages::{ColdStartInfo, MetricsAuxInfo},
     error::ErrorKind,
     intern::{BranchIdInt, ProjectIdInt},
-    metrics::{ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol},
+    metrics::{ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol, Waiting},
     DbName, EndpointId, RoleName,
 };
 
@@ -28,7 +29,15 @@ pub static LOG_CHAN_DISCONNECT: OnceCell<mpsc::WeakUnboundedSender<RequestData>>
 ///
 /// This data should **not** be used for connection logic, only for observability and limiting purposes.
 /// All connection logic should instead use strongly typed state machines, not a bunch of Options.
-pub struct RequestMonitoring {
+pub struct RequestMonitoring(
+    /// To allow easier use of the ctx object, we have interior mutability.
+    /// I would typically use a RefCell but that would break the `Send` requirements
+    /// so we need something with thread-safety. `TryLock` is a cheap alternative
+    /// that offers similar semantics to a `RefCell` but with synchronisation.
+    TryLock<RequestMonitoringInner>,
+);
+
+struct RequestMonitoringInner {
     pub peer_addr: IpAddr,
     pub session_id: Uuid,
     pub protocol: Protocol,
@@ -85,7 +94,7 @@ impl RequestMonitoring {
             role = tracing::field::Empty,
         );
 
-        Self {
+        let inner = RequestMonitoringInner {
             peer_addr,
             session_id,
             protocol,
@@ -110,7 +119,9 @@ impl RequestMonitoring {
             disconnect_sender: LOG_CHAN_DISCONNECT.get().and_then(|tx| tx.upgrade()),
             latency_timer: LatencyTimer::new(protocol),
             disconnect_timestamp: None,
-        }
+        };
+
+        Self(TryLock::new(inner))
     }
 
     #[cfg(test)]
@@ -119,48 +130,177 @@ impl RequestMonitoring {
     }
 
     pub fn console_application_name(&self) -> String {
+        let this = self.0.try_lock().expect("should not deadlock");
         format!(
             "{}/{}",
-            self.application.as_deref().unwrap_or_default(),
-            self.protocol
+            this.application.as_deref().unwrap_or_default(),
+            this.protocol
         )
     }
 
-    pub fn set_rejected(&mut self, rejected: bool) {
-        self.rejected = Some(rejected);
+    pub fn set_rejected(&self, rejected: bool) {
+        let mut this = self.0.try_lock().expect("should not deadlock");
+        this.rejected = Some(rejected);
     }
 
-    pub fn set_cold_start_info(&mut self, info: ColdStartInfo) {
+    pub fn set_cold_start_info(&self, info: ColdStartInfo) {
+        self.0
+            .try_lock()
+            .expect("should not deadlock")
+            .set_cold_start_info(info);
+    }
+
+    pub fn set_db_options(&self, options: StartupMessageParams) {
+        let mut this = self.0.try_lock().expect("should not deadlock");
+        this.set_application(options.get("application_name").map(SmolStr::from));
+        if let Some(user) = options.get("user") {
+            this.set_user(user.into());
+        }
+        if let Some(dbname) = options.get("database") {
+            this.set_dbname(dbname.into());
+        }
+
+        this.pg_options = Some(options);
+    }
+
+    pub fn set_project(&self, x: MetricsAuxInfo) {
+        let mut this = self.0.try_lock().expect("should not deadlock");
+        if this.endpoint_id.is_none() {
+            this.set_endpoint_id(x.endpoint_id.as_str().into())
+        }
+        this.branch = Some(x.branch_id);
+        this.project = Some(x.project_id);
+        this.set_cold_start_info(x.cold_start_info);
+    }
+
+    pub fn set_project_id(&self, project_id: ProjectIdInt) {
+        let mut this = self.0.try_lock().expect("should not deadlock");
+        this.project = Some(project_id);
+    }
+
+    pub fn set_endpoint_id(&self, endpoint_id: EndpointId) {
+        self.0
+            .try_lock()
+            .expect("should not deadlock")
+            .set_endpoint_id(endpoint_id);
+    }
+
+    pub fn set_dbname(&self, dbname: DbName) {
+        self.0
+            .try_lock()
+            .expect("should not deadlock")
+            .set_dbname(dbname);
+    }
+
+    pub fn set_user(&self, user: RoleName) {
+        self.0
+            .try_lock()
+            .expect("should not deadlock")
+            .set_user(user);
+    }
+
+    pub fn set_auth_method(&self, auth_method: AuthMethod) {
+        let mut this = self.0.try_lock().expect("should not deadlock");
+        this.auth_method = Some(auth_method);
+    }
+
+    pub fn has_private_peer_addr(&self) -> bool {
+        self.0
+            .try_lock()
+            .expect("should not deadlock")
+            .has_private_peer_addr()
+    }
+
+    pub fn set_error_kind(&self, kind: ErrorKind) {
+        let mut this = self.0.try_lock().expect("should not deadlock");
+        // Do not record errors from the private address to metrics.
+        if !this.has_private_peer_addr() {
+            Metrics::get().proxy.errors_total.inc(kind);
+        }
+        if let Some(ep) = &this.endpoint_id {
+            let metric = &Metrics::get().proxy.endpoints_affected_by_errors;
+            let label = metric.with_labels(kind);
+            metric.get_metric(label).measure(ep);
+        }
+        this.error_kind = Some(kind);
+    }
+
+    pub fn set_success(&self) {
+        let mut this = self.0.try_lock().expect("should not deadlock");
+        this.success = true;
+    }
+
+    pub fn log_connect(&self) {
+        self.0
+            .try_lock()
+            .expect("should not deadlock")
+            .log_connect();
+    }
+
+    pub fn protocol(&self) -> Protocol {
+        self.0.try_lock().expect("should not deadlock").protocol
+    }
+
+    pub fn span(&self) -> Span {
+        self.0.try_lock().expect("should not deadlock").span.clone()
+    }
+
+    pub fn session_id(&self) -> Uuid {
+        self.0.try_lock().expect("should not deadlock").session_id
+    }
+
+    pub fn peer_addr(&self) -> IpAddr {
+        self.0.try_lock().expect("should not deadlock").peer_addr
+    }
+
+    pub fn cold_start_info(&self) -> ColdStartInfo {
+        self.0
+            .try_lock()
+            .expect("should not deadlock")
+            .cold_start_info
+    }
+
+    pub fn latency_timer_pause(&self, waiting_for: Waiting) -> LatencyTimerPause {
+        LatencyTimerPause {
+            ctx: self,
+            start: tokio::time::Instant::now(),
+            waiting_for,
+        }
+    }
+
+    pub fn success(&self) {
+        self.0
+            .try_lock()
+            .expect("should not deadlock")
+            .latency_timer
+            .success()
+    }
+}
+
+pub struct LatencyTimerPause<'a> {
+    ctx: &'a RequestMonitoring,
+    start: tokio::time::Instant,
+    waiting_for: Waiting,
+}
+
+impl Drop for LatencyTimerPause<'_> {
+    fn drop(&mut self) {
+        self.ctx
+            .0
+            .try_lock()
+            .expect("should not deadlock")
+            .latency_timer
+            .unpause(self.start, self.waiting_for);
+    }
+}
+
+impl RequestMonitoringInner {
+    fn set_cold_start_info(&mut self, info: ColdStartInfo) {
         self.cold_start_info = info;
         self.latency_timer.cold_start_info(info);
     }
 
-    pub fn set_db_options(&mut self, options: StartupMessageParams) {
-        self.set_application(options.get("application_name").map(SmolStr::from));
-        if let Some(user) = options.get("user") {
-            self.set_user(user.into());
-        }
-        if let Some(dbname) = options.get("database") {
-            self.set_dbname(dbname.into());
-        }
-
-        self.pg_options = Some(options);
-    }
-
-    pub fn set_project(&mut self, x: MetricsAuxInfo) {
-        if self.endpoint_id.is_none() {
-            self.set_endpoint_id(x.endpoint_id.as_str().into())
-        }
-        self.branch = Some(x.branch_id);
-        self.project = Some(x.project_id);
-        self.set_cold_start_info(x.cold_start_info);
-    }
-
-    pub fn set_project_id(&mut self, project_id: ProjectIdInt) {
-        self.project = Some(project_id);
-    }
-
-    pub fn set_endpoint_id(&mut self, endpoint_id: EndpointId) {
+    fn set_endpoint_id(&mut self, endpoint_id: EndpointId) {
         if self.endpoint_id.is_none() {
             self.span.record("ep", display(&endpoint_id));
             let metric = &Metrics::get().proxy.connecting_endpoints;
@@ -176,44 +316,23 @@ impl RequestMonitoring {
         }
     }
 
-    pub fn set_dbname(&mut self, dbname: DbName) {
+    fn set_dbname(&mut self, dbname: DbName) {
         self.dbname = Some(dbname);
     }
 
-    pub fn set_user(&mut self, user: RoleName) {
+    fn set_user(&mut self, user: RoleName) {
         self.span.record("role", display(&user));
         self.user = Some(user);
     }
 
-    pub fn set_auth_method(&mut self, auth_method: AuthMethod) {
-        self.auth_method = Some(auth_method);
-    }
-
-    pub fn has_private_peer_addr(&self) -> bool {
+    fn has_private_peer_addr(&self) -> bool {
         match self.peer_addr {
             IpAddr::V4(ip) => ip.is_private(),
             _ => false,
         }
     }
 
-    pub fn set_error_kind(&mut self, kind: ErrorKind) {
-        // Do not record errors from the private address to metrics.
-        if !self.has_private_peer_addr() {
-            Metrics::get().proxy.errors_total.inc(kind);
-        }
-        if let Some(ep) = &self.endpoint_id {
-            let metric = &Metrics::get().proxy.endpoints_affected_by_errors;
-            let label = metric.with_labels(kind);
-            metric.get_metric(label).measure(ep);
-        }
-        self.error_kind = Some(kind);
-    }
-
-    pub fn set_success(&mut self) {
-        self.success = true;
-    }
-
-    pub fn log_connect(&mut self) {
+    fn log_connect(&mut self) {
         let outcome = if self.success {
             ConnectOutcome::Success
         } else {
@@ -256,7 +375,7 @@ impl RequestMonitoring {
     }
 }
 
-impl Drop for RequestMonitoring {
+impl Drop for RequestMonitoringInner {
     fn drop(&mut self) {
         if self.sender.is_some() {
             self.log_connect();
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index 543a458274..bb02a476fc 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -23,7 +23,7 @@ use utils::backoff;
 
 use crate::{config::remote_storage_from_toml, context::LOG_CHAN_DISCONNECT};
 
-use super::{RequestMonitoring, LOG_CHAN};
+use super::{RequestMonitoringInner, LOG_CHAN};
 
 #[derive(clap::Args, Clone, Debug)]
 pub struct ParquetUploadArgs {
@@ -118,8 +118,8 @@ impl<'a> serde::Serialize for Options<'a> {
     }
 }
 
-impl From<&RequestMonitoring> for RequestData {
-    fn from(value: &RequestMonitoring) -> Self {
+impl From<&RequestMonitoringInner> for RequestData {
+    fn from(value: &RequestMonitoringInner) -> Self {
         Self {
             session_id: value.session_id,
             peer_addr: value.peer_addr.to_string(),
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index db25ac0311..0167553e30 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -370,6 +370,7 @@ pub struct CancellationRequest {
     pub kind: CancellationOutcome,
 }
 
+#[derive(Clone, Copy)]
 pub enum Waiting {
     Cplane,
     Client,
@@ -398,12 +399,6 @@ pub struct LatencyTimer {
     outcome: ConnectOutcome,
 }
 
-pub struct LatencyTimerPause<'a> {
-    timer: &'a mut LatencyTimer,
-    start: time::Instant,
-    waiting_for: Waiting,
-}
-
 impl LatencyTimer {
     pub fn new(protocol: Protocol) -> Self {
         Self {
@@ -417,11 +412,13 @@ impl LatencyTimer {
         }
     }
 
-    pub fn pause(&mut self, waiting_for: Waiting) -> LatencyTimerPause<'_> {
-        LatencyTimerPause {
-            timer: self,
-            start: Instant::now(),
-            waiting_for,
+    pub fn unpause(&mut self, start: Instant, waiting_for: Waiting) {
+        let dur = start.elapsed();
+        match waiting_for {
+            Waiting::Cplane => self.accumulated.cplane += dur,
+            Waiting::Client => self.accumulated.client += dur,
+            Waiting::Compute => self.accumulated.compute += dur,
+            Waiting::RetryTimeout => self.accumulated.retry += dur,
         }
     }
 
@@ -438,18 +435,6 @@ impl LatencyTimer {
     }
 }
 
-impl Drop for LatencyTimerPause<'_> {
-    fn drop(&mut self) {
-        let dur = self.start.elapsed();
-        match self.waiting_for {
-            Waiting::Cplane => self.timer.accumulated.cplane += dur,
-            Waiting::Client => self.timer.accumulated.client += dur,
-            Waiting::Compute => self.timer.accumulated.compute += dur,
-            Waiting::RetryTimeout => self.timer.accumulated.retry += dur,
-        }
-    }
-}
-
 #[derive(FixedCardinalityLabel, Clone, Copy, Debug)]
 pub enum ConnectOutcome {
     Success,
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 3edefcf21a..2182f38fe7 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -113,18 +113,18 @@ pub async fn task_main(
                 }
             };
 
-            let mut ctx = RequestMonitoring::new(
+            let ctx = RequestMonitoring::new(
                 session_id,
                 peer_addr,
                 crate::metrics::Protocol::Tcp,
                 &config.region,
             );
-            let span = ctx.span.clone();
+            let span = ctx.span();
 
             let startup = Box::pin(
                 handle_client(
                     config,
-                    &mut ctx,
+                    &ctx,
                     cancellation_handler,
                     socket,
                     ClientMode::Tcp,
@@ -240,7 +240,7 @@ impl ReportableError for ClientRequestError {
 
 pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     config: &'static ProxyConfig,
-    ctx: &mut RequestMonitoring,
+    ctx: &RequestMonitoring,
     cancellation_handler: Arc<CancellationHandlerMain>,
     stream: S,
     mode: ClientMode,
@@ -248,25 +248,25 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     conn_gauge: NumClientConnectionsGuard<'static>,
 ) -> Result<Option<ProxyPassthrough<CancellationHandlerMainInternal, S>>, ClientRequestError> {
     info!(
-        protocol = %ctx.protocol,
+        protocol = %ctx.protocol(),
         "handling interactive connection from client"
     );
 
     let metrics = &Metrics::get().proxy;
-    let proto = ctx.protocol;
+    let proto = ctx.protocol();
     let _request_gauge = metrics.connection_requests.guard(proto);
 
     let tls = config.tls_config.as_ref();
 
     let record_handshake_error = !ctx.has_private_peer_addr();
-    let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
-    let do_handshake = handshake(stream, mode.handshake_tls(tls), record_handshake_error);
+    let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Client);
+    let do_handshake = handshake(ctx, stream, mode.handshake_tls(tls), record_handshake_error);
     let (mut stream, params) =
         match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? {
             HandshakeData::Startup(stream, params) => (stream, params),
             HandshakeData::Cancel(cancel_key_data) => {
                 return Ok(cancellation_handler
-                    .cancel_session(cancel_key_data, ctx.session_id)
+                    .cancel_session(cancel_key_data, ctx.session_id())
                     .await
                     .map(|()| None)?)
             }
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index 82180aaee3..f38e43ba5a 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -46,7 +46,7 @@ pub trait ConnectMechanism {
     type Error: From<Self::ConnectError>;
     async fn connect_once(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         node_info: &console::CachedNodeInfo,
         timeout: time::Duration,
     ) -> Result<Self::Connection, Self::ConnectError>;
@@ -58,7 +58,7 @@ pub trait ConnectMechanism {
 pub trait ComputeConnectBackend {
     async fn wake_compute(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
     ) -> Result<CachedNodeInfo, console::errors::WakeComputeError>;
 
     fn get_keys(&self) -> Option<&ComputeCredentialKeys>;
@@ -81,7 +81,7 @@ impl ConnectMechanism for TcpMechanism<'_> {
     #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)]
     async fn connect_once(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         node_info: &console::CachedNodeInfo,
         timeout: time::Duration,
     ) -> Result<PostgresConnection, Self::Error> {
@@ -98,7 +98,7 @@ impl ConnectMechanism for TcpMechanism<'_> {
 /// Try to connect to the compute node, retrying if necessary.
 #[tracing::instrument(skip_all)]
 pub async fn connect_to_compute<M: ConnectMechanism, B: ComputeConnectBackend>(
-    ctx: &mut RequestMonitoring,
+    ctx: &RequestMonitoring,
     mechanism: &M,
     user_info: &B,
     allow_self_signed_compute: bool,
@@ -126,7 +126,7 @@ where
         .await
     {
         Ok(res) => {
-            ctx.latency_timer.success();
+            ctx.success();
             Metrics::get().proxy.retries_metric.observe(
                 RetriesMetricGroup {
                     outcome: ConnectOutcome::Success,
@@ -178,7 +178,7 @@ where
             .await
         {
             Ok(res) => {
-                ctx.latency_timer.success();
+                ctx.success();
                 Metrics::get().proxy.retries_metric.observe(
                     RetriesMetricGroup {
                         outcome: ConnectOutcome::Success,
@@ -209,9 +209,7 @@ where
         let wait_duration = retry_after(num_retries, connect_to_compute_retry_config);
         num_retries += 1;
 
-        let pause = ctx
-            .latency_timer
-            .pause(crate::metrics::Waiting::RetryTimeout);
+        let pause = ctx.latency_timer_pause(crate::metrics::Waiting::RetryTimeout);
         time::sleep(wait_duration).await;
         drop(pause);
     }
diff --git a/proxy/src/proxy/handshake.rs b/proxy/src/proxy/handshake.rs
index d488aea927..c65a5558d9 100644
--- a/proxy/src/proxy/handshake.rs
+++ b/proxy/src/proxy/handshake.rs
@@ -10,6 +10,7 @@ use tracing::{info, warn};
 use crate::{
     auth::endpoint_sni,
     config::{TlsConfig, PG_ALPN_PROTOCOL},
+    context::RequestMonitoring,
     error::ReportableError,
     metrics::Metrics,
     proxy::ERR_INSECURE_CONNECTION,
@@ -67,6 +68,7 @@ pub enum HandshakeData<S> {
 /// we also take an extra care of propagating only the select handshake errors to client.
 #[tracing::instrument(skip_all)]
 pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
+    ctx: &RequestMonitoring,
     stream: S,
     mut tls: Option<&TlsConfig>,
     record_handshake_error: bool,
@@ -80,8 +82,6 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
     let mut stream = PqStream::new(Stream::from_raw(stream));
     loop {
         let msg = stream.read_startup_packet().await?;
-        info!("received {msg:?}");
-
         use FeStartupPacket::*;
         match msg {
             SslRequest { direct } => match stream.get_ref() {
@@ -145,16 +145,20 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
 
                         let conn_info = tls_stream.get_ref().1;
 
+                        // try parse endpoint
+                        let ep = conn_info
+                            .server_name()
+                            .and_then(|sni| endpoint_sni(sni, &tls.common_names).ok().flatten());
+                        if let Some(ep) = ep {
+                            ctx.set_endpoint_id(ep);
+                        }
+
                         // check the ALPN, if exists, as required.
                         match conn_info.alpn_protocol() {
                             None | Some(PG_ALPN_PROTOCOL) => {}
                             Some(other) => {
-                                // try parse ep for better error
-                                let ep = conn_info.server_name().and_then(|sni| {
-                                    endpoint_sni(sni, &tls.common_names).ok().flatten()
-                                });
                                 let alpn = String::from_utf8_lossy(other);
-                                warn!(?ep, %alpn, "unexpected ALPN");
+                                warn!(%alpn, "unexpected ALPN");
                                 return Err(HandshakeError::ProtocolViolation);
                             }
                         }
@@ -198,7 +202,12 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                         .await?;
                 }
 
-                info!(?version, session_type = "normal", "successful handshake");
+                info!(
+                    ?version,
+                    ?params,
+                    session_type = "normal",
+                    "successful handshake"
+                );
                 break Ok(HandshakeData::Startup(stream, params));
             }
             // downgrade protocol version
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 5186a9e1b0..d8308c4f2a 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -155,7 +155,7 @@ impl TestAuth for Scram {
         stream: &mut PqStream<Stream<S>>,
     ) -> anyhow::Result<()> {
         let outcome = auth::AuthFlow::new(stream)
-            .begin(auth::Scram(&self.0, &mut RequestMonitoring::test()))
+            .begin(auth::Scram(&self.0, &RequestMonitoring::test()))
             .await?
             .authenticate()
             .await?;
@@ -175,10 +175,11 @@ async fn dummy_proxy(
     auth: impl TestAuth + Send,
 ) -> anyhow::Result<()> {
     let (client, _) = read_proxy_protocol(client).await?;
-    let mut stream = match handshake(client, tls.as_ref(), false).await? {
-        HandshakeData::Startup(stream, _) => stream,
-        HandshakeData::Cancel(_) => bail!("cancellation not supported"),
-    };
+    let mut stream =
+        match handshake(&RequestMonitoring::test(), client, tls.as_ref(), false).await? {
+            HandshakeData::Startup(stream, _) => stream,
+            HandshakeData::Cancel(_) => bail!("cancellation not supported"),
+        };
 
     auth.authenticate(&mut stream).await?;
 
@@ -457,7 +458,7 @@ impl ConnectMechanism for TestConnectMechanism {
 
     async fn connect_once(
         &self,
-        _ctx: &mut RequestMonitoring,
+        _ctx: &RequestMonitoring,
         _node_info: &console::CachedNodeInfo,
         _timeout: std::time::Duration,
     ) -> Result<Self::Connection, Self::ConnectError> {
@@ -565,7 +566,7 @@ fn helper_create_connect_info(
 async fn connect_to_compute_success() {
     let _ = env_logger::try_init();
     use ConnectAction::*;
-    let mut ctx = RequestMonitoring::test();
+    let ctx = RequestMonitoring::test();
     let mechanism = TestConnectMechanism::new(vec![Wake, Connect]);
     let user_info = helper_create_connect_info(&mechanism);
     let config = RetryConfig {
@@ -573,7 +574,7 @@ async fn connect_to_compute_success() {
         max_retries: 5,
         backoff_factor: 2.0,
     };
-    connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
+    connect_to_compute(&ctx, &mechanism, &user_info, false, config, config)
         .await
         .unwrap();
     mechanism.verify();
@@ -583,7 +584,7 @@ async fn connect_to_compute_success() {
 async fn connect_to_compute_retry() {
     let _ = env_logger::try_init();
     use ConnectAction::*;
-    let mut ctx = RequestMonitoring::test();
+    let ctx = RequestMonitoring::test();
     let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Connect]);
     let user_info = helper_create_connect_info(&mechanism);
     let config = RetryConfig {
@@ -591,7 +592,7 @@ async fn connect_to_compute_retry() {
         max_retries: 5,
         backoff_factor: 2.0,
     };
-    connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
+    connect_to_compute(&ctx, &mechanism, &user_info, false, config, config)
         .await
         .unwrap();
     mechanism.verify();
@@ -602,7 +603,7 @@ async fn connect_to_compute_retry() {
 async fn connect_to_compute_non_retry_1() {
     let _ = env_logger::try_init();
     use ConnectAction::*;
-    let mut ctx = RequestMonitoring::test();
+    let ctx = RequestMonitoring::test();
     let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Fail]);
     let user_info = helper_create_connect_info(&mechanism);
     let config = RetryConfig {
@@ -610,7 +611,7 @@ async fn connect_to_compute_non_retry_1() {
         max_retries: 5,
         backoff_factor: 2.0,
     };
-    connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
+    connect_to_compute(&ctx, &mechanism, &user_info, false, config, config)
         .await
         .unwrap_err();
     mechanism.verify();
@@ -621,7 +622,7 @@ async fn connect_to_compute_non_retry_1() {
 async fn connect_to_compute_non_retry_2() {
     let _ = env_logger::try_init();
     use ConnectAction::*;
-    let mut ctx = RequestMonitoring::test();
+    let ctx = RequestMonitoring::test();
     let mechanism = TestConnectMechanism::new(vec![Wake, Fail, Wake, Connect]);
     let user_info = helper_create_connect_info(&mechanism);
     let config = RetryConfig {
@@ -629,7 +630,7 @@ async fn connect_to_compute_non_retry_2() {
         max_retries: 5,
         backoff_factor: 2.0,
     };
-    connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
+    connect_to_compute(&ctx, &mechanism, &user_info, false, config, config)
         .await
         .unwrap();
     mechanism.verify();
@@ -641,7 +642,7 @@ async fn connect_to_compute_non_retry_3() {
     let _ = env_logger::try_init();
     tokio::time::pause();
     use ConnectAction::*;
-    let mut ctx = RequestMonitoring::test();
+    let ctx = RequestMonitoring::test();
     let mechanism =
         TestConnectMechanism::new(vec![Wake, Retry, Wake, Retry, Retry, Retry, Retry, Retry]);
     let user_info = helper_create_connect_info(&mechanism);
@@ -656,7 +657,7 @@ async fn connect_to_compute_non_retry_3() {
         backoff_factor: 2.0,
     };
     connect_to_compute(
-        &mut ctx,
+        &ctx,
         &mechanism,
         &user_info,
         false,
@@ -673,7 +674,7 @@ async fn connect_to_compute_non_retry_3() {
 async fn wake_retry() {
     let _ = env_logger::try_init();
     use ConnectAction::*;
-    let mut ctx = RequestMonitoring::test();
+    let ctx = RequestMonitoring::test();
     let mechanism = TestConnectMechanism::new(vec![WakeRetry, Wake, Connect]);
     let user_info = helper_create_connect_info(&mechanism);
     let config = RetryConfig {
@@ -681,7 +682,7 @@ async fn wake_retry() {
         max_retries: 5,
         backoff_factor: 2.0,
     };
-    connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
+    connect_to_compute(&ctx, &mechanism, &user_info, false, config, config)
         .await
         .unwrap();
     mechanism.verify();
@@ -692,7 +693,7 @@ async fn wake_retry() {
 async fn wake_non_retry() {
     let _ = env_logger::try_init();
     use ConnectAction::*;
-    let mut ctx = RequestMonitoring::test();
+    let ctx = RequestMonitoring::test();
     let mechanism = TestConnectMechanism::new(vec![WakeRetry, WakeFail]);
     let user_info = helper_create_connect_info(&mechanism);
     let config = RetryConfig {
@@ -700,7 +701,7 @@ async fn wake_non_retry() {
         max_retries: 5,
         backoff_factor: 2.0,
     };
-    connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
+    connect_to_compute(&ctx, &mechanism, &user_info, false, config, config)
         .await
         .unwrap_err();
     mechanism.verify();
diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs
index d96dd0947b..c8ec2b2db6 100644
--- a/proxy/src/proxy/tests/mitm.rs
+++ b/proxy/src/proxy/tests/mitm.rs
@@ -34,9 +34,14 @@ async fn proxy_mitm(
     tokio::spawn(async move {
         // begin handshake with end_server
         let end_server = connect_tls(server2, client_config2.make_tls_connect().unwrap()).await;
-        let (end_client, startup) = match handshake(client1, Some(&server_config1), false)
-            .await
-            .unwrap()
+        let (end_client, startup) = match handshake(
+            &RequestMonitoring::test(),
+            client1,
+            Some(&server_config1),
+            false,
+        )
+        .await
+        .unwrap()
         {
             HandshakeData::Startup(stream, params) => (stream, params),
             HandshakeData::Cancel(_) => panic!("cancellation not supported"),
diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs
index fef349aac0..5b06e8f054 100644
--- a/proxy/src/proxy/wake_compute.rs
+++ b/proxy/src/proxy/wake_compute.rs
@@ -14,7 +14,7 @@ use super::connect_compute::ComputeConnectBackend;
 
 pub async fn wake_compute<B: ComputeConnectBackend>(
     num_retries: &mut u32,
-    ctx: &mut RequestMonitoring,
+    ctx: &RequestMonitoring,
     api: &B,
     config: RetryConfig,
 ) -> Result<CachedNodeInfo, WakeComputeError> {
@@ -52,9 +52,7 @@ pub async fn wake_compute<B: ComputeConnectBackend>(
 
         let wait_duration = retry_after(*num_retries, config);
         *num_retries += 1;
-        let pause = ctx
-            .latency_timer
-            .pause(crate::metrics::Waiting::RetryTimeout);
+        let pause = ctx.latency_timer_pause(crate::metrics::Waiting::RetryTimeout);
         tokio::time::sleep(wait_duration).await;
         drop(pause);
     }
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index efa999ed7d..115bef7375 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -334,7 +334,7 @@ async fn request_handler(
             &config.region,
         );
 
-        let span = ctx.span.clone();
+        let span = ctx.span();
         info!(parent: &span, "performing websocket upgrade");
 
         let (response, websocket) = framed_websockets::upgrade::upgrade(&mut request)
@@ -367,7 +367,7 @@ async fn request_handler(
             crate::metrics::Protocol::Http,
             &config.region,
         );
-        let span = ctx.span.clone();
+        let span = ctx.span();
 
         sql_over_http::handle(config, ctx, request, backend, http_cancellation_token)
             .instrument(span)
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 3b86c1838c..80d46c67eb 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -35,15 +35,15 @@ pub struct PoolingBackend {
 impl PoolingBackend {
     pub async fn authenticate(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         config: &AuthenticationConfig,
         conn_info: &ConnInfo,
     ) -> Result<ComputeCredentials, AuthError> {
         let user_info = conn_info.user_info.clone();
         let backend = self.config.auth_backend.as_ref().map(|_| user_info.clone());
         let (allowed_ips, maybe_secret) = backend.get_allowed_ips_and_secret(ctx).await?;
-        if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) {
-            return Err(AuthError::ip_address_not_allowed(ctx.peer_addr));
+        if !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) {
+            return Err(AuthError::ip_address_not_allowed(ctx.peer_addr()));
         }
         if !self
             .endpoint_rate_limiter
@@ -100,7 +100,7 @@ impl PoolingBackend {
     #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)]
     pub async fn connect_to_compute(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         conn_info: ConnInfo,
         keys: ComputeCredentials,
         force_new: bool,
@@ -222,7 +222,7 @@ impl ConnectMechanism for TokioMechanism {
 
     async fn connect_once(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         node_info: &CachedNodeInfo,
         timeout: Duration,
     ) -> Result<Self::Connection, Self::ConnectError> {
@@ -240,7 +240,7 @@ impl ConnectMechanism for TokioMechanism {
             .param("client_encoding", "UTF8")
             .expect("client encoding UTF8 is always valid");
 
-        let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
+        let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
         let res = config.connect(tokio_postgres::NoTls).await;
         drop(pause);
         let (client, connection) = permit.release_result(res)?;
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index dbc58d48ec..e1dc44dc1c 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -377,7 +377,7 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
 
     pub fn get(
         self: &Arc<Self>,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         conn_info: &ConnInfo,
     ) -> Result<Option<Client<C>>, HttpConnError> {
         let mut client: Option<ClientInner<C>> = None;
@@ -409,9 +409,9 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
                     cold_start_info = ColdStartInfo::HttpPoolHit.as_str(),
                     "pool: reusing connection '{conn_info}'"
                 );
-                client.session.send(ctx.session_id)?;
+                client.session.send(ctx.session_id())?;
                 ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit);
-                ctx.latency_timer.success();
+                ctx.success();
                 return Ok(Some(Client::new(client, conn_info.clone(), endpoint_pool)));
             }
         }
@@ -465,19 +465,19 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
 
 pub fn poll_client<C: ClientInnerExt>(
     global_pool: Arc<GlobalConnPool<C>>,
-    ctx: &mut RequestMonitoring,
+    ctx: &RequestMonitoring,
     conn_info: ConnInfo,
     client: C,
     mut connection: tokio_postgres::Connection<Socket, NoTlsStream>,
     conn_id: uuid::Uuid,
     aux: MetricsAuxInfo,
 ) -> Client<C> {
-    let conn_gauge = Metrics::get().proxy.db_connections.guard(ctx.protocol);
-    let mut session_id = ctx.session_id;
+    let conn_gauge = Metrics::get().proxy.db_connections.guard(ctx.protocol());
+    let mut session_id = ctx.session_id();
     let (tx, mut rx) = tokio::sync::watch::channel(session_id);
 
     let span = info_span!(parent: None, "connection", %conn_id);
-    let cold_start_info = ctx.cold_start_info;
+    let cold_start_info = ctx.cold_start_info();
     span.in_scope(|| {
         info!(cold_start_info = cold_start_info.as_str(), %conn_info, %session_id, "new connection");
     });
@@ -766,7 +766,6 @@ mod tests {
                 opt_in: false,
                 max_total_conns: 3,
             },
-            request_timeout: Duration::from_secs(1),
             cancel_set: CancelSet::new(0),
             client_conn_threshold: u64::MAX,
         }));
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 6400e4ac7b..77ec6b1c73 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -144,7 +144,7 @@ impl UserFacingError for ConnInfoError {
 }
 
 fn get_conn_info(
-    ctx: &mut RequestMonitoring,
+    ctx: &RequestMonitoring,
     headers: &HeaderMap,
     tls: &TlsConfig,
 ) -> Result<ConnInfo, ConnInfoError> {
@@ -224,12 +224,12 @@ fn get_conn_info(
 // TODO: return different http error codes
 pub async fn handle(
     config: &'static ProxyConfig,
-    mut ctx: RequestMonitoring,
+    ctx: RequestMonitoring,
     request: Request<Incoming>,
     backend: Arc<PoolingBackend>,
     cancel: CancellationToken,
 ) -> Result<Response<Full<Bytes>>, ApiError> {
-    let result = handle_inner(cancel, config, &mut ctx, request, backend).await;
+    let result = handle_inner(cancel, config, &ctx, request, backend).await;
 
     let mut response = match result {
         Ok(r) => {
@@ -482,13 +482,16 @@ fn map_isolation_level_to_headers(level: IsolationLevel) -> Option<HeaderValue>
 async fn handle_inner(
     cancel: CancellationToken,
     config: &'static ProxyConfig,
-    ctx: &mut RequestMonitoring,
+    ctx: &RequestMonitoring,
     request: Request<Incoming>,
     backend: Arc<PoolingBackend>,
 ) -> Result<Response<Full<Bytes>>, SqlOverHttpError> {
-    let _requeset_gauge = Metrics::get().proxy.connection_requests.guard(ctx.protocol);
+    let _requeset_gauge = Metrics::get()
+        .proxy
+        .connection_requests
+        .guard(ctx.protocol());
     info!(
-        protocol = %ctx.protocol,
+        protocol = %ctx.protocol(),
         "handling interactive connection from client"
     );
 
@@ -544,7 +547,7 @@ async fn handle_inner(
                 .await?;
             // not strictly necessary to mark success here,
             // but it's just insurance for if we forget it somewhere else
-            ctx.latency_timer.success();
+            ctx.success();
             Ok::<_, HttpConnError>(client)
         }
         .map_err(SqlOverHttpError::from),
diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs
index 0d5b88f07b..4fba4d141c 100644
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -129,7 +129,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AsyncBufRead for WebSocketRw<S> {
 
 pub async fn serve_websocket(
     config: &'static ProxyConfig,
-    mut ctx: RequestMonitoring,
+    ctx: RequestMonitoring,
     websocket: OnUpgrade,
     cancellation_handler: Arc<CancellationHandlerMain>,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
@@ -145,7 +145,7 @@ pub async fn serve_websocket(
 
     let res = Box::pin(handle_client(
         config,
-        &mut ctx,
+        &ctx,
         cancellation_handler,
         WebSocketRw::new(websocket),
         ClientMode::Websockets { hostname },

From c0776b8724aa339eb1a06b39d701bb65b74bc180 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 7 Aug 2024 17:50:09 +0300
Subject: [PATCH 355/412] fix: EphemeralFiles can outlive their Timeline via
 `enum LayerManager` (#8229)

Ephemeral files cleanup on drop but did not delay shutdown, leading to
problems with restarting the tenant. The solution is as proposed:
- make ephemeral files carry the gate guard to delay `Timeline::gate`
closing
- flush in-memory layers and strong references to those on
`Timeline::shutdown`

The above are realized by making LayerManager an `enum` with `Open` and
`Closed` variants, and fail requests to modify `LayerMap`.

Additionally:

- fix too eager anyhow conversions in compaction
- unify how we freeze layers and handle errors
- optimize likely_resident_layers to read LayerFileManager hashmap
values instead of bouncing through LayerMap

Fixes: #7830
---
 libs/utils/src/sync/gate.rs                   |   3 +-
 pageserver/benches/bench_ingest.rs            |   6 +-
 pageserver/src/http/routes.rs                 |   5 +-
 pageserver/src/tenant.rs                      |  38 +-
 pageserver/src/tenant/ephemeral_file.rs       |  45 ++-
 .../src/tenant/ephemeral_file/page_caching.rs |  10 +-
 pageserver/src/tenant/layer_map.rs            |   4 +-
 pageserver/src/tenant/mgr.rs                  |   9 +-
 .../src/tenant/storage_layer/delta_layer.rs   |   4 +-
 .../tenant/storage_layer/inmemory_layer.rs    |   4 +-
 .../src/tenant/storage_layer/layer/tests.rs   |  20 +-
 pageserver/src/tenant/timeline.rs             | 331 +++++++++++-------
 pageserver/src/tenant/timeline/compaction.rs  |  35 +-
 .../src/tenant/timeline/detach_ancestor.rs    |  17 +-
 .../src/tenant/timeline/eviction_task.rs      |  66 ++--
 .../src/tenant/timeline/layer_manager.rs      | 214 ++++++-----
 16 files changed, 505 insertions(+), 306 deletions(-)

diff --git a/libs/utils/src/sync/gate.rs b/libs/utils/src/sync/gate.rs
index 156b99a010..16ec563fa7 100644
--- a/libs/utils/src/sync/gate.rs
+++ b/libs/utils/src/sync/gate.rs
@@ -78,8 +78,9 @@ impl Drop for GateGuard {
     }
 }
 
-#[derive(Debug)]
+#[derive(Debug, thiserror::Error)]
 pub enum GateError {
+    #[error("gate is closed")]
     GateClosed,
 }
 
diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs
index af2b6934c6..459394449a 100644
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -61,7 +61,11 @@ async fn ingest(
 
     let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
 
-    let layer = InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, &ctx).await?;
+    let gate = utils::sync::gate::Gate::default();
+    let entered = gate.enter().unwrap();
+
+    let layer =
+        InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, entered, &ctx).await?;
 
     let data = Value::Image(Bytes::from(vec![0u8; put_size])).ser()?;
     let ctx = RequestContext::new(
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index fdab780bfb..a983d8c4c2 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1162,7 +1162,10 @@ async fn layer_map_info_handler(
     let timeline =
         active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
             .await?;
-    let layer_map_info = timeline.layer_map_info(reset).await;
+    let layer_map_info = timeline
+        .layer_map_info(reset)
+        .await
+        .map_err(|_shutdown| ApiError::ShuttingDown)?;
 
     json_response(StatusCode::OK, layer_map_info)
 }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 989ed0d4eb..2422ab4cf2 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -601,6 +601,12 @@ impl From<PageReconstructError> for GcError {
     }
 }
 
+impl From<timeline::layer_manager::Shutdown> for GcError {
+    fn from(_: timeline::layer_manager::Shutdown) -> Self {
+        GcError::TimelineCancelled
+    }
+}
+
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum LoadConfigError {
     #[error("TOML deserialization error: '{0}'")]
@@ -710,6 +716,7 @@ impl Tenant {
                     .read()
                     .await
                     .layer_map()
+                    .expect("currently loading, layer manager cannot be shutdown already")
                     .iter_historic_layers()
                     .next()
                     .is_some(),
@@ -4674,10 +4681,10 @@ mod tests {
 
         let layer_map = tline.layers.read().await;
         let level0_deltas = layer_map
-            .layer_map()
-            .get_level0_deltas()
-            .into_iter()
-            .map(|desc| layer_map.get_from_desc(&desc))
+            .layer_map()?
+            .level0_deltas()
+            .iter()
+            .map(|desc| layer_map.get_from_desc(desc))
             .collect::<Vec<_>>();
 
         assert!(!level0_deltas.is_empty());
@@ -4908,11 +4915,13 @@ mod tests {
         let inserted = bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?;
 
         let guard = tline.layers.read().await;
-        guard.layer_map().dump(true, &ctx).await?;
+        let lm = guard.layer_map()?;
+
+        lm.dump(true, &ctx).await?;
 
         let mut reads = Vec::new();
         let mut prev = None;
-        guard.layer_map().iter_historic_layers().for_each(|desc| {
+        lm.iter_historic_layers().for_each(|desc| {
             if !desc.is_delta() {
                 prev = Some(desc.clone());
                 return;
@@ -5918,23 +5927,12 @@ mod tests {
             tline.freeze_and_flush().await?; // force create a delta layer
         }
 
-        let before_num_l0_delta_files = tline
-            .layers
-            .read()
-            .await
-            .layer_map()
-            .get_level0_deltas()
-            .len();
+        let before_num_l0_delta_files =
+            tline.layers.read().await.layer_map()?.level0_deltas().len();
 
         tline.compact(&cancel, EnumSet::empty(), &ctx).await?;
 
-        let after_num_l0_delta_files = tline
-            .layers
-            .read()
-            .await
-            .layer_map()
-            .get_level0_deltas()
-            .len();
+        let after_num_l0_delta_files = tline.layers.read().await.layer_map()?.level0_deltas().len();
 
         assert!(after_num_l0_delta_files < before_num_l0_delta_files, "after_num_l0_delta_files={after_num_l0_delta_files}, before_num_l0_delta_files={before_num_l0_delta_files}");
 
diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs
index bb65ae24fc..770f3ca5f0 100644
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -29,6 +29,7 @@ impl EphemeralFile {
         conf: &PageServerConf,
         tenant_shard_id: TenantShardId,
         timeline_id: TimelineId,
+        gate_guard: utils::sync::gate::GateGuard,
         ctx: &RequestContext,
     ) -> Result<EphemeralFile, io::Error> {
         static NEXT_FILENAME: AtomicU64 = AtomicU64::new(1);
@@ -51,10 +52,12 @@ impl EphemeralFile {
         )
         .await?;
 
+        let prewarm = conf.l0_flush.prewarm_on_write();
+
         Ok(EphemeralFile {
             _tenant_shard_id: tenant_shard_id,
             _timeline_id: timeline_id,
-            rw: page_caching::RW::new(file, conf.l0_flush.prewarm_on_write()),
+            rw: page_caching::RW::new(file, prewarm, gate_guard),
         })
     }
 
@@ -161,7 +164,11 @@ mod tests {
     async fn test_ephemeral_blobs() -> Result<(), io::Error> {
         let (conf, tenant_id, timeline_id, ctx) = harness("ephemeral_blobs")?;
 
-        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &ctx).await?;
+        let gate = utils::sync::gate::Gate::default();
+
+        let entered = gate.enter().unwrap();
+
+        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, entered, &ctx).await?;
 
         let pos_foo = file.write_blob(b"foo", &ctx).await?;
         assert_eq!(
@@ -215,4 +222,38 @@ mod tests {
 
         Ok(())
     }
+
+    #[tokio::test]
+    async fn ephemeral_file_holds_gate_open() {
+        const FOREVER: std::time::Duration = std::time::Duration::from_secs(5);
+
+        let (conf, tenant_id, timeline_id, ctx) =
+            harness("ephemeral_file_holds_gate_open").unwrap();
+
+        let gate = utils::sync::gate::Gate::default();
+
+        let file = EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
+            .await
+            .unwrap();
+
+        let mut closing = tokio::task::spawn(async move {
+            gate.close().await;
+        });
+
+        // gate is entered until the ephemeral file is dropped
+        // do not start paused tokio-epoll-uring has a sleep loop
+        tokio::time::pause();
+        tokio::time::timeout(FOREVER, &mut closing)
+            .await
+            .expect_err("closing cannot complete before dropping");
+
+        // this is a requirement of the reset_tenant functionality: we have to be able to restart a
+        // tenant fast, and for that, we need all tenant_dir operations be guarded by entering a gate
+        drop(file);
+
+        tokio::time::timeout(FOREVER, &mut closing)
+            .await
+            .expect("closing completes right away")
+            .expect("closing does not panic");
+    }
 }
diff --git a/pageserver/src/tenant/ephemeral_file/page_caching.rs b/pageserver/src/tenant/ephemeral_file/page_caching.rs
index 43b9fff28d..0a12b64a7c 100644
--- a/pageserver/src/tenant/ephemeral_file/page_caching.rs
+++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs
@@ -18,6 +18,8 @@ use super::zero_padded_read_write;
 pub struct RW {
     page_cache_file_id: page_cache::FileId,
     rw: super::zero_padded_read_write::RW<PreWarmingWriter>,
+    /// Gate guard is held on as long as we need to do operations in the path (delete on drop).
+    _gate_guard: utils::sync::gate::GateGuard,
 }
 
 /// When we flush a block to the underlying [`crate::virtual_file::VirtualFile`],
@@ -29,7 +31,11 @@ pub enum PrewarmOnWrite {
 }
 
 impl RW {
-    pub fn new(file: VirtualFile, prewarm_on_write: PrewarmOnWrite) -> Self {
+    pub fn new(
+        file: VirtualFile,
+        prewarm_on_write: PrewarmOnWrite,
+        _gate_guard: utils::sync::gate::GateGuard,
+    ) -> Self {
         let page_cache_file_id = page_cache::next_file_id();
         Self {
             page_cache_file_id,
@@ -38,6 +44,7 @@ impl RW {
                 file,
                 prewarm_on_write,
             )),
+            _gate_guard,
         }
     }
 
@@ -145,6 +152,7 @@ impl Drop for RW {
         // We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed.
 
         // unlink the file
+        // we are clear to do this, because we have entered a gate
         let res = std::fs::remove_file(&self.rw.as_writer().file.path);
         if let Err(e) = res {
             if e.kind() != std::io::ErrorKind::NotFound {
diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index ba9c08f6e7..844f117ea2 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -846,8 +846,8 @@ impl LayerMap {
     }
 
     /// Return all L0 delta layers
-    pub fn get_level0_deltas(&self) -> Vec<Arc<PersistentLayerDesc>> {
-        self.l0_delta_layers.to_vec()
+    pub fn level0_deltas(&self) -> &Vec<Arc<PersistentLayerDesc>> {
+        &self.l0_delta_layers
     }
 
     /// debugging function to print out the contents of the layer map
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index b5568d37b5..7901fc3554 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -1767,14 +1767,9 @@ impl TenantManager {
             let parent_timelines = timelines.keys().cloned().collect::<Vec<_>>();
             for timeline in timelines.values() {
                 tracing::info!(timeline_id=%timeline.timeline_id, "Loading list of layers to hardlink");
-                let timeline_layers = timeline
-                    .layers
-                    .read()
-                    .await
-                    .likely_resident_layers()
-                    .collect::<Vec<_>>();
+                let layers = timeline.layers.read().await;
 
-                for layer in timeline_layers {
+                for layer in layers.likely_resident_layers() {
                     let relative_path = layer
                         .local_path()
                         .strip_prefix(&parent_path)
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index bff8f7cb24..f4e965b99a 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -1957,6 +1957,7 @@ pub(crate) mod test {
             .await
             .likely_resident_layers()
             .next()
+            .cloned()
             .unwrap();
 
         {
@@ -2031,7 +2032,8 @@ pub(crate) mod test {
             .read()
             .await
             .likely_resident_layers()
-            .find(|x| x != &initdb_layer)
+            .find(|&x| x != &initdb_layer)
+            .cloned()
             .unwrap();
 
         // create a copy for the timeline, so we don't overwrite the file
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index f118f3d8d8..57d93feaaf 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -385,11 +385,13 @@ impl InMemoryLayer {
         timeline_id: TimelineId,
         tenant_shard_id: TenantShardId,
         start_lsn: Lsn,
+        gate_guard: utils::sync::gate::GateGuard,
         ctx: &RequestContext,
     ) -> Result<InMemoryLayer> {
         trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");
 
-        let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id, ctx).await?;
+        let file =
+            EphemeralFile::create(conf, tenant_shard_id, timeline_id, gate_guard, ctx).await?;
         let key = InMemoryLayerFileId(file.page_cache_file_id());
 
         Ok(InMemoryLayer {
diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs
index 6b0d5f09ff..bffd2db800 100644
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -39,7 +39,7 @@ async fn smoke_test() {
     let layer = {
         let mut layers = {
             let layers = timeline.layers.read().await;
-            layers.likely_resident_layers().collect::<Vec<_>>()
+            layers.likely_resident_layers().cloned().collect::<Vec<_>>()
         };
 
         assert_eq!(layers.len(), 1);
@@ -176,7 +176,7 @@ async fn smoke_test() {
     {
         let layers = &[layer];
         let mut g = timeline.layers.write().await;
-        g.finish_gc_timeline(layers);
+        g.open_mut().unwrap().finish_gc_timeline(layers);
         // this just updates the remote_physical_size for demonstration purposes
         rtc.schedule_gc_update(layers).unwrap();
     }
@@ -216,7 +216,7 @@ async fn evict_and_wait_on_wanted_deleted() {
     let layer = {
         let mut layers = {
             let layers = timeline.layers.read().await;
-            layers.likely_resident_layers().collect::<Vec<_>>()
+            layers.likely_resident_layers().cloned().collect::<Vec<_>>()
         };
 
         assert_eq!(layers.len(), 1);
@@ -260,7 +260,7 @@ async fn evict_and_wait_on_wanted_deleted() {
     // the deletion of the layer in remote_storage happens.
     {
         let mut layers = timeline.layers.write().await;
-        layers.finish_gc_timeline(&[layer]);
+        layers.open_mut().unwrap().finish_gc_timeline(&[layer]);
     }
 
     SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await;
@@ -301,7 +301,7 @@ fn read_wins_pending_eviction() {
         let layer = {
             let mut layers = {
                 let layers = timeline.layers.read().await;
-                layers.likely_resident_layers().collect::<Vec<_>>()
+                layers.likely_resident_layers().cloned().collect::<Vec<_>>()
             };
 
             assert_eq!(layers.len(), 1);
@@ -433,7 +433,7 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
         let layer = {
             let mut layers = {
                 let layers = timeline.layers.read().await;
-                layers.likely_resident_layers().collect::<Vec<_>>()
+                layers.likely_resident_layers().cloned().collect::<Vec<_>>()
             };
 
             assert_eq!(layers.len(), 1);
@@ -602,7 +602,7 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
     let layer = {
         let mut layers = {
             let layers = timeline.layers.read().await;
-            layers.likely_resident_layers().collect::<Vec<_>>()
+            layers.likely_resident_layers().cloned().collect::<Vec<_>>()
         };
 
         assert_eq!(layers.len(), 1);
@@ -682,7 +682,7 @@ async fn evict_and_wait_does_not_wait_for_download() {
     let layer = {
         let mut layers = {
             let layers = timeline.layers.read().await;
-            layers.likely_resident_layers().collect::<Vec<_>>()
+            layers.likely_resident_layers().cloned().collect::<Vec<_>>()
         };
 
         assert_eq!(layers.len(), 1);
@@ -801,9 +801,9 @@ async fn eviction_cancellation_on_drop() {
     let (evicted_layer, not_evicted) = {
         let mut layers = {
             let mut guard = timeline.layers.write().await;
-            let layers = guard.likely_resident_layers().collect::<Vec<_>>();
+            let layers = guard.likely_resident_layers().cloned().collect::<Vec<_>>();
             // remove the layers from layermap
-            guard.finish_gc_timeline(&layers);
+            guard.open_mut().unwrap().finish_gc_timeline(&layers);
 
             layers
         };
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index a05e4e0712..8f9ff78fd8 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -527,6 +527,12 @@ pub(crate) enum PageReconstructError {
     MissingKey(MissingKeyError),
 }
 
+impl From<layer_manager::Shutdown> for PageReconstructError {
+    fn from(_: layer_manager::Shutdown) -> Self {
+        PageReconstructError::Cancelled
+    }
+}
+
 impl GetVectoredError {
     #[cfg(test)]
     pub(crate) fn is_missing_key_error(&self) -> bool {
@@ -534,6 +540,12 @@ impl GetVectoredError {
     }
 }
 
+impl From<layer_manager::Shutdown> for GetVectoredError {
+    fn from(_: layer_manager::Shutdown) -> Self {
+        GetVectoredError::Cancelled
+    }
+}
+
 pub struct MissingKeyError {
     key: Key,
     shard: ShardNumber,
@@ -597,6 +609,12 @@ pub(crate) enum CreateImageLayersError {
     Other(#[from] anyhow::Error),
 }
 
+impl From<layer_manager::Shutdown> for CreateImageLayersError {
+    fn from(_: layer_manager::Shutdown) -> Self {
+        CreateImageLayersError::Cancelled
+    }
+}
+
 #[derive(thiserror::Error, Debug, Clone)]
 pub(crate) enum FlushLayerError {
     /// Timeline cancellation token was cancelled
@@ -634,6 +652,12 @@ impl FlushLayerError {
     }
 }
 
+impl From<layer_manager::Shutdown> for FlushLayerError {
+    fn from(_: layer_manager::Shutdown) -> Self {
+        FlushLayerError::Cancelled
+    }
+}
+
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum GetVectoredError {
     #[error("timeline shutting down")]
@@ -1198,12 +1222,7 @@ impl Timeline {
     /// Hence, the result **does not represent local filesystem usage**.
     pub(crate) async fn layer_size_sum(&self) -> u64 {
         let guard = self.layers.read().await;
-        let layer_map = guard.layer_map();
-        let mut size = 0;
-        for l in layer_map.iter_historic_layers() {
-            size += l.file_size;
-        }
-        size
+        guard.layer_size_sum()
     }
 
     pub(crate) fn resident_physical_size(&self) -> u64 {
@@ -1370,16 +1389,15 @@ impl Timeline {
     // This exists to provide a non-span creating version of `freeze_and_flush` we can call without
     // polluting the span hierarchy.
     pub(crate) async fn freeze_and_flush0(&self) -> Result<(), FlushLayerError> {
-        let to_lsn = {
+        let token = {
             // Freeze the current open in-memory layer. It will be written to disk on next
             // iteration.
             let mut g = self.write_lock.lock().await;
 
             let to_lsn = self.get_last_record_lsn();
-            self.freeze_inmem_layer_at(to_lsn, &mut g).await;
-            to_lsn
+            self.freeze_inmem_layer_at(to_lsn, &mut g).await?
         };
-        self.flush_frozen_layers_and_wait(to_lsn).await
+        self.wait_flush_completion(token).await
     }
 
     // Check if an open ephemeral layer should be closed: this provides
@@ -1393,12 +1411,20 @@ impl Timeline {
             return;
         };
 
+        // FIXME: why not early exit? because before #7927 the state would had been cleared every
+        // time, and this was missed.
+        // if write_guard.is_none() { return; }
+
         let Ok(layers_guard) = self.layers.try_read() else {
             // Don't block if the layer lock is busy
             return;
         };
 
-        let Some(open_layer) = &layers_guard.layer_map().open_layer else {
+        let Ok(lm) = layers_guard.layer_map() else {
+            return;
+        };
+
+        let Some(open_layer) = &lm.open_layer else {
             // If there is no open layer, we have no layer freezing to do.  However, we might need to generate
             // some updates to disk_consistent_lsn and remote_consistent_lsn, in case we ingested some WAL regions
             // that didn't result in writes to this shard.
@@ -1424,9 +1450,16 @@ impl Timeline {
                     );
 
                     // The flush loop will update remote consistent LSN as well as disk consistent LSN.
-                    self.flush_frozen_layers_and_wait(last_record_lsn)
-                        .await
-                        .ok();
+                    // We know there is no open layer, so we can request freezing without actually
+                    // freezing anything. This is true even if we have dropped the layers_guard, we
+                    // still hold the write_guard.
+                    let _ = async {
+                        let token = self
+                            .freeze_inmem_layer_at(last_record_lsn, &mut write_guard)
+                            .await?;
+                        self.wait_flush_completion(token).await
+                    }
+                    .await;
                 }
             }
 
@@ -1464,33 +1497,26 @@ impl Timeline {
             self.last_freeze_at.load(),
             open_layer.get_opened_at(),
         ) {
-            let at_lsn = match open_layer.info() {
+            match open_layer.info() {
                 InMemoryLayerInfo::Frozen { lsn_start, lsn_end } => {
                     // We may reach this point if the layer was already frozen by not yet flushed: flushing
                     // happens asynchronously in the background.
                     tracing::debug!(
                         "Not freezing open layer, it's already frozen ({lsn_start}..{lsn_end})"
                     );
-                    None
                 }
                 InMemoryLayerInfo::Open { .. } => {
                     // Upgrade to a write lock and freeze the layer
                     drop(layers_guard);
-                    let mut layers_guard = self.layers.write().await;
-                    let froze = layers_guard
-                        .try_freeze_in_memory_layer(
-                            current_lsn,
-                            &self.last_freeze_at,
-                            &mut write_guard,
-                        )
+                    let res = self
+                        .freeze_inmem_layer_at(current_lsn, &mut write_guard)
                         .await;
-                    Some(current_lsn).filter(|_| froze)
-                }
-            };
-            if let Some(lsn) = at_lsn {
-                let res: Result<u64, _> = self.flush_frozen_layers(lsn);
-                if let Err(e) = res {
-                    tracing::info!("failed to flush frozen layer after background freeze: {e:#}");
+
+                    if let Err(e) = res {
+                        tracing::info!(
+                            "failed to flush frozen layer after background freeze: {e:#}"
+                        );
+                    }
                 }
             }
         }
@@ -1644,6 +1670,11 @@ impl Timeline {
                     // about corner cases like s3 suddenly hanging up?
                     self.remote_client.shutdown().await;
                 }
+                Err(FlushLayerError::Cancelled) => {
+                    // this is likely the second shutdown, ignore silently.
+                    // TODO: this can be removed once https://github.com/neondatabase/neon/issues/5080
+                    debug_assert!(self.cancel.is_cancelled());
+                }
                 Err(e) => {
                     // Non-fatal.  Shutdown is infallible.  Failures to flush just mean that
                     // we have some extra WAL replay to do next time the timeline starts.
@@ -1662,6 +1693,7 @@ impl Timeline {
         // Transition the remote_client into a state where it's only useful for timeline deletion.
         // (The deletion use case is why we can't just hook up remote_client to Self::cancel).)
         self.remote_client.stop();
+
         // As documented in remote_client.stop()'s doc comment, it's our responsibility
         // to shut down the upload queue tasks.
         // TODO: fix that, task management should be encapsulated inside remote_client.
@@ -1672,10 +1704,17 @@ impl Timeline {
         )
         .await;
 
-        // TODO: work toward making this a no-op. See this funciton's doc comment for more context.
+        // TODO: work toward making this a no-op. See this function's doc comment for more context.
         tracing::debug!("Waiting for tasks...");
         task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id), Some(self.timeline_id)).await;
 
+        {
+            // Allow any remaining in-memory layers to do cleanup -- until that, they hold the gate
+            // open.
+            let mut write_guard = self.write_lock.lock().await;
+            self.layers.write().await.shutdown(&mut write_guard);
+        }
+
         // Finally wait until any gate-holders are complete.
         //
         // TODO: once above shutdown_tasks is a no-op, we can close the gate before calling shutdown_tasks
@@ -1769,9 +1808,12 @@ impl Timeline {
         }
     }
 
-    pub(crate) async fn layer_map_info(&self, reset: LayerAccessStatsReset) -> LayerMapInfo {
+    pub(crate) async fn layer_map_info(
+        &self,
+        reset: LayerAccessStatsReset,
+    ) -> Result<LayerMapInfo, layer_manager::Shutdown> {
         let guard = self.layers.read().await;
-        let layer_map = guard.layer_map();
+        let layer_map = guard.layer_map()?;
         let mut in_memory_layers = Vec::with_capacity(layer_map.frozen_layers.len() + 1);
         if let Some(open_layer) = &layer_map.open_layer {
             in_memory_layers.push(open_layer.info());
@@ -1780,16 +1822,15 @@ impl Timeline {
             in_memory_layers.push(frozen_layer.info());
         }
 
-        let mut historic_layers = Vec::new();
-        for historic_layer in layer_map.iter_historic_layers() {
-            let historic_layer = guard.get_from_desc(&historic_layer);
-            historic_layers.push(historic_layer.info(reset));
-        }
+        let historic_layers = layer_map
+            .iter_historic_layers()
+            .map(|desc| guard.get_from_desc(&desc).info(reset))
+            .collect();
 
-        LayerMapInfo {
+        Ok(LayerMapInfo {
             in_memory_layers,
             historic_layers,
-        }
+        })
     }
 
     #[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))]
@@ -1797,7 +1838,7 @@ impl Timeline {
         &self,
         layer_file_name: &LayerName,
     ) -> anyhow::Result<Option<bool>> {
-        let Some(layer) = self.find_layer(layer_file_name).await else {
+        let Some(layer) = self.find_layer(layer_file_name).await? else {
             return Ok(None);
         };
 
@@ -1818,7 +1859,7 @@ impl Timeline {
             .enter()
             .map_err(|_| anyhow::anyhow!("Shutting down"))?;
 
-        let Some(local_layer) = self.find_layer(layer_file_name).await else {
+        let Some(local_layer) = self.find_layer(layer_file_name).await? else {
             return Ok(None);
         };
 
@@ -2304,7 +2345,10 @@ impl Timeline {
         let mut layers = self.layers.try_write().expect(
             "in the context where we call this function, no other task has access to the object",
         );
-        layers.initialize_empty(Lsn(start_lsn.0));
+        layers
+            .open_mut()
+            .expect("in this context the LayerManager must still be open")
+            .initialize_empty(Lsn(start_lsn.0));
     }
 
     /// Scan the timeline directory, cleanup, populate the layer map, and schedule uploads for local-only
@@ -2436,7 +2480,10 @@ impl Timeline {
 
         let num_layers = loaded_layers.len();
 
-        guard.initialize_local_layers(loaded_layers, disk_consistent_lsn + 1);
+        guard
+            .open_mut()
+            .expect("layermanager must be open during init")
+            .initialize_local_layers(loaded_layers, disk_consistent_lsn + 1);
 
         self.remote_client
             .schedule_layer_file_deletion(&needs_cleanup)?;
@@ -2471,7 +2518,7 @@ impl Timeline {
 
         // Now that we have the full layer map, we may calculate the visibility of layers within it (a global scan)
         drop(guard); // drop write lock, update_layer_visibility will take a read lock.
-        self.update_layer_visibility().await;
+        self.update_layer_visibility().await?;
 
         info!(
             "loaded layer map with {} layers at {}, total physical size: {}",
@@ -2893,16 +2940,17 @@ impl Timeline {
         }
     }
 
-    async fn find_layer(&self, layer_name: &LayerName) -> Option<Layer> {
+    async fn find_layer(
+        &self,
+        layer_name: &LayerName,
+    ) -> Result<Option<Layer>, layer_manager::Shutdown> {
         let guard = self.layers.read().await;
-        for historic_layer in guard.layer_map().iter_historic_layers() {
-            let historic_layer_name = historic_layer.layer_name();
-            if layer_name == &historic_layer_name {
-                return Some(guard.get_from_desc(&historic_layer));
-            }
-        }
-
-        None
+        let layer = guard
+            .layer_map()?
+            .iter_historic_layers()
+            .find(|l| &l.layer_name() == layer_name)
+            .map(|found| guard.get_from_desc(&found));
+        Ok(layer)
     }
 
     /// The timeline heatmap is a hint to secondary locations from the primary location,
@@ -2953,6 +3001,7 @@ impl Timeline {
 }
 
 impl Timeline {
+    #[allow(unknown_lints)] // doc_lazy_continuation is still a new lint
     #[allow(clippy::doc_lazy_continuation)]
     /// Get the data needed to reconstruct all keys in the provided keyspace
     ///
@@ -3104,7 +3153,7 @@ impl Timeline {
             // which turns out to be a perf bottleneck in some cases.
             if !unmapped_keyspace.is_empty() {
                 let guard = timeline.layers.read().await;
-                let layers = guard.layer_map();
+                let layers = guard.layer_map()?;
 
                 let in_memory_layer = layers.find_in_memory_layer(|l| {
                     let start_lsn = l.get_lsn_range().start;
@@ -3256,22 +3305,35 @@ impl Timeline {
         }
     }
 
+    /// Returns a non-frozen open in-memory layer for ingestion.
     ///
-    /// Get a handle to the latest layer for appending.
-    ///
+    /// Takes a witness of timeline writer state lock being held, because it makes no sense to call
+    /// this function without holding the mutex.
     async fn get_layer_for_write(
         &self,
         lsn: Lsn,
+        _guard: &tokio::sync::MutexGuard<'_, Option<TimelineWriterState>>,
         ctx: &RequestContext,
     ) -> anyhow::Result<Arc<InMemoryLayer>> {
         let mut guard = self.layers.write().await;
+        let gate_guard = self.gate.enter().context("enter gate for inmem layer")?;
+
+        let last_record_lsn = self.get_last_record_lsn();
+        ensure!(
+            lsn > last_record_lsn,
+            "cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})",
+            lsn,
+            last_record_lsn,
+        );
+
         let layer = guard
+            .open_mut()?
             .get_layer_for_write(
                 lsn,
-                self.get_last_record_lsn(),
                 self.conf,
                 self.timeline_id,
                 self.tenant_shard_id,
+                gate_guard,
                 ctx,
             )
             .await?;
@@ -3285,21 +3347,48 @@ impl Timeline {
         self.last_record_lsn.advance(new_lsn);
     }
 
+    /// Freeze any existing open in-memory layer and unconditionally notify the flush loop.
+    ///
+    /// Unconditional flush loop notification is given because in sharded cases we will want to
+    /// leave an Lsn gap. Unsharded tenants do not have Lsn gaps.
     async fn freeze_inmem_layer_at(
         &self,
         at: Lsn,
         write_lock: &mut tokio::sync::MutexGuard<'_, Option<TimelineWriterState>>,
-    ) {
+    ) -> Result<u64, FlushLayerError> {
         let frozen = {
             let mut guard = self.layers.write().await;
             guard
+                .open_mut()?
                 .try_freeze_in_memory_layer(at, &self.last_freeze_at, write_lock)
                 .await
         };
+
         if frozen {
             let now = Instant::now();
             *(self.last_freeze_ts.write().unwrap()) = now;
         }
+
+        // Increment the flush cycle counter and wake up the flush task.
+        // Remember the new value, so that when we listen for the flush
+        // to finish, we know when the flush that we initiated has
+        // finished, instead of some other flush that was started earlier.
+        let mut my_flush_request = 0;
+
+        let flush_loop_state = { *self.flush_loop_state.lock().unwrap() };
+        if !matches!(flush_loop_state, FlushLoopState::Running { .. }) {
+            return Err(FlushLayerError::NotRunning(flush_loop_state));
+        }
+
+        self.layer_flush_start_tx.send_modify(|(counter, lsn)| {
+            my_flush_request = *counter + 1;
+            *counter = my_flush_request;
+            *lsn = std::cmp::max(at, *lsn);
+        });
+
+        assert_ne!(my_flush_request, 0);
+
+        Ok(my_flush_request)
     }
 
     /// Layer flusher task's main loop.
@@ -3336,7 +3425,11 @@ impl Timeline {
 
                 let layer_to_flush = {
                     let guard = self.layers.read().await;
-                    guard.layer_map().frozen_layers.front().cloned()
+                    let Ok(lm) = guard.layer_map() else {
+                        info!("dropping out of flush loop for timeline shutdown");
+                        return;
+                    };
+                    lm.frozen_layers.front().cloned()
                     // drop 'layers' lock to allow concurrent reads and writes
                 };
                 let Some(layer_to_flush) = layer_to_flush else {
@@ -3393,34 +3486,7 @@ impl Timeline {
         }
     }
 
-    /// Request the flush loop to write out all frozen layers up to `at_lsn` as Delta L0 files to disk.
-    /// The caller is responsible for the freezing, e.g., [`Self::freeze_inmem_layer_at`].
-    ///
-    /// `at_lsn` may be higher than the highest LSN of a frozen layer: if this is the
-    /// case, it means no data will be written between the top of the highest frozen layer and
-    /// to_lsn, e.g. because this tenant shard has ingested up to to_lsn and not written any data
-    /// locally for that part of the WAL.
-    fn flush_frozen_layers(&self, at_lsn: Lsn) -> Result<u64, FlushLayerError> {
-        // Increment the flush cycle counter and wake up the flush task.
-        // Remember the new value, so that when we listen for the flush
-        // to finish, we know when the flush that we initiated has
-        // finished, instead of some other flush that was started earlier.
-        let mut my_flush_request = 0;
-
-        let flush_loop_state = { *self.flush_loop_state.lock().unwrap() };
-        if !matches!(flush_loop_state, FlushLoopState::Running { .. }) {
-            return Err(FlushLayerError::NotRunning(flush_loop_state));
-        }
-
-        self.layer_flush_start_tx.send_modify(|(counter, lsn)| {
-            my_flush_request = *counter + 1;
-            *counter = my_flush_request;
-            *lsn = std::cmp::max(at_lsn, *lsn);
-        });
-
-        Ok(my_flush_request)
-    }
-
+    /// Waits any flush request created by [`Self::freeze_inmem_layer_at`] to complete.
     async fn wait_flush_completion(&self, request: u64) -> Result<(), FlushLayerError> {
         let mut rx = self.layer_flush_done_tx.subscribe();
         loop {
@@ -3453,11 +3519,6 @@ impl Timeline {
         }
     }
 
-    async fn flush_frozen_layers_and_wait(&self, at_lsn: Lsn) -> Result<(), FlushLayerError> {
-        let token = self.flush_frozen_layers(at_lsn)?;
-        self.wait_flush_completion(token).await
-    }
-
     /// Flush one frozen in-memory layer to disk, as a new delta layer.
     ///
     /// Return value is the last lsn (inclusive) of the layer that was frozen.
@@ -3594,11 +3655,11 @@ impl Timeline {
         {
             let mut guard = self.layers.write().await;
 
-            if self.cancel.is_cancelled() {
-                return Err(FlushLayerError::Cancelled);
-            }
-
-            guard.finish_flush_l0_layer(delta_layer_to_add.as_ref(), &frozen_layer, &self.metrics);
+            guard.open_mut()?.finish_flush_l0_layer(
+                delta_layer_to_add.as_ref(),
+                &frozen_layer,
+                &self.metrics,
+            );
 
             if self.set_disk_consistent_lsn(disk_consistent_lsn) {
                 // Schedule remote uploads that will reflect our new disk_consistent_lsn
@@ -3806,7 +3867,9 @@ impl Timeline {
         let threshold = self.get_image_creation_threshold();
 
         let guard = self.layers.read().await;
-        let layers = guard.layer_map();
+        let Ok(layers) = guard.layer_map() else {
+            return false;
+        };
 
         let mut max_deltas = 0;
         for part_range in &partition.ranges {
@@ -4214,13 +4277,16 @@ impl Timeline {
         let mut guard = self.layers.write().await;
 
         // FIXME: we could add the images to be uploaded *before* returning from here, but right
-        // now they are being scheduled outside of write lock
-        guard.track_new_image_layers(&image_layers, &self.metrics);
+        // now they are being scheduled outside of write lock; current way is inconsistent with
+        // compaction lock order.
+        guard
+            .open_mut()?
+            .track_new_image_layers(&image_layers, &self.metrics);
         drop_wlock(guard);
         timer.stop_and_record();
 
         // Creating image layers may have caused some previously visible layers to be covered
-        self.update_layer_visibility().await;
+        self.update_layer_visibility().await?;
 
         Ok(image_layers)
     }
@@ -4379,6 +4445,12 @@ impl CompactionError {
     }
 }
 
+impl From<layer_manager::Shutdown> for CompactionError {
+    fn from(_: layer_manager::Shutdown) -> Self {
+        CompactionError::ShuttingDown
+    }
+}
+
 #[serde_as]
 #[derive(serde::Serialize)]
 struct RecordedDuration(#[serde_as(as = "serde_with::DurationMicroSeconds")] Duration);
@@ -4484,11 +4556,14 @@ impl Timeline {
             .collect();
 
         if !new_images.is_empty() {
-            guard.track_new_image_layers(new_images, &self.metrics);
+            guard
+                .open_mut()?
+                .track_new_image_layers(new_images, &self.metrics);
         }
 
-        // deletion will happen later, the layer file manager calls garbage_collect_on_drop
-        guard.finish_compact_l0(&remove_layers, &insert_layers, &self.metrics);
+        guard
+            .open_mut()?
+            .finish_compact_l0(&remove_layers, &insert_layers, &self.metrics);
 
         self.remote_client
             .schedule_compaction_update(&remove_layers, new_deltas)?;
@@ -4502,7 +4577,7 @@ impl Timeline {
         self: &Arc<Self>,
         mut replace_layers: Vec<(Layer, ResidentLayer)>,
         mut drop_layers: Vec<Layer>,
-    ) -> Result<(), super::upload_queue::NotInitialized> {
+    ) -> Result<(), CompactionError> {
         let mut guard = self.layers.write().await;
 
         // Trim our lists in case our caller (compaction) raced with someone else (GC) removing layers: we want
@@ -4510,7 +4585,9 @@ impl Timeline {
         replace_layers.retain(|(l, _)| guard.contains(l));
         drop_layers.retain(|l| guard.contains(l));
 
-        guard.rewrite_layers(&replace_layers, &drop_layers, &self.metrics);
+        guard
+            .open_mut()?
+            .rewrite_layers(&replace_layers, &drop_layers, &self.metrics);
 
         let upload_layers: Vec<_> = replace_layers.into_iter().map(|r| r.1).collect();
 
@@ -4799,7 +4876,7 @@ impl Timeline {
         //
         // TODO holding a write lock is too agressive and avoidable
         let mut guard = self.layers.write().await;
-        let layers = guard.layer_map();
+        let layers = guard.layer_map()?;
         'outer: for l in layers.iter_historic_layers() {
             result.layers_total += 1;
 
@@ -4927,7 +5004,7 @@ impl Timeline {
                     }
                 })?;
 
-            guard.finish_gc_timeline(&gc_layers);
+            guard.open_mut()?.finish_gc_timeline(&gc_layers);
 
             #[cfg(feature = "testing")]
             {
@@ -5083,9 +5160,13 @@ impl Timeline {
 
         let remaining = {
             let guard = self.layers.read().await;
-            guard
-                .layer_map()
-                .iter_historic_layers()
+            let Ok(lm) = guard.layer_map() else {
+                // technically here we could look into iterating accessible layers, but downloading
+                // all layers of a shutdown timeline makes no sense regardless.
+                tracing::info!("attempted to download all layers of shutdown timeline");
+                return;
+            };
+            lm.iter_historic_layers()
                 .map(|desc| guard.get_from_desc(&desc))
                 .collect::<Vec<_>>()
         };
@@ -5195,7 +5276,7 @@ impl Timeline {
                 let last_activity_ts = layer.latest_activity();
 
                 EvictionCandidate {
-                    layer: layer.into(),
+                    layer: layer.to_owned().into(),
                     last_activity_ts,
                     relative_last_activity: finite_f32::FiniteF32::ZERO,
                 }
@@ -5280,7 +5361,7 @@ impl Timeline {
 
         {
             let mut guard = self.layers.write().await;
-            guard.force_insert_layer(image_layer);
+            guard.open_mut().unwrap().force_insert_layer(image_layer);
         }
 
         Ok(())
@@ -5324,7 +5405,7 @@ impl Timeline {
             }
 
             let guard = self.layers.read().await;
-            for layer in guard.layer_map().iter_historic_layers() {
+            for layer in guard.layer_map()?.iter_historic_layers() {
                 if layer.is_delta()
                     && overlaps_with(&layer.lsn_range, &deltas.lsn_range)
                     && layer.lsn_range != deltas.lsn_range
@@ -5354,7 +5435,7 @@ impl Timeline {
 
         {
             let mut guard = self.layers.write().await;
-            guard.force_insert_layer(delta_layer);
+            guard.open_mut().unwrap().force_insert_layer(delta_layer);
         }
 
         Ok(())
@@ -5369,7 +5450,7 @@ impl Timeline {
     ) -> anyhow::Result<Vec<(Key, Bytes)>> {
         let mut all_data = Vec::new();
         let guard = self.layers.read().await;
-        for layer in guard.layer_map().iter_historic_layers() {
+        for layer in guard.layer_map()?.iter_historic_layers() {
             if !layer.is_delta() && layer.image_layer_lsn() == lsn {
                 let layer = guard.get_from_desc(&layer);
                 let mut reconstruct_data = ValuesReconstructState::default();
@@ -5397,7 +5478,7 @@ impl Timeline {
     ) -> anyhow::Result<Vec<super::storage_layer::PersistentLayerKey>> {
         let mut layers = Vec::new();
         let guard = self.layers.read().await;
-        for layer in guard.layer_map().iter_historic_layers() {
+        for layer in guard.layer_map()?.iter_historic_layers() {
             layers.push(layer.key());
         }
         Ok(layers)
@@ -5414,7 +5495,7 @@ impl Timeline {
 /// Tracking writes ingestion does to a particular in-memory layer.
 ///
 /// Cleared upon freezing a layer.
-struct TimelineWriterState {
+pub(crate) struct TimelineWriterState {
     open_layer: Arc<InMemoryLayer>,
     current_size: u64,
     // Previous Lsn which passed through
@@ -5522,7 +5603,10 @@ impl<'a> TimelineWriter<'a> {
     }
 
     async fn open_layer(&mut self, at: Lsn, ctx: &RequestContext) -> anyhow::Result<()> {
-        let layer = self.tl.get_layer_for_write(at, ctx).await?;
+        let layer = self
+            .tl
+            .get_layer_for_write(at, &self.write_guard, ctx)
+            .await?;
         let initial_size = layer.size().await?;
 
         let last_freeze_at = self.last_freeze_at.load();
@@ -5535,15 +5619,15 @@ impl<'a> TimelineWriter<'a> {
         Ok(())
     }
 
-    async fn roll_layer(&mut self, freeze_at: Lsn) -> anyhow::Result<()> {
+    async fn roll_layer(&mut self, freeze_at: Lsn) -> Result<(), FlushLayerError> {
         let current_size = self.write_guard.as_ref().unwrap().current_size;
 
         // self.write_guard will be taken by the freezing
         self.tl
             .freeze_inmem_layer_at(freeze_at, &mut self.write_guard)
-            .await;
+            .await?;
 
-        self.tl.flush_frozen_layers(freeze_at)?;
+        assert!(self.write_guard.is_none());
 
         if current_size >= self.get_checkpoint_distance() * 2 {
             warn!("Flushed oversized open layer with size {}", current_size)
@@ -5708,6 +5792,7 @@ mod tests {
         let layers = timeline.layers.read().await;
         let desc = layers
             .layer_map()
+            .unwrap()
             .iter_historic_layers()
             .next()
             .expect("must find one layer to evict");
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 276d7b4967..87ec46c0b5 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -371,7 +371,7 @@ impl Timeline {
         );
 
         let layers = self.layers.read().await;
-        for layer_desc in layers.layer_map().iter_historic_layers() {
+        for layer_desc in layers.layer_map()?.iter_historic_layers() {
             let layer = layers.get_from_desc(&layer_desc);
             if layer.metadata().shard.shard_count == self.shard_identity.count {
                 // This layer does not belong to a historic ancestor, no need to re-image it.
@@ -549,7 +549,9 @@ impl Timeline {
     ///
     /// The result may be used as an input to eviction and secondary downloads to de-prioritize layers
     /// that we know won't be needed for reads.
-    pub(super) async fn update_layer_visibility(&self) {
+    pub(super) async fn update_layer_visibility(
+        &self,
+    ) -> Result<(), super::layer_manager::Shutdown> {
         let head_lsn = self.get_last_record_lsn();
 
         // We will sweep through layers in reverse-LSN order.  We only do historic layers.  L0 deltas
@@ -557,7 +559,7 @@ impl Timeline {
         // Note that L0 deltas _can_ be covered by image layers, but we consider them 'visible' because we anticipate that
         // they will be subject to L0->L1 compaction in the near future.
         let layer_manager = self.layers.read().await;
-        let layer_map = layer_manager.layer_map();
+        let layer_map = layer_manager.layer_map()?;
 
         let readable_points = {
             let children = self.gc_info.read().unwrap().retain_lsns.clone();
@@ -580,6 +582,7 @@ impl Timeline {
         // TODO: publish our covered KeySpace to our parent, so that when they update their visibility, they can
         // avoid assuming that everything at a branch point is visible.
         drop(covered);
+        Ok(())
     }
 
     /// Collect a bunch of Level 0 layer files, and compact and reshuffle them as
@@ -633,12 +636,8 @@ impl Timeline {
     ) -> Result<CompactLevel0Phase1Result, CompactionError> {
         stats.read_lock_held_spawn_blocking_startup_micros =
             stats.read_lock_acquisition_micros.till_now(); // set by caller
-        let layers = guard.layer_map();
-        let level0_deltas = layers.get_level0_deltas();
-        let mut level0_deltas = level0_deltas
-            .into_iter()
-            .map(|x| guard.get_from_desc(&x))
-            .collect_vec();
+        let layers = guard.layer_map()?;
+        let level0_deltas = layers.level0_deltas();
         stats.level0_deltas_count = Some(level0_deltas.len());
 
         // Only compact if enough layers have accumulated.
@@ -651,6 +650,11 @@ impl Timeline {
             return Ok(CompactLevel0Phase1Result::default());
         }
 
+        let mut level0_deltas = level0_deltas
+            .iter()
+            .map(|x| guard.get_from_desc(x))
+            .collect::<Vec<_>>();
+
         // Gather the files to compact in this iteration.
         //
         // Start with the oldest Level 0 delta file, and collect any other
@@ -1407,10 +1411,9 @@ impl Timeline {
         // Find the top of the historical layers
         let end_lsn = {
             let guard = self.layers.read().await;
-            let layers = guard.layer_map();
+            let layers = guard.layer_map()?;
 
-            let l0_deltas = layers.get_level0_deltas();
-            drop(guard);
+            let l0_deltas = layers.level0_deltas();
 
             // As an optimization, if we find that there are too few L0 layers,
             // bail out early. We know that the compaction algorithm would do
@@ -1782,7 +1785,7 @@ impl Timeline {
         // 2. Inferred from (1), for each key in the layer selection, the value can be reconstructed only with the layers in the layer selection.
         let (layer_selection, gc_cutoff, retain_lsns_below_horizon) = {
             let guard = self.layers.read().await;
-            let layers = guard.layer_map();
+            let layers = guard.layer_map()?;
             let gc_info = self.gc_info.read().unwrap();
             let mut retain_lsns_below_horizon = Vec::new();
             let gc_cutoff = gc_info.cutoffs.select_min();
@@ -2216,7 +2219,9 @@ impl Timeline {
         // Step 3: Place back to the layer map.
         {
             let mut guard = self.layers.write().await;
-            guard.finish_gc_compaction(&layer_selection, &compact_to, &self.metrics)
+            guard
+                .open_mut()?
+                .finish_gc_compaction(&layer_selection, &compact_to, &self.metrics)
         };
         self.remote_client
             .schedule_compaction_update(&layer_selection, &compact_to)?;
@@ -2296,7 +2301,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
         self.flush_updates().await?;
 
         let guard = self.timeline.layers.read().await;
-        let layer_map = guard.layer_map();
+        let layer_map = guard.layer_map()?;
 
         let result = layer_map
             .iter_historic_layers()
diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs
index 645b5ad2bf..7f63b53e86 100644
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -74,6 +74,11 @@ impl From<crate::tenant::upload_queue::NotInitialized> for Error {
         Error::ShuttingDown
     }
 }
+impl From<super::layer_manager::Shutdown> for Error {
+    fn from(_: super::layer_manager::Shutdown) -> Self {
+        Error::ShuttingDown
+    }
+}
 
 impl From<FlushLayerError> for Error {
     fn from(value: FlushLayerError) -> Self {
@@ -277,7 +282,7 @@ pub(super) async fn prepare(
 
         // between retries, these can change if compaction or gc ran in between. this will mean
         // we have to redo work.
-        partition_work(ancestor_lsn, &layers)
+        partition_work(ancestor_lsn, &layers)?
     };
 
     // TODO: layers are already sorted by something: use that to determine how much of remote
@@ -383,14 +388,14 @@ pub(super) async fn prepare(
 
 fn partition_work(
     ancestor_lsn: Lsn,
-    source_layermap: &LayerManager,
-) -> (usize, Vec<Layer>, Vec<Layer>) {
+    source: &LayerManager,
+) -> Result<(usize, Vec<Layer>, Vec<Layer>), Error> {
     let mut straddling_branchpoint = vec![];
     let mut rest_of_historic = vec![];
 
     let mut later_by_lsn = 0;
 
-    for desc in source_layermap.layer_map().iter_historic_layers() {
+    for desc in source.layer_map()?.iter_historic_layers() {
         // off by one chances here:
         // - start is inclusive
         // - end is exclusive
@@ -409,10 +414,10 @@ fn partition_work(
             &mut rest_of_historic
         };
 
-        target.push(source_layermap.get_from_desc(&desc));
+        target.push(source.get_from_desc(&desc));
     }
 
-    (later_by_lsn, straddling_branchpoint, rest_of_historic)
+    Ok((later_by_lsn, straddling_branchpoint, rest_of_historic))
 }
 
 async fn upload_rewritten_layer(
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index 1ba1bf9de5..07d860eb80 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -213,51 +213,45 @@ impl Timeline {
         let mut js = tokio::task::JoinSet::new();
         {
             let guard = self.layers.read().await;
-            let layers = guard.layer_map();
-            for layer in layers.iter_historic_layers() {
-                let layer = guard.get_from_desc(&layer);
 
-                // guard against eviction while we inspect it; it might be that eviction_task and
-                // disk_usage_eviction_task both select the same layers to be evicted, and
-                // seemingly free up double the space. both succeeding is of no consequence.
+            guard
+                .likely_resident_layers()
+                .filter(|layer| {
+                    let last_activity_ts = layer.latest_activity();
 
-                if !layer.is_likely_resident() {
-                    continue;
-                }
+                    let no_activity_for = match now.duration_since(last_activity_ts) {
+                        Ok(d) => d,
+                        Err(_e) => {
+                            // We reach here if `now` < `last_activity_ts`, which can legitimately
+                            // happen if there is an access between us getting `now`, and us getting
+                            // the access stats from the layer.
+                            //
+                            // The other reason why it can happen is system clock skew because
+                            // SystemTime::now() is not monotonic, so, even if there is no access
+                            // to the layer after we get `now` at the beginning of this function,
+                            // it could be that `now`  < `last_activity_ts`.
+                            //
+                            // To distinguish the cases, we would need to record `Instant`s in the
+                            // access stats (i.e., monotonic timestamps), but then, the timestamps
+                            // values in the access stats would need to be `Instant`'s, and hence
+                            // they would be meaningless outside of the pageserver process.
+                            // At the time of writing, the trade-off is that access stats are more
+                            // valuable than detecting clock skew.
+                            return false;
+                        }
+                    };
 
-                let last_activity_ts = layer.latest_activity();
-
-                let no_activity_for = match now.duration_since(last_activity_ts) {
-                    Ok(d) => d,
-                    Err(_e) => {
-                        // We reach here if `now` < `last_activity_ts`, which can legitimately
-                        // happen if there is an access between us getting `now`, and us getting
-                        // the access stats from the layer.
-                        //
-                        // The other reason why it can happen is system clock skew because
-                        // SystemTime::now() is not monotonic, so, even if there is no access
-                        // to the layer after we get `now` at the beginning of this function,
-                        // it could be that `now`  < `last_activity_ts`.
-                        //
-                        // To distinguish the cases, we would need to record `Instant`s in the
-                        // access stats (i.e., monotonic timestamps), but then, the timestamps
-                        // values in the access stats would need to be `Instant`'s, and hence
-                        // they would be meaningless outside of the pageserver process.
-                        // At the time of writing, the trade-off is that access stats are more
-                        // valuable than detecting clock skew.
-                        continue;
-                    }
-                };
-
-                if no_activity_for > p.threshold {
+                    no_activity_for > p.threshold
+                })
+                .cloned()
+                .for_each(|layer| {
                     js.spawn(async move {
                         layer
                             .evict_and_wait(std::time::Duration::from_secs(5))
                             .await
                     });
                     stats.candidates += 1;
-                }
-            }
+                });
         };
 
         let join_all = async move {
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index e6e7bc2e77..8f20d84401 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -1,4 +1,4 @@
-use anyhow::{bail, ensure, Context, Result};
+use anyhow::{bail, ensure, Context};
 use itertools::Itertools;
 use pageserver_api::shard::TenantShardId;
 use std::{collections::HashMap, sync::Arc};
@@ -24,39 +24,142 @@ use crate::{
 use super::TimelineWriterState;
 
 /// Provides semantic APIs to manipulate the layer map.
-#[derive(Default)]
-pub(crate) struct LayerManager {
-    layer_map: LayerMap,
-    layer_fmgr: LayerFileManager<Layer>,
+pub(crate) enum LayerManager {
+    /// Open as in not shutdown layer manager; we still have in-memory layers and we can manipulate
+    /// the layers.
+    Open(OpenLayerManager),
+    /// Shutdown layer manager where there are no more in-memory layers and persistent layers are
+    /// read-only.
+    Closed {
+        layers: HashMap<PersistentLayerKey, Layer>,
+    },
+}
+
+impl Default for LayerManager {
+    fn default() -> Self {
+        LayerManager::Open(OpenLayerManager::default())
+    }
 }
 
 impl LayerManager {
-    pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Layer {
-        self.layer_fmgr.get_from_desc(desc)
+    pub(crate) fn get_from_key(&self, key: &PersistentLayerKey) -> Layer {
+        // The assumption for the `expect()` is that all code maintains the following invariant:
+        // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
+        self.layers()
+            .get(key)
+            .with_context(|| format!("get layer from key: {key}"))
+            .expect("not found")
+            .clone()
     }
 
-    pub(crate) fn get_from_key(&self, desc: &PersistentLayerKey) -> Layer {
-        self.layer_fmgr.get_from_key(desc)
+    pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Layer {
+        self.get_from_key(&desc.key())
     }
 
     /// Get an immutable reference to the layer map.
     ///
     /// We expect users only to be able to get an immutable layer map. If users want to make modifications,
     /// they should use the below semantic APIs. This design makes us step closer to immutable storage state.
-    pub(crate) fn layer_map(&self) -> &LayerMap {
-        &self.layer_map
+    pub(crate) fn layer_map(&self) -> Result<&LayerMap, Shutdown> {
+        use LayerManager::*;
+        match self {
+            Open(OpenLayerManager { layer_map, .. }) => Ok(layer_map),
+            Closed { .. } => Err(Shutdown),
+        }
     }
 
+    pub(crate) fn open_mut(&mut self) -> Result<&mut OpenLayerManager, Shutdown> {
+        use LayerManager::*;
+
+        match self {
+            Open(open) => Ok(open),
+            Closed { .. } => Err(Shutdown),
+        }
+    }
+
+    /// LayerManager shutdown. The in-memory layers do cleanup on drop, so we must drop them in
+    /// order to allow shutdown to complete.
+    ///
+    /// If there was a want to flush in-memory layers, it must have happened earlier.
+    pub(crate) fn shutdown(&mut self, writer_state: &mut Option<TimelineWriterState>) {
+        use LayerManager::*;
+        match self {
+            Open(OpenLayerManager {
+                layer_map,
+                layer_fmgr: LayerFileManager(hashmap),
+            }) => {
+                let open = layer_map.open_layer.take();
+                let frozen = layer_map.frozen_layers.len();
+                let taken_writer_state = writer_state.take();
+                tracing::info!(open = open.is_some(), frozen, "dropped inmemory layers");
+                let layers = std::mem::take(hashmap);
+                *self = Closed { layers };
+                assert_eq!(open.is_some(), taken_writer_state.is_some());
+            }
+            Closed { .. } => {
+                tracing::debug!("ignoring multiple shutdowns on layer manager")
+            }
+        }
+    }
+
+    /// Sum up the historic layer sizes
+    pub(crate) fn layer_size_sum(&self) -> u64 {
+        self.layers()
+            .values()
+            .map(|l| l.layer_desc().file_size)
+            .sum()
+    }
+
+    pub(crate) fn likely_resident_layers(&self) -> impl Iterator<Item = &'_ Layer> + '_ {
+        self.layers().values().filter(|l| l.is_likely_resident())
+    }
+
+    pub(crate) fn contains(&self, layer: &Layer) -> bool {
+        self.contains_key(&layer.layer_desc().key())
+    }
+
+    pub(crate) fn contains_key(&self, key: &PersistentLayerKey) -> bool {
+        self.layers().contains_key(key)
+    }
+
+    pub(crate) fn all_persistent_layers(&self) -> Vec<PersistentLayerKey> {
+        self.layers().keys().cloned().collect_vec()
+    }
+
+    fn layers(&self) -> &HashMap<PersistentLayerKey, Layer> {
+        use LayerManager::*;
+        match self {
+            Open(OpenLayerManager { layer_fmgr, .. }) => &layer_fmgr.0,
+            Closed { layers } => layers,
+        }
+    }
+}
+
+#[derive(Default)]
+pub(crate) struct OpenLayerManager {
+    layer_map: LayerMap,
+    layer_fmgr: LayerFileManager<Layer>,
+}
+
+impl std::fmt::Debug for OpenLayerManager {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("OpenLayerManager")
+            .field("layer_count", &self.layer_fmgr.0.len())
+            .finish()
+    }
+}
+
+#[derive(Debug, thiserror::Error)]
+#[error("layer manager has been shutdown")]
+pub(crate) struct Shutdown;
+
+impl OpenLayerManager {
     /// Called from `load_layer_map`. Initialize the layer manager with:
     /// 1. all on-disk layers
     /// 2. next open layer (with disk disk_consistent_lsn LSN)
-    pub(crate) fn initialize_local_layers(
-        &mut self,
-        on_disk_layers: Vec<Layer>,
-        next_open_layer_at: Lsn,
-    ) {
+    pub(crate) fn initialize_local_layers(&mut self, layers: Vec<Layer>, next_open_layer_at: Lsn) {
         let mut updates = self.layer_map.batch_update();
-        for layer in on_disk_layers {
+        for layer in layers {
             Self::insert_historic_layer(layer, &mut updates, &mut self.layer_fmgr);
         }
         updates.flush();
@@ -68,26 +171,19 @@ impl LayerManager {
         self.layer_map.next_open_layer_at = Some(next_open_layer_at);
     }
 
-    /// Open a new writable layer to append data if there is no open layer, otherwise return the current open layer,
-    /// called within `get_layer_for_write`.
+    /// Open a new writable layer to append data if there is no open layer, otherwise return the
+    /// current open layer, called within `get_layer_for_write`.
     pub(crate) async fn get_layer_for_write(
         &mut self,
         lsn: Lsn,
-        last_record_lsn: Lsn,
         conf: &'static PageServerConf,
         timeline_id: TimelineId,
         tenant_shard_id: TenantShardId,
+        gate_guard: utils::sync::gate::GateGuard,
         ctx: &RequestContext,
-    ) -> Result<Arc<InMemoryLayer>> {
+    ) -> anyhow::Result<Arc<InMemoryLayer>> {
         ensure!(lsn.is_aligned());
 
-        ensure!(
-            lsn > last_record_lsn,
-            "cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})",
-            lsn,
-            last_record_lsn,
-        );
-
         // Do we have a layer open for writing already?
         let layer = if let Some(open_layer) = &self.layer_map.open_layer {
             if open_layer.get_lsn_range().start > lsn {
@@ -113,8 +209,15 @@ impl LayerManager {
                 lsn
             );
 
-            let new_layer =
-                InMemoryLayer::create(conf, timeline_id, tenant_shard_id, start_lsn, ctx).await?;
+            let new_layer = InMemoryLayer::create(
+                conf,
+                timeline_id,
+                tenant_shard_id,
+                start_lsn,
+                gate_guard,
+                ctx,
+            )
+            .await?;
             let layer = Arc::new(new_layer);
 
             self.layer_map.open_layer = Some(layer.clone());
@@ -168,7 +271,7 @@ impl LayerManager {
         froze
     }
 
-    /// Add image layers to the layer map, called from `create_image_layers`.
+    /// Add image layers to the layer map, called from [`super::Timeline::create_image_layers`].
     pub(crate) fn track_new_image_layers(
         &mut self,
         image_layers: &[ResidentLayer],
@@ -241,7 +344,7 @@ impl LayerManager {
         self.finish_compact_l0(compact_from, compact_to, metrics)
     }
 
-    /// Called when compaction is completed.
+    /// Called post-compaction when some previous generation image layers were trimmed.
     pub(crate) fn rewrite_layers(
         &mut self,
         rewrite_layers: &[(Layer, ResidentLayer)],
@@ -330,31 +433,6 @@ impl LayerManager {
         mapping.remove(layer);
         layer.delete_on_drop();
     }
-
-    pub(crate) fn likely_resident_layers(&self) -> impl Iterator<Item = Layer> + '_ {
-        // for small layer maps, we most likely have all resident, but for larger more are likely
-        // to be evicted assuming lots of layers correlated with longer lifespan.
-
-        self.layer_map().iter_historic_layers().filter_map(|desc| {
-            self.layer_fmgr
-                .0
-                .get(&desc.key())
-                .filter(|l| l.is_likely_resident())
-                .cloned()
-        })
-    }
-
-    pub(crate) fn contains(&self, layer: &Layer) -> bool {
-        self.layer_fmgr.contains(layer)
-    }
-
-    pub(crate) fn contains_key(&self, key: &PersistentLayerKey) -> bool {
-        self.layer_fmgr.contains_key(key)
-    }
-
-    pub(crate) fn all_persistent_layers(&self) -> Vec<PersistentLayerKey> {
-        self.layer_fmgr.0.keys().cloned().collect_vec()
-    }
 }
 
 pub(crate) struct LayerFileManager<T>(HashMap<PersistentLayerKey, T>);
@@ -366,24 +444,6 @@ impl<T> Default for LayerFileManager<T> {
 }
 
 impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
-    fn get_from_key(&self, key: &PersistentLayerKey) -> T {
-        // The assumption for the `expect()` is that all code maintains the following invariant:
-        // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
-        self.0
-            .get(key)
-            .with_context(|| format!("get layer from key: {}", key))
-            .expect("not found")
-            .clone()
-    }
-
-    fn get_from_desc(&self, desc: &PersistentLayerDesc) -> T {
-        self.get_from_key(&desc.key())
-    }
-
-    fn contains_key(&self, key: &PersistentLayerKey) -> bool {
-        self.0.contains_key(key)
-    }
-
     pub(crate) fn insert(&mut self, layer: T) {
         let present = self.0.insert(layer.layer_desc().key(), layer.clone());
         if present.is_some() && cfg!(debug_assertions) {
@@ -391,10 +451,6 @@ impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
         }
     }
 
-    pub(crate) fn contains(&self, layer: &T) -> bool {
-        self.0.contains_key(&layer.layer_desc().key())
-    }
-
     pub(crate) fn remove(&mut self, layer: &T) {
         let present = self.0.remove(&layer.layer_desc().key());
         if present.is_none() && cfg!(debug_assertions) {

From 658d763915378c373bebdb86b40d905dceed2de8 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <iskyzh@gmail.com>
Date: Wed, 7 Aug 2024 23:37:46 +0800
Subject: [PATCH 356/412] fix(pageserver): dump the key when it's invalid
 (#8633)

We see an assertion error in staging. Dump the key to guess where it was
from, and then we can fix it.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/pageserver_api/src/key.rs | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index 0acd83753e..3af3f74e9c 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -107,7 +107,10 @@ impl Key {
     /// As long as Neon does not support tablespace (because of lack of access to local file system),
     /// we can assume that only some predefined namespace OIDs are used which can fit in u16
     pub fn to_i128(&self) -> i128 {
-        assert!(self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
+        assert!(
+            self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222,
+            "invalid key: {self}",
+        );
         (((self.field1 & 0x7F) as i128) << 120)
             | (((self.field2 & 0xFFFF) as i128) << 104)
             | ((self.field3 as i128) << 72)

From 40e3c913bb5588a2a3a9c8a76ab43fd7aa418140 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 7 Aug 2024 19:19:00 +0300
Subject: [PATCH 357/412] refactor(timeline_detach_ancestor): replace ordered
 reparented with a hashset (#8629)

Earlier I was thinking we'd need a (ancestor_lsn, timeline_id) ordered
list of reparented. Turns out we did not need it at all. Replace it with
an unordered hashset. Additionally refactor the reparented direct
children query out, it will later be used from more places.

Split off from #8430.

Cc: #6994
---
 .../src/models/detach_ancestor.rs             |   4 +-
 pageserver/src/tenant/mgr.rs                  |   5 +-
 pageserver/src/tenant/timeline.rs             |   6 +-
 .../src/tenant/timeline/detach_ancestor.rs    | 121 ++++++++++--------
 storage_controller/src/service.rs             |   2 +-
 test_runner/fixtures/pageserver/http.py       |   4 +-
 .../regress/test_timeline_detach_ancestor.py  |   6 +-
 7 files changed, 78 insertions(+), 70 deletions(-)

diff --git a/libs/pageserver_api/src/models/detach_ancestor.rs b/libs/pageserver_api/src/models/detach_ancestor.rs
index ae5a21bab9..ad74d343ae 100644
--- a/libs/pageserver_api/src/models/detach_ancestor.rs
+++ b/libs/pageserver_api/src/models/detach_ancestor.rs
@@ -1,6 +1,8 @@
+use std::collections::HashSet;
+
 use utils::id::TimelineId;
 
 #[derive(Debug, Default, PartialEq, serde::Serialize, serde::Deserialize)]
 pub struct AncestorDetached {
-    pub reparented_timelines: Vec<TimelineId>,
+    pub reparented_timelines: HashSet<TimelineId>,
 }
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 7901fc3554..3f592f167e 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -13,7 +13,7 @@ use pageserver_api::upcall_api::ReAttachResponseTenant;
 use rand::{distributions::Alphanumeric, Rng};
 use std::borrow::Cow;
 use std::cmp::Ordering;
-use std::collections::{BTreeMap, HashMap};
+use std::collections::{BTreeMap, HashMap, HashSet};
 use std::ops::Deref;
 use std::sync::Arc;
 use std::time::Duration;
@@ -1966,7 +1966,8 @@ impl TenantManager {
         timeline_id: TimelineId,
         prepared: PreparedTimelineDetach,
         ctx: &RequestContext,
-    ) -> Result<Vec<TimelineId>, anyhow::Error> {
+    ) -> Result<HashSet<TimelineId>, anyhow::Error> {
+        // FIXME: this is unnecessary, slotguard already has these semantics
         struct RevertOnDropSlot(Option<SlotGuard>);
 
         impl Drop for RevertOnDropSlot {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 8f9ff78fd8..76dcb5645f 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3286,10 +3286,6 @@ impl Timeline {
         Ok(ancestor.clone())
     }
 
-    pub(crate) fn get_ancestor_timeline(&self) -> Option<Arc<Timeline>> {
-        self.ancestor_timeline.clone()
-    }
-
     pub(crate) fn get_shard_identity(&self) -> &ShardIdentity {
         &self.shard_identity
     }
@@ -4366,7 +4362,7 @@ impl Timeline {
         tenant: &crate::tenant::Tenant,
         prepared: detach_ancestor::PreparedTimelineDetach,
         ctx: &RequestContext,
-    ) -> Result<Vec<TimelineId>, anyhow::Error> {
+    ) -> Result<HashSet<TimelineId>, anyhow::Error> {
         detach_ancestor::complete(self, tenant, prepared, ctx).await
     }
 
diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs
index 7f63b53e86..3b52adc77b 100644
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -1,4 +1,4 @@
-use std::sync::Arc;
+use std::{collections::HashSet, sync::Arc};
 
 use super::{layer_manager::LayerManager, FlushLayerError, Timeline};
 use crate::{
@@ -146,50 +146,9 @@ pub(super) async fn prepare(
             }
         }
 
-        // detached has previously been detached; let's inspect each of the current timelines and
-        // report back the timelines which have been reparented by our detach
-        let mut all_direct_children = tenant
-            .timelines
-            .lock()
-            .unwrap()
-            .values()
-            .filter(|tl| matches!(tl.ancestor_timeline.as_ref(), Some(ancestor) if Arc::ptr_eq(ancestor, detached)))
-            .map(|tl| (tl.ancestor_lsn, tl.clone()))
-            .collect::<Vec<_>>();
-
-        let mut any_shutdown = false;
-
-        all_direct_children.retain(
-            |(_, tl)| match tl.remote_client.initialized_upload_queue() {
-                Ok(accessor) => accessor
-                    .latest_uploaded_index_part()
-                    .lineage
-                    .is_reparented(),
-                Err(_shutdownalike) => {
-                    // not 100% a shutdown, but let's bail early not to give inconsistent results in
-                    // sharded enviroment.
-                    any_shutdown = true;
-                    true
-                }
-            },
-        );
-
-        if any_shutdown {
-            // it could be one or many being deleted; have client retry
-            return Err(Error::ShuttingDown);
-        }
-
-        let mut reparented = all_direct_children;
-        // why this instead of hashset? there is a reason, but I've forgotten it many times.
-        //
-        // maybe if this was a hashset we would not be able to distinguish some race condition.
-        reparented.sort_unstable_by_key(|(lsn, tl)| (*lsn, tl.timeline_id));
-
+        let reparented_timelines = reparented_direct_children(detached, tenant)?;
         return Ok(Progress::Done(AncestorDetached {
-            reparented_timelines: reparented
-                .into_iter()
-                .map(|(_, tl)| tl.timeline_id)
-                .collect(),
+            reparented_timelines,
         }));
     };
 
@@ -386,6 +345,57 @@ pub(super) async fn prepare(
     Ok(Progress::Prepared(guard, prepared))
 }
 
+fn reparented_direct_children(
+    detached: &Arc<Timeline>,
+    tenant: &Tenant,
+) -> Result<HashSet<TimelineId>, Error> {
+    let mut all_direct_children = tenant
+        .timelines
+        .lock()
+        .unwrap()
+        .values()
+        .filter_map(|tl| {
+            let is_direct_child = matches!(tl.ancestor_timeline.as_ref(), Some(ancestor) if Arc::ptr_eq(ancestor, detached));
+
+            if is_direct_child {
+                Some(tl.clone())
+            } else {
+                if let Some(timeline) = tl.ancestor_timeline.as_ref() {
+                    assert_ne!(timeline.timeline_id, detached.timeline_id, "we cannot have two timelines with the same timeline_id live");
+                }
+                None
+            }
+        })
+        // Collect to avoid lock taking order problem with Tenant::timelines and
+        // Timeline::remote_client
+        .collect::<Vec<_>>();
+
+    let mut any_shutdown = false;
+
+    all_direct_children.retain(|tl| match tl.remote_client.initialized_upload_queue() {
+        Ok(accessor) => accessor
+            .latest_uploaded_index_part()
+            .lineage
+            .is_reparented(),
+        Err(_shutdownalike) => {
+            // not 100% a shutdown, but let's bail early not to give inconsistent results in
+            // sharded enviroment.
+            any_shutdown = true;
+            true
+        }
+    });
+
+    if any_shutdown {
+        // it could be one or many being deleted; have client retry
+        return Err(Error::ShuttingDown);
+    }
+
+    Ok(all_direct_children
+        .into_iter()
+        .map(|tl| tl.timeline_id)
+        .collect())
+}
+
 fn partition_work(
     ancestor_lsn: Lsn,
     source: &LayerManager,
@@ -544,11 +554,12 @@ pub(super) async fn complete(
     tenant: &Tenant,
     prepared: PreparedTimelineDetach,
     _ctx: &RequestContext,
-) -> Result<Vec<TimelineId>, anyhow::Error> {
+) -> Result<HashSet<TimelineId>, anyhow::Error> {
     let PreparedTimelineDetach { layers } = prepared;
 
     let ancestor = detached
-        .get_ancestor_timeline()
+        .ancestor_timeline
+        .as_ref()
         .expect("must still have a ancestor");
     let ancestor_lsn = detached.get_ancestor_lsn();
 
@@ -588,7 +599,7 @@ pub(super) async fn complete(
             }
 
             let tl_ancestor = tl.ancestor_timeline.as_ref()?;
-            let is_same = Arc::ptr_eq(&ancestor, tl_ancestor);
+            let is_same = Arc::ptr_eq(ancestor, tl_ancestor);
             let is_earlier = tl.get_ancestor_lsn() <= ancestor_lsn;
 
             let is_deleting = tl
@@ -629,13 +640,18 @@ pub(super) async fn complete(
         });
 
     let reparenting_candidates = tasks.len();
-    let mut reparented = Vec::with_capacity(tasks.len());
+    let mut reparented = HashSet::with_capacity(tasks.len());
 
     while let Some(res) = tasks.join_next().await {
         match res {
             Ok(Some(timeline)) => {
                 tracing::info!(reparented=%timeline.timeline_id, "reparenting done");
-                reparented.push((timeline.ancestor_lsn, timeline.timeline_id));
+
+                assert!(
+                    reparented.insert(timeline.timeline_id),
+                    "duplicate reparenting? timeline_id={}",
+                    timeline.timeline_id
+                );
             }
             Ok(None) => {
                 // lets just ignore this for now. one or all reparented timelines could had
@@ -657,12 +673,5 @@ pub(super) async fn complete(
         tracing::info!("failed to reparent some candidates");
     }
 
-    reparented.sort_unstable();
-
-    let reparented = reparented
-        .into_iter()
-        .map(|(_, timeline_id)| timeline_id)
-        .collect();
-
     Ok(reparented)
 }
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 6940bf2c64..e391ce65e6 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -2954,7 +2954,6 @@ impl Service {
         }
 
         // no shard needs to go first/last; the operation should be idempotent
-        // TODO: it would be great to ensure that all shards return the same error
         let mut results = self
             .tenant_for_shards(targets, |tenant_shard_id, node| {
                 futures::FutureExt::boxed(detach_one(
@@ -2973,6 +2972,7 @@ impl Service {
             .filter(|(_, res)| res != &any.1)
             .collect::<Vec<_>>();
         if !mismatching.is_empty() {
+            // this can be hit by races which should not happen because operation lock on cplane
             let matching = results.len() - mismatching.len();
             tracing::error!(
                 matching,
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 5be59d3749..65d6ff5d62 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -857,7 +857,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         timeline_id: TimelineId,
         batch_size: int | None = None,
         **kwargs,
-    ) -> List[TimelineId]:
+    ) -> Set[TimelineId]:
         params = {}
         if batch_size is not None:
             params["batch_size"] = batch_size
@@ -868,7 +868,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         )
         self.verbose_error(res)
         json = res.json()
-        return list(map(TimelineId, json["reparented_timelines"]))
+        return set(map(TimelineId, json["reparented_timelines"]))
 
     def evict_layer(
         self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, layer_name: str
diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index 38f8dfa885..b3767a2766 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -165,7 +165,7 @@ def test_ancestor_detach_branched_from(
     )
 
     all_reparented = client.detach_ancestor(env.initial_tenant, timeline_id)
-    assert all_reparented == []
+    assert all_reparented == set()
 
     if restart_after:
         env.pageserver.stop()
@@ -534,7 +534,7 @@ def test_compaction_induced_by_detaches_in_history(
 
     for _, timeline_id in skip_main:
         reparented = client.detach_ancestor(env.initial_tenant, timeline_id)
-        assert reparented == [], "we have no earlier branches at any level"
+        assert reparented == set(), "we have no earlier branches at any level"
 
     post_detach_l0s = list(filter(lambda x: x.l0, delta_layers(branch_timeline_id)))
     assert len(post_detach_l0s) == 5, "should had inherited 4 L0s, have 5 in total"
@@ -774,7 +774,7 @@ def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder):
         else:
             break
 
-    assert reparented == [], "too many retries (None) or unexpected reparentings"
+    assert reparented == set(), "too many retries (None) or unexpected reparentings"
 
     for shard_info in shards:
         node_id = int(shard_info["node_id"])

From e082226a32b91e53a3227ce66bf9b66d30654bd0 Mon Sep 17 00:00:00 2001
From: Cihan Demirci <128653800+fcdm@users.noreply.github.com>
Date: Wed, 7 Aug 2024 19:53:47 +0300
Subject: [PATCH 358/412] cicd: push build-tools image to ACR as well (#8638)

https://github.com/neondatabase/cloud/issues/15899
---
 .github/workflows/pin-build-tools-image.yml | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/pin-build-tools-image.yml b/.github/workflows/pin-build-tools-image.yml
index 024594532f..cf10910b0b 100644
--- a/.github/workflows/pin-build-tools-image.yml
+++ b/.github/workflows/pin-build-tools-image.yml
@@ -66,8 +66,22 @@ jobs:
           username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
           password: ${{ secrets.AWS_SECRET_KEY_DEV }}
 
-      - name: Tag build-tools with `${{ env.TO_TAG }}` in ECR
+      - name: Azure login
+        if: steps.check-manifests.outputs.skip == 'false'
+        uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a  # @v2.1.1
+        with:
+          client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }}
+          tenant-id: ${{ secrets.AZURE_TENANT_ID }}
+          subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }}
+
+      - name: Login to ACR
+        if: steps.check-manifests.outputs.skip == 'false'
+        run: |
+          az acr login --name=neoneastus2
+
+      - name: Tag build-tools with `${{ env.TO_TAG }}` in ECR and ACR
         if: steps.check-manifests.outputs.skip == 'false'
         run: |
           docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG} \
+                                          -t neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG} \
                                              neondatabase/build-tools:${FROM_TAG}

From 77bb6c4cc459268d6b95b4c543a6638b873a6af4 Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Wed, 7 Aug 2024 16:04:19 -0400
Subject: [PATCH 359/412] feat(pageserver): add direct io pageserver config
 (#8622)

Part of #8130, [RFC: Direct IO For Pageserver](https://github.com/neondatabase/neon/blob/problame/direct-io-rfc/docs/rfcs/034-direct-io-for-pageserver.md)

## Description

Add pageserver config for evaluating/enabling direct I/O.

- Disabled: current default, uses buffered io as is.
- Evaluate: still uses buffered io, but could do alignment checking and
perf simulation (pad latency by direct io RW to a fake file).
- Enabled: uses direct io, behavior on alignment error is configurable.


Signed-off-by: Yuchen Liang <yuchen@neon.tech>
---
 libs/pageserver_api/src/models.rs | 49 +++++++++++++++++++++++++++++++
 pageserver/src/bin/pageserver.rs  |  1 +
 pageserver/src/config.rs          | 17 +++++++++++
 pageserver/src/virtual_file.rs    |  1 +
 4 files changed, 68 insertions(+)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index b541bba6a1..ab4adfbebe 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -947,6 +947,8 @@ pub struct TopTenantShardsResponse {
 }
 
 pub mod virtual_file {
+    use std::path::PathBuf;
+
     #[derive(
         Copy,
         Clone,
@@ -965,6 +967,53 @@ pub mod virtual_file {
         #[cfg(target_os = "linux")]
         TokioEpollUring,
     }
+
+    /// Direct IO modes for a pageserver.
+    #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
+    #[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
+    pub enum DirectIoMode {
+        /// Direct IO disabled (uses usual buffered IO).
+        #[default]
+        Disabled,
+        /// Direct IO disabled (performs checks and perf simulations).
+        Evaluate {
+            /// Alignment check level
+            alignment_check: DirectIoAlignmentCheckLevel,
+            /// Latency padded for performance simulation.
+            latency_padding: DirectIoLatencyPadding,
+        },
+        /// Direct IO enabled.
+        Enabled {
+            /// Actions to perform on alignment error.
+            on_alignment_error: DirectIoOnAlignmentErrorAction,
+        },
+    }
+
+    #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
+    #[serde(rename_all = "kebab-case")]
+    pub enum DirectIoAlignmentCheckLevel {
+        #[default]
+        Error,
+        Log,
+        None,
+    }
+
+    #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
+    #[serde(rename_all = "kebab-case")]
+    pub enum DirectIoOnAlignmentErrorAction {
+        Error,
+        #[default]
+        FallbackToBuffered,
+    }
+
+    #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
+    #[serde(tag = "type", rename_all = "kebab-case")]
+    pub enum DirectIoLatencyPadding {
+        /// Pad virtual file operations with IO to a fake file.
+        FakeFileRW { path: PathBuf },
+        #[default]
+        None,
+    }
 }
 
 // Wrapped in libpq CopyData
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 5ebd6511ac..932918410c 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -123,6 +123,7 @@ fn main() -> anyhow::Result<()> {
 
     // after setting up logging, log the effective IO engine choice and read path implementations
     info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
+    info!(?conf.virtual_file_direct_io, "starting with virtual_file Direct IO settings");
     info!(?conf.get_impl, "starting with get page implementation");
     info!(?conf.get_vectored_impl, "starting with vectored get page implementation");
     info!(?conf.compact_level0_phase1_value_access, "starting with setting for compact_level0_phase1_value_access");
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 41c2fe0af3..f4c367bd4d 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -300,6 +300,9 @@ pub struct PageServerConf {
     /// This flag is temporary and will be removed after gradual rollout.
     /// See <https://github.com/neondatabase/neon/issues/8184>.
     pub compact_level0_phase1_value_access: CompactL0Phase1ValueAccess,
+
+    /// Direct IO settings
+    pub virtual_file_direct_io: virtual_file::DirectIoMode,
 }
 
 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -408,6 +411,8 @@ struct PageServerConfigBuilder {
     l0_flush: BuilderValue<L0FlushConfig>,
 
     compact_level0_phase1_value_access: BuilderValue<CompactL0Phase1ValueAccess>,
+
+    virtual_file_direct_io: BuilderValue<virtual_file::DirectIoMode>,
 }
 
 impl PageServerConfigBuilder {
@@ -498,6 +503,7 @@ impl PageServerConfigBuilder {
             ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
             l0_flush: Set(L0FlushConfig::default()),
             compact_level0_phase1_value_access: Set(CompactL0Phase1ValueAccess::default()),
+            virtual_file_direct_io: Set(virtual_file::DirectIoMode::default()),
         }
     }
 }
@@ -685,6 +691,10 @@ impl PageServerConfigBuilder {
         self.compact_level0_phase1_value_access = BuilderValue::Set(value);
     }
 
+    pub fn virtual_file_direct_io(&mut self, value: virtual_file::DirectIoMode) {
+        self.virtual_file_direct_io = BuilderValue::Set(value);
+    }
+
     pub fn build(self, id: NodeId) -> anyhow::Result<PageServerConf> {
         let default = Self::default_values();
 
@@ -743,6 +753,7 @@ impl PageServerConfigBuilder {
                 ephemeral_bytes_per_memory_kb,
                 l0_flush,
                 compact_level0_phase1_value_access,
+                virtual_file_direct_io,
             }
             CUSTOM LOGIC
             {
@@ -1018,6 +1029,9 @@ impl PageServerConf {
                 "compact_level0_phase1_value_access" => {
                     builder.compact_level0_phase1_value_access(utils::toml_edit_ext::deserialize_item(item).context("compact_level0_phase1_value_access")?)
                 }
+                "virtual_file_direct_io" => {
+                    builder.virtual_file_direct_io(utils::toml_edit_ext::deserialize_item(item).context("virtual_file_direct_io")?)
+                }
                 _ => bail!("unrecognized pageserver option '{key}'"),
             }
         }
@@ -1103,6 +1117,7 @@ impl PageServerConf {
             ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
             l0_flush: L0FlushConfig::default(),
             compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
+            virtual_file_direct_io: virtual_file::DirectIoMode::default(),
         }
     }
 }
@@ -1345,6 +1360,7 @@ background_task_maximum_delay = '334 s'
                 ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
                 l0_flush: L0FlushConfig::default(),
                 compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
+                virtual_file_direct_io: virtual_file::DirectIoMode::default(),
             },
             "Correct defaults should be used when no config values are provided"
         );
@@ -1420,6 +1436,7 @@ background_task_maximum_delay = '334 s'
                 ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
                 l0_flush: L0FlushConfig::default(),
                 compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
+                virtual_file_direct_io: virtual_file::DirectIoMode::default(),
             },
             "Should be able to parse all basic config values correctly"
         );
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 51b0c420c3..8de646469e 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -34,6 +34,7 @@ pub use io_engine::FeatureTestResult as IoEngineFeatureTestResult;
 mod metadata;
 mod open_options;
 use self::owned_buffers_io::write::OwnedAsyncWriter;
+pub(crate) use api::DirectIoMode;
 pub(crate) use io_engine::IoEngineKind;
 pub(crate) use metadata::Metadata;
 pub(crate) use open_options::*;

From acaacd468097cb83cdcbd7f74284ada964d08bc0 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 7 Aug 2024 21:17:08 +0100
Subject: [PATCH 360/412] pageserver: make bench_ingest build (but panic) on
 macOS (#8641)

## Problem

Some developers build on MacOS, which doesn't have  io_uring.

## Summary of changes

- Add `io_engine_for_bench`, which on linux will give io_uring or panic
if it's unavailable, and on MacOS will always panic.

We do not want to run such benchmarks with StdFs: the results aren't
interesting, and will actively waste the time of any developers who
start investigating performance before they realize they're using a
known-slow I/O backend.

Why not just conditionally compile this benchmark on linux only? Because
even on linux, I still want it to refuse to run if it can't get
io_uring.
---
 pageserver/benches/bench_ingest.rs       |  4 ++--
 pageserver/src/virtual_file.rs           |  1 +
 pageserver/src/virtual_file/io_engine.rs | 26 ++++++++++++++++++++++++
 3 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs
index 459394449a..9bab02e46c 100644
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -11,7 +11,7 @@ use pageserver::{
     repository::Value,
     task_mgr::TaskKind,
     tenant::storage_layer::InMemoryLayer,
-    virtual_file::{self, api::IoEngineKind},
+    virtual_file,
 };
 use pageserver_api::{key::Key, shard::TenantShardId};
 use utils::{
@@ -149,7 +149,7 @@ fn criterion_benchmark(c: &mut Criterion) {
     let conf: &'static PageServerConf = Box::leak(Box::new(
         pageserver::config::PageServerConf::dummy_conf(temp_dir.path().to_path_buf()),
     ));
-    virtual_file::init(16384, IoEngineKind::TokioEpollUring);
+    virtual_file::init(16384, virtual_file::io_engine_for_bench());
     page_cache::init(conf.page_cache_size);
 
     {
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 8de646469e..27f6fe90a4 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -30,6 +30,7 @@ use tokio::time::Instant;
 pub use pageserver_api::models::virtual_file as api;
 pub(crate) mod io_engine;
 pub use io_engine::feature_test as io_engine_feature_test;
+pub use io_engine::io_engine_for_bench;
 pub use io_engine::FeatureTestResult as IoEngineFeatureTestResult;
 mod metadata;
 mod open_options;
diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs
index 2820cea097..0ffcd9fa05 100644
--- a/pageserver/src/virtual_file/io_engine.rs
+++ b/pageserver/src/virtual_file/io_engine.rs
@@ -328,3 +328,29 @@ pub fn feature_test() -> anyhow::Result<FeatureTestResult> {
     .join()
     .unwrap()
 }
+
+/// For use in benchmark binaries only.
+///
+/// Benchmarks which initialize `virtual_file` need to know what engine to use, but we also
+/// don't want to silently fall back to slower I/O engines in a benchmark: this could waste
+/// developer time trying to figure out why it's slow.
+///
+/// In practice, this method will either return IoEngineKind::TokioEpollUring, or panic.
+pub fn io_engine_for_bench() -> IoEngineKind {
+    #[cfg(not(target_os = "linux"))]
+    {
+        panic!("This benchmark does I/O and can only give a representative result on Linux");
+    }
+    #[cfg(target_os = "linux")]
+    {
+        match feature_test().unwrap() {
+            FeatureTestResult::PlatformPreferred(engine) => engine,
+            FeatureTestResult::Worse {
+                engine: _engine,
+                remark,
+            } => {
+                panic!("This benchmark does I/O can requires the preferred I/O engine: {remark}");
+            }
+        }
+    }
+}

From 2a210d4c58e57e1d22a28be4433907adfc944698 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 8 Aug 2024 10:23:57 +0300
Subject: [PATCH 361/412] Use sycnhronous commit for logical replicaiton worker
 (#8645)

## Problem

See
https://neondb.slack.com/archives/C03QLRH7PPD/p1723038557449239?thread_ts=1722868375.476789&cid=C03QLRH7PPD


Logical replication subscription by default use `synchronous_commit=off`
which cause problems with safekeeper

## Summary of changes

Set `synchronous_commit=on` for logical replication subscription in
test_subscriber_restart.py

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
---
 test_runner/regress/test_subscriber_restart.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/test_runner/regress/test_subscriber_restart.py b/test_runner/regress/test_subscriber_restart.py
index 91caad7220..4581008022 100644
--- a/test_runner/regress/test_subscriber_restart.py
+++ b/test_runner/regress/test_subscriber_restart.py
@@ -37,7 +37,9 @@ def test_subscriber_restart(neon_simple_env: NeonEnv):
             scur.execute("CREATE TABLE t (pk integer primary key, sk integer)")
             # scur.execute("CREATE INDEX on t(sk)") # slowdown applying WAL at replica
             pub_conn = f"host=localhost port={pub.pg_port} dbname=postgres user=cloud_admin"
-            query = f"CREATE SUBSCRIPTION sub CONNECTION '{pub_conn}' PUBLICATION pub"
+            # synchronous_commit=on to test a hypothesis for why this test has been flaky.
+            # XXX: Add link to the issue
+            query = f"CREATE SUBSCRIPTION sub CONNECTION '{pub_conn}' PUBLICATION pub with (synchronous_commit=on)"
             scur.execute(query)
             time.sleep(2)  # let initial table sync complete
 

From 48ae1214c5385f6feb645e3f10598fdfc9a33191 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 8 Aug 2024 12:34:47 +0300
Subject: [PATCH 362/412] fix(test): do not fail test for filesystem race
 (#8643)

evidence:
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-8632/10287641784/index.html#suites/0e58fb04d9998963e98e45fe1880af7d/c7a46335515142b/
---
 pageserver/src/statvfs.rs | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/pageserver/src/statvfs.rs b/pageserver/src/statvfs.rs
index 45a516566f..ede1791afa 100644
--- a/pageserver/src/statvfs.rs
+++ b/pageserver/src/statvfs.rs
@@ -56,7 +56,6 @@ impl Statvfs {
 }
 
 pub mod mock {
-    use anyhow::Context;
     use camino::Utf8Path;
     use regex::Regex;
     use tracing::log::info;
@@ -135,14 +134,30 @@ pub mod mock {
             {
                 continue;
             }
-            total += entry
-                .metadata()
-                .with_context(|| format!("get metadata of {:?}", entry.path()))?
-                .len();
+            let m = match entry.metadata() {
+                Ok(m) => m,
+                Err(e) if is_not_found(&e) => {
+                    // some temp file which got removed right as we are walking
+                    continue;
+                }
+                Err(e) => {
+                    return Err(anyhow::Error::new(e)
+                        .context(format!("get metadata of {:?}", entry.path())))
+                }
+            };
+            total += m.len();
         }
         Ok(total)
     }
 
+    fn is_not_found(e: &walkdir::Error) -> bool {
+        let Some(io_error) = e.io_error() else {
+            return false;
+        };
+        let kind = io_error.kind();
+        matches!(kind, std::io::ErrorKind::NotFound)
+    }
+
     pub struct Statvfs {
         pub blocks: u64,
         pub blocks_available: u64,

From 552832b81972d8facc609f77944c16dba2d092d5 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 8 Aug 2024 14:02:53 +0300
Subject: [PATCH 363/412] fix: stop leaking BackgroundPurges (#8650)

avoid "leaking" the completions of BackgroundPurges by:

1. switching it to TaskTracker for provided close+wait
2. stop using tokio::fs::remove_dir_all which will consume two units of
memory instead of one blocking task

Additionally, use more graceful shutdown in tests which do actually some
background cleanup.
---
 pageserver/src/tenant/mgr.rs              | 96 +++++++----------------
 test_runner/regress/test_tenant_delete.py | 11 ++-
 2 files changed, 36 insertions(+), 71 deletions(-)

diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 3f592f167e..3316627540 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -224,21 +224,8 @@ async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<U
 }
 
 /// See [`Self::spawn`].
-#[derive(Clone)]
-pub struct BackgroundPurges(Arc<std::sync::Mutex<BackgroundPurgesInner>>);
-enum BackgroundPurgesInner {
-    Open(tokio::task::JoinSet<()>),
-    // we use the async mutex for coalescing
-    ShuttingDown(Arc<tokio::sync::Mutex<tokio::task::JoinSet<()>>>),
-}
-
-impl Default for BackgroundPurges {
-    fn default() -> Self {
-        Self(Arc::new(std::sync::Mutex::new(
-            BackgroundPurgesInner::Open(JoinSet::new()),
-        )))
-    }
-}
+#[derive(Clone, Default)]
+pub struct BackgroundPurges(tokio_util::task::TaskTracker);
 
 impl BackgroundPurges {
     /// When we have moved a tenant's content to a temporary directory, we may delete it lazily in
@@ -247,24 +234,32 @@ impl BackgroundPurges {
     /// Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
     /// Thus the [`BackgroundPurges`] type to keep track of these tasks.
     pub fn spawn(&self, tmp_path: Utf8PathBuf) {
-        let mut guard = self.0.lock().unwrap();
-        let jset = match &mut *guard {
-            BackgroundPurgesInner::Open(ref mut jset) => jset,
-            BackgroundPurgesInner::ShuttingDown(_) => {
-                warn!("trying to spawn background purge during shutdown, ignoring");
-                return;
+        // because on shutdown we close and wait, we are misusing TaskTracker a bit.
+        //
+        // so first acquire a token, then check if the tracker has been closed. the tracker might get closed
+        // right after, but at least the shutdown will wait for what we are spawning next.
+        let token = self.0.token();
+
+        if self.0.is_closed() {
+            warn!(
+                %tmp_path,
+                "trying to spawn background purge during shutdown, ignoring"
+            );
+            return;
+        }
+
+        let span = info_span!(parent: None, "background_purge", %tmp_path);
+
+        let task = move || {
+            let _token = token;
+            let _entered = span.entered();
+            if let Err(error) = std::fs::remove_dir_all(tmp_path.as_path()) {
+                // should we fatal_io_error here?
+                warn!(%error, "failed to purge tenant directory");
             }
         };
-        jset.spawn_on(
-            async move {
-                if let Err(error) = fs::remove_dir_all(tmp_path.as_path()).await {
-                    // should we fatal_io_error here?
-                    warn!(%error, path=%tmp_path, "failed to purge tenant directory");
-                }
-            }
-            .instrument(info_span!(parent: None, "background_purge")),
-            BACKGROUND_RUNTIME.handle(),
-        );
+
+        BACKGROUND_RUNTIME.spawn_blocking(task);
     }
 
     /// When this future completes, all background purges have completed.
@@ -278,42 +273,9 @@ impl BackgroundPurges {
     /// instances of this future will continue to be correct.
     #[instrument(skip_all)]
     pub async fn shutdown(&self) {
-        let jset = {
-            let mut guard = self.0.lock().unwrap();
-            match &mut *guard {
-                BackgroundPurgesInner::Open(jset) => {
-                    *guard = BackgroundPurgesInner::ShuttingDown(Arc::new(tokio::sync::Mutex::new(
-                        std::mem::take(jset),
-                    )))
-                }
-                BackgroundPurgesInner::ShuttingDown(_) => {
-                    // calling shutdown multiple times is most likely a bug in pageserver shutdown code
-                    warn!("already shutting down");
-                }
-            };
-            match &mut *guard {
-                BackgroundPurgesInner::ShuttingDown(ref mut jset) => jset.clone(),
-                BackgroundPurgesInner::Open(_) => {
-                    unreachable!("above code transitions into shut down state");
-                }
-            }
-        };
-        let mut jset = jset.lock().await; // concurrent callers coalesce here
-        while let Some(res) = jset.join_next().await {
-            match res {
-                Ok(()) => {}
-                Err(e) if e.is_panic() => {
-                    // If it panicked, the error is already logged by the panic hook.
-                }
-                Err(e) if e.is_cancelled() => {
-                    unreachable!("we don't cancel the joinset or runtime")
-                }
-                Err(e) => {
-                    // No idea when this can happen, but let's log it.
-                    warn!(%e, "background purge task failed or panicked");
-                }
-            }
-        }
+        // forbid new tasks (can be called many times)
+        self.0.close();
+        self.0.wait().await;
     }
 }
 
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index c01b3a2e89..dadf5ca672 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -128,6 +128,8 @@ def test_tenant_delete_smoke(
     assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1
     assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "inprogress"}) == 0
 
+    env.pageserver.stop()
+
 
 def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonEnvBuilder):
     """Reproduction of 2023-11-23 stuck tenants investigation"""
@@ -200,11 +202,10 @@ def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonE
         if deletion is not None:
             deletion.join()
 
+    env.pageserver.stop()
 
-def test_tenant_delete_races_timeline_creation(
-    neon_env_builder: NeonEnvBuilder,
-    pg_bin: PgBin,
-):
+
+def test_tenant_delete_races_timeline_creation(neon_env_builder: NeonEnvBuilder):
     """
     Validate that timeline creation executed in parallel with deletion works correctly.
 
@@ -318,6 +319,8 @@ def test_tenant_delete_races_timeline_creation(
     # We deleted our only tenant, and the scrubber fails if it detects nothing
     neon_env_builder.disable_scrub_on_exit()
 
+    env.pageserver.stop()
+
 
 def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder):
     """

From 56077caaf92a4a776eb51a169dcd3f113276f84c Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 8 Aug 2024 12:57:48 +0100
Subject: [PATCH 364/412] pageserver: remove paranoia double-calculation of
 retain_lsns (#8617)

## Problem

This code was to mitigate risk in
https://github.com/neondatabase/neon/pull/8427

As expected, we did not hit this code path - the new continuous updates
of gc_info are working fine, we can remove this code now.

## Summary of changes

- Remove block that double-checks retain_lsns
---
 pageserver/src/tenant.rs | 48 ----------------------------------------
 1 file changed, 48 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 2422ab4cf2..90c0e28bc4 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3012,54 +3012,6 @@ impl Tenant {
         // because that will stall branch creation.
         let gc_cs = self.gc_cs.lock().await;
 
-        // Paranoia check: it is critical that GcInfo's list of child timelines is correct, to avoid incorrectly GC'ing data they
-        // depend on.  So although GcInfo is updated continuously by Timeline::new and Timeline::drop, we also calculate it here
-        // and fail out if it's inaccurate.
-        // (this can be removed later, it's a risk mitigation for https://github.com/neondatabase/neon/pull/8427)
-        {
-            let mut all_branchpoints: BTreeMap<TimelineId, Vec<(Lsn, TimelineId)>> =
-                BTreeMap::new();
-            timelines.iter().for_each(|timeline| {
-                if let Some(ancestor_timeline_id) = &timeline.get_ancestor_timeline_id() {
-                    let ancestor_children =
-                        all_branchpoints.entry(*ancestor_timeline_id).or_default();
-                    ancestor_children.push((timeline.get_ancestor_lsn(), timeline.timeline_id));
-                }
-            });
-
-            for timeline in &timelines {
-                let mut branchpoints: Vec<(Lsn, TimelineId)> = all_branchpoints
-                    .remove(&timeline.timeline_id)
-                    .unwrap_or_default();
-
-                branchpoints.sort_by_key(|b| b.0);
-
-                let target = timeline.gc_info.read().unwrap();
-
-                // We require that retain_lsns contains everything in `branchpoints`, but not that
-                // they are exactly equal: timeline deletions can race with us, so retain_lsns
-                // may contain some extra stuff.  It is safe to have extra timelines in there, because it
-                // just means that we retain slightly more data than we otherwise might.
-                let have_branchpoints = target.retain_lsns.iter().copied().collect::<HashSet<_>>();
-                for b in &branchpoints {
-                    if !have_branchpoints.contains(b) {
-                        tracing::error!(
-                            "Bug: `retain_lsns` is set incorrectly.  Expected be {:?}, but found {:?}",
-                            branchpoints,
-                            target.retain_lsns
-                        );
-                        debug_assert!(false);
-                        // Do not GC based on bad information!
-                        // (ab-use an existing GcError type rather than adding a new one, since this is a
-                        // "should never happen" check that will be removed soon).
-                        return Err(GcError::Remote(anyhow::anyhow!(
-                            "retain_lsns failed validation!"
-                        )));
-                    }
-                }
-            }
-        }
-
         // Ok, we now know all the branch points.
         // Update the GC information for each timeline.
         let mut gc_timelines = Vec::with_capacity(timelines.len());

From 857ad70b71ffaebcfdea27cae5d4298b29dd6b3f Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 8 Aug 2024 19:24:21 +0100
Subject: [PATCH 365/412] tests: don't require kafka client for regular tests
 (#8662)

## Problem

We're adding more third party dependencies to support more diverse +
realistic test cases in `test_runner/logical_repl`. I :heart: these
tests, they are a good thing.

The slight glitch is that python packaging is hard, and some third party
python packages have issues. For example the current kafka dependency
doesn't work on latest python. We can mitigate that by only importing
these more specialized dependencies in the tests that use them.

## Summary of changes

- Move the `kafka` import into a test body, so that folks running the
regular `test_runner/regress` tests don't have to have a working kafka
client package.
---
 test_runner/logical_repl/test_debezium.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test_runner/logical_repl/test_debezium.py b/test_runner/logical_repl/test_debezium.py
index 700b731418..5426a06ca1 100644
--- a/test_runner/logical_repl/test_debezium.py
+++ b/test_runner/logical_repl/test_debezium.py
@@ -12,7 +12,6 @@ import requests
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import RemotePostgres
 from fixtures.utils import wait_until
-from kafka import KafkaConsumer
 
 
 class DebeziumAPI:
@@ -95,6 +94,8 @@ def debezium(remote_pg: RemotePostgres):
     log.debug("%s %s %s", resp.status_code, resp.ok, resp.text)
     assert resp.status_code == 201
     assert len(dbz.list_connectors()) == 1
+    from kafka import KafkaConsumer
+
     consumer = KafkaConsumer(
         "dbserver1.inventory.customers",
         bootstrap_servers=["kafka:9092"],

From c725a3e4b146b6db6a546091a10c0571e70a5b0d Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 9 Aug 2024 07:17:16 +0100
Subject: [PATCH 366/412] CI(build-tools): update Rust, Python, Mold (#8667)

## Problem
- Rust 1.80.1 has been released:
https://blog.rust-lang.org/2024/08/08/Rust-1.80.1.html
- Python 3.9.19 has been released:
https://www.python.org/downloads/release/python-3919/
- Mold 2.33.0 has been released:
https://github.com/rui314/mold/releases/tag/v2.33.0
- Unpinned `cargo-deny` in `build-tools` got updated to the latest
version and doesn't work anymore with the current config file

## Summary of changes
- Bump Rust to 1.80.1
- Bump Python to 3.9.19
- Bump Mold to 2.33.0
- Pin `cargo-deny`, `cargo-hack`, `cargo-hakari`, `cargo-nextest`,
`rustfilt` versions
- Update `deny.toml` to the latest format, see
https://github.com/EmbarkStudios/cargo-deny/pull/611
---
 Dockerfile.build-tools | 21 +++++++++++++--------
 deny.toml              | 10 ++--------
 rust-toolchain.toml    |  2 +-
 3 files changed, 16 insertions(+), 17 deletions(-)

diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools
index dfaab1cb2e..a72092e8e2 100644
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -99,7 +99,7 @@ RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "aws
     && rm awscliv2.zip
 
 # Mold: A Modern Linker
-ENV MOLD_VERSION v2.31.0
+ENV MOLD_VERSION=v2.33.0
 RUN set -e \
     && git clone https://github.com/rui314/mold.git \
     && mkdir mold/build \
@@ -168,7 +168,7 @@ USER nonroot:nonroot
 WORKDIR /home/nonroot
 
 # Python
-ENV PYTHON_VERSION=3.9.18 \
+ENV PYTHON_VERSION=3.9.19 \
     PYENV_ROOT=/home/nonroot/.pyenv \
     PATH=/home/nonroot/.pyenv/shims:/home/nonroot/.pyenv/bin:/home/nonroot/.poetry/bin:$PATH
 RUN set -e \
@@ -192,9 +192,14 @@ WORKDIR /home/nonroot
 
 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.80.0
+ENV RUSTC_VERSION=1.80.1
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
+ARG RUSTFILT_VERSION=0.2.1
+ARG CARGO_HAKARI_VERSION=0.9.30
+ARG CARGO_DENY_VERSION=0.16.1
+ARG CARGO_HACK_VERSION=0.6.31
+ARG CARGO_NEXTEST_VERSION=0.9.72
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
 	chmod +x rustup-init && \
 	./rustup-init -y --default-toolchain ${RUSTC_VERSION} && \
@@ -204,11 +209,11 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
     cargo --version && rustup --version && \
     rustup component add llvm-tools-preview rustfmt clippy && \
     cargo install --git https://github.com/paritytech/cachepot && \
-    cargo install rustfilt && \
-    cargo install cargo-hakari && \
-    cargo install cargo-deny --locked && \
-    cargo install cargo-hack && \
-    cargo install cargo-nextest && \
+    cargo install rustfilt            --version ${RUSTFILT_VERSION} && \
+    cargo install cargo-hakari        --version ${CARGO_HAKARI_VERSION} && \
+    cargo install cargo-deny --locked --version ${CARGO_DENY_VERSION} && \
+    cargo install cargo-hack          --version ${CARGO_HACK_VERSION} && \
+    cargo install cargo-nextest       --version ${CARGO_NEXTEST_VERSION} && \
     rm -rf /home/nonroot/.cargo/registry && \
     rm -rf /home/nonroot/.cargo/git
 ENV RUSTC_WRAPPER=cachepot
diff --git a/deny.toml b/deny.toml
index 469609c496..dc985138e6 100644
--- a/deny.toml
+++ b/deny.toml
@@ -4,6 +4,7 @@
 # to your expectations and requirements.
 
 # Root options
+[graph]
 targets = [
     { triple = "x86_64-unknown-linux-gnu" },
     { triple = "aarch64-unknown-linux-gnu" },
@@ -12,6 +13,7 @@ targets = [
 ]
 all-features = false
 no-default-features = false
+[output]
 feature-depth = 1
 
 # This section is considered when running `cargo deny check advisories`
@@ -19,17 +21,13 @@ feature-depth = 1
 # https://embarkstudios.github.io/cargo-deny/checks/advisories/cfg.html
 [advisories]
 db-urls = ["https://github.com/rustsec/advisory-db"]
-vulnerability = "deny"
-unmaintained = "warn"
 yanked = "warn"
-notice = "warn"
 ignore = []
 
 # This section is considered when running `cargo deny check licenses`
 # More documentation for the licenses section can be found here:
 # https://embarkstudios.github.io/cargo-deny/checks/licenses/cfg.html
 [licenses]
-unlicensed = "deny"
 allow = [
     "Apache-2.0",
     "Artistic-2.0",
@@ -42,10 +40,6 @@ allow = [
     "OpenSSL",
     "Unicode-DFS-2016",
 ]
-deny = []
-copyleft = "warn"
-allow-osi-fsf-free = "neither"
-default = "deny"
 confidence-threshold = 0.8
 exceptions = [
     # Zlib license has some restrictions if we decide to change sth
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index 3510359591..368b8d300a 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,5 +1,5 @@
 [toolchain]
-channel = "1.80.0"
+channel = "1.80.1"
 profile = "default"
 # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
 # https://rust-lang.github.io/rustup/concepts/profiles.html

From af8c86590322b2bd906ef80ec83b24da931203ab Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 9 Aug 2024 07:54:54 +0100
Subject: [PATCH 367/412] Dockerfiles: fix LegacyKeyValueFormat &
 JSONArgsRecommended  (#8664)

## Problem
CI complains in all PRs:
```
"ENV key=value" should be used instead of legacy "ENV key value" format
```
https://docs.docker.com/reference/build-checks/legacy-key-value-format/

See
- https://github.com/neondatabase/neon/pull/8644/files ("Unchanged files
with check annotations" section)
- https://github.com/neondatabase/neon/actions/runs/10304090562?pr=8644
("Annotations" section)


## Summary of changes
- Use `ENV key=value` instead of `ENV key value` in all Dockerfiles
---
 Dockerfile              |  6 +++---
 Dockerfile.build-tools  |  2 +-
 Dockerfile.compute-node | 28 ++++++++++++++--------------
 vm-image-spec.yaml      |  4 ++--
 4 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index ace112cccf..6ed57a84a3 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -17,7 +17,7 @@ COPY --chown=nonroot pgxn pgxn
 COPY --chown=nonroot Makefile Makefile
 COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh
 
-ENV BUILD_TYPE release
+ENV BUILD_TYPE=release
 RUN set -e \
     && mold -run make -j $(nproc) -s neon-pg-ext \
     && rm -rf pg_install/build \
@@ -104,7 +104,7 @@ RUN mkdir -p /data/.neon/ && \
 
 # When running a binary that links with libpq, default to using our most recent postgres version.  Binaries
 # that want a particular postgres version will select it explicitly: this is just a default.
-ENV LD_LIBRARY_PATH /usr/local/v16/lib
+ENV LD_LIBRARY_PATH=/usr/local/v16/lib
 
 
 VOLUME ["/data"]
@@ -112,5 +112,5 @@ USER neon
 EXPOSE 6400
 EXPOSE 9898
 
-CMD /usr/local/bin/pageserver -D /data/.neon
+CMD ["/usr/local/bin/pageserver", "-D", "/data/.neon"]
 
diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools
index a72092e8e2..d39d36e1b6 100644
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -58,7 +58,7 @@ RUN set -e \
     && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
 
 # protobuf-compiler (protoc)
-ENV PROTOC_VERSION 25.1
+ENV PROTOC_VERSION=25.1
 RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-$(uname -m | sed 's/aarch64/aarch_64/g').zip" -o "protoc.zip" \
     && unzip -q protoc.zip -d protoc \
     && mv protoc/bin/protoc /usr/local/bin/protoc \
diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 054d44e0ec..7acaf2f2fd 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -94,7 +94,7 @@ RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar
     DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \
     make clean && cp -R /sfcgal/* /
 
-ENV PATH "/usr/local/pgsql/bin:$PATH"
+ENV PATH="/usr/local/pgsql/bin:$PATH"
 
 RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \
     echo "74eb356e3f85f14233791013360881b6748f78081cc688ff9d6f0f673a762d13 postgis.tar.gz" | sha256sum --check && \
@@ -411,7 +411,7 @@ FROM build-deps AS timescaledb-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 ARG PG_VERSION
-ENV PATH "/usr/local/pgsql/bin:$PATH"
+ENV PATH="/usr/local/pgsql/bin:$PATH"
 
 RUN case "${PG_VERSION}" in \
       "v14" | "v15") \
@@ -444,7 +444,7 @@ FROM build-deps AS pg-hint-plan-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 ARG PG_VERSION
-ENV PATH "/usr/local/pgsql/bin:$PATH"
+ENV PATH="/usr/local/pgsql/bin:$PATH"
 
 RUN case "${PG_VERSION}" in \
       "v14") \
@@ -480,7 +480,7 @@ RUN case "${PG_VERSION}" in \
 FROM build-deps AS pg-cron-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-ENV PATH "/usr/local/pgsql/bin/:$PATH"
+ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \
     echo "383a627867d730222c272bfd25cd5e151c578d73f696d32910c7db8c665cc7db pg_cron.tar.gz" | sha256sum --check && \
     mkdir pg_cron-src && cd pg_cron-src && tar xzf ../pg_cron.tar.gz --strip-components=1 -C . && \
@@ -506,7 +506,7 @@ RUN apt-get update && \
         libboost-system1.74-dev \
         libeigen3-dev
 
-ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
+ENV PATH="/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
 RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \
     echo "bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d rdkit.tar.gz" | sha256sum --check && \
     mkdir rdkit-src && cd rdkit-src && tar xzf ../rdkit.tar.gz --strip-components=1 -C . && \
@@ -546,7 +546,7 @@ RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.
 FROM build-deps AS pg-uuidv7-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-ENV PATH "/usr/local/pgsql/bin/:$PATH"
+ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \
     echo "0d0759ab01b7fb23851ecffb0bce27822e1868a4a5819bfd276101c716637a7a pg_uuidv7.tar.gz" | sha256sum --check && \
     mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \
@@ -563,7 +563,7 @@ RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz
 FROM build-deps AS pg-roaringbitmap-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-ENV PATH "/usr/local/pgsql/bin/:$PATH"
+ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \
     echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \
     mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \
@@ -580,7 +580,7 @@ RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4
 FROM build-deps AS pg-semver-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-ENV PATH "/usr/local/pgsql/bin/:$PATH"
+ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O pg_semver.tar.gz && \
     echo "fbdaf7512026d62eec03fad8687c15ed509b6ba395bff140acd63d2e4fbe25d7 pg_semver.tar.gz" | sha256sum --check && \
     mkdir pg_semver-src && cd pg_semver-src && tar xzf ../pg_semver.tar.gz --strip-components=1 -C . && \
@@ -598,7 +598,7 @@ FROM build-deps AS pg-embedding-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 ARG PG_VERSION
-ENV PATH "/usr/local/pgsql/bin/:$PATH"
+ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN case "${PG_VERSION}" in \
       "v14" | "v15") \
         export PG_EMBEDDING_VERSION=0.3.5 \
@@ -622,7 +622,7 @@ RUN case "${PG_VERSION}" in \
 FROM build-deps AS pg-anon-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-ENV PATH "/usr/local/pgsql/bin/:$PATH"
+ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget  https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \
     echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9  pg_anon.tar.gz" | sha256sum --check && \
     mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \
@@ -750,7 +750,7 @@ RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz -
 FROM build-deps AS wal2json-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-ENV PATH "/usr/local/pgsql/bin/:$PATH"
+ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \
     echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \
     mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \
@@ -766,7 +766,7 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.
 FROM build-deps AS pg-ivm-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-ENV PATH "/usr/local/pgsql/bin/:$PATH"
+ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \
     echo "ebfde04f99203c7be4b0e873f91104090e2e83e5429c32ac242d00f334224d5e pg_ivm.tar.gz" | sha256sum --check && \
     mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C . && \
@@ -783,7 +783,7 @@ RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_iv
 FROM build-deps AS pg-partman-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-ENV PATH "/usr/local/pgsql/bin/:$PATH"
+ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \
     echo "75b541733a9659a6c90dbd40fccb904a630a32880a6e3044d0c4c5f4c8a65525 pg_partman.tar.gz" | sha256sum --check && \
     mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C . && \
@@ -1034,6 +1034,6 @@ RUN apt update &&  \
     rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
     localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
 
-ENV LANG en_US.utf8
+ENV LANG=en_US.utf8
 USER postgres
 ENTRYPOINT ["/usr/local/bin/compute_ctl"]
diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index 7d005c7139..41d6e11725 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -416,7 +416,7 @@ build: |
   # libcgroup) that doesn't support cgroup v2 (version 0.41-11). Unfortunately, the vm-monitor
   # requires cgroup v2, so we'll build cgroup-tools ourselves.
   FROM debian:bullseye-slim as libcgroup-builder
-  ENV LIBCGROUP_VERSION v2.0.3
+  ENV LIBCGROUP_VERSION=v2.0.3
 
   RUN set -exu \
       && apt update \
@@ -460,7 +460,7 @@ build: |
           pkg-config
 
   # Use `dist_man_MANS=` to skip manpage generation (which requires python3/pandoc)
-  ENV PGBOUNCER_TAG pgbouncer_1_22_1
+  ENV PGBOUNCER_TAG=pgbouncer_1_22_1
   RUN set -e \
       && git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/pgbouncer/pgbouncer.git pgbouncer \
       && cd pgbouncer \

From a8eebdb07210ccf235122e2a9a3ea228e4d887f9 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Fri, 9 Aug 2024 09:36:29 +0200
Subject: [PATCH 368/412] Run a subset of benchmarking job steps on GitHub
 action runners in Azure - closer to the system under test (#8651)

## Problem

Latency from one cloud provider to another one is higher than within the
same cloud provider.
Some of our benchmarks are latency sensitive - we run a pgbench or psql
in the github action runner and the system under test is running in Neon
(database project).
For realistic perf tps and latency results we need to compare apples to
apples and run the database client in the same "latency distance" for
all tests.

## Summary of changes

Move job steps that test Neon databases deployed on Azure into Azure
action runners.
- bench strategy variant using azure database
- pgvector strategy variant using azure database
- pgbench-compare strategy variants using azure database

## Test run

https://github.com/neondatabase/neon/actions/runs/10314848502
---
 .github/actionlint.yml             |  1 +
 .github/workflows/benchmarking.yml | 88 +++++++++++++++++++++++-------
 2 files changed, 69 insertions(+), 20 deletions(-)

diff --git a/.github/actionlint.yml b/.github/actionlint.yml
index d27fa01efa..a5282876d0 100644
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -13,3 +13,4 @@ config-variables:
   - REMOTE_STORAGE_AZURE_CONTAINER
   - REMOTE_STORAGE_AZURE_REGION
   - SLACK_UPCOMING_RELEASE_CHANNEL_ID
+  - DEV_AWS_OIDC_ROLE_ARN
diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 0f4dac841e..6f80d6e431 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -56,6 +56,10 @@ concurrency:
 jobs:
   bench:
     if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
+    permissions:
+      contents: write
+      statuses: write
+      id-token: write # Required for OIDC authentication in azure runners
     strategy:
       fail-fast: false
       matrix:
@@ -63,9 +67,13 @@ jobs:
           - DEFAULT_PG_VERSION: 16
             PLATFORM: "neon-staging"
             region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
+            RUNNER: [ self-hosted, us-east-2, x64 ]
+            IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
           - DEFAULT_PG_VERSION: 16
             PLATFORM: "azure-staging"
             region_id: 'azure-eastus2'
+            RUNNER: [ self-hosted, eastus2, x64 ]
+            IMAGE: neondatabase/build-tools:pinned
     env:
       TEST_PG_BENCH_DURATIONS_MATRIX: "300"
       TEST_PG_BENCH_SCALES_MATRIX: "10,100"
@@ -76,14 +84,21 @@ jobs:
       SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
       PLATFORM: ${{ matrix.PLATFORM }}
 
-    runs-on: [ self-hosted, us-east-2, x64 ]
+    runs-on: ${{ matrix.RUNNER }}
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
+      image: ${{ matrix.IMAGE }}
       options: --init
 
     steps:
     - uses: actions/checkout@v4
 
+    - name: Configure AWS credentials # necessary on Azure runners
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}  
+        role-duration-seconds: 18000 # 5 hours
+
     - name: Download Neon artifact
       uses: ./.github/actions/download
       with:
@@ -161,6 +176,7 @@ jobs:
     steps:
     - uses: actions/checkout@v4
 
+    
     - name: Download Neon artifact
       uses: ./.github/actions/download
       with:
@@ -237,6 +253,9 @@ jobs:
       id: pgbench-compare-matrix
       run: |
         region_id_default=${{ env.DEFAULT_REGION_ID }}
+        runner_default='["self-hosted", "us-east-2", "x64"]'
+        runner_azure='["self-hosted", "eastus2", "x64"]'
+        image_default="369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned"
         matrix='{
           "pg_version" : [
             16
@@ -250,16 +269,19 @@ jobs:
             "neonvm-captest-new"
           ],
           "db_size": [ "10gb" ],
-          "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier",       "db_size": "3gb"  },
-                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "50gb" },
-                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-freetier", "db_size": "3gb"  },
-                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "10gb" },
-                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "50gb" },
-                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
+          "runner": ['"$runner_default"'],
+          "image": [ "'"$image_default"'" ],
+          "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier",       "db_size": "3gb" ,"runner": '"$runner_default"', "image": "'"$image_default"'" },
+                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
+                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
+                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_azure"',   "image": "neondatabase/build-tools:pinned" },
+                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "10gb","runner": '"$runner_azure"',   "image": "neondatabase/build-tools:pinned" },
+                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "50gb","runner": '"$runner_azure"',   "image": "neondatabase/build-tools:pinned" },
+                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]
         }'
 
         if [ "$(date +%A)" = "Saturday" ]; then
-          matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb"}]')
+          matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]')
         fi
 
         echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -302,6 +324,10 @@ jobs:
   pgbench-compare:
     if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
     needs: [ generate-matrices ]
+    permissions:
+      contents: write
+      statuses: write
+      id-token: write # Required for OIDC authentication in azure runners
 
     strategy:
       fail-fast: false
@@ -317,9 +343,9 @@ jobs:
       SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
       PLATFORM: ${{ matrix.platform }}
 
-    runs-on: [ self-hosted, us-east-2, x64 ]
+    runs-on: ${{ matrix.runner }}
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
+      image: ${{ matrix.image }}
       options: --init
 
     # Increase timeout to 8h, default timeout is 6h
@@ -328,6 +354,13 @@ jobs:
     steps:
     - uses: actions/checkout@v4
 
+    - name: Configure AWS credentials # necessary on Azure runners
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        role-duration-seconds: 18000 # 5 hours
+        
     - name: Download Neon artifact
       uses: ./.github/actions/download
       with:
@@ -435,12 +468,20 @@ jobs:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
   pgbench-pgvector:
+    permissions:
+      contents: write
+      statuses: write
+      id-token: write # Required for OIDC authentication in azure runners
     strategy:
       fail-fast: false
       matrix:
         include:
           - PLATFORM: "neonvm-captest-pgvector"
+            RUNNER: [ self-hosted, us-east-2, x64 ]
+            IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
           - PLATFORM: "azure-captest-pgvector"
+            RUNNER: [ self-hosted, eastus2, x64 ]
+            IMAGE: neondatabase/build-tools:pinned
 
     env:
       TEST_PG_BENCH_DURATIONS_MATRIX: "15m"
@@ -453,9 +494,9 @@ jobs:
       SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
       PLATFORM: ${{ matrix.PLATFORM }}
 
-    runs-on: [ self-hosted, us-east-2, x64 ]
+    runs-on: ${{ matrix.RUNNER }}
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
+      image: ${{ matrix.IMAGE }}
       options: --init
 
     steps:
@@ -466,12 +507,12 @@ jobs:
     - name: Install postgresql-16 where pytest expects it
       run: |
         cd /home/nonroot
-        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/libpq5_16.3-1.pgdg110%2B1_amd64.deb
-        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.3-1.pgdg110%2B1_amd64.deb
-        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.3-1.pgdg110%2B1_amd64.deb 
-        dpkg -x libpq5_16.3-1.pgdg110+1_amd64.deb pg
-        dpkg -x postgresql-client-16_16.3-1.pgdg110+1_amd64.deb pg
-        dpkg -x postgresql-16_16.3-1.pgdg110+1_amd64.deb pg
+        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/libpq5_16.4-1.pgdg110%2B1_amd64.deb
+        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.4-1.pgdg110%2B1_amd64.deb
+        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.4-1.pgdg110%2B1_amd64.deb 
+        dpkg -x libpq5_16.4-1.pgdg110+1_amd64.deb pg
+        dpkg -x postgresql-client-16_16.4-1.pgdg110+1_amd64.deb pg
+        dpkg -x postgresql-16_16.4-1.pgdg110+1_amd64.deb pg
         mkdir -p /tmp/neon/pg_install/v16/bin
         ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench  
         ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql /tmp/neon/pg_install/v16/bin/psql  
@@ -496,6 +537,13 @@ jobs:
         esac
 
         echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
+        
+    - name: Configure AWS credentials # necessary on Azure runners to read/write from/to S3
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        role-duration-seconds: 18000 # 5 hours
 
     - name: Benchmark pgvector hnsw indexing
       uses: ./.github/actions/run-python-test-set
@@ -524,7 +572,7 @@ jobs:
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
         PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
-
+    
     - name: Create Allure report
       if: ${{ !cancelled() }}
       uses: ./.github/actions/allure-report-generate

From 9dfed93f70dd86fe77b84eb1efa88d9676b51fba Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 9 Aug 2024 09:09:29 +0100
Subject: [PATCH 369/412] Revert "proxy: update tokio-postgres to allow
 arbitrary config params (#8076)" (#8654)

This reverts #8076 - which was already reverted from the release branch
since forever (it would have been a breaking change to release for all
users who currently set TimeZone options). It's causing conflicts now so
we should revert it here as well.
---
 Cargo.lock                            |   8 +-
 libs/postgres_connection/src/lib.rs   |  50 +++++-----
 proxy/src/compute.rs                  | 129 ++++++++++++--------------
 proxy/src/serverless/backend.rs       |   4 -
 proxy/src/serverless/sql_over_http.rs |   1 -
 test_runner/regress/test_proxy.py     |  19 ----
 6 files changed, 92 insertions(+), 119 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index f565119dbd..031fae0f37 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3960,7 +3960,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -3973,7 +3973,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
  "base64 0.20.0",
  "byteorder",
@@ -3992,7 +3992,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -6187,7 +6187,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
  "async-trait",
  "byteorder",
diff --git a/libs/postgres_connection/src/lib.rs b/libs/postgres_connection/src/lib.rs
index fdabcbacb2..9f57f3d507 100644
--- a/libs/postgres_connection/src/lib.rs
+++ b/libs/postgres_connection/src/lib.rs
@@ -144,7 +144,20 @@ impl PgConnectionConfig {
             // implement and this function is hardly a bottleneck. The function is only called around
             // establishing a new connection.
             #[allow(unstable_name_collisions)]
-            config.options(&encode_options(&self.options));
+            config.options(
+                &self
+                    .options
+                    .iter()
+                    .map(|s| {
+                        if s.contains(['\\', ' ']) {
+                            Cow::Owned(s.replace('\\', "\\\\").replace(' ', "\\ "))
+                        } else {
+                            Cow::Borrowed(s.as_str())
+                        }
+                    })
+                    .intersperse(Cow::Borrowed(" ")) // TODO: use impl from std once it's stabilized
+                    .collect::<String>(),
+            );
         }
         config
     }
@@ -165,21 +178,6 @@ impl PgConnectionConfig {
     }
 }
 
-#[allow(unstable_name_collisions)]
-fn encode_options(options: &[String]) -> String {
-    options
-        .iter()
-        .map(|s| {
-            if s.contains(['\\', ' ']) {
-                Cow::Owned(s.replace('\\', "\\\\").replace(' ', "\\ "))
-            } else {
-                Cow::Borrowed(s.as_str())
-            }
-        })
-        .intersperse(Cow::Borrowed(" ")) // TODO: use impl from std once it's stabilized
-        .collect::<String>()
-}
-
 impl fmt::Display for PgConnectionConfig {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         // The password is intentionally hidden and not part of this display string.
@@ -208,7 +206,7 @@ impl fmt::Debug for PgConnectionConfig {
 
 #[cfg(test)]
 mod tests_pg_connection_config {
-    use crate::{encode_options, PgConnectionConfig};
+    use crate::PgConnectionConfig;
     use once_cell::sync::Lazy;
     use url::Host;
 
@@ -257,12 +255,18 @@ mod tests_pg_connection_config {
 
     #[test]
     fn test_with_options() {
-        let options = encode_options(&[
-            "hello".to_owned(),
-            "world".to_owned(),
-            "with space".to_owned(),
-            "and \\ backslashes".to_owned(),
+        let cfg = PgConnectionConfig::new_host_port(STUB_HOST.clone(), 123).extend_options([
+            "hello",
+            "world",
+            "with space",
+            "and \\ backslashes",
         ]);
-        assert_eq!(options, "hello world with\\ space and\\ \\\\\\ backslashes");
+        assert_eq!(cfg.host(), &*STUB_HOST);
+        assert_eq!(cfg.port(), 123);
+        assert_eq!(cfg.raw_address(), "stub.host.example:123");
+        assert_eq!(
+            cfg.to_tokio_postgres_config().get_options(),
+            Some("hello world with\\ space and\\ \\\\\\ backslashes")
+        );
     }
 }
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index 21687160ea..18c82fe379 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -103,8 +103,12 @@ impl ConnCfg {
 
     /// Reuse password or auth keys from the other config.
     pub fn reuse_password(&mut self, other: Self) {
-        if let Some(password) = other.get_auth() {
-            self.auth(password);
+        if let Some(password) = other.get_password() {
+            self.password(password);
+        }
+
+        if let Some(keys) = other.get_auth_keys() {
+            self.auth_keys(keys);
         }
     }
 
@@ -120,64 +124,48 @@ impl ConnCfg {
 
     /// Apply startup message params to the connection config.
     pub fn set_startup_params(&mut self, params: &StartupMessageParams) {
-        let mut client_encoding = false;
-        for (k, v) in params.iter() {
-            match k {
-                "user" => {
-                    // Only set `user` if it's not present in the config.
-                    // Link auth flow takes username from the console's response.
-                    if self.get_user().is_none() {
-                        self.user(v);
-                    }
+        // Only set `user` if it's not present in the config.
+        // Link auth flow takes username from the console's response.
+        if let (None, Some(user)) = (self.get_user(), params.get("user")) {
+            self.user(user);
+        }
+
+        // Only set `dbname` if it's not present in the config.
+        // Link auth flow takes dbname from the console's response.
+        if let (None, Some(dbname)) = (self.get_dbname(), params.get("database")) {
+            self.dbname(dbname);
+        }
+
+        // Don't add `options` if they were only used for specifying a project.
+        // Connection pools don't support `options`, because they affect backend startup.
+        if let Some(options) = filtered_options(params) {
+            self.options(&options);
+        }
+
+        if let Some(app_name) = params.get("application_name") {
+            self.application_name(app_name);
+        }
+
+        // TODO: This is especially ugly...
+        if let Some(replication) = params.get("replication") {
+            use tokio_postgres::config::ReplicationMode;
+            match replication {
+                "true" | "on" | "yes" | "1" => {
+                    self.replication_mode(ReplicationMode::Physical);
                 }
                 "database" => {
-                    // Only set `dbname` if it's not present in the config.
-                    // Link auth flow takes dbname from the console's response.
-                    if self.get_dbname().is_none() {
-                        self.dbname(v);
-                    }
-                }
-                "options" => {
-                    // Don't add `options` if they were only used for specifying a project.
-                    // Connection pools don't support `options`, because they affect backend startup.
-                    if let Some(options) = filtered_options(v) {
-                        self.options(&options);
-                    }
-                }
-
-                // the special ones in tokio-postgres that we don't want being set by the user
-                "dbname" => {}
-                "password" => {}
-                "sslmode" => {}
-                "host" => {}
-                "port" => {}
-                "connect_timeout" => {}
-                "keepalives" => {}
-                "keepalives_idle" => {}
-                "keepalives_interval" => {}
-                "keepalives_retries" => {}
-                "target_session_attrs" => {}
-                "channel_binding" => {}
-                "max_backend_message_size" => {}
-
-                "client_encoding" => {
-                    client_encoding = true;
-                    // only error should be from bad null bytes,
-                    // but we've already checked for those.
-                    _ = self.param("client_encoding", v);
-                }
-
-                _ => {
-                    // only error should be from bad null bytes,
-                    // but we've already checked for those.
-                    _ = self.param(k, v);
+                    self.replication_mode(ReplicationMode::Logical);
                 }
+                _other => {}
             }
         }
-        if !client_encoding {
-            // for compatibility since we removed it from tokio-postgres
-            self.param("client_encoding", "UTF8").unwrap();
-        }
+
+        // TODO: extend the list of the forwarded startup parameters.
+        // Currently, tokio-postgres doesn't allow us to pass
+        // arbitrary parameters, but the ones above are a good start.
+        //
+        // This and the reverse params problem can be better addressed
+        // in a bespoke connection machinery (a new library for that sake).
     }
 }
 
@@ -350,9 +338,10 @@ impl ConnCfg {
 }
 
 /// Retrieve `options` from a startup message, dropping all proxy-secific flags.
-fn filtered_options(options: &str) -> Option<String> {
+fn filtered_options(params: &StartupMessageParams) -> Option<String> {
     #[allow(unstable_name_collisions)]
-    let options: String = StartupMessageParams::parse_options_raw(options)
+    let options: String = params
+        .options_raw()?
         .filter(|opt| parse_endpoint_param(opt).is_none() && neon_option(opt).is_none())
         .intersperse(" ") // TODO: use impl from std once it's stabilized
         .collect();
@@ -424,23 +413,27 @@ mod tests {
     #[test]
     fn test_filtered_options() {
         // Empty options is unlikely to be useful anyway.
-        assert_eq!(filtered_options(""), None);
+        let params = StartupMessageParams::new([("options", "")]);
+        assert_eq!(filtered_options(&params), None);
 
         // It's likely that clients will only use options to specify endpoint/project.
-        let params = "project=foo";
-        assert_eq!(filtered_options(params), None);
+        let params = StartupMessageParams::new([("options", "project=foo")]);
+        assert_eq!(filtered_options(&params), None);
 
         // Same, because unescaped whitespaces are no-op.
-        let params = " project=foo ";
-        assert_eq!(filtered_options(params), None);
+        let params = StartupMessageParams::new([("options", " project=foo ")]);
+        assert_eq!(filtered_options(&params).as_deref(), None);
 
-        let params = r"\  project=foo \ ";
-        assert_eq!(filtered_options(params).as_deref(), Some(r"\  \ "));
+        let params = StartupMessageParams::new([("options", r"\  project=foo \ ")]);
+        assert_eq!(filtered_options(&params).as_deref(), Some(r"\  \ "));
 
-        let params = "project = foo";
-        assert_eq!(filtered_options(params).as_deref(), Some("project = foo"));
+        let params = StartupMessageParams::new([("options", "project = foo")]);
+        assert_eq!(filtered_options(&params).as_deref(), Some("project = foo"));
 
-        let params = "project = foo neon_endpoint_type:read_write   neon_lsn:0/2";
-        assert_eq!(filtered_options(params).as_deref(), Some("project = foo"));
+        let params = StartupMessageParams::new([(
+            "options",
+            "project = foo neon_endpoint_type:read_write   neon_lsn:0/2",
+        )]);
+        assert_eq!(filtered_options(&params).as_deref(), Some("project = foo"));
     }
 }
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 80d46c67eb..295ea1a1c7 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -236,10 +236,6 @@ impl ConnectMechanism for TokioMechanism {
             .dbname(&self.conn_info.dbname)
             .connect_timeout(timeout);
 
-        config
-            .param("client_encoding", "UTF8")
-            .expect("client encoding UTF8 is always valid");
-
         let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
         let res = config.connect(tokio_postgres::NoTls).await;
         drop(pause);
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 77ec6b1c73..e5b6536328 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -203,7 +203,6 @@ fn get_conn_info(
             options = Some(NeonOptions::parse_options_raw(&value));
         }
     }
-    ctx.set_db_options(params.freeze());
 
     let user_info = ComputeUserInfo {
         endpoint,
diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py
index 8ed44b1094..f446f4f200 100644
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -53,25 +53,6 @@ def test_proxy_select_1(static_proxy: NeonProxy):
     assert out[0][0] == 42
 
 
-def test_proxy_server_params(static_proxy: NeonProxy):
-    """
-    Test that server params are passing through to postgres
-    """
-
-    out = static_proxy.safe_psql(
-        "select to_json('0 seconds'::interval)", options="-c intervalstyle=iso_8601"
-    )
-    assert out[0][0] == "PT0S"
-    out = static_proxy.safe_psql(
-        "select to_json('0 seconds'::interval)", options="-c intervalstyle=sql_standard"
-    )
-    assert out[0][0] == "0"
-    out = static_proxy.safe_psql(
-        "select to_json('0 seconds'::interval)", options="-c intervalstyle=postgres"
-    )
-    assert out[0][0] == "00:00:00"
-
-
 def test_password_hack(static_proxy: NeonProxy):
     """
     Check the PasswordHack auth flow: an alternative to SCRAM auth for

From f8c0da43b5b2ed68d0a7d3cce66cb09ab93575f5 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <iskyzh@gmail.com>
Date: Fri, 9 Aug 2024 16:18:55 +0800
Subject: [PATCH 370/412] fix(neon): disable create tablespace stmt (#8657)

part of https://github.com/neondatabase/neon/issues/8653

Disable create tablespace stmt. It turns out it requires much less
effort to do the regress test mode flag than patching the test cases,
and given that we might need to support tablespaces in the future, I
decided to add a new flag `regress_test_mode` to change the behavior of
create tablespace.

Tested manually that without setting regress_test_mode, create
tablespace will be rejected.


---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
---
 pgxn/neon/control_plane_connector.c    | 21 +++++++++++++++++++++
 test_runner/regress/test_pg_regress.py | 25 ++++++++++++++++++++++---
 2 files changed, 43 insertions(+), 3 deletions(-)

diff --git a/pgxn/neon/control_plane_connector.c b/pgxn/neon/control_plane_connector.c
index 93252e6b29..de023da5c4 100644
--- a/pgxn/neon/control_plane_connector.c
+++ b/pgxn/neon/control_plane_connector.c
@@ -45,6 +45,7 @@ static const char *jwt_token = NULL;
 /* GUCs */
 static char *ConsoleURL = NULL;
 static bool ForwardDDL = true;
+static bool RegressTestMode = false;
 
 /*
  * CURL docs say that this buffer must exist until we call curl_easy_cleanup
@@ -802,6 +803,14 @@ NeonProcessUtility(
 		case T_DropRoleStmt:
 			HandleDropRole(castNode(DropRoleStmt, parseTree));
 			break;
+		case T_CreateTableSpaceStmt:
+			if (!RegressTestMode)
+			{
+				ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					errmsg("CREATE TABLESPACE is not supported on Neon")));
+			}
+   			break;
 		default:
 			break;
 	}
@@ -864,6 +873,18 @@ InitControlPlaneConnector()
 							 NULL,
 							 NULL);
 
+	DefineCustomBoolVariable(
+							 "neon.regress_test_mode",
+							 "Controls whether we are running in the regression test mode",
+							 NULL,
+							 &RegressTestMode,
+							 false,
+							 PGC_SUSET,
+							 0,
+							 NULL,
+							 NULL,
+							 NULL);
+
 	jwt_token = getenv("NEON_CONTROL_PLANE_TOKEN");
 	if (!jwt_token)
 	{
diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py
index 6f7ea0092a..45ce5b1c5b 100644
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -144,7 +144,13 @@ def test_pg_regress(
     )
 
     # Connect to postgres and create a database called "regression".
-    endpoint = env.endpoints.create_start("main")
+    endpoint = env.endpoints.create_start(
+        "main",
+        config_lines=[
+            # Enable the test mode, so that we don't need to patch the test cases.
+            "neon.regress_test_mode = true",
+        ],
+    )
     endpoint.safe_psql(f"CREATE DATABASE {DBNAME}")
 
     # Create some local directories for pg_regress to run in.
@@ -207,7 +213,14 @@ def test_isolation(
 
     # Connect to postgres and create a database called "regression".
     # isolation tests use prepared transactions, so enable them
-    endpoint = env.endpoints.create_start("main", config_lines=["max_prepared_transactions=100"])
+    endpoint = env.endpoints.create_start(
+        "main",
+        config_lines=[
+            "max_prepared_transactions=100",
+            # Enable the test mode, so that we don't need to patch the test cases.
+            "neon.regress_test_mode = true",
+        ],
+    )
     endpoint.safe_psql(f"CREATE DATABASE {DBNAME}")
 
     # Create some local directories for pg_isolation_regress to run in.
@@ -268,7 +281,13 @@ def test_sql_regress(
     )
 
     # Connect to postgres and create a database called "regression".
-    endpoint = env.endpoints.create_start("main")
+    endpoint = env.endpoints.create_start(
+        "main",
+        config_lines=[
+            # Enable the test mode, so that we don't need to patch the test cases.
+            "neon.regress_test_mode = true",
+        ],
+    )
     endpoint.safe_psql(f"CREATE DATABASE {DBNAME}")
 
     # Create some local directories for pg_regress to run in.

From cbba8e339036e98f632407f194c0e947d133daa9 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 9 Aug 2024 12:05:43 +0100
Subject: [PATCH 371/412] CI(pin-build-tools-image): fix permissions for Azure
 login (#8671)

## Problem

Azure login fails in `pin-build-tools-image` workflow because the job
doesn't have the required permissions.

```
Error: Please make sure to give write permissions to id-token in the workflow.
Error: Login failed with Error: Error message: Unable to get ACTIONS_ID_TOKEN_REQUEST_URL env variable. Double check if the 'auth-type' is correct. Refer to https://github.com/Azure/login#readme for more information.
```

## Summary of changes
- Add `id-token: write` permission to `pin-build-tools-image`
- Add an input to force image tagging
- Unify pushing to Docker Hub with other registries
- Split the job into two to have less if's
---
 .github/workflows/pin-build-tools-image.yml | 50 +++++++++++++--------
 1 file changed, 32 insertions(+), 18 deletions(-)

diff --git a/.github/workflows/pin-build-tools-image.yml b/.github/workflows/pin-build-tools-image.yml
index cf10910b0b..2e79498fc4 100644
--- a/.github/workflows/pin-build-tools-image.yml
+++ b/.github/workflows/pin-build-tools-image.yml
@@ -7,12 +7,20 @@ on:
         description: 'Source tag'
         required: true
         type: string
+      force:
+        description: 'Force the image to be pinned'
+        default: false
+        type: boolean
   workflow_call:
     inputs:
       from-tag:
         description: 'Source tag'
         required: true
         type: string
+      force:
+        description: 'Force the image to be pinned'
+        default: false
+        type: boolean
 
 defaults:
   run:
@@ -22,15 +30,18 @@ concurrency:
   group: pin-build-tools-image-${{ inputs.from-tag }}
   cancel-in-progress: false
 
+# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
 permissions: {}
 
-jobs:
-  tag-image:
-    runs-on: ubuntu-22.04
+env:
+  FROM_TAG: ${{ inputs.from-tag }}
+  TO_TAG: pinned
 
-    env:
-      FROM_TAG: ${{ inputs.from-tag }}
-      TO_TAG: pinned
+jobs:
+  check-manifests:
+    runs-on: ubuntu-22.04
+    outputs:
+      skip: ${{ steps.check-manifests.outputs.skip }}
 
     steps:
       - name: Check if we really need to pin the image
@@ -47,27 +58,31 @@ jobs:
 
           echo "skip=${skip}" | tee -a $GITHUB_OUTPUT
 
+  tag-image:
+    needs: check-manifests
+
+    # use format(..) to catch both inputs.force = true AND inputs.force = 'true'
+    if: needs.check-manifests.outputs.skip == 'false' || format('{0}', inputs.force) == 'true'
+
+    runs-on: ubuntu-22.04
+
+    permissions:
+      id-token: write # for `azure/login`
+
+    steps:
       - uses: docker/login-action@v3
-        if: steps.check-manifests.outputs.skip == 'false'
+
         with:
           username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
           password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
 
-      - name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub
-        if: steps.check-manifests.outputs.skip == 'false'
-        run: |
-          docker buildx imagetools create -t neondatabase/build-tools:${TO_TAG} \
-                                             neondatabase/build-tools:${FROM_TAG}
-
       - uses: docker/login-action@v3
-        if: steps.check-manifests.outputs.skip == 'false'
         with:
           registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
           username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
           password: ${{ secrets.AWS_SECRET_KEY_DEV }}
 
       - name: Azure login
-        if: steps.check-manifests.outputs.skip == 'false'
         uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a  # @v2.1.1
         with:
           client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }}
@@ -75,13 +90,12 @@ jobs:
           subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }}
 
       - name: Login to ACR
-        if: steps.check-manifests.outputs.skip == 'false'
         run: |
           az acr login --name=neoneastus2
 
-      - name: Tag build-tools with `${{ env.TO_TAG }}` in ECR and ACR
-        if: steps.check-manifests.outputs.skip == 'false'
+      - name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub, ECR, and ACR
         run: |
           docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG} \
                                           -t neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG} \
+                                          -t neondatabase/build-tools:${TO_TAG} \
                                              neondatabase/build-tools:${FROM_TAG}

From e9a378d1aa72c245044d63f3b53193f424f507d4 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 9 Aug 2024 14:01:56 +0100
Subject: [PATCH 372/412] pageserver: don't treat NotInitialized::Stopped as
 unexpected (#8675)

## Problem

This type of error can happen during shutdown & was triggering a circuit
breaker alert.

## Summary of changes

- Map NotIntialized::Stopped to CompactionError::ShuttingDown, so that
we may handle it cleanly
---
 pageserver/src/tenant/timeline.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 76dcb5645f..f810df5a56 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4412,11 +4412,11 @@ impl From<CollectKeySpaceError> for CompactionError {
 impl From<super::upload_queue::NotInitialized> for CompactionError {
     fn from(value: super::upload_queue::NotInitialized) -> Self {
         match value {
-            super::upload_queue::NotInitialized::Uninitialized
-            | super::upload_queue::NotInitialized::Stopped => {
+            super::upload_queue::NotInitialized::Uninitialized => {
                 CompactionError::Other(anyhow::anyhow!(value))
             }
-            super::upload_queue::NotInitialized::ShuttingDown => CompactionError::ShuttingDown,
+            super::upload_queue::NotInitialized::ShuttingDown
+            | super::upload_queue::NotInitialized::Stopped => CompactionError::ShuttingDown,
         }
     }
 }

From 494023f5dffc6d74affa4fc4a4342c6489ff7178 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 9 Aug 2024 15:45:07 +0100
Subject: [PATCH 373/412] storcon: skip draining shard if it's secondary is
 lagging too much (#8644)

## Problem
Migrations of tenant shards with cold secondaries are holding up drains
in during production deployments.

## Summary of changes
If a secondary locations is lagging by more than 256MiB (configurable,
but that's the default), then skip cutting it over to the secondary as part of the node drain.
---
 control_plane/src/local_env.rs                |   3 +
 control_plane/src/storage_controller.rs       |   4 +
 pageserver/src/tenant/secondary/downloader.rs |   6 +-
 storage_controller/src/drain_utils.rs         | 225 ++++++++++++++++++
 storage_controller/src/lib.rs                 |   1 +
 storage_controller/src/main.rs                |   6 +
 storage_controller/src/reconciler.rs          |  74 +++++-
 storage_controller/src/service.rs             | 222 ++++++++++++-----
 storage_controller/src/tenant_shard.rs        |   4 +-
 test_runner/fixtures/neon_fixtures.py         |  64 +++++
 test_runner/fixtures/pageserver/http.py       |   6 +
 .../test_storage_controller_scale.py          |  48 +---
 .../regress/test_storage_controller.py        | 113 +++++++++
 13 files changed, 666 insertions(+), 110 deletions(-)
 create mode 100644 storage_controller/src/drain_utils.rs

diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 505d157efd..15bbac702f 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -158,6 +158,8 @@ pub struct NeonStorageControllerConf {
 
     /// Threshold for auto-splitting a tenant into shards
     pub split_threshold: Option<u64>,
+
+    pub max_secondary_lag_bytes: Option<u64>,
 }
 
 impl NeonStorageControllerConf {
@@ -173,6 +175,7 @@ impl Default for NeonStorageControllerConf {
             max_offline: Self::DEFAULT_MAX_OFFLINE_INTERVAL,
             max_warming_up: Self::DEFAULT_MAX_WARMING_UP_INTERVAL,
             split_threshold: None,
+            max_secondary_lag_bytes: None,
         }
     }
 }
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index e054e9ee57..f180e922e8 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -383,6 +383,10 @@ impl StorageController {
             args.push(format!("--split-threshold={split_threshold}"))
         }
 
+        if let Some(lag) = self.config.max_secondary_lag_bytes.as_ref() {
+            args.push(format!("--max-secondary-lag-bytes={lag}"))
+        }
+
         args.push(format!(
             "--neon-local-repo-dir={}",
             self.env.base_data_dir.display()
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 27439d4f03..135e73b57f 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -55,7 +55,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::{info_span, instrument, warn, Instrument};
 use utils::{
     backoff, completion::Barrier, crashsafe::path_with_suffix_extension, failpoint_support, fs_ext,
-    id::TimelineId, serde_system_time,
+    id::TimelineId, pausable_failpoint, serde_system_time,
 };
 
 use super::{
@@ -1146,12 +1146,14 @@ impl<'a> TenantDownloader<'a> {
         layer: HeatMapLayer,
         ctx: &RequestContext,
     ) -> Result<Option<HeatMapLayer>, UpdateError> {
-        // Failpoint for simulating slow remote storage
+        // Failpoints for simulating slow remote storage
         failpoint_support::sleep_millis_async!(
             "secondary-layer-download-sleep",
             &self.secondary_state.cancel
         );
 
+        pausable_failpoint!("secondary-layer-download-pausable");
+
         let local_path = local_layer_path(
             self.conf,
             tenant_shard_id,
diff --git a/storage_controller/src/drain_utils.rs b/storage_controller/src/drain_utils.rs
new file mode 100644
index 0000000000..dea1f04649
--- /dev/null
+++ b/storage_controller/src/drain_utils.rs
@@ -0,0 +1,225 @@
+use std::{
+    collections::{BTreeMap, HashMap},
+    sync::Arc,
+};
+
+use pageserver_api::controller_api::NodeSchedulingPolicy;
+use utils::{id::NodeId, shard::TenantShardId};
+
+use crate::{
+    background_node_operations::OperationError, node::Node, scheduler::Scheduler,
+    tenant_shard::TenantShard,
+};
+
+pub(crate) struct TenantShardIterator<F> {
+    tenants_accessor: F,
+    inspected_all_shards: bool,
+    last_inspected_shard: Option<TenantShardId>,
+}
+
+/// A simple iterator which can be used in tandem with [`crate::service::Service`]
+/// to iterate over all known tenant shard ids without holding the lock on the
+/// service state at all times.
+impl<F> TenantShardIterator<F>
+where
+    F: Fn(Option<TenantShardId>) -> Option<TenantShardId>,
+{
+    pub(crate) fn new(tenants_accessor: F) -> Self {
+        Self {
+            tenants_accessor,
+            inspected_all_shards: false,
+            last_inspected_shard: None,
+        }
+    }
+
+    /// Returns the next tenant shard id if one exists
+    pub(crate) fn next(&mut self) -> Option<TenantShardId> {
+        if self.inspected_all_shards {
+            return None;
+        }
+
+        match (self.tenants_accessor)(self.last_inspected_shard) {
+            Some(tid) => {
+                self.last_inspected_shard = Some(tid);
+                Some(tid)
+            }
+            None => {
+                self.inspected_all_shards = true;
+                None
+            }
+        }
+    }
+
+    /// Returns true when the end of the iterator is reached and false otherwise
+    pub(crate) fn finished(&self) -> bool {
+        self.inspected_all_shards
+    }
+}
+
+/// Check that the state of the node being drained is as expected:
+/// node is present in memory and scheduling policy is set to [`NodeSchedulingPolicy::Draining`]
+pub(crate) fn validate_node_state(
+    node_id: &NodeId,
+    nodes: Arc<HashMap<NodeId, Node>>,
+) -> Result<(), OperationError> {
+    let node = nodes.get(node_id).ok_or(OperationError::NodeStateChanged(
+        format!("node {} was removed", node_id).into(),
+    ))?;
+
+    let current_policy = node.get_scheduling();
+    if !matches!(current_policy, NodeSchedulingPolicy::Draining) {
+        // TODO(vlad): maybe cancel pending reconciles before erroring out. need to think
+        // about it
+        return Err(OperationError::NodeStateChanged(
+            format!("node {} changed state to {:?}", node_id, current_policy).into(),
+        ));
+    }
+
+    Ok(())
+}
+
+/// Struct that houses a few utility methods for draining pageserver nodes
+pub(crate) struct TenantShardDrain {
+    pub(crate) drained_node: NodeId,
+    pub(crate) tenant_shard_id: TenantShardId,
+}
+
+impl TenantShardDrain {
+    /// Check if the tenant shard under question is eligible for drainining:
+    /// it's primary attachment is on the node being drained
+    pub(crate) fn tenant_shard_eligible_for_drain(
+        &self,
+        tenants: &BTreeMap<TenantShardId, TenantShard>,
+        scheduler: &Scheduler,
+    ) -> Option<NodeId> {
+        let tenant_shard = tenants.get(&self.tenant_shard_id)?;
+
+        if *tenant_shard.intent.get_attached() != Some(self.drained_node) {
+            return None;
+        }
+
+        match scheduler.node_preferred(tenant_shard.intent.get_secondary()) {
+            Some(node) => Some(node),
+            None => {
+                tracing::warn!(
+                    tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(),
+                    "No eligible secondary while draining {}", self.drained_node
+                );
+
+                None
+            }
+        }
+    }
+
+    /// Attempt to reschedule the tenant shard under question to one of its secondary locations
+    /// Returns an Err when the operation should be aborted and Ok(None) when the tenant shard
+    /// should be skipped.
+    pub(crate) fn reschedule_to_secondary<'a>(
+        &self,
+        destination: NodeId,
+        tenants: &'a mut BTreeMap<TenantShardId, TenantShard>,
+        scheduler: &mut Scheduler,
+        nodes: &Arc<HashMap<NodeId, Node>>,
+    ) -> Result<Option<&'a mut TenantShard>, OperationError> {
+        let tenant_shard = match tenants.get_mut(&self.tenant_shard_id) {
+            Some(some) => some,
+            None => {
+                // Tenant shard was removed in the meantime.
+                // Skip to the next one, but don't fail the overall operation
+                return Ok(None);
+            }
+        };
+
+        if !nodes.contains_key(&destination) {
+            return Err(OperationError::NodeStateChanged(
+                format!("node {} was removed", destination).into(),
+            ));
+        }
+
+        if !tenant_shard.intent.get_secondary().contains(&destination) {
+            tracing::info!(
+                tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(),
+                "Secondary moved away from {destination} during drain"
+            );
+
+            return Ok(None);
+        }
+
+        match tenant_shard.reschedule_to_secondary(Some(destination), scheduler) {
+            Err(e) => {
+                tracing::warn!(
+                    tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(),
+                    "Scheduling error when draining pageserver {} : {}", self.drained_node, e
+                );
+
+                Ok(None)
+            }
+            Ok(()) => {
+                let scheduled_to = tenant_shard.intent.get_attached();
+                tracing::info!(
+                    tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(),
+                    "Rescheduled shard while draining node {}: {} -> {:?}",
+                    self.drained_node,
+                    self.drained_node,
+                    scheduled_to
+                );
+
+                Ok(Some(tenant_shard))
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use utils::{
+        id::TenantId,
+        shard::{ShardCount, ShardNumber, TenantShardId},
+    };
+
+    use super::TenantShardIterator;
+
+    #[test]
+    fn test_tenant_shard_iterator() {
+        let tenant_id = TenantId::generate();
+        let shard_count = ShardCount(8);
+
+        let mut tenant_shards = Vec::default();
+        for i in 0..shard_count.0 {
+            tenant_shards.push((
+                TenantShardId {
+                    tenant_id,
+                    shard_number: ShardNumber(i),
+                    shard_count,
+                },
+                (),
+            ))
+        }
+
+        let tenant_shards = Arc::new(tenant_shards);
+
+        let mut tid_iter = TenantShardIterator::new({
+            let tenants = tenant_shards.clone();
+            move |last_inspected_shard: Option<TenantShardId>| {
+                let entry = match last_inspected_shard {
+                    Some(skip_past) => {
+                        let mut cursor = tenants.iter().skip_while(|(tid, _)| *tid != skip_past);
+                        cursor.nth(1)
+                    }
+                    None => tenants.first(),
+                };
+
+                entry.map(|(tid, _)| tid).copied()
+            }
+        });
+
+        let mut iterated_over = Vec::default();
+        while let Some(tid) = tid_iter.next() {
+            iterated_over.push((tid, ()));
+        }
+
+        assert_eq!(iterated_over, *tenant_shards);
+    }
+}
diff --git a/storage_controller/src/lib.rs b/storage_controller/src/lib.rs
index 8caf638904..26c258c466 100644
--- a/storage_controller/src/lib.rs
+++ b/storage_controller/src/lib.rs
@@ -4,6 +4,7 @@ use utils::seqwait::MonotonicCounter;
 mod auth;
 mod background_node_operations;
 mod compute_hook;
+mod drain_utils;
 mod heartbeater;
 pub mod http;
 mod id_lock_map;
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index 2799f21fdc..a66e9128bc 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -92,6 +92,11 @@ struct Cli {
     /// Chaos testing
     #[arg(long)]
     chaos_interval: Option<humantime::Duration>,
+
+    // Maximum acceptable lag for the secondary location while draining
+    // a pageserver
+    #[arg(long)]
+    max_secondary_lag_bytes: Option<u64>,
 }
 
 enum StrictMode {
@@ -279,6 +284,7 @@ async fn async_main() -> anyhow::Result<()> {
             .unwrap_or(RECONCILER_CONCURRENCY_DEFAULT),
         split_threshold: args.split_threshold,
         neon_local_repo_dir: args.neon_local_repo_dir,
+        max_secondary_lag_bytes: args.max_secondary_lag_bytes,
     };
 
     // After loading secrets & config, but before starting anything else, apply database migrations
diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs
index 254fdb364e..94db879ade 100644
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -39,6 +39,9 @@ pub(super) struct Reconciler {
     /// to detach this tenant shard.
     pub(crate) detach: Vec<Node>,
 
+    /// Configuration specific to this reconciler
+    pub(crate) reconciler_config: ReconcilerConfig,
+
     pub(crate) config: TenantConfig,
     pub(crate) observed: ObservedState,
 
@@ -73,6 +76,65 @@ pub(super) struct Reconciler {
     pub(crate) persistence: Arc<Persistence>,
 }
 
+pub(crate) struct ReconcilerConfigBuilder {
+    config: ReconcilerConfig,
+}
+
+impl ReconcilerConfigBuilder {
+    pub(crate) fn new() -> Self {
+        Self {
+            config: ReconcilerConfig::default(),
+        }
+    }
+
+    pub(crate) fn secondary_warmup_timeout(self, value: Duration) -> Self {
+        Self {
+            config: ReconcilerConfig {
+                secondary_warmup_timeout: Some(value),
+                ..self.config
+            },
+        }
+    }
+
+    pub(crate) fn secondary_download_request_timeout(self, value: Duration) -> Self {
+        Self {
+            config: ReconcilerConfig {
+                secondary_download_request_timeout: Some(value),
+                ..self.config
+            },
+        }
+    }
+
+    pub(crate) fn build(self) -> ReconcilerConfig {
+        self.config
+    }
+}
+
+#[derive(Default, Debug, Copy, Clone)]
+pub(crate) struct ReconcilerConfig {
+    // During live migration give up on warming-up the secondary
+    // after this timeout.
+    secondary_warmup_timeout: Option<Duration>,
+
+    // During live migrations this is the amount of time that
+    // the pagserver will hold our poll.
+    secondary_download_request_timeout: Option<Duration>,
+}
+
+impl ReconcilerConfig {
+    pub(crate) fn get_secondary_warmup_timeout(&self) -> Duration {
+        const SECONDARY_WARMUP_TIMEOUT_DEFAULT: Duration = Duration::from_secs(300);
+        self.secondary_warmup_timeout
+            .unwrap_or(SECONDARY_WARMUP_TIMEOUT_DEFAULT)
+    }
+
+    pub(crate) fn get_secondary_download_request_timeout(&self) -> Duration {
+        const SECONDARY_DOWNLOAD_REQUEST_TIMEOUT_DEFAULT: Duration = Duration::from_secs(20);
+        self.secondary_download_request_timeout
+            .unwrap_or(SECONDARY_DOWNLOAD_REQUEST_TIMEOUT_DEFAULT)
+    }
+}
+
 /// RAII resource units granted to a Reconciler, which it should keep alive until it finishes doing I/O
 pub(crate) struct ReconcileUnits {
     _sem_units: tokio::sync::OwnedSemaphorePermit,
@@ -300,11 +362,13 @@ impl Reconciler {
     ) -> Result<(), ReconcileError> {
         // This is not the timeout for a request, but the total amount of time we're willing to wait
         // for a secondary location to get up to date before
-        const TOTAL_DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(300);
+        let total_download_timeout = self.reconciler_config.get_secondary_warmup_timeout();
 
         // This the long-polling interval for the secondary download requests we send to destination pageserver
         // during a migration.
-        const REQUEST_DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(20);
+        let request_download_timeout = self
+            .reconciler_config
+            .get_secondary_download_request_timeout();
 
         let started_at = Instant::now();
 
@@ -315,14 +379,14 @@ impl Reconciler {
                         client
                             .tenant_secondary_download(
                                 tenant_shard_id,
-                                Some(REQUEST_DOWNLOAD_TIMEOUT),
+                                Some(request_download_timeout),
                             )
                             .await
                     },
                     &self.service_config.jwt_token,
                     1,
                     3,
-                    REQUEST_DOWNLOAD_TIMEOUT * 2,
+                    request_download_timeout * 2,
                     &self.cancel,
                 )
                 .await
@@ -350,7 +414,7 @@ impl Reconciler {
                 return Ok(());
             } else if status == StatusCode::ACCEPTED {
                 let total_runtime = started_at.elapsed();
-                if total_runtime > TOTAL_DOWNLOAD_TIMEOUT {
+                if total_runtime > total_download_timeout {
                     tracing::warn!("Timed out after {}ms downloading layers to {node}.  Progress so far: {}/{} layers, {}/{} bytes",
                         total_runtime.as_millis(),
                         progress.layers_downloaded,
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index e391ce65e6..31b2d0c3f5 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -14,10 +14,11 @@ use crate::{
         Drain, Fill, Operation, OperationError, OperationHandler, MAX_RECONCILES_PER_OPERATION,
     },
     compute_hook::NotifyError,
+    drain_utils::{self, TenantShardDrain, TenantShardIterator},
     id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, TracingExclusiveGuard},
     metrics::LeadershipStatusGroup,
     persistence::{AbortShardSplitStatus, MetadataHealthPersistence, TenantFilter},
-    reconciler::{ReconcileError, ReconcileUnits},
+    reconciler::{ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder},
     scheduler::{MaySchedule, ScheduleContext, ScheduleMode},
     tenant_shard::{
         MigrateAttachment, ReconcileNeeded, ReconcilerStatus, ScheduleOptimization,
@@ -325,6 +326,12 @@ pub struct Config {
 
     // TODO: make this cfg(feature  = "testing")
     pub neon_local_repo_dir: Option<PathBuf>,
+
+    // Maximum acceptable download lag for the secondary location
+    // while draining a node. If the secondary location is lagging
+    // by more than the configured amount, then the secondary is not
+    // upgraded to primary.
+    pub max_secondary_lag_bytes: Option<u64>,
 }
 
 impl From<DatabaseError> for ApiError {
@@ -5187,11 +5194,22 @@ impl Service {
         Ok(())
     }
 
-    /// Wrap [`TenantShard`] reconciliation methods with acquisition of [`Gate`] and [`ReconcileUnits`],
+    /// Like [`Self::maybe_configured_reconcile_shard`], but uses the default reconciler
+    /// configuration
     fn maybe_reconcile_shard(
         &self,
         shard: &mut TenantShard,
         nodes: &Arc<HashMap<NodeId, Node>>,
+    ) -> Option<ReconcilerWaiter> {
+        self.maybe_configured_reconcile_shard(shard, nodes, ReconcilerConfig::default())
+    }
+
+    /// Wrap [`TenantShard`] reconciliation methods with acquisition of [`Gate`] and [`ReconcileUnits`],
+    fn maybe_configured_reconcile_shard(
+        &self,
+        shard: &mut TenantShard,
+        nodes: &Arc<HashMap<NodeId, Node>>,
+        reconciler_config: ReconcilerConfig,
     ) -> Option<ReconcilerWaiter> {
         let reconcile_needed = shard.get_reconcile_needed(nodes);
 
@@ -5241,6 +5259,7 @@ impl Service {
             &self.result_tx,
             nodes,
             &self.compute_hook,
+            reconciler_config,
             &self.config,
             &self.persistence,
             units,
@@ -5715,18 +5734,92 @@ impl Service {
         self.gate.close().await;
     }
 
+    /// Spot check the download lag for a secondary location of a shard.
+    /// Should be used as a heuristic, since it's not always precise: the
+    /// secondary might have not downloaded the new heat map yet and, hence,
+    /// is not aware of the lag.
+    ///
+    /// Returns:
+    /// * Ok(None) if the lag could not be determined from the status,
+    /// * Ok(Some(_)) if the lag could be determind
+    /// * Err on failures to query the pageserver.
+    async fn secondary_lag(
+        &self,
+        secondary: &NodeId,
+        tenant_shard_id: TenantShardId,
+    ) -> Result<Option<u64>, mgmt_api::Error> {
+        let nodes = self.inner.read().unwrap().nodes.clone();
+        let node = nodes.get(secondary).ok_or(mgmt_api::Error::ApiError(
+            StatusCode::NOT_FOUND,
+            format!("Node with id {} not found", secondary),
+        ))?;
+
+        match node
+            .with_client_retries(
+                |client| async move { client.tenant_secondary_status(tenant_shard_id).await },
+                &self.config.jwt_token,
+                1,
+                3,
+                Duration::from_millis(250),
+                &self.cancel,
+            )
+            .await
+        {
+            Some(Ok(status)) => match status.heatmap_mtime {
+                Some(_) => Ok(Some(status.bytes_total - status.bytes_downloaded)),
+                None => Ok(None),
+            },
+            Some(Err(e)) => Err(e),
+            None => Err(mgmt_api::Error::Cancelled),
+        }
+    }
+
     /// Drain a node by moving the shards attached to it as primaries.
     /// This is a long running operation and it should run as a separate Tokio task.
     pub(crate) async fn drain_node(
-        &self,
+        self: &Arc<Self>,
         node_id: NodeId,
         cancel: CancellationToken,
     ) -> Result<(), OperationError> {
-        let mut last_inspected_shard: Option<TenantShardId> = None;
-        let mut inspected_all_shards = false;
+        const MAX_SECONDARY_LAG_BYTES_DEFAULT: u64 = 256 * 1024 * 1024;
+        let max_secondary_lag_bytes = self
+            .config
+            .max_secondary_lag_bytes
+            .unwrap_or(MAX_SECONDARY_LAG_BYTES_DEFAULT);
+
+        // By default, live migrations are generous about the wait time for getting
+        // the secondary location up to speed. When draining, give up earlier in order
+        // to not stall the operation when a cold secondary is encountered.
+        const SECONDARY_WARMUP_TIMEOUT: Duration = Duration::from_secs(20);
+        const SECONDARY_DOWNLOAD_REQUEST_TIMEOUT: Duration = Duration::from_secs(5);
+        let reconciler_config = ReconcilerConfigBuilder::new()
+            .secondary_warmup_timeout(SECONDARY_WARMUP_TIMEOUT)
+            .secondary_download_request_timeout(SECONDARY_DOWNLOAD_REQUEST_TIMEOUT)
+            .build();
+
         let mut waiters = Vec::new();
 
-        while !inspected_all_shards {
+        let mut tid_iter = TenantShardIterator::new({
+            let service = self.clone();
+            move |last_inspected_shard: Option<TenantShardId>| {
+                let locked = &service.inner.read().unwrap();
+                let tenants = &locked.tenants;
+                let entry = match last_inspected_shard {
+                    Some(skip_past) => {
+                        // Skip to the last seen tenant shard id
+                        let mut cursor = tenants.iter().skip_while(|(tid, _)| **tid != skip_past);
+
+                        // Skip past the last seen
+                        cursor.nth(1)
+                    }
+                    None => tenants.first_key_value(),
+                };
+
+                entry.map(|(tid, _)| tid).copied()
+            }
+        });
+
+        while !tid_iter.finished() {
             if cancel.is_cancelled() {
                 match self
                     .node_configure(node_id, None, Some(NodeSchedulingPolicy::Active))
@@ -5745,71 +5838,82 @@ impl Service {
                 }
             }
 
-            {
-                let mut locked = self.inner.write().unwrap();
-                let (nodes, tenants, scheduler) = locked.parts_mut();
+            drain_utils::validate_node_state(&node_id, self.inner.read().unwrap().nodes.clone())?;
 
-                let node = nodes.get(&node_id).ok_or(OperationError::NodeStateChanged(
-                    format!("node {node_id} was removed").into(),
-                ))?;
-
-                let current_policy = node.get_scheduling();
-                if !matches!(current_policy, NodeSchedulingPolicy::Draining) {
-                    // TODO(vlad): maybe cancel pending reconciles before erroring out. need to think
-                    // about it
-                    return Err(OperationError::NodeStateChanged(
-                        format!("node {node_id} changed state to {current_policy:?}").into(),
-                    ));
-                }
-
-                let mut cursor = tenants.iter_mut().skip_while({
-                    let skip_past = last_inspected_shard;
-                    move |(tid, _)| match skip_past {
-                        Some(last) => **tid != last,
-                        None => false,
+            while waiters.len() < MAX_RECONCILES_PER_OPERATION {
+                let tid = match tid_iter.next() {
+                    Some(tid) => tid,
+                    None => {
+                        break;
                     }
-                });
+                };
 
-                while waiters.len() < MAX_RECONCILES_PER_OPERATION {
-                    let (tid, tenant_shard) = match cursor.next() {
-                        Some(some) => some,
+                let tid_drain = TenantShardDrain {
+                    drained_node: node_id,
+                    tenant_shard_id: tid,
+                };
+
+                let dest_node_id = {
+                    let locked = self.inner.read().unwrap();
+
+                    match tid_drain
+                        .tenant_shard_eligible_for_drain(&locked.tenants, &locked.scheduler)
+                    {
+                        Some(node_id) => node_id,
                         None => {
-                            inspected_all_shards = true;
-                            break;
+                            continue;
                         }
-                    };
+                    }
+                };
 
-                    // If the shard is not attached to the node being drained, skip it.
-                    if *tenant_shard.intent.get_attached() != Some(node_id) {
-                        last_inspected_shard = Some(*tid);
+                match self.secondary_lag(&dest_node_id, tid).await {
+                    Ok(Some(lag)) if lag <= max_secondary_lag_bytes => {
+                        // The secondary is reasonably up to date.
+                        // Migrate to it
+                    }
+                    Ok(Some(lag)) => {
+                        tracing::info!(
+                            tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
+                            "Secondary on node {dest_node_id} is lagging by {lag}. Skipping reconcile."
+                        );
                         continue;
                     }
+                    Ok(None) => {
+                        tracing::info!(
+                            tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
+                            "Could not determine lag for secondary on node {dest_node_id}. Skipping reconcile."
+                        );
+                        continue;
+                    }
+                    Err(err) => {
+                        tracing::warn!(
+                            tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
+                            "Failed to get secondary lag from node {dest_node_id}. Skipping reconcile: {err}"
+                        );
+                        continue;
+                    }
+                }
 
-                    match tenant_shard.reschedule_to_secondary(None, scheduler) {
-                        Err(e) => {
-                            tracing::warn!(
-                                tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
-                                "Scheduling error when draining pageserver {} : {e}", node_id
-                            );
-                        }
-                        Ok(()) => {
-                            let scheduled_to = tenant_shard.intent.get_attached();
-                            tracing::info!(
-                                tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
-                                "Rescheduled shard while draining node {}: {} -> {:?}",
-                                node_id,
-                                node_id,
-                                scheduled_to
-                            );
+                {
+                    let mut locked = self.inner.write().unwrap();
+                    let (nodes, tenants, scheduler) = locked.parts_mut();
+                    let rescheduled = tid_drain.reschedule_to_secondary(
+                        dest_node_id,
+                        tenants,
+                        scheduler,
+                        nodes,
+                    )?;
 
-                            let waiter = self.maybe_reconcile_shard(tenant_shard, nodes);
-                            if let Some(some) = waiter {
-                                waiters.push(some);
-                            }
+                    if let Some(tenant_shard) = rescheduled {
+                        let waiter = self.maybe_configured_reconcile_shard(
+                            tenant_shard,
+                            nodes,
+                            reconciler_config,
+                        );
+                        if let Some(some) = waiter {
+                            waiters.push(some);
                         }
                     }
-
-                    last_inspected_shard = Some(*tid);
                 }
             }
 
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index e250f29f98..1fcc3c8547 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -7,7 +7,7 @@ use std::{
 use crate::{
     metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome},
     persistence::TenantShardPersistence,
-    reconciler::ReconcileUnits,
+    reconciler::{ReconcileUnits, ReconcilerConfig},
     scheduler::{AffinityScore, MaySchedule, RefCountUpdate, ScheduleContext},
     service::ReconcileResultRequest,
 };
@@ -1063,6 +1063,7 @@ impl TenantShard {
         result_tx: &tokio::sync::mpsc::UnboundedSender<ReconcileResultRequest>,
         pageservers: &Arc<HashMap<NodeId, Node>>,
         compute_hook: &Arc<ComputeHook>,
+        reconciler_config: ReconcilerConfig,
         service_config: &service::Config,
         persistence: &Arc<Persistence>,
         units: ReconcileUnits,
@@ -1101,6 +1102,7 @@ impl TenantShard {
             generation: self.generation,
             intent: reconciler_intent,
             detach,
+            reconciler_config,
             config: self.config.clone(),
             observed: self.observed.clone(),
             compute_hook: compute_hook.clone(),
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index c6f4404784..844a23d327 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -14,6 +14,7 @@ import textwrap
 import threading
 import time
 import uuid
+from collections import defaultdict
 from contextlib import closing, contextmanager
 from dataclasses import dataclass
 from datetime import datetime
@@ -2667,6 +2668,69 @@ class NeonStorageController(MetricsGetter, LogUtils):
         log.info(f"Got failpoints request response code {res.status_code}")
         res.raise_for_status()
 
+    def get_tenants_placement(self) -> defaultdict[str, Dict[str, Any]]:
+        """
+        Get the intent and observed placements of all tenants known to the storage controller.
+        """
+        tenants = self.tenant_list()
+
+        tenant_placement: defaultdict[str, Dict[str, Any]] = defaultdict(
+            lambda: {
+                "observed": {"attached": None, "secondary": []},
+                "intent": {"attached": None, "secondary": []},
+            }
+        )
+
+        for t in tenants:
+            for node_id, loc_state in t["observed"]["locations"].items():
+                if (
+                    loc_state is not None
+                    and "conf" in loc_state
+                    and loc_state["conf"] is not None
+                    and loc_state["conf"]["mode"]
+                    in set(["AttachedSingle", "AttachedMulti", "AttachedStale"])
+                ):
+                    tenant_placement[t["tenant_shard_id"]]["observed"]["attached"] = int(node_id)
+
+                if (
+                    loc_state is not None
+                    and "conf" in loc_state
+                    and loc_state["conf"] is not None
+                    and loc_state["conf"]["mode"] == "Secondary"
+                ):
+                    tenant_placement[t["tenant_shard_id"]]["observed"]["secondary"].append(
+                        int(node_id)
+                    )
+
+            if "attached" in t["intent"]:
+                tenant_placement[t["tenant_shard_id"]]["intent"]["attached"] = t["intent"][
+                    "attached"
+                ]
+
+            if "secondary" in t["intent"]:
+                tenant_placement[t["tenant_shard_id"]]["intent"]["secondary"] += t["intent"][
+                    "secondary"
+                ]
+
+        return tenant_placement
+
+    def warm_up_all_secondaries(self):
+        log.info("Warming up all secondary locations")
+
+        tenant_placement = self.get_tenants_placement()
+        for tid, placement in tenant_placement.items():
+            assert placement["observed"]["attached"] is not None
+            primary_id = placement["observed"]["attached"]
+
+            assert len(placement["observed"]["secondary"]) == 1
+            secondary_id = placement["observed"]["secondary"][0]
+
+            parsed_tid = TenantShardId.parse(tid)
+            self.env.get_pageserver(primary_id).http_client().tenant_heatmap_upload(parsed_tid)
+            self.env.get_pageserver(secondary_id).http_client().tenant_secondary_download(
+                parsed_tid, wait_ms=250
+            )
+
     @property
     def workdir(self) -> Path:
         return self.env.repo_dir
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 65d6ff5d62..cd4261f1b8 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -361,6 +361,12 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         self.verbose_error(res)
         return (res.status_code, res.json())
 
+    def tenant_secondary_status(self, tenant_id: Union[TenantId, TenantShardId]):
+        url = f"http://localhost:{self.port}/v1/tenant/{tenant_id}/secondary/status"
+        res = self.get(url)
+        self.verbose_error(res)
+        return res.json()
+
     def set_tenant_config(self, tenant_id: Union[TenantId, TenantShardId], config: dict[str, Any]):
         assert "tenant_id" not in config.keys()
         res = self.put(
diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py
index 04785f7184..297aedfbed 100644
--- a/test_runner/performance/test_storage_controller_scale.py
+++ b/test_runner/performance/test_storage_controller_scale.py
@@ -2,7 +2,6 @@ import concurrent.futures
 import random
 import time
 from collections import defaultdict
-from typing import Any, Dict
 
 import pytest
 from fixtures.common_types import TenantId, TenantShardId, TimelineId
@@ -24,51 +23,14 @@ def get_consistent_node_shard_counts(env: NeonEnv, total_shards) -> defaultdict[
     This function takes into account the intersection of the intent and the observed state.
     If they do not match, it asserts out.
     """
-    tenants = env.storage_controller.tenant_list()
-
-    intent = dict()
-    observed = dict()
-
-    tenant_placement: defaultdict[str, Dict[str, Any]] = defaultdict(
-        lambda: {
-            "observed": {"attached": None, "secondary": []},
-            "intent": {"attached": None, "secondary": []},
-        }
-    )
-
-    for t in tenants:
-        for node_id, loc_state in t["observed"]["locations"].items():
-            if (
-                loc_state is not None
-                and "conf" in loc_state
-                and loc_state["conf"] is not None
-                and loc_state["conf"]["mode"]
-                in set(["AttachedSingle", "AttachedMulti", "AttachedStale"])
-            ):
-                observed[t["tenant_shard_id"]] = int(node_id)
-                tenant_placement[t["tenant_shard_id"]]["observed"]["attached"] = int(node_id)
-
-            if (
-                loc_state is not None
-                and "conf" in loc_state
-                and loc_state["conf"] is not None
-                and loc_state["conf"]["mode"] == "Secondary"
-            ):
-                tenant_placement[t["tenant_shard_id"]]["observed"]["secondary"].append(int(node_id))
-
-        if "attached" in t["intent"]:
-            intent[t["tenant_shard_id"]] = t["intent"]["attached"]
-            tenant_placement[t["tenant_shard_id"]]["intent"]["attached"] = t["intent"]["attached"]
-
-        if "secondary" in t["intent"]:
-            tenant_placement[t["tenant_shard_id"]]["intent"]["secondary"] += t["intent"][
-                "secondary"
-            ]
-
+    tenant_placement = env.storage_controller.get_tenants_placement()
     log.info(f"{tenant_placement=}")
 
     matching = {
-        tid: intent[tid] for tid in observed if tid in intent and intent[tid] == observed[tid]
+        tid: tenant_placement[tid]["intent"]["attached"]
+        for tid in tenant_placement
+        if tenant_placement[tid]["intent"]["attached"]
+        == tenant_placement[tid]["observed"]["attached"]
     }
     assert len(matching) == total_shards
 
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index eb2cdccdb9..9b2557a165 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -17,6 +17,7 @@ from fixtures.neon_fixtures import (
     PgBin,
     StorageControllerApiException,
     TokenScope,
+    last_flush_lsn_upload,
 )
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import (
@@ -1597,6 +1598,8 @@ def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder):
 
     # Perform a graceful rolling restart
     for ps in env.pageservers:
+        env.storage_controller.warm_up_all_secondaries()
+
         env.storage_controller.retryable_node_operation(
             lambda ps_id: env.storage_controller.node_drain(ps_id), ps.id, max_attempts=3, backoff=2
         )
@@ -1645,6 +1648,115 @@ def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder):
     assert_shard_counts_balanced(env, shard_counts, total_shards)
 
 
+def test_skip_drain_on_secondary_lag(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
+    """
+    Artificially make a tenant shard's secondary location lag behind the primary
+    and check that storage controller driven node drains skip the lagging tenant shard.
+    Finally, validate that the tenant shard is migrated when a new drain request comes
+    in and it's no longer lagging.
+    """
+    neon_env_builder.num_pageservers = 2
+    neon_env_builder.storage_controller_config = {
+        "max_secondary_lag_bytes": 1 * 1024 * 1024,
+    }
+
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    tid, timeline_id = env.neon_cli.create_tenant(placement_policy='{"Attached":1}')
+
+    # Give things a chance to settle.
+    env.storage_controller.reconcile_until_idle(timeout_secs=30)
+
+    locations = env.storage_controller.locate(tid)
+    assert len(locations) == 1
+    primary: int = locations[0]["node_id"]
+    not_primary = [ps.id for ps in env.pageservers if ps.id != primary]
+    assert len(not_primary) == 1
+    secondary = not_primary[0]
+
+    log.info(f"Paused secondary downloads on {secondary}")
+    env.get_pageserver(secondary).http_client().configure_failpoints(
+        ("secondary-layer-download-pausable", "pause")
+    )
+
+    log.info(f"Ingesting some data for {tid}")
+
+    with env.endpoints.create_start("main", tenant_id=tid) as endpoint:
+        run_pg_bench_small(pg_bin, endpoint.connstr())
+        endpoint.safe_psql("CREATE TABLE created_foo(id integer);")
+        last_flush_lsn_upload(env, endpoint, tid, timeline_id)
+
+    log.info(f"Uploading heatmap from {primary} and requesting download from {secondary}")
+
+    env.get_pageserver(primary).http_client().tenant_heatmap_upload(tid)
+    env.get_pageserver(secondary).http_client().tenant_secondary_download(tid, wait_ms=100)
+
+    def secondary_is_lagging():
+        resp = env.get_pageserver(secondary).http_client().tenant_secondary_status(tid)
+        lag = resp["bytes_total"] - resp["bytes_downloaded"]
+
+        if lag <= 1 * 1024 * 1024:
+            raise Exception(f"Secondary lag not big enough: {lag}")
+
+    log.info(f"Looking for lag to develop on the secondary {secondary}")
+    wait_until(10, 1, secondary_is_lagging)
+
+    log.info(f"Starting drain of primary {primary} with laggy secondary {secondary}")
+    env.storage_controller.retryable_node_operation(
+        lambda ps_id: env.storage_controller.node_drain(ps_id), primary, max_attempts=3, backoff=2
+    )
+
+    env.storage_controller.poll_node_status(
+        primary,
+        PageserverAvailability.ACTIVE,
+        PageserverSchedulingPolicy.PAUSE_FOR_RESTART,
+        max_attempts=6,
+        backoff=5,
+    )
+
+    locations = env.storage_controller.locate(tid)
+    assert len(locations) == 1
+    assert locations[0]["node_id"] == primary
+
+    log.info(f"Unpausing secondary downloads on {secondary}")
+    env.get_pageserver(secondary).http_client().configure_failpoints(
+        ("secondary-layer-download-pausable", "off")
+    )
+    env.get_pageserver(secondary).http_client().tenant_secondary_download(tid, wait_ms=100)
+
+    log.info(f"Waiting for lag to reduce on {secondary}")
+
+    def lag_is_acceptable():
+        resp = env.get_pageserver(secondary).http_client().tenant_secondary_status(tid)
+        lag = resp["bytes_total"] - resp["bytes_downloaded"]
+
+        if lag > 1 * 1024 * 1024:
+            raise Exception(f"Secondary lag not big enough: {lag}")
+
+    wait_until(10, 1, lag_is_acceptable)
+
+    env.storage_controller.node_configure(primary, {"scheduling": "Active"})
+
+    log.info(f"Starting drain of primary {primary} with non-laggy secondary {secondary}")
+
+    env.storage_controller.retryable_node_operation(
+        lambda ps_id: env.storage_controller.node_drain(ps_id), primary, max_attempts=3, backoff=2
+    )
+
+    env.storage_controller.poll_node_status(
+        primary,
+        PageserverAvailability.ACTIVE,
+        PageserverSchedulingPolicy.PAUSE_FOR_RESTART,
+        max_attempts=6,
+        backoff=5,
+    )
+
+    locations = env.storage_controller.locate(tid)
+    assert len(locations) == 1
+    assert locations[0]["node_id"] == secondary
+
+
 def test_background_operation_cancellation(neon_env_builder: NeonEnvBuilder):
     neon_env_builder.num_pageservers = 2
     env = neon_env_builder.init_configs()
@@ -1671,6 +1783,7 @@ def test_background_operation_cancellation(neon_env_builder: NeonEnvBuilder):
 
     ps_id_to_drain = env.pageservers[0].id
 
+    env.storage_controller.warm_up_all_secondaries()
     env.storage_controller.retryable_node_operation(
         lambda ps_id: env.storage_controller.node_drain(ps_id),
         ps_id_to_drain,

From 642aa1e160bd6c7c99ff8707a8b91b7309650eb0 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 9 Aug 2024 15:48:16 +0100
Subject: [PATCH 374/412] Dockerfiles: remove cachepot (#8666)

## Problem
We install and try to use `cachepot`. But it is not configured correctly
and doesn't work (after https://github.com/neondatabase/neon/pull/2290)

## Summary of changes
- Remove `cachepot`
---
 .github/workflows/neon_extra_builds.yml |  2 --
 Dockerfile                              | 15 +--------------
 Dockerfile.build-tools                  |  2 --
 libs/utils/src/lib.rs                   |  2 +-
 4 files changed, 2 insertions(+), 19 deletions(-)

diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index d4870e16ad..2ee66cfdc1 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -149,8 +149,6 @@ jobs:
 
     env:
       BUILD_TYPE: release
-      # remove the cachepot wrapper and build without crate caches
-      RUSTC_WRAPPER: ""
       # build with incremental compilation produce partial results
       # so do not attempt to cache this build, also disable the incremental compilation
       CARGO_INCREMENTAL: 0
diff --git a/Dockerfile b/Dockerfile
index 6ed57a84a3..ceb1c7cb55 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -29,24 +29,12 @@ WORKDIR /home/nonroot
 ARG GIT_VERSION=local
 ARG BUILD_TAG
 
-# Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds.
-# Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations.
-# cachepot falls back to local filesystem if S3 is misconfigured, not failing the build
-ARG RUSTC_WRAPPER=cachepot
-ENV AWS_REGION=eu-central-1
-ENV CACHEPOT_S3_KEY_PREFIX=cachepot
-ARG CACHEPOT_BUCKET=neon-github-dev
-#ARG AWS_ACCESS_KEY_ID
-#ARG AWS_SECRET_ACCESS_KEY
-
 COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server
 COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server
 COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server
 COPY --from=pg-build /home/nonroot/pg_install/v16/lib                       pg_install/v16/lib
 COPY --chown=nonroot . .
 
-# Show build caching stats to check if it was used in the end.
-# Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
 RUN set -e \
     && PQ_LIB_DIR=$(pwd)/pg_install/v16/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment" cargo build \
       --bin pg_sni_router  \
@@ -58,8 +46,7 @@ RUN set -e \
       --bin proxy  \
       --bin neon_local \
       --bin storage_scrubber \
-      --locked --release \
-    && cachepot -s
+      --locked --release
 
 # Build final image
 #
diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools
index d39d36e1b6..d6beb61369 100644
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -208,7 +208,6 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
     . "$HOME/.cargo/env" && \
     cargo --version && rustup --version && \
     rustup component add llvm-tools-preview rustfmt clippy && \
-    cargo install --git https://github.com/paritytech/cachepot && \
     cargo install rustfilt            --version ${RUSTFILT_VERSION} && \
     cargo install cargo-hakari        --version ${CARGO_HAKARI_VERSION} && \
     cargo install cargo-deny --locked --version ${CARGO_DENY_VERSION} && \
@@ -216,7 +215,6 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
     cargo install cargo-nextest       --version ${CARGO_NEXTEST_VERSION} && \
     rm -rf /home/nonroot/.cargo/registry && \
     rm -rf /home/nonroot/.cargo/git
-ENV RUSTC_WRAPPER=cachepot
 
 # Show versions
 RUN whoami \
diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs
index a46d68ef33..f4fc0ba57b 100644
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -128,7 +128,7 @@ pub mod circuit_breaker;
 ///
 /// #############################################################################################
 /// TODO this macro is not the way the library is intended to be used, see <https://github.com/neondatabase/neon/issues/1565> for details.
-/// We use `cachepot` to reduce our current CI build times: <https://github.com/neondatabase/cloud/pull/1033#issuecomment-1100935036>
+/// We used `cachepot` to reduce our current CI build times: <https://github.com/neondatabase/cloud/pull/1033#issuecomment-1100935036>
 /// Yet, it seems to ignore the GIT_VERSION env variable, passed to Docker build, even with build.rs that contains
 /// `println!("cargo:rerun-if-env-changed=GIT_VERSION");` code for cachepot cache invalidation.
 /// The problem needs further investigation and regular `const` declaration instead of a macro.

From 66f86f184bff8e280bc7ae53fd8b94b843f2766e Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 9 Aug 2024 18:30:15 +0100
Subject: [PATCH 375/412] Update docs/SUMMARY.md (#8665)

## Problem

This page had many dead links, and was confusing for folks looking for
documentation about our product.

Closes: https://github.com/neondatabase/neon/issues/8535

## Summary of changes

- Add a link to the product docs up top
- Remove dead/placeholder links
---
 docs/SUMMARY.md | 59 ++++++++++---------------------------------------
 1 file changed, 12 insertions(+), 47 deletions(-)

diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md
index b275349168..5fd4080c28 100644
--- a/docs/SUMMARY.md
+++ b/docs/SUMMARY.md
@@ -1,13 +1,18 @@
 # Summary
 
+# Looking for `neon.tech` docs?
+
+This page linkes to a selection of technical content about the open source code in this repository.
+
+Please visit https://neon.tech/docs for documentation about using the Neon service, which is based on the code
+in this repository.
+
+# Architecture
+
 [Introduction]()
 - [Separation of Compute and Storage](./separation-compute-storage.md)
 
-# Architecture
-
 - [Compute]()
-  - [WAL proposer]()
-  - [WAL Backpressure]()
   - [Postgres changes](./core_changes.md)
 
 - [Pageserver](./pageserver.md)
@@ -16,33 +21,15 @@
     - [WAL Redo](./pageserver-walredo.md)
     - [Page cache](./pageserver-pagecache.md)
     - [Storage](./pageserver-storage.md)
-        - [Datadir mapping]()
-        - [Layer files]()
-        - [Branching]()
-        - [Garbage collection]()
-    - [Cloud Storage]()
     - [Processing a GetPage request](./pageserver-processing-getpage.md)
     - [Processing WAL](./pageserver-processing-wal.md)
-	- [Management API]()
-	- [Tenant Rebalancing]()
 
 - [WAL Service](walservice.md)
   - [Consensus protocol](safekeeper-protocol.md)
-  - [Management API]()
-  - [Rebalancing]()
-
-- [Control Plane]()
-
-- [Proxy]()
 
 - [Source view](./sourcetree.md)
   - [docker.md](./docker.md) — Docker images and building pipeline.
   - [Error handling and logging](./error-handling.md)
-  - [Testing]()
-    - [Unit testing]()
-    - [Integration testing]()
-    - [Benchmarks]()
-
 
 - [Glossary](./glossary.md)
 
@@ -58,28 +45,6 @@
 
 # RFCs
 
-- [RFCs](./rfcs/README.md)
-
-- [002-storage](rfcs/002-storage.md)
-- [003-laptop-cli](rfcs/003-laptop-cli.md)
-- [004-durability](rfcs/004-durability.md)
-- [005-zenith_local](rfcs/005-zenith_local.md)
-- [006-laptop-cli-v2-CLI](rfcs/006-laptop-cli-v2-CLI.md)
-- [006-laptop-cli-v2-repository-structure](rfcs/006-laptop-cli-v2-repository-structure.md)
-- [007-serverless-on-laptop](rfcs/007-serverless-on-laptop.md)
-- [008-push-pull](rfcs/008-push-pull.md)
-- [009-snapshot-first-storage-cli](rfcs/009-snapshot-first-storage-cli.md)
-- [009-snapshot-first-storage](rfcs/009-snapshot-first-storage.md)
-- [009-snapshot-first-storage-pitr](rfcs/009-snapshot-first-storage-pitr.md)
-- [010-storage_details](rfcs/010-storage_details.md)
-- [011-retention-policy](rfcs/011-retention-policy.md)
-- [012-background-tasks](rfcs/012-background-tasks.md)
-- [013-term-history](rfcs/013-term-history.md)
-- [014-safekeepers-gossip](rfcs/014-safekeepers-gossip.md)
-- [014-storage-lsm](rfcs/014-storage-lsm.md)
-- [015-storage-messaging](rfcs/015-storage-messaging.md)
-- [016-connection-routing](rfcs/016-connection-routing.md)
-- [017-timeline-data-management](rfcs/017-timeline-data-management.md)
-- [018-storage-messaging-2](rfcs/018-storage-messaging-2.md)
-- [019-tenant-timeline-lifecycles](rfcs/019-tenant-timeline-lifecycles.md)
-- [cluster-size-limits](rfcs/cluster-size-limits.md)
+Major changes are documented in RFCS:
+- See [RFCs](./rfcs/README.md) for more information
+- view the RFCs at https://github.com/neondatabase/neon/tree/main/docs/rfcs

From c7c58eeab821c762dd09325a8556985e585359bc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Sat, 10 Aug 2024 14:04:47 +0200
Subject: [PATCH 376/412] Also pass HOME env var in access_env_vars (#8685)

Noticed this while debugging a test failure in #8673 which only occurs
with real S3 instead of mock S3: if you authenticate to S3 via
`AWS_PROFILE`, then it requires the `HOME` env var to be set so that it
can read inside the `~/.aws` directory.

The scrubber abstraction `StorageScrubber::scrubber_cli` in
`neon_fixtures.py` would otherwise not work. My earlier PR #6556 has
done similar things for the `neon_local` wrapper.

You can try:

```
aws sso login --profile dev
export ENABLE_REAL_S3_REMOTE_STORAGE=y REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests REMOTE_STORAGE_S3_REGION=eu-central-1 AWS_PROFILE=dev
RUST_BACKTRACE=1 BUILD_TYPE=debug DEFAULT_PG_VERSION=16 ./scripts/pytest -vv --tb=short -k test_scrubber_tenant_snapshot
```

before and after this patch: this patch fixes it.
---
 test_runner/fixtures/remote_storage.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py
index 0f2a997b1e..1b6c3c23ba 100644
--- a/test_runner/fixtures/remote_storage.py
+++ b/test_runner/fixtures/remote_storage.py
@@ -177,9 +177,14 @@ class S3Storage:
 
     def access_env_vars(self) -> Dict[str, str]:
         if self.aws_profile is not None:
-            return {
+            env = {
                 "AWS_PROFILE": self.aws_profile,
             }
+            # Pass through HOME env var because AWS_PROFILE needs it in order to work
+            home = os.getenv("HOME")
+            if home is not None:
+                env["HOME"] = home
+            return env
         if self.access_key is not None and self.secret_key is not None:
             return {
                 "AWS_ACCESS_KEY_ID": self.access_key,

From d69f79c7eb930b8ffd368d7ada479e696f204871 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sun, 11 Aug 2024 12:21:32 +0100
Subject: [PATCH 377/412] chore(deps): bump aiohttp from 3.9.4 to 3.10.2
 (#8684)

---
 poetry.lock    | 170 ++++++++++++++++++++++++++-----------------------
 pyproject.toml |   2 +-
 2 files changed, 92 insertions(+), 80 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 9026824558..7db91e51f7 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,91 +1,103 @@
 # This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
 
+[[package]]
+name = "aiohappyeyeballs"
+version = "2.3.5"
+description = "Happy Eyeballs for asyncio"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "aiohappyeyeballs-2.3.5-py3-none-any.whl", hash = "sha256:4d6dea59215537dbc746e93e779caea8178c866856a721c9c660d7a5a7b8be03"},
+    {file = "aiohappyeyeballs-2.3.5.tar.gz", hash = "sha256:6fa48b9f1317254f122a07a131a86b71ca6946ca989ce6326fff54a99a920105"},
+]
+
 [[package]]
 name = "aiohttp"
-version = "3.9.4"
+version = "3.10.2"
 description = "Async http client/server framework (asyncio)"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "aiohttp-3.9.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:76d32588ef7e4a3f3adff1956a0ba96faabbdee58f2407c122dd45aa6e34f372"},
-    {file = "aiohttp-3.9.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:56181093c10dbc6ceb8a29dfeea1e815e1dfdc020169203d87fd8d37616f73f9"},
-    {file = "aiohttp-3.9.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c7a5b676d3c65e88b3aca41816bf72831898fcd73f0cbb2680e9d88e819d1e4d"},
-    {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d1df528a85fb404899d4207a8d9934cfd6be626e30e5d3a5544a83dbae6d8a7e"},
-    {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f595db1bceabd71c82e92df212dd9525a8a2c6947d39e3c994c4f27d2fe15b11"},
-    {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9c0b09d76e5a4caac3d27752027fbd43dc987b95f3748fad2b924a03fe8632ad"},
-    {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:689eb4356649ec9535b3686200b231876fb4cab4aca54e3bece71d37f50c1d13"},
-    {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a3666cf4182efdb44d73602379a66f5fdfd5da0db5e4520f0ac0dcca644a3497"},
-    {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b65b0f8747b013570eea2f75726046fa54fa8e0c5db60f3b98dd5d161052004a"},
-    {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:a1885d2470955f70dfdd33a02e1749613c5a9c5ab855f6db38e0b9389453dce7"},
-    {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:0593822dcdb9483d41f12041ff7c90d4d1033ec0e880bcfaf102919b715f47f1"},
-    {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:47f6eb74e1ecb5e19a78f4a4228aa24df7fbab3b62d4a625d3f41194a08bd54f"},
-    {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:c8b04a3dbd54de6ccb7604242fe3ad67f2f3ca558f2d33fe19d4b08d90701a89"},
-    {file = "aiohttp-3.9.4-cp310-cp310-win32.whl", hash = "sha256:8a78dfb198a328bfb38e4308ca8167028920fb747ddcf086ce706fbdd23b2926"},
-    {file = "aiohttp-3.9.4-cp310-cp310-win_amd64.whl", hash = "sha256:e78da6b55275987cbc89141a1d8e75f5070e577c482dd48bd9123a76a96f0bbb"},
-    {file = "aiohttp-3.9.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c111b3c69060d2bafc446917534150fd049e7aedd6cbf21ba526a5a97b4402a5"},
-    {file = "aiohttp-3.9.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:efbdd51872cf170093998c87ccdf3cb5993add3559341a8e5708bcb311934c94"},
-    {file = "aiohttp-3.9.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7bfdb41dc6e85d8535b00d73947548a748e9534e8e4fddd2638109ff3fb081df"},
-    {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2bd9d334412961125e9f68d5b73c1d0ab9ea3f74a58a475e6b119f5293eee7ba"},
-    {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:35d78076736f4a668d57ade00c65d30a8ce28719d8a42471b2a06ccd1a2e3063"},
-    {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:824dff4f9f4d0f59d0fa3577932ee9a20e09edec8a2f813e1d6b9f89ced8293f"},
-    {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:52b8b4e06fc15519019e128abedaeb56412b106ab88b3c452188ca47a25c4093"},
-    {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eae569fb1e7559d4f3919965617bb39f9e753967fae55ce13454bec2d1c54f09"},
-    {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:69b97aa5792428f321f72aeb2f118e56893371f27e0b7d05750bcad06fc42ca1"},
-    {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4d79aad0ad4b980663316f26d9a492e8fab2af77c69c0f33780a56843ad2f89e"},
-    {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:d6577140cd7db19e430661e4b2653680194ea8c22c994bc65b7a19d8ec834403"},
-    {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:9860d455847cd98eb67897f5957b7cd69fbcb436dd3f06099230f16a66e66f79"},
-    {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:69ff36d3f8f5652994e08bd22f093e11cfd0444cea310f92e01b45a4e46b624e"},
-    {file = "aiohttp-3.9.4-cp311-cp311-win32.whl", hash = "sha256:e27d3b5ed2c2013bce66ad67ee57cbf614288bda8cdf426c8d8fe548316f1b5f"},
-    {file = "aiohttp-3.9.4-cp311-cp311-win_amd64.whl", hash = "sha256:d6a67e26daa686a6fbdb600a9af8619c80a332556245fa8e86c747d226ab1a1e"},
-    {file = "aiohttp-3.9.4-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:c5ff8ff44825736a4065d8544b43b43ee4c6dd1530f3a08e6c0578a813b0aa35"},
-    {file = "aiohttp-3.9.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:d12a244627eba4e9dc52cbf924edef905ddd6cafc6513849b4876076a6f38b0e"},
-    {file = "aiohttp-3.9.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:dcad56c8d8348e7e468899d2fb3b309b9bc59d94e6db08710555f7436156097f"},
-    {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f7e69a7fd4b5ce419238388e55abd220336bd32212c673ceabc57ccf3d05b55"},
-    {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4870cb049f10d7680c239b55428916d84158798eb8f353e74fa2c98980dcc0b"},
-    {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3b2feaf1b7031ede1bc0880cec4b0776fd347259a723d625357bb4b82f62687b"},
-    {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:939393e8c3f0a5bcd33ef7ace67680c318dc2ae406f15e381c0054dd658397de"},
-    {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7d2334e387b2adcc944680bebcf412743f2caf4eeebd550f67249c1c3696be04"},
-    {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:e0198ea897680e480845ec0ffc5a14e8b694e25b3f104f63676d55bf76a82f1a"},
-    {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:e40d2cd22914d67c84824045861a5bb0fb46586b15dfe4f046c7495bf08306b2"},
-    {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:aba80e77c227f4234aa34a5ff2b6ff30c5d6a827a91d22ff6b999de9175d71bd"},
-    {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:fb68dc73bc8ac322d2e392a59a9e396c4f35cb6fdbdd749e139d1d6c985f2527"},
-    {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:f3460a92638dce7e47062cf088d6e7663adb135e936cb117be88d5e6c48c9d53"},
-    {file = "aiohttp-3.9.4-cp312-cp312-win32.whl", hash = "sha256:32dc814ddbb254f6170bca198fe307920f6c1308a5492f049f7f63554b88ef36"},
-    {file = "aiohttp-3.9.4-cp312-cp312-win_amd64.whl", hash = "sha256:63f41a909d182d2b78fe3abef557fcc14da50c7852f70ae3be60e83ff64edba5"},
-    {file = "aiohttp-3.9.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:c3770365675f6be220032f6609a8fbad994d6dcf3ef7dbcf295c7ee70884c9af"},
-    {file = "aiohttp-3.9.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:305edae1dea368ce09bcb858cf5a63a064f3bff4767dec6fa60a0cc0e805a1d3"},
-    {file = "aiohttp-3.9.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:6f121900131d116e4a93b55ab0d12ad72573f967b100e49086e496a9b24523ea"},
-    {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b71e614c1ae35c3d62a293b19eface83d5e4d194e3eb2fabb10059d33e6e8cbf"},
-    {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:419f009fa4cfde4d16a7fc070d64f36d70a8d35a90d71aa27670bba2be4fd039"},
-    {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7b39476ee69cfe64061fd77a73bf692c40021f8547cda617a3466530ef63f947"},
-    {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b33f34c9c7decdb2ab99c74be6443942b730b56d9c5ee48fb7df2c86492f293c"},
-    {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c78700130ce2dcebb1a8103202ae795be2fa8c9351d0dd22338fe3dac74847d9"},
-    {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:268ba22d917655d1259af2d5659072b7dc11b4e1dc2cb9662fdd867d75afc6a4"},
-    {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:17e7c051f53a0d2ebf33013a9cbf020bb4e098c4bc5bce6f7b0c962108d97eab"},
-    {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:7be99f4abb008cb38e144f85f515598f4c2c8932bf11b65add0ff59c9c876d99"},
-    {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:d58a54d6ff08d2547656356eea8572b224e6f9bbc0cf55fa9966bcaac4ddfb10"},
-    {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:7673a76772bda15d0d10d1aa881b7911d0580c980dbd16e59d7ba1422b2d83cd"},
-    {file = "aiohttp-3.9.4-cp38-cp38-win32.whl", hash = "sha256:e4370dda04dc8951012f30e1ce7956a0a226ac0714a7b6c389fb2f43f22a250e"},
-    {file = "aiohttp-3.9.4-cp38-cp38-win_amd64.whl", hash = "sha256:eb30c4510a691bb87081192a394fb661860e75ca3896c01c6d186febe7c88530"},
-    {file = "aiohttp-3.9.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:84e90494db7df3be5e056f91412f9fa9e611fbe8ce4aaef70647297f5943b276"},
-    {file = "aiohttp-3.9.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7d4845f8501ab28ebfdbeab980a50a273b415cf69e96e4e674d43d86a464df9d"},
-    {file = "aiohttp-3.9.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:69046cd9a2a17245c4ce3c1f1a4ff8c70c7701ef222fce3d1d8435f09042bba1"},
-    {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b73a06bafc8dcc508420db43b4dd5850e41e69de99009d0351c4f3007960019"},
-    {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:418bb0038dfafeac923823c2e63226179976c76f981a2aaad0ad5d51f2229bca"},
-    {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:71a8f241456b6c2668374d5d28398f8e8cdae4cce568aaea54e0f39359cd928d"},
-    {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:935c369bf8acc2dc26f6eeb5222768aa7c62917c3554f7215f2ead7386b33748"},
-    {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:74e4e48c8752d14ecfb36d2ebb3d76d614320570e14de0a3aa7a726ff150a03c"},
-    {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:916b0417aeddf2c8c61291238ce25286f391a6acb6f28005dd9ce282bd6311b6"},
-    {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:9b6787b6d0b3518b2ee4cbeadd24a507756ee703adbac1ab6dc7c4434b8c572a"},
-    {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:221204dbda5ef350e8db6287937621cf75e85778b296c9c52260b522231940ed"},
-    {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:10afd99b8251022ddf81eaed1d90f5a988e349ee7d779eb429fb07b670751e8c"},
-    {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:2506d9f7a9b91033201be9ffe7d89c6a54150b0578803cce5cb84a943d075bc3"},
-    {file = "aiohttp-3.9.4-cp39-cp39-win32.whl", hash = "sha256:e571fdd9efd65e86c6af2f332e0e95dad259bfe6beb5d15b3c3eca3a6eb5d87b"},
-    {file = "aiohttp-3.9.4-cp39-cp39-win_amd64.whl", hash = "sha256:7d29dd5319d20aa3b7749719ac9685fbd926f71ac8c77b2477272725f882072d"},
-    {file = "aiohttp-3.9.4.tar.gz", hash = "sha256:6ff71ede6d9a5a58cfb7b6fffc83ab5d4a63138276c771ac91ceaaddf5459644"},
+    {file = "aiohttp-3.10.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:95213b3d79c7e387144e9cb7b9d2809092d6ff2c044cb59033aedc612f38fb6d"},
+    {file = "aiohttp-3.10.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1aa005f060aff7124cfadaa2493f00a4e28ed41b232add5869e129a2e395935a"},
+    {file = "aiohttp-3.10.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:eabe6bf4c199687592f5de4ccd383945f485779c7ffb62a9b9f1f8a3f9756df8"},
+    {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:96e010736fc16d21125c7e2dc5c350cd43c528b85085c04bf73a77be328fe944"},
+    {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:99f81f9c1529fd8e03be4a7bd7df32d14b4f856e90ef6e9cbad3415dbfa9166c"},
+    {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d611d1a01c25277bcdea06879afbc11472e33ce842322496b211319aa95441bb"},
+    {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e00191d38156e09e8c81ef3d75c0d70d4f209b8381e71622165f22ef7da6f101"},
+    {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:74c091a5ded6cb81785de2d7a8ab703731f26de910dbe0f3934eabef4ae417cc"},
+    {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:18186a80ec5a701816adbf1d779926e1069392cf18504528d6e52e14b5920525"},
+    {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:5a7ceb2a0d2280f23a02c64cd0afdc922079bb950400c3dd13a1ab2988428aac"},
+    {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:8bd7be6ff6c162a60cb8fce65ee879a684fbb63d5466aba3fa5b9288eb04aefa"},
+    {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:fae962b62944eaebff4f4fddcf1a69de919e7b967136a318533d82d93c3c6bd1"},
+    {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:a0fde16d284efcacbe15fb0c1013f0967b6c3e379649239d783868230bf1db42"},
+    {file = "aiohttp-3.10.2-cp310-cp310-win32.whl", hash = "sha256:f81cd85a0e76ec7b8e2b6636fe02952d35befda4196b8c88f3cec5b4fb512839"},
+    {file = "aiohttp-3.10.2-cp310-cp310-win_amd64.whl", hash = "sha256:54ba10eb5a3481c28282eb6afb5f709aedf53cf9c3a31875ffbdc9fc719ffd67"},
+    {file = "aiohttp-3.10.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:87fab7f948e407444c2f57088286e00e2ed0003ceaf3d8f8cc0f60544ba61d91"},
+    {file = "aiohttp-3.10.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ec6ad66ed660d46503243cbec7b2b3d8ddfa020f984209b3b8ef7d98ce69c3f2"},
+    {file = "aiohttp-3.10.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a4be88807283bd96ae7b8e401abde4ca0bab597ba73b5e9a2d98f36d451e9aac"},
+    {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:01c98041f90927c2cbd72c22a164bb816fa3010a047d264969cf82e1d4bcf8d1"},
+    {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:54e36c67e1a9273ecafab18d6693da0fb5ac48fd48417e4548ac24a918c20998"},
+    {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7de3ddb6f424af54535424082a1b5d1ae8caf8256ebd445be68c31c662354720"},
+    {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7dd9c7db94b4692b827ce51dcee597d61a0e4f4661162424faf65106775b40e7"},
+    {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e57e21e1167705f8482ca29cc5d02702208d8bf4aff58f766d94bcd6ead838cd"},
+    {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:a1a50e59b720060c29e2951fd9f13c01e1ea9492e5a527b92cfe04dd64453c16"},
+    {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:686c87782481fda5ee6ba572d912a5c26d9f98cc5c243ebd03f95222af3f1b0f"},
+    {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:dafb4abb257c0ed56dc36f4e928a7341b34b1379bd87e5a15ce5d883c2c90574"},
+    {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:494a6f77560e02bd7d1ab579fdf8192390567fc96a603f21370f6e63690b7f3d"},
+    {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6fe8503b1b917508cc68bf44dae28823ac05e9f091021e0c41f806ebbb23f92f"},
+    {file = "aiohttp-3.10.2-cp311-cp311-win32.whl", hash = "sha256:4ddb43d06ce786221c0dfd3c91b4892c318eaa36b903f7c4278e7e2fa0dd5102"},
+    {file = "aiohttp-3.10.2-cp311-cp311-win_amd64.whl", hash = "sha256:ca2f5abcb0a9a47e56bac173c01e9f6c6e7f27534d91451c5f22e6a35a5a2093"},
+    {file = "aiohttp-3.10.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:14eb6b17f6246959fb0b035d4f4ae52caa870c4edfb6170aad14c0de5bfbf478"},
+    {file = "aiohttp-3.10.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:465e445ec348d4e4bd349edd8b22db75f025da9d7b6dc1369c48e7935b85581e"},
+    {file = "aiohttp-3.10.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:341f8ece0276a828d95b70cd265d20e257f5132b46bf77d759d7f4e0443f2906"},
+    {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c01fbb87b5426381cd9418b3ddcf4fc107e296fa2d3446c18ce6c76642f340a3"},
+    {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2c474af073e1a6763e1c5522bbb2d85ff8318197e4c6c919b8d7886e16213345"},
+    {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d9076810a5621236e29b2204e67a68e1fe317c8727ee4c9abbfbb1083b442c38"},
+    {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e8f515d6859e673940e08de3922b9c4a2249653b0ac181169313bd6e4b1978ac"},
+    {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:655e583afc639bef06f3b2446972c1726007a21003cd0ef57116a123e44601bc"},
+    {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8da9449a575133828cc99985536552ea2dcd690e848f9d41b48d8853a149a959"},
+    {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:19073d57d0feb1865d12361e2a1f5a49cb764bf81a4024a3b608ab521568093a"},
+    {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:c8e98e1845805f184d91fda6f9ab93d7c7b0dddf1c07e0255924bfdb151a8d05"},
+    {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:377220a5efde6f9497c5b74649b8c261d3cce8a84cb661be2ed8099a2196400a"},
+    {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:92f7f4a4dc9cdb5980973a74d43cdbb16286dacf8d1896b6c3023b8ba8436f8e"},
+    {file = "aiohttp-3.10.2-cp312-cp312-win32.whl", hash = "sha256:9bb2834a6f11d65374ce97d366d6311a9155ef92c4f0cee543b2155d06dc921f"},
+    {file = "aiohttp-3.10.2-cp312-cp312-win_amd64.whl", hash = "sha256:518dc3cb37365255708283d1c1c54485bbacccd84f0a0fb87ed8917ba45eda5b"},
+    {file = "aiohttp-3.10.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:7f98e70bbbf693086efe4b86d381efad8edac040b8ad02821453083d15ec315f"},
+    {file = "aiohttp-3.10.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9f6f0b252a009e98fe84028a4ec48396a948e7a65b8be06ccfc6ef68cf1f614d"},
+    {file = "aiohttp-3.10.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:9360e3ffc7b23565600e729e8c639c3c50d5520e05fdf94aa2bd859eef12c407"},
+    {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3988044d1635c7821dd44f0edfbe47e9875427464e59d548aece447f8c22800a"},
+    {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:30a9d59da1543a6f1478c3436fd49ec59be3868bca561a33778b4391005e499d"},
+    {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f9f49bdb94809ac56e09a310a62f33e5f22973d6fd351aac72a39cd551e98194"},
+    {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ddfd2dca3f11c365d6857a07e7d12985afc59798458a2fdb2ffa4a0332a3fd43"},
+    {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:685c1508ec97b2cd3e120bfe309a4ff8e852e8a7460f1ef1de00c2c0ed01e33c"},
+    {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:49904f38667c44c041a0b44c474b3ae36948d16a0398a8f8cd84e2bb3c42a069"},
+    {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:352f3a4e5f11f3241a49b6a48bc5b935fabc35d1165fa0d87f3ca99c1fcca98b"},
+    {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:fc61f39b534c5d5903490478a0dd349df397d2284a939aa3cbaa2fb7a19b8397"},
+    {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:ad2274e707be37420d0b6c3d26a8115295fe9d8e6e530fa6a42487a8ca3ad052"},
+    {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:c836bf3c7512100219fe1123743fd8dd9a2b50dd7cfb0c3bb10d041309acab4b"},
+    {file = "aiohttp-3.10.2-cp38-cp38-win32.whl", hash = "sha256:53e8898adda402be03ff164b0878abe2d884e3ea03a4701e6ad55399d84b92dc"},
+    {file = "aiohttp-3.10.2-cp38-cp38-win_amd64.whl", hash = "sha256:7cc8f65f5b22304693de05a245b6736b14cb5bc9c8a03da6e2ae9ef15f8b458f"},
+    {file = "aiohttp-3.10.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:9dfc906d656e14004c5bc672399c1cccc10db38df2b62a13fb2b6e165a81c316"},
+    {file = "aiohttp-3.10.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:91b10208b222ddf655c3a3d5b727879d7163db12b634492df41a9182a76edaae"},
+    {file = "aiohttp-3.10.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9fd16b5e1a7bdd14668cd6bde60a2a29b49147a535c74f50d8177d11b38433a7"},
+    {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b2bfdda4971bd79201f59adbad24ec2728875237e1c83bba5221284dbbf57bda"},
+    {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:69d73f869cf29e8a373127fc378014e2b17bcfbe8d89134bc6fb06a2f67f3cb3"},
+    {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:df59f8486507c421c0620a2c3dce81fbf1d54018dc20ff4fecdb2c106d6e6abc"},
+    {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0df930015db36b460aa9badbf35eccbc383f00d52d4b6f3de2ccb57d064a6ade"},
+    {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:562b1153ab7f766ee6b8b357ec777a302770ad017cf18505d34f1c088fccc448"},
+    {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:d984db6d855de58e0fde1ef908d48fe9a634cadb3cf715962722b4da1c40619d"},
+    {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:14dc3fcb0d877911d775d511eb617a486a8c48afca0a887276e63db04d3ee920"},
+    {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:b52a27a5c97275e254704e1049f4b96a81e67d6205f52fa37a4777d55b0e98ef"},
+    {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:cd33d9de8cfd006a0d0fe85f49b4183c57e91d18ffb7e9004ce855e81928f704"},
+    {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:1238fc979160bc03a92fff9ad021375ff1c8799c6aacb0d8ea1b357ea40932bb"},
+    {file = "aiohttp-3.10.2-cp39-cp39-win32.whl", hash = "sha256:e2f43d238eae4f0b04f58d4c0df4615697d4ca3e9f9b1963d49555a94f0f5a04"},
+    {file = "aiohttp-3.10.2-cp39-cp39-win_amd64.whl", hash = "sha256:947847f07a8f81d7b39b2d0202fd73e61962ebe17ac2d8566f260679e467da7b"},
+    {file = "aiohttp-3.10.2.tar.gz", hash = "sha256:4d1f694b5d6e459352e5e925a42e05bac66655bfde44d81c59992463d2897014"},
 ]
 
 [package.dependencies]
+aiohappyeyeballs = ">=2.3.0"
 aiosignal = ">=1.1.2"
 async-timeout = {version = ">=4.0,<5.0", markers = "python_version < \"3.11\""}
 attrs = ">=17.3.0"
@@ -94,7 +106,7 @@ multidict = ">=4.5,<7.0"
 yarl = ">=1.0,<2.0"
 
 [package.extras]
-speedups = ["Brotli", "aiodns", "brotlicffi"]
+speedups = ["Brotli", "aiodns (>=3.2.0)", "brotlicffi"]
 
 [[package]]
 name = "aiopg"
@@ -3371,4 +3383,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "d569a3593b98baceb0a88e176bdad63cae99d6bfc2a81bf6741663a4abcafd72"
+content-hash = "c09bcb333ab550958b33dbf4fec968c500d8e701fd4c96402cddbd9bb8048055"
diff --git a/pyproject.toml b/pyproject.toml
index cfb569b2ba..ad3961ef55 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,7 +32,7 @@ psutil = "^5.9.4"
 types-psutil = "^5.9.5.12"
 types-toml = "^0.10.8.6"
 pytest-httpserver = "^1.0.8"
-aiohttp = "3.9.4"
+aiohttp = "3.10.2"
 pytest-rerunfailures = "^13.0"
 types-pytest-lazy-fixture = "^0.6.3.3"
 pytest-split = "^0.8.1"

From 7474790c808a642ee86e7b99b37df5e9e7317ddb Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 24 Sep 2024 17:34:56 +0100
Subject: [PATCH 378/412] CI(promote-images): fix prod ECR auth (#9131)

## Problem
Login to prod ECR doesn't work anymore:
```
Retrieving registries data through *** SDK...
*** ECR detected with eu-central-1 region
Error: The security token included in the request is invalid.
```
Ref
https://github.com/neondatabase/neon/actions/runs/11015238522/job/30592994281

Tested
https://github.com/neondatabase/neon/actions/runs/11017690614/job/30596213259#step:5:18
(on https://github.com/neondatabase/neon/commit/aae6182ff)

## Summary of changes
- Fix login to prod ECR by using `aws-actions/configure-aws-credentials`
---
 .github/workflows/build_and_test.yml | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index a634edb96b..bfdf3be2b9 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -843,6 +843,9 @@ jobs:
     needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
     runs-on: ubuntu-22.04
 
+    permissions:
+      id-token: write # for `aws-actions/configure-aws-credentials`
+
     env:
       VERSIONS: v14 v15 v16 v17
 
@@ -887,13 +890,19 @@ jobs:
           docker buildx imagetools create -t neondatabase/neon-test-extensions-v16:latest \
                                               neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }}
 
+      - name: Configure AWS-prod credentials
+        if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          aws-region: eu-central-1
+          mask-aws-account-id: true
+          role-to-assume: ${{ secrets.PROD_GHA_OIDC_ROLE }}
+
       - name: Login to prod ECR
         uses: docker/login-action@v3
         if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
         with:
           registry: 093970136003.dkr.ecr.eu-central-1.amazonaws.com
-          username: ${{ secrets.PROD_GHA_RUNNER_LIMITED_AWS_ACCESS_KEY_ID }}
-          password: ${{ secrets.PROD_GHA_RUNNER_LIMITED_AWS_SECRET_ACCESS_KEY }}
 
       - name: Copy all images to prod ECR
         if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'

From 2b3cc87a2aef7c133558d4e26f7e60b88ad1ac68 Mon Sep 17 00:00:00 2001
From: David Gomes <david@neon.tech>
Date: Mon, 21 Oct 2024 18:39:30 -0500
Subject: [PATCH 379/412] chore(compute): bumps pg_session_jwt to latest
 version (#9474)

---
 compute/Dockerfile.compute-node | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/compute/Dockerfile.compute-node b/compute/Dockerfile.compute-node
index 74970696b5..6451e309f0 100644
--- a/compute/Dockerfile.compute-node
+++ b/compute/Dockerfile.compute-node
@@ -975,8 +975,8 @@ ARG PG_VERSION
 RUN case "${PG_VERSION}" in "v17") \
     echo "pg_session_jwt does not yet have a release that supports pg17" && exit 0;; \
     esac && \
-    wget https://github.com/neondatabase/pg_session_jwt/archive/e642528f429dd3f5403845a50191b78d434b84a6.tar.gz -O pg_session_jwt.tar.gz && \
-    echo "1a69210703cc91224785e59a0a67562dd9eed9a0914ac84b11447582ca0d5b93 pg_session_jwt.tar.gz" | sha256sum --check && \
+    wget https://github.com/neondatabase/pg_session_jwt/archive/e1310b08ba51377a19e0559e4d1194883b9b2ba2.tar.gz -O pg_session_jwt.tar.gz && \
+    echo "837932a077888d5545fd54b0abcc79e5f8e37017c2769a930afc2f5c94df6f4e pg_session_jwt.tar.gz" | sha256sum --check && \
     mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \
     sed -i 's/pgrx = "=0.11.3"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     cargo pgrx install --release

From 237d6ffc0204c23a4d672ec541df1426ab8ce79b Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Tue, 29 Oct 2024 22:02:03 +0100
Subject: [PATCH 380/412] chore(compute): Bump pg_mooncake to the latest
 version

The topmost commit in the `neon` branch at the time of writing this
https://github.com/Mooncake-Labs/pg_mooncake/commits/neon/
https://github.com/Mooncake-Labs/pg_mooncake/commit/568b5a82b5fc16136bdf4ca5aac3e0cc261ab48d
---
 compute/compute-node.Dockerfile | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 85fb9c441d..d62c3c1668 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1131,14 +1131,17 @@ FROM rust-extensions-build AS pg-mooncake-build
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-ENV PG_MOONCAKE_VERSION=882175dbba07ba2e6e59b1088d61bf325b910b9e
+# The topmost commit in the `neon` branch at the time of writing this
+# https://github.com/Mooncake-Labs/pg_mooncake/commits/neon/
+# https://github.com/Mooncake-Labs/pg_mooncake/commit/568b5a82b5fc16136bdf4ca5aac3e0cc261ab48d
+ENV PG_MOONCAKE_VERSION=568b5a82b5fc16136bdf4ca5aac3e0cc261ab48d
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
 
 RUN case "${PG_VERSION}" in \
         'v14') \
             echo "pg_mooncake is not supported on Postgres ${PG_VERSION}" && exit 0;; \
     esac && \
-    git clone --depth 1 --branch neon https://github.com/kelvich/pg_mooncake.git pg_mooncake-src && \
+    git clone --depth 1 --branch neon https://github.com/Mooncake-Labs/pg_mooncake.git pg_mooncake-src && \
     cd pg_mooncake-src && \
     git checkout "${PG_MOONCAKE_VERSION}" && \
     git submodule update --init --depth 1 --recursive && \

From 04f91eea4567982a6f1831287ff7c8d39280cda4 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Mon, 11 Nov 2024 09:13:46 -0500
Subject: [PATCH 381/412] fix(pageserver): increase frozen layer warning
 threshold; ignore in tests (#9705)

Perf benchmarks produce a lot of layers.

## Summary of changes

Bumping the threshold and ignore the warning.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline.rs                 | 2 +-
 test_runner/fixtures/pageserver/allowed_errors.py | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 56faacbaee..09ddb19765 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3531,7 +3531,7 @@ impl Timeline {
                         self.get_compaction_threshold(),
                         DEFAULT_COMPACTION_THRESHOLD,
                     )
-                    && frozen_layer_total_size >= /* 64 MB */ 64000000
+                    && frozen_layer_total_size >= /* 128 MB */ 128000000
                 {
                     tracing::warn!(
                         "too many frozen layers: {num_frozen_layers} layers with estimated in-mem size of {frozen_layer_total_size} bytes",
diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py
index fa85563e35..d05704c8e0 100755
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -93,6 +93,8 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
     ".*WARN.*path=/v1/utilization .*request was dropped before completing",
     # Can happen during shutdown
     ".*scheduling deletion on drop failed: queue is in state Stopped.*",
+    # Too many frozen layers error is normal during intensive benchmarks
+    ".*too many frozen layers.*",
 )
 
 
From 96d66a201d5eec294c9bf62719d31972fa7eb024 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 11 Nov 2024 09:33:14 -0600
Subject: [PATCH 382/412] Clean up C.UTF-8 locale changes

Removes some unnecessary initdb arguments, and fixes Neon for MacOS
since it doesn't seem to ship a C.UTF-8 locale.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute_tools/src/config.rs       | 15 +++++++++++----
 libs/pageserver_api/src/config.rs |  6 +++++-
 pageserver/src/tenant.rs          |  6 ------
 3 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs
index 50e2a95e9d..d4e413034e 100644
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -74,10 +74,17 @@ pub fn write_postgres_conf(
     }
 
     // Locales
-    writeln!(file, "lc_messages='C.UTF-8'")?;
-    writeln!(file, "lc_monetary='C.UTF-8'")?;
-    writeln!(file, "lc_time='C.UTF-8'")?;
-    writeln!(file, "lc_numeric='C.UTF-8'")?;
+    if cfg!(target_os = "macos") {
+        writeln!(file, "lc_messages='C'")?;
+        writeln!(file, "lc_monetary='C'")?;
+        writeln!(file, "lc_time='C'")?;
+        writeln!(file, "lc_numeric='C'")?;
+    } else {
+        writeln!(file, "lc_messages='C.UTF-8'")?;
+        writeln!(file, "lc_monetary='C.UTF-8'")?;
+        writeln!(file, "lc_time='C.UTF-8'")?;
+        writeln!(file, "lc_numeric='C.UTF-8'")?;
+    }
 
     match spec.mode {
         ComputeMode::Primary => {}
diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 4272181954..f48c1febb5 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -277,7 +277,11 @@ pub mod defaults {
     pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s";
 
     pub const DEFAULT_SUPERUSER: &str = "cloud_admin";
-    pub const DEFAULT_LOCALE: &str = "C.UTF-8";
+    pub const DEFAULT_LOCALE: &str = if cfg!(target_os = "macos") {
+        "C"
+    } else {
+        "C.UTF-8"
+    };
 
     pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192;
     pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100;
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 903174680e..774672aed6 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -4786,12 +4786,6 @@ async fn run_initdb(
         .args(["--username", &conf.superuser])
         .args(["--encoding", "utf8"])
         .args(["--locale", &conf.locale])
-        .args(["--lc-collate", &conf.locale])
-        .args(["--lc-ctype", &conf.locale])
-        .args(["--lc-messages", &conf.locale])
-        .args(["--lc-monetary", &conf.locale])
-        .args(["--lc-numeric", &conf.locale])
-        .args(["--lc-time", &conf.locale])
         .arg("--no-instructions")
         .arg("--no-sync")
         .env_clear()

From 0fc6f6af8e293f407fd1b84ebb7bc747f8fbbfd1 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Mon, 25 Nov 2024 06:05:23 +0000
Subject: [PATCH 383/412] Storage & Compute release 2024-11-25


From 166f33f96b93a93031b2bb5fb2b03415c18ced7f Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 25 Nov 2024 16:19:36 +0100
Subject: [PATCH 384/412] Fixup Storage & Compute Release 2024-11-25


From 304af5c9e3e490ca8b96bdbff26e63c3266b3a42 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Mon, 2 Dec 2024 06:05:37 +0000
Subject: [PATCH 385/412] Storage & Compute release 2024-12-02


From 6c349e76d9aa223eb31bf925609f1a1adaabe6dc Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Mon, 9 Dec 2024 06:05:40 +0000
Subject: [PATCH 386/412] Storage release 2024-12-09


From bc6354921f3f5906c1004319b907b582d1cce660 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 11 Dec 2024 02:51:44 +0000
Subject: [PATCH 387/412] pageserver: fix CLog truncate walingest

---
 pageserver/src/walingest.rs | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 30c8965d51..b7712cfac7 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -877,22 +877,24 @@ impl WalIngest {
         // will block waiting for the last valid LSN to advance up to
         // it. So we use the previous record's LSN in the get calls
         // instead.
-        for segno in modification
-            .tline
-            .list_slru_segments(SlruKind::Clog, Version::Modified(modification), ctx)
-            .await?
-        {
-            let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
+        if modification.tline.get_shard_identity().is_shard_zero() {
+            for segno in modification
+                .tline
+                .list_slru_segments(SlruKind::Clog, Version::Modified(modification), ctx)
+                .await?
+            {
+                let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
 
-            let may_delete = dispatch_pgversion!(modification.tline.pg_version, {
-                pgv::nonrelfile_utils::slru_may_delete_clogsegment(segpage, pageno)
-            });
+                let may_delete = dispatch_pgversion!(modification.tline.pg_version, {
+                    pgv::nonrelfile_utils::slru_may_delete_clogsegment(segpage, pageno)
+                });
 
-            if may_delete {
-                modification
-                    .drop_slru_segment(SlruKind::Clog, segno, ctx)
-                    .await?;
-                trace!("Drop CLOG segment {:>04X}", segno);
+                if may_delete {
+                    modification
+                        .drop_slru_segment(SlruKind::Clog, segno, ctx)
+                        .await?;
+                    trace!("Drop CLOG segment {:>04X}", segno);
+                }
             }
         }
 

From fcfd1c7d0a47e08ad874b14bbfc7ec32a1efb24f Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 11 Dec 2024 13:23:19 +0100
Subject: [PATCH 388/412] pageserver: don't drop multixact slrus on non zero
 shards

---
 pageserver/src/walingest.rs | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index b7712cfac7..e5b23fed51 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -1049,16 +1049,18 @@ impl WalIngest {
 
         // Delete all the segments except the last one. The last segment can still
         // contain, possibly partially, valid data.
-        while segment != endsegment {
-            modification
-                .drop_slru_segment(SlruKind::MultiXactMembers, segment as u32, ctx)
-                .await?;
+        if modification.tline.get_shard_identity().is_shard_zero() {
+            while segment != endsegment {
+                modification
+                    .drop_slru_segment(SlruKind::MultiXactMembers, segment as u32, ctx)
+                    .await?;
 
-            /* move to next segment, handling wraparound correctly */
-            if segment == maxsegment {
-                segment = 0;
-            } else {
-                segment += 1;
+                /* move to next segment, handling wraparound correctly */
+                if segment == maxsegment {
+                    segment = 0;
+                } else {
+                    segment += 1;
+                }
             }
         }
 

From fde1046278bbf432666e93c5d88d4366fb86ce9b Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 11 Dec 2024 12:35:02 +0000
Subject: [PATCH 389/412] wal_decoder: fix compact key protobuf encoding
 (#10074)

## Problem

Protobuf doesn't support 128 bit integers, so we encode the keys as two
64 bit integers. Issue is that when we split the 128 bit compact key we
use signed 64 bit integers to represent the two halves. This may result
in a negative lower half when relnode is larger than `0x00800000`. When
we convert the lower half to an i128 we get a negative `CompactKey`.

## Summary of Changes

Use unsigned integers when encoding into Protobuf.

## Deployment

* Prod: We disabled the interpreted proto, so no compat concerns.
* Staging: Disable the interpreted proto, do one release, and then
release the fixed version.
We do this because a negative int32 will convert to a large uint32 value
and could give
a key in the actual pageserver space. In production we would around this
by adding new
fields to the proto and deprecating the old ones, but we can make our
lives easy here.
* Pre-prod: Same as staging
---
 libs/pageserver_api/src/key.rs               |  2 +-
 libs/wal_decoder/proto/interpreted_wal.proto |  4 +-
 libs/wal_decoder/src/wire_format.rs          | 65 +++++++++++++++++++-
 3 files changed, 66 insertions(+), 5 deletions(-)

diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index 37dff6fe46..373329c9b4 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -24,7 +24,7 @@ pub struct Key {
 
 /// When working with large numbers of Keys in-memory, it is more efficient to handle them as i128 than as
 /// a struct of fields.
-#[derive(Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize)]
+#[derive(Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize, Debug)]
 pub struct CompactKey(i128);
 
 /// The storage key size.
diff --git a/libs/wal_decoder/proto/interpreted_wal.proto b/libs/wal_decoder/proto/interpreted_wal.proto
index 0393392c1a..d68484d30f 100644
--- a/libs/wal_decoder/proto/interpreted_wal.proto
+++ b/libs/wal_decoder/proto/interpreted_wal.proto
@@ -37,7 +37,7 @@ message ValueMeta {
 }
 
 message CompactKey {
-  int64 high = 1;
-  int64 low = 2;
+  uint64 high = 1;
+  uint64 low = 2;
 }
 
diff --git a/libs/wal_decoder/src/wire_format.rs b/libs/wal_decoder/src/wire_format.rs
index 5a343054c3..944ee5c919 100644
--- a/libs/wal_decoder/src/wire_format.rs
+++ b/libs/wal_decoder/src/wire_format.rs
@@ -236,8 +236,8 @@ impl From<ValueMeta> for proto::ValueMeta {
 impl From<CompactKey> for proto::CompactKey {
     fn from(value: CompactKey) -> Self {
         proto::CompactKey {
-            high: (value.raw() >> 64) as i64,
-            low: value.raw() as i64,
+            high: (value.raw() >> 64) as u64,
+            low: value.raw() as u64,
         }
     }
 }
@@ -354,3 +354,64 @@ impl From<proto::CompactKey> for CompactKey {
         (((value.high as i128) << 64) | (value.low as i128)).into()
     }
 }
+
+#[test]
+fn test_compact_key_with_large_relnode() {
+    use pageserver_api::key::Key;
+
+    let inputs = vec![
+        Key {
+            field1: 0,
+            field2: 0x100,
+            field3: 0x200,
+            field4: 0,
+            field5: 0x10,
+            field6: 0x5,
+        },
+        Key {
+            field1: 0,
+            field2: 0x100,
+            field3: 0x200,
+            field4: 0x007FFFFF,
+            field5: 0x10,
+            field6: 0x5,
+        },
+        Key {
+            field1: 0,
+            field2: 0x100,
+            field3: 0x200,
+            field4: 0x00800000,
+            field5: 0x10,
+            field6: 0x5,
+        },
+        Key {
+            field1: 0,
+            field2: 0x100,
+            field3: 0x200,
+            field4: 0x00800001,
+            field5: 0x10,
+            field6: 0x5,
+        },
+        Key {
+            field1: 0,
+            field2: 0xFFFFFFFF,
+            field3: 0xFFFFFFFF,
+            field4: 0xFFFFFFFF,
+            field5: 0x0,
+            field6: 0x0,
+        },
+    ];
+
+    for input in inputs {
+        assert!(input.is_valid_key_on_write_path());
+        let compact = input.to_compact();
+        let proto: proto::CompactKey = compact.into();
+        let from_proto: CompactKey = proto.into();
+
+        assert_eq!(
+            compact, from_proto,
+            "Round trip failed for key with relnode={:#x}",
+            input.field4
+        );
+    }
+}

From c4ce4ac25ad95c28e7de8b4d2d0bc244caeb52d6 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 11 Dec 2024 14:37:08 +0100
Subject: [PATCH 390/412] page_service: don't count time spent in Batcher
 towards smgr latency metrics (#10075)

## Problem

With pipelining enabled, the time a request spends in the batcher stage
counts towards the smgr op latency.

If pipelining is disabled, that time is not accounted for.

In practice, this results in a jump in smgr getpage latencies in various
dashboards and degrades the internal SLO.

## Solution

In a similar vein to #10042 and with a similar rationale, this PR stops
counting the time spent in batcher stage towards smgr op latency.

The smgr op latency metric is reduced to the actual execution time.

Time spent in batcher stage is tracked in a separate histogram.
I expect to remove that histogram after batching rollout is complete,
but it will be helpful in the meantime to reason about the rollout.
---
 pageserver/src/metrics.rs         | 168 +++++++++++++++++++++---------
 pageserver/src/page_service.rs    |  13 ++-
 pageserver/src/tenant/throttle.rs |  19 ++--
 test_runner/fixtures/metrics.py   |   1 +
 4 files changed, 143 insertions(+), 58 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 96ee157856..b4e20cb8b9 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -16,7 +16,6 @@ use postgres_backend::{is_expected_io_error, QueryError};
 use pq_proto::framed::ConnectionError;
 use strum::{EnumCount, VariantNames};
 use strum_macros::{IntoStaticStr, VariantNames};
-use tracing::warn;
 use utils::id::TimelineId;
 
 /// Prometheus histogram buckets (in seconds) for operations in the critical
@@ -1225,32 +1224,58 @@ pub(crate) mod virtual_file_io_engine {
 
 pub(crate) struct SmgrOpTimer(Option<SmgrOpTimerInner>);
 pub(crate) struct SmgrOpTimerInner {
-    global_latency_histo: Histogram,
+    global_execution_latency_histo: Histogram,
+    per_timeline_execution_latency_histo: Option<Histogram>,
 
-    // Optional because not all op types are tracked per-timeline
-    per_timeline_latency_histo: Option<Histogram>,
+    global_batch_wait_time: Histogram,
+    per_timeline_batch_wait_time: Histogram,
 
     global_flush_in_progress_micros: IntCounter,
     per_timeline_flush_in_progress_micros: IntCounter,
 
-    start: Instant,
-    throttled: Duration,
-    op: SmgrQueryType,
+    timings: SmgrOpTimerState,
+}
+
+#[derive(Debug)]
+enum SmgrOpTimerState {
+    Received {
+        received_at: Instant,
+    },
+    ThrottleDoneExecutionStarting {
+        received_at: Instant,
+        throttle_started_at: Instant,
+        started_execution_at: Instant,
+    },
 }
 
 pub(crate) struct SmgrOpFlushInProgress {
-    base: Instant,
+    flush_started_at: Instant,
     global_micros: IntCounter,
     per_timeline_micros: IntCounter,
 }
 
 impl SmgrOpTimer {
-    pub(crate) fn deduct_throttle(&mut self, throttle: &Option<Duration>) {
-        let Some(throttle) = throttle else {
-            return;
-        };
+    pub(crate) fn observe_throttle_done_execution_starting(&mut self, throttle: &ThrottleResult) {
         let inner = self.0.as_mut().expect("other public methods consume self");
-        inner.throttled += *throttle;
+        match (&mut inner.timings, throttle) {
+            (SmgrOpTimerState::Received { received_at }, throttle) => match throttle {
+                ThrottleResult::NotThrottled { start } => {
+                    inner.timings = SmgrOpTimerState::ThrottleDoneExecutionStarting {
+                        received_at: *received_at,
+                        throttle_started_at: *start,
+                        started_execution_at: *start,
+                    };
+                }
+                ThrottleResult::Throttled { start, end } => {
+                    inner.timings = SmgrOpTimerState::ThrottleDoneExecutionStarting {
+                        received_at: *start,
+                        throttle_started_at: *start,
+                        started_execution_at: *end,
+                    };
+                }
+            },
+            (x, _) => panic!("called in unexpected state: {x:?}"),
+        }
     }
 
     pub(crate) fn observe_smgr_op_completion_and_start_flushing(mut self) -> SmgrOpFlushInProgress {
@@ -1263,7 +1288,7 @@ impl SmgrOpTimer {
             ..
         } = inner;
         SmgrOpFlushInProgress {
-            base: flush_start,
+            flush_started_at: flush_start,
             global_micros: global_flush_in_progress_micros,
             per_timeline_micros: per_timeline_flush_in_progress_micros,
         }
@@ -1274,32 +1299,42 @@ impl SmgrOpTimer {
         let inner = self.0.take()?;
 
         let now = Instant::now();
-        let elapsed = now - inner.start;
 
-        let elapsed = match elapsed.checked_sub(inner.throttled) {
-            Some(elapsed) => elapsed,
-            None => {
-                use utils::rate_limit::RateLimit;
-                static LOGGED: Lazy<Mutex<enum_map::EnumMap<SmgrQueryType, RateLimit>>> =
-                    Lazy::new(|| {
-                        Mutex::new(enum_map::EnumMap::from_array(std::array::from_fn(|_| {
-                            RateLimit::new(Duration::from_secs(10))
-                        })))
-                    });
-                let mut guard = LOGGED.lock().unwrap();
-                let rate_limit = &mut guard[inner.op];
-                rate_limit.call(|| {
-                    warn!(op=?inner.op, ?elapsed, ?inner.throttled, "implementation error: time spent throttled exceeds total request wall clock time");
-                });
-                elapsed // un-throttled time, more info than just saturating to 0
+        let batch;
+        let execution;
+        let throttle;
+        match inner.timings {
+            SmgrOpTimerState::Received { received_at } => {
+                batch = (now - received_at).as_secs_f64();
+                // TODO: use label for dropped requests.
+                // This is quite rare in practice, only during tenant/pageservers shutdown.
+                throttle = Duration::ZERO;
+                execution = Duration::ZERO.as_secs_f64();
             }
-        };
+            SmgrOpTimerState::ThrottleDoneExecutionStarting {
+                received_at,
+                throttle_started_at,
+                started_execution_at,
+            } => {
+                batch = (throttle_started_at - received_at).as_secs_f64();
+                throttle = started_execution_at - throttle_started_at;
+                execution = (now - started_execution_at).as_secs_f64();
+            }
+        }
 
-        let elapsed = elapsed.as_secs_f64();
+        // update time spent in batching
+        inner.global_batch_wait_time.observe(batch);
+        inner.per_timeline_batch_wait_time.observe(batch);
 
-        inner.global_latency_histo.observe(elapsed);
-        if let Some(per_timeline_getpage_histo) = &inner.per_timeline_latency_histo {
-            per_timeline_getpage_histo.observe(elapsed);
+        // time spent in throttle metric is updated by throttle impl
+        let _ = throttle;
+
+        // update metrics for execution latency
+        inner.global_execution_latency_histo.observe(execution);
+        if let Some(per_timeline_execution_latency_histo) =
+            &inner.per_timeline_execution_latency_histo
+        {
+            per_timeline_execution_latency_histo.observe(execution);
         }
 
         Some((now, inner))
@@ -1325,12 +1360,12 @@ impl SmgrOpFlushInProgress {
         // Last call is tracked in `now`.
         let mut observe_guard = scopeguard::guard(
             || {
-                let elapsed = now - self.base;
+                let elapsed = now - self.flush_started_at;
                 self.global_micros
                     .inc_by(u64::try_from(elapsed.as_micros()).unwrap());
                 self.per_timeline_micros
                     .inc_by(u64::try_from(elapsed.as_micros()).unwrap());
-                self.base = now;
+                self.flush_started_at = now;
             },
             |mut observe| {
                 observe();
@@ -1377,6 +1412,8 @@ pub(crate) struct SmgrQueryTimePerTimeline {
     per_timeline_batch_size: Histogram,
     global_flush_in_progress_micros: IntCounter,
     per_timeline_flush_in_progress_micros: IntCounter,
+    global_batch_wait_time: Histogram,
+    per_timeline_batch_wait_time: Histogram,
 }
 
 static SMGR_QUERY_STARTED_GLOBAL: Lazy<IntCounterVec> = Lazy::new(|| {
@@ -1399,12 +1436,15 @@ static SMGR_QUERY_STARTED_PER_TENANT_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|
     .expect("failed to define a metric")
 });
 
+// Alias so all histograms recording per-timeline smgr timings use the same buckets.
+static SMGR_QUERY_TIME_PER_TENANT_TIMELINE_BUCKETS: &[f64] = CRITICAL_OP_BUCKETS;
+
 static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
     register_histogram_vec!(
         "pageserver_smgr_query_seconds",
-        "Time spent on smgr query handling, aggegated by query type and tenant/timeline.",
+        "Time spent _executing_ smgr query handling, excluding batch and throttle delays.",
         &["smgr_query_type", "tenant_id", "shard_id", "timeline_id"],
-        CRITICAL_OP_BUCKETS.into(),
+        SMGR_QUERY_TIME_PER_TENANT_TIMELINE_BUCKETS.into(),
     )
     .expect("failed to define a metric")
 });
@@ -1462,7 +1502,7 @@ static SMGR_QUERY_TIME_GLOBAL_BUCKETS: Lazy<Vec<f64>> = Lazy::new(|| {
 static SMGR_QUERY_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
     register_histogram_vec!(
         "pageserver_smgr_query_seconds_global",
-        "Time spent on smgr query handling, aggregated by query type.",
+        "Like pageserver_smgr_query_seconds, but aggregated to instance level.",
         &["smgr_query_type"],
         SMGR_QUERY_TIME_GLOBAL_BUCKETS.clone(),
     )
@@ -1559,6 +1599,25 @@ static PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL: Lazy<IntCounter> = Lazy
     .expect("failed to define a metric")
 });
 
+static PAGE_SERVICE_SMGR_BATCH_WAIT_TIME: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "pageserver_page_service_pagestream_batch_wait_time_seconds",
+        "Time a request spent waiting in its batch until the batch moved to throttle&execution.",
+        &["tenant_id", "shard_id", "timeline_id"],
+        SMGR_QUERY_TIME_PER_TENANT_TIMELINE_BUCKETS.into(),
+    )
+    .expect("failed to define a metric")
+});
+
+static PAGE_SERVICE_SMGR_BATCH_WAIT_TIME_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "pageserver_page_service_pagestream_batch_wait_time_seconds_global",
+        "Like pageserver_page_service_pagestream_batch_wait_time_seconds, but aggregated to instance level.",
+        SMGR_QUERY_TIME_GLOBAL_BUCKETS.to_vec(),
+    )
+    .expect("failed to define a metric")
+});
+
 impl SmgrQueryTimePerTimeline {
     pub(crate) fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
         let tenant_id = tenant_shard_id.tenant_id.to_string();
@@ -1599,6 +1658,11 @@ impl SmgrQueryTimePerTimeline {
             .get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id])
             .unwrap();
 
+        let global_batch_wait_time = PAGE_SERVICE_SMGR_BATCH_WAIT_TIME_GLOBAL.clone();
+        let per_timeline_batch_wait_time = PAGE_SERVICE_SMGR_BATCH_WAIT_TIME
+            .get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id])
+            .unwrap();
+
         let global_flush_in_progress_micros =
             PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL.clone();
         let per_timeline_flush_in_progress_micros = PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS
@@ -1614,9 +1678,11 @@ impl SmgrQueryTimePerTimeline {
             per_timeline_batch_size,
             global_flush_in_progress_micros,
             per_timeline_flush_in_progress_micros,
+            global_batch_wait_time,
+            per_timeline_batch_wait_time,
         }
     }
-    pub(crate) fn start_smgr_op(&self, op: SmgrQueryType, started_at: Instant) -> SmgrOpTimer {
+    pub(crate) fn start_smgr_op(&self, op: SmgrQueryType, received_at: Instant) -> SmgrOpTimer {
         self.global_started[op as usize].inc();
 
         let per_timeline_latency_histo = if matches!(op, SmgrQueryType::GetPageAtLsn) {
@@ -1627,15 +1693,15 @@ impl SmgrQueryTimePerTimeline {
         };
 
         SmgrOpTimer(Some(SmgrOpTimerInner {
-            global_latency_histo: self.global_latency[op as usize].clone(),
-            per_timeline_latency_histo,
-            start: started_at,
-            op,
-            throttled: Duration::ZERO,
+            global_execution_latency_histo: self.global_latency[op as usize].clone(),
+            per_timeline_execution_latency_histo: per_timeline_latency_histo,
+            timings: SmgrOpTimerState::Received { received_at },
             global_flush_in_progress_micros: self.global_flush_in_progress_micros.clone(),
             per_timeline_flush_in_progress_micros: self
                 .per_timeline_flush_in_progress_micros
                 .clone(),
+            global_batch_wait_time: self.global_batch_wait_time.clone(),
+            per_timeline_batch_wait_time: self.per_timeline_batch_wait_time.clone(),
         }))
     }
 
@@ -2889,6 +2955,11 @@ impl TimelineMetrics {
             shard_id,
             timeline_id,
         ]);
+        let _ = PAGE_SERVICE_SMGR_BATCH_WAIT_TIME.remove_label_values(&[
+            tenant_id,
+            shard_id,
+            timeline_id,
+        ]);
     }
 }
 
@@ -2919,6 +2990,7 @@ use crate::context::{PageContentKind, RequestContext};
 use crate::task_mgr::TaskKind;
 use crate::tenant::mgr::TenantSlot;
 use crate::tenant::tasks::BackgroundLoopKind;
+use crate::tenant::throttle::ThrottleResult;
 use crate::tenant::Timeline;
 
 /// Maintain a per timeline gauge in addition to the global gauge.
@@ -3773,6 +3845,7 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) {
         &REMOTE_ONDEMAND_DOWNLOADED_BYTES,
         &CIRCUIT_BREAKERS_BROKEN,
         &CIRCUIT_BREAKERS_UNBROKEN,
+        &PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL,
     ]
     .into_iter()
     .for_each(|c| {
@@ -3820,6 +3893,7 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) {
         &WAL_REDO_BYTES_HISTOGRAM,
         &WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM,
         &PAGE_SERVICE_BATCH_SIZE_GLOBAL,
+        &PAGE_SERVICE_SMGR_BATCH_WAIT_TIME_GLOBAL,
     ]
     .into_iter()
     .for_each(|h| {
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 97d94bbe7f..d00ec11a76 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -575,7 +575,10 @@ enum BatchedFeMessage {
 }
 
 impl BatchedFeMessage {
-    async fn throttle(&mut self, cancel: &CancellationToken) -> Result<(), QueryError> {
+    async fn throttle_and_record_start_processing(
+        &mut self,
+        cancel: &CancellationToken,
+    ) -> Result<(), QueryError> {
         let (shard, tokens, timers) = match self {
             BatchedFeMessage::Exists { shard, timer, .. }
             | BatchedFeMessage::Nblocks { shard, timer, .. }
@@ -603,7 +606,7 @@ impl BatchedFeMessage {
             }
         };
         for timer in timers {
-            timer.deduct_throttle(&throttled);
+            timer.observe_throttle_done_execution_starting(&throttled);
         }
         Ok(())
     }
@@ -1230,7 +1233,7 @@ impl PageServerHandler {
                 }
             };
 
-            if let Err(cancelled) = msg.throttle(&self.cancel).await {
+            if let Err(cancelled) = msg.throttle_and_record_start_processing(&self.cancel).await {
                 break cancelled;
             }
 
@@ -1397,7 +1400,9 @@ impl PageServerHandler {
                             return Err(e);
                         }
                     };
-                    batch.throttle(&self.cancel).await?;
+                    batch
+                        .throttle_and_record_start_processing(&self.cancel)
+                        .await?;
                     self.pagesteam_handle_batched_message(pgb_writer, batch, &cancel, &ctx)
                         .await?;
                 }
diff --git a/pageserver/src/tenant/throttle.rs b/pageserver/src/tenant/throttle.rs
index 54c0e59daa..8ab6a0e060 100644
--- a/pageserver/src/tenant/throttle.rs
+++ b/pageserver/src/tenant/throttle.rs
@@ -58,6 +58,11 @@ pub struct Stats {
     pub sum_throttled_usecs: u64,
 }
 
+pub enum ThrottleResult {
+    NotThrottled { start: Instant },
+    Throttled { start: Instant, end: Instant },
+}
+
 impl<M> Throttle<M>
 where
     M: Metric,
@@ -122,15 +127,15 @@ where
         self.inner.load().rate_limiter.steady_rps()
     }
 
-    pub async fn throttle(&self, key_count: usize) -> Option<Duration> {
+    pub async fn throttle(&self, key_count: usize) -> ThrottleResult {
         let inner = self.inner.load_full(); // clones the `Inner` Arc
 
-        if !inner.enabled {
-            return None;
-        }
-
         let start = std::time::Instant::now();
 
+        if !inner.enabled {
+            return ThrottleResult::NotThrottled { start };
+        }
+
         self.metric.accounting_start();
         self.count_accounted_start.fetch_add(1, Ordering::Relaxed);
         let did_throttle = inner.rate_limiter.acquire(key_count).await;
@@ -145,9 +150,9 @@ where
                 .fetch_add(wait_time.as_micros() as u64, Ordering::Relaxed);
             let observation = Observation { wait_time };
             self.metric.observe_throttling(&observation);
-            Some(wait_time)
+            ThrottleResult::Throttled { start, end: now }
         } else {
-            None
+            ThrottleResult::NotThrottled { start }
         }
     }
 }
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index a591e088ef..c5295360c3 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -178,6 +178,7 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
     counter("pageserver_timeline_wal_records_received"),
     counter("pageserver_page_service_pagestream_flush_in_progress_micros"),
     *histogram("pageserver_page_service_batch_size"),
+    *histogram("pageserver_page_service_pagestream_batch_wait_time_seconds"),
     *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
     # "pageserver_directory_entries_count", -- only used if above a certain threshold
     # "pageserver_broken_tenants_count" -- used only for broken

From aeb79d1bb6da18448e31b56607a75a41e0a71c86 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Fri, 13 Dec 2024 06:02:24 +0000
Subject: [PATCH 391/412] Storage release 2024-12-13


From 6292d93867e69247377d5a5d105359b5659afa82 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Tue, 7 Jan 2025 10:48:11 +0000
Subject: [PATCH 392/412] Compute release 2025-01-07


From 31bd2dcdb409b6059e21d7293c7b1ff8dbd1e2b7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= <jc@neon.tech>
Date: Tue, 7 Jan 2025 14:45:18 +0100
Subject: [PATCH 393/412] Fix promote-images-prod after splitting it out
 (#10292)

## Problem
`promote-images` was split into `promote-images-dev` and
`promote-images-prod` in
https://github.com/neondatabase/neon/pull/10267.

`dev` credentials were loaded in `promote-images-dev` and `prod`
credentials were loaded in `promote-images-prod`, but
`promote-images-prod` needs `dev` credentials as well to access the
`dev` images to replicate them from `dev` to `prod`.

## Summary of changes
Load `dev` credentials in `promote-images-prod` as well.
---
 .github/workflows/build_and_test.yml | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 5c2b397c82..01f5c3ede9 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -979,6 +979,16 @@ jobs:
       VERSIONS: v14 v15 v16 v17
 
     steps:
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          aws-region: eu-central-1
+          role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          role-duration-seconds: 3600
+
+      - name: Login to Amazon Dev ECR
+        uses: aws-actions/amazon-ecr-login@v2
+
       - uses: docker/login-action@v3
         with:
           username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}

From ebc313c768d58c4eb46105737451cde6da489b3f Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Fri, 10 Jan 2025 19:55:05 +0000
Subject: [PATCH 394/412] Compute release 2025-01-10


From 6a4d8ec41023586e9fdbecc3283828be4ee95e27 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Wed, 15 Jan 2025 17:59:13 +0000
Subject: [PATCH 395/412] Compute release 2025-01-15


From 44ef8c884f409f2afd38cf249fc146994eb9285d Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Fri, 24 Jan 2025 07:00:43 +0000
Subject: [PATCH 396/412] Compute release 2025-01-24


From 6d9846a9e51bf8df3928f603a031669a21fecee3 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Fri, 31 Jan 2025 07:00:54 +0000
Subject: [PATCH 397/412] Compute release 2025-01-31


From ffc1a81b830d853dece75230b6c5c6a8d91947e1 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Fri, 7 Feb 2025 07:00:57 +0000
Subject: [PATCH 398/412] Compute release 2025-02-07


From 81367a6bbc887c6cef4826d8b6ecf0db81ce0a4f Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Tue, 18 Feb 2025 16:48:02 +0000
Subject: [PATCH 399/412] Compute release 2025-02-18


From 723f9ad3ee8ed994a4d76a8a05a7a93405d45ed8 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 20 Feb 2025 18:26:25 +0200
Subject: [PATCH 400/412] Temporarily disable pg_duckdb

It clashed with pg_mooncake
---
 compute/compute-node.Dockerfile | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 0491abe965..e78c26c0a6 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1669,7 +1669,11 @@ COPY --from=pg_anon-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_ivm-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_partman-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_mooncake-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg_duckdb-build /usr/local/pgsql/ /usr/local/pgsql/
+
+# Disabled temporarily, because it clashed with pg_mooncake. pg_mooncake
+# also depends on libduckdb, but a different version.
+#COPY --from=pg_duckdb-build /usr/local/pgsql/ /usr/local/pgsql/
+
 COPY --from=pg_repack-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pgaudit-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pgauditlogtofile-build /usr/local/pgsql/ /usr/local/pgsql/

From 33e5930c9739762a5a0b99a20ac4312a1e8c4306 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Mon, 24 Feb 2025 15:34:43 +0000
Subject: [PATCH 401/412] Compute release 2025-02-24


From 0d3f7a2b82ad0d803975711bfffb0194fa3f14bd Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Fri, 28 Feb 2025 11:18:46 +0000
Subject: [PATCH 402/412] Compute release 2025-02-28


From 8542507ee554715af6f1060c94dc1153c8172753 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Fri, 7 Mar 2025 07:00:52 +0000
Subject: [PATCH 403/412] Compute release 2025-03-07


From c7daf2b1e3414bd79d79160b9b79a47007aa395d Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 3 Apr 2025 12:29:53 +0100
Subject: [PATCH 404/412] Compute: update plv8 patch (#11426)

## Problem

https://github.com/neondatabase/cloud/issues/26866

## Summary of changes
- Update plv8 patch

Co-authored-by: Alexey Kondratov <kondratov.aleksey@gmail.com>
---
 compute/compute-node.Dockerfile               |  4 +--
 .../{plv8-3.1.10.patch => plv8_v3.1.10.patch} | 25 ++++++++++++-------
 compute/patches/plv8_v3.2.3.patch             | 13 ++++++++++
 3 files changed, 31 insertions(+), 11 deletions(-)
 rename compute/patches/{plv8-3.1.10.patch => plv8_v3.1.10.patch} (80%)
 create mode 100644 compute/patches/plv8_v3.2.3.patch

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 1fe2570e87..a53b380b3f 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -369,7 +369,7 @@ FROM build-deps AS plv8-src
 ARG PG_VERSION
 WORKDIR /ext-src
 
-COPY compute/patches/plv8-3.1.10.patch .
+COPY compute/patches/plv8* .
 
 # plv8 3.2.3 supports v17
 # last release v3.2.3 - Sep 7, 2024
@@ -393,7 +393,7 @@ RUN case "${PG_VERSION:?}" in \
     git clone --recurse-submodules --depth 1 --branch ${PLV8_TAG} https://github.com/plv8/plv8.git plv8-src && \
     tar -czf plv8.tar.gz --exclude .git plv8-src && \
     cd plv8-src && \
-    if [[ "${PG_VERSION:?}" < "v17" ]]; then patch -p1 < /ext-src/plv8-3.1.10.patch; fi
+    if [[ "${PG_VERSION:?}" < "v17" ]]; then patch -p1 < /ext-src/plv8_v3.1.10.patch; else patch -p1 < /ext-src/plv8_v3.2.3.patch; fi
 
 # Step 1: Build the vendored V8 engine. It doesn't depend on PostgreSQL, so use
 # 'build-deps' as the base. This enables caching and avoids unnecessary rebuilds.
diff --git a/compute/patches/plv8-3.1.10.patch b/compute/patches/plv8_v3.1.10.patch
similarity index 80%
rename from compute/patches/plv8-3.1.10.patch
rename to compute/patches/plv8_v3.1.10.patch
index 43cdb479f7..5cf96426d0 100644
--- a/compute/patches/plv8-3.1.10.patch
+++ b/compute/patches/plv8_v3.1.10.patch
@@ -1,12 +1,6 @@
-commit 46b38d3e46f9cd6c70d9b189dd6ff4abaa17cf5e
-Author: Alexander Bayandin <alexander@neon.tech>
-Date:   Sat Nov 30 18:29:32 2024 +0000
-
-    Fix v8 9.7.37 compilation on Debian 12
-
 diff --git a/patches/code/84cf3230a9680aac3b73c410c2b758760b6d3066.patch b/patches/code/84cf3230a9680aac3b73c410c2b758760b6d3066.patch
 new file mode 100644
-index 0000000..f0a5dc7
+index 0000000..fae1cb3
 --- /dev/null
 +++ b/patches/code/84cf3230a9680aac3b73c410c2b758760b6d3066.patch
 @@ -0,0 +1,30 @@
@@ -35,8 +29,21 @@ index 0000000..f0a5dc7
 +@@ -5,6 +5,7 @@
 + #ifndef V8_HEAP_CPPGC_PREFINALIZER_HANDLER_H_
 + #define V8_HEAP_CPPGC_PREFINALIZER_HANDLER_H_
-+ 
++
 ++#include <utility>
 + #include <vector>
-+ 
++
 + #include "include/cppgc/prefinalizer.h"
+diff --git a/plv8.cc b/plv8.cc
+index c1ce883..6e47e94 100644
+--- a/plv8.cc
++++ b/plv8.cc
+@@ -379,7 +379,7 @@ _PG_init(void)
+ 							   NULL,
+ 							   &plv8_v8_flags,
+ 							   NULL,
+-							   PGC_USERSET, 0,
++							   PGC_SUSET, 0,
+ #if PG_VERSION_NUM >= 90100
+ 							   NULL,
+ #endif
diff --git a/compute/patches/plv8_v3.2.3.patch b/compute/patches/plv8_v3.2.3.patch
new file mode 100644
index 0000000000..5cf4ae2fa2
--- /dev/null
+++ b/compute/patches/plv8_v3.2.3.patch
@@ -0,0 +1,13 @@
+diff --git a/plv8.cc b/plv8.cc
+index edfa2aa..623e7f2 100644
+--- a/plv8.cc
++++ b/plv8.cc
+@@ -385,7 +385,7 @@ _PG_init(void)
+                                    NULL,
+                                    &plv8_v8_flags,
+                                    NULL,
+-                                   PGC_USERSET, 0,
++                                   PGC_SUSET, 0,
+ #if PG_VERSION_NUM >= 90100
+                                    NULL,
+ #endif

From 817ec9979f225648e6e1b77fdb7c85923bf2bd84 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Wed, 16 Apr 2025 16:55:11 +0100
Subject: [PATCH 405/412] compute: fix copy-paste typo for neon GUC parameters
 check (#11610)

fix for commit
[5063151](https://github.com/neondatabase/neon/commit/50631512710d8c5fd9c4c681d7882e16f4df93f3)
---
 pgxn/neon/libpagestore.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index dfabb6919e..a956664d29 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -1362,7 +1362,7 @@ pg_init_libpagestore(void)
 							   "",
 							   PGC_POSTMASTER,
 							   0,	/* no flags required */
-							   check_neon_id, NULL, NULL);
+							   NULL, NULL, NULL);
 	DefineCustomStringVariable("neon.branch_id",
 							   "Neon branch_id the server is running on",
 							   NULL,
@@ -1370,7 +1370,7 @@ pg_init_libpagestore(void)
 							   "",
 							   PGC_POSTMASTER,
 							   0,	/* no flags required */
-							   check_neon_id, NULL, NULL);
+							   NULL, NULL, NULL);
 	DefineCustomStringVariable("neon.endpoint_id",
 							   "Neon endpoint_id the server is running on",
 							   NULL,
@@ -1378,7 +1378,7 @@ pg_init_libpagestore(void)
 							   "",
 							   PGC_POSTMASTER,
 							   0,	/* no flags required */
-							   check_neon_id, NULL, NULL);
+							   NULL, NULL, NULL);
 
 	DefineCustomIntVariable("neon.stripe_size",
 							"sharding stripe size",

From fe4762a9aa4e0087ec4b5e52314ed27c2267548b Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Fri, 11 Apr 2025 10:06:29 -0500
Subject: [PATCH 406/412] Consolidate compute_ctl configuration structures
 (#11514)

Previously, the structure of the spec file was just the compute spec.
However, the response from the control plane get spec request included
the compute spec and the compute_ctl config. This divergence was
hindering other work such as adding regression tests for compute_ctl
HTTP authorization.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute_tools/src/bin/compute_ctl.rs          |  52 ++---
 compute_tools/src/metrics.rs                  |   4 +-
 compute_tools/src/spec.rs                     |  50 ++---
 control_plane/src/endpoint.rs                 | 196 ++++++++++--------
 .../compute_wrapper/shell/compute.sh          |  28 ++-
 .../var/db/postgres/configs/config.json       | 148 +++++++++++++
 .../var/db/postgres/specs/spec.json           | 141 -------------
 docker-compose/docker-compose.yml             |   2 +-
 libs/compute_api/src/responses.rs             |  28 ++-
 libs/compute_api/src/spec.rs                  |  10 +-
 test_runner/fixtures/neon_fixtures.py         |  22 +-
 test_runner/regress/test_compute_catalog.py   | 190 +++++++++--------
 .../regress/test_compute_reconfigure.py       |  20 +-
 .../regress/test_subscriber_branching.py      |  12 +-
 14 files changed, 491 insertions(+), 412 deletions(-)
 create mode 100644 docker-compose/compute_wrapper/var/db/postgres/configs/config.json
 delete mode 100644 docker-compose/compute_wrapper/var/db/postgres/specs/spec.json

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index 4796a07d92..ea8350e2f5 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -29,13 +29,12 @@
 //! ```sh
 //! compute_ctl -D /var/db/postgres/compute \
 //!             -C 'postgresql://cloud_admin@localhost/postgres' \
-//!             -S /var/db/postgres/specs/current.json \
+//!             -c /var/db/postgres/configs/config.json \
 //!             -b /usr/local/bin/postgres \
 //!             -r http://pg-ext-s3-gateway \
 //! ```
 use std::ffi::OsString;
 use std::fs::File;
-use std::path::Path;
 use std::process::exit;
 use std::sync::mpsc;
 use std::thread;
@@ -43,8 +42,7 @@ use std::time::Duration;
 
 use anyhow::{Context, Result};
 use clap::Parser;
-use compute_api::responses::ComputeCtlConfig;
-use compute_api::spec::ComputeSpec;
+use compute_api::responses::ComputeConfig;
 use compute_tools::compute::{
     BUILD_TAG, ComputeNode, ComputeNodeParams, forward_termination_signal,
 };
@@ -118,8 +116,10 @@ struct Cli {
     #[arg(long)]
     pub set_disk_quota_for_fs: Option<String>,
 
-    #[arg(short = 'S', long, group = "spec-path")]
-    pub spec_path: Option<OsString>,
+    // TODO(tristan957): remove alias after compatibility tests are no longer
+    // an issue
+    #[arg(short = 'c', long, alias = "spec-path")]
+    pub config: Option<OsString>,
 
     #[arg(short = 'i', long, group = "compute-id")]
     pub compute_id: String,
@@ -127,8 +127,9 @@ struct Cli {
     #[arg(
         short = 'p',
         long,
-        conflicts_with = "spec-path",
-        value_name = "CONTROL_PLANE_API_BASE_URL"
+        conflicts_with = "config",
+        value_name = "CONTROL_PLANE_API_BASE_URL",
+        requires = "compute-id"
     )]
     pub control_plane_uri: Option<String>,
 }
@@ -154,7 +155,7 @@ fn main() -> Result<()> {
 
     let connstr = Url::parse(&cli.connstr).context("cannot parse connstr as a URL")?;
 
-    let cli_spec = try_spec_from_cli(&cli)?;
+    let cli_spec = get_config(&cli)?;
 
     let compute_node = ComputeNode::new(
         ComputeNodeParams {
@@ -201,27 +202,17 @@ async fn init() -> Result<()> {
     Ok(())
 }
 
-fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
-    // First, read spec from the path if provided
-    if let Some(ref spec_path) = cli.spec_path {
-        let file = File::open(Path::new(spec_path))?;
-        return Ok(CliSpecParams {
-            spec: Some(serde_json::from_reader(file)?),
-            compute_ctl_config: ComputeCtlConfig::default(),
-        });
+fn get_config(cli: &Cli) -> Result<ComputeConfig> {
+    // First, read the config from the path if provided
+    if let Some(ref config) = cli.config {
+        let file = File::open(config)?;
+        return Ok(serde_json::from_reader(&file)?);
     }
 
-    if cli.control_plane_uri.is_none() {
-        panic!("must specify --control-plane-uri");
-    };
-
-    // If the spec wasn't provided in the CLI arguments, then retrieve it from
+    // If the config wasn't provided in the CLI arguments, then retrieve it from
     // the control plane
-    match get_spec_from_control_plane(cli.control_plane_uri.as_ref().unwrap(), &cli.compute_id) {
-        Ok(resp) => Ok(CliSpecParams {
-            spec: resp.0,
-            compute_ctl_config: resp.1,
-        }),
+    match get_config_from_control_plane(cli.control_plane_uri.as_ref().unwrap(), &cli.compute_id) {
+        Ok(config) => Ok(config),
         Err(e) => {
             error!(
                 "cannot get response from control plane: {}\n\
@@ -233,13 +224,6 @@ fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
     }
 }
 
-struct CliSpecParams {
-    /// If a spec was provided via CLI or file, the [`ComputeSpec`]
-    spec: Option<ComputeSpec>,
-    #[allow(dead_code)]
-    compute_ctl_config: ComputeCtlConfig,
-}
-
 fn deinit_and_exit(exit_code: Option<i32>) -> ! {
     // Shutdown trace pipeline gracefully, so that it has a chance to send any
     // pending traces before we exit. Shutting down OTEL tracing provider may
diff --git a/compute_tools/src/metrics.rs b/compute_tools/src/metrics.rs
index 52f1795703..fa00476fd2 100644
--- a/compute_tools/src/metrics.rs
+++ b/compute_tools/src/metrics.rs
@@ -19,13 +19,13 @@ pub(crate) static INSTALLED_EXTENSIONS: Lazy<UIntGaugeVec> = Lazy::new(|| {
 // but for all our APIs we defined a 'slug'/method/operationId in the OpenAPI spec.
 // And it's fair to call it a 'RPC' (Remote Procedure Call).
 pub enum CPlaneRequestRPC {
-    GetSpec,
+    GetConfig,
 }
 
 impl CPlaneRequestRPC {
     pub fn as_str(&self) -> &str {
         match self {
-            CPlaneRequestRPC::GetSpec => "GetSpec",
+            CPlaneRequestRPC::GetConfig => "GetConfig",
         }
     }
 }
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index a76af21e9f..4b38e6e29c 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -3,9 +3,8 @@ use std::path::Path;
 
 use anyhow::{Result, anyhow, bail};
 use compute_api::responses::{
-    ComputeCtlConfig, ControlPlaneComputeStatus, ControlPlaneSpecResponse,
+    ComputeConfig, ControlPlaneComputeStatus, ControlPlaneConfigResponse,
 };
-use compute_api::spec::ComputeSpec;
 use reqwest::StatusCode;
 use tokio_postgres::Client;
 use tracing::{error, info, instrument};
@@ -21,7 +20,7 @@ use crate::params::PG_HBA_ALL_MD5;
 fn do_control_plane_request(
     uri: &str,
     jwt: &str,
-) -> Result<ControlPlaneSpecResponse, (bool, String, String)> {
+) -> Result<ControlPlaneConfigResponse, (bool, String, String)> {
     let resp = reqwest::blocking::Client::new()
         .get(uri)
         .header("Authorization", format!("Bearer {}", jwt))
@@ -29,14 +28,14 @@ fn do_control_plane_request(
         .map_err(|e| {
             (
                 true,
-                format!("could not perform spec request to control plane: {:?}", e),
+                format!("could not perform request to control plane: {:?}", e),
                 UNKNOWN_HTTP_STATUS.to_string(),
             )
         })?;
 
     let status = resp.status();
     match status {
-        StatusCode::OK => match resp.json::<ControlPlaneSpecResponse>() {
+        StatusCode::OK => match resp.json::<ControlPlaneConfigResponse>() {
             Ok(spec_resp) => Ok(spec_resp),
             Err(e) => Err((
                 true,
@@ -69,40 +68,35 @@ fn do_control_plane_request(
     }
 }
 
-/// Request spec from the control-plane by compute_id. If `NEON_CONTROL_PLANE_TOKEN`
-/// env variable is set, it will be used for authorization.
-pub fn get_spec_from_control_plane(
-    base_uri: &str,
-    compute_id: &str,
-) -> Result<(Option<ComputeSpec>, ComputeCtlConfig)> {
+/// Request config from the control-plane by compute_id. If
+/// `NEON_CONTROL_PLANE_TOKEN` env variable is set, it will be used for
+/// authorization.
+pub fn get_config_from_control_plane(base_uri: &str, compute_id: &str) -> Result<ComputeConfig> {
     let cp_uri = format!("{base_uri}/compute/api/v2/computes/{compute_id}/spec");
-    let jwt: String = match std::env::var("NEON_CONTROL_PLANE_TOKEN") {
-        Ok(v) => v,
-        Err(_) => "".to_string(),
-    };
+    let jwt: String = std::env::var("NEON_CONTROL_PLANE_TOKEN").unwrap_or_default();
     let mut attempt = 1;
 
-    info!("getting spec from control plane: {}", cp_uri);
+    info!("getting config from control plane: {}", cp_uri);
 
     // Do 3 attempts to get spec from the control plane using the following logic:
     // - network error -> then retry
     // - compute id is unknown or any other error -> bail out
     // - no spec for compute yet (Empty state) -> return Ok(None)
-    // - got spec -> return Ok(Some(spec))
+    // - got config -> return Ok(Some(config))
     while attempt < 4 {
         let result = match do_control_plane_request(&cp_uri, &jwt) {
-            Ok(spec_resp) => {
+            Ok(config_resp) => {
                 CPLANE_REQUESTS_TOTAL
                     .with_label_values(&[
-                        CPlaneRequestRPC::GetSpec.as_str(),
+                        CPlaneRequestRPC::GetConfig.as_str(),
                         &StatusCode::OK.to_string(),
                     ])
                     .inc();
-                match spec_resp.status {
-                    ControlPlaneComputeStatus::Empty => Ok((None, spec_resp.compute_ctl_config)),
+                match config_resp.status {
+                    ControlPlaneComputeStatus::Empty => Ok(config_resp.into()),
                     ControlPlaneComputeStatus::Attached => {
-                        if let Some(spec) = spec_resp.spec {
-                            Ok((Some(spec), spec_resp.compute_ctl_config))
+                        if config_resp.spec.is_some() {
+                            Ok(config_resp.into())
                         } else {
                             bail!("compute is attached, but spec is empty")
                         }
@@ -111,7 +105,7 @@ pub fn get_spec_from_control_plane(
             }
             Err((retry, msg, status)) => {
                 CPLANE_REQUESTS_TOTAL
-                    .with_label_values(&[CPlaneRequestRPC::GetSpec.as_str(), &status])
+                    .with_label_values(&[CPlaneRequestRPC::GetConfig.as_str(), &status])
                     .inc();
                 if retry {
                     Err(anyhow!(msg))
@@ -122,7 +116,7 @@ pub fn get_spec_from_control_plane(
         };
 
         if let Err(e) = &result {
-            error!("attempt {} to get spec failed with: {}", attempt, e);
+            error!("attempt {} to get config failed with: {}", attempt, e);
         } else {
             return result;
         }
@@ -133,13 +127,13 @@ pub fn get_spec_from_control_plane(
 
     // All attempts failed, return error.
     Err(anyhow::anyhow!(
-        "Exhausted all attempts to retrieve the spec from the control plane"
+        "Exhausted all attempts to retrieve the config from the control plane"
     ))
 }
 
 /// Check `pg_hba.conf` and update if needed to allow external connections.
 pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {
-    // XXX: consider making it a part of spec.json
+    // XXX: consider making it a part of config.json
     let pghba_path = pgdata_path.join("pg_hba.conf");
 
     if config::line_in_file(&pghba_path, PG_HBA_ALL_MD5)? {
@@ -153,7 +147,7 @@ pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {
 
 /// Create a standby.signal file
 pub fn add_standby_signal(pgdata_path: &Path) -> Result<()> {
-    // XXX: consider making it a part of spec.json
+    // XXX: consider making it a part of config.json
     let signalfile = pgdata_path.join("standby.signal");
 
     if !signalfile.exists() {
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 663c024953..2fa7a62f8f 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -29,7 +29,7 @@
 //!     compute.log               - log output of `compute_ctl` and `postgres`
 //!     endpoint.json             - serialized `EndpointConf` struct
 //!     postgresql.conf           - postgresql settings
-//!     spec.json                 - passed to `compute_ctl`
+//!     config.json                 - passed to `compute_ctl`
 //!     pgdata/
 //!         postgresql.conf       - copy of postgresql.conf created by `compute_ctl`
 //!         zenith.signal
@@ -46,7 +46,9 @@ use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
 
 use anyhow::{Context, Result, anyhow, bail};
 use compute_api::requests::ConfigurationRequest;
-use compute_api::responses::{ComputeCtlConfig, ComputeStatus, ComputeStatusResponse};
+use compute_api::responses::{
+    ComputeConfig, ComputeCtlConfig, ComputeStatus, ComputeStatusResponse,
+};
 use compute_api::spec::{
     Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PgIdent,
     RemoteExtSpec, Role,
@@ -619,90 +621,101 @@ impl Endpoint {
             remote_extensions = None;
         };
 
-        // Create spec file
-        let mut spec = ComputeSpec {
-            skip_pg_catalog_updates: self.skip_pg_catalog_updates,
-            format_version: 1.0,
-            operation_uuid: None,
-            features: self.features.clone(),
-            swap_size_bytes: None,
-            disk_quota_bytes: None,
-            disable_lfc_resizing: None,
-            cluster: Cluster {
-                cluster_id: None, // project ID: not used
-                name: None,       // project name: not used
-                state: None,
-                roles: if create_test_user {
-                    vec![Role {
+        // Create config file
+        let config = {
+            let mut spec = ComputeSpec {
+                skip_pg_catalog_updates: self.skip_pg_catalog_updates,
+                format_version: 1.0,
+                operation_uuid: None,
+                features: self.features.clone(),
+                swap_size_bytes: None,
+                disk_quota_bytes: None,
+                disable_lfc_resizing: None,
+                cluster: Cluster {
+                    cluster_id: None, // project ID: not used
+                    name: None,       // project name: not used
+                    state: None,
+                    roles: if create_test_user {
+                        vec![Role {
+                            name: PgIdent::from_str("test").unwrap(),
+                            encrypted_password: None,
+                            options: None,
+                        }]
+                    } else {
+                        Vec::new()
+                    },
+                    databases: if create_test_user {
+                        vec![Database {
+                            name: PgIdent::from_str("neondb").unwrap(),
+                            owner: PgIdent::from_str("test").unwrap(),
+                            options: None,
+                            restrict_conn: false,
+                            invalid: false,
+                        }]
+                    } else {
+                        Vec::new()
+                    },
+                    settings: None,
+                    postgresql_conf: Some(postgresql_conf.clone()),
+                },
+                delta_operations: None,
+                tenant_id: Some(self.tenant_id),
+                timeline_id: Some(self.timeline_id),
+                project_id: None,
+                branch_id: None,
+                endpoint_id: Some(self.endpoint_id.clone()),
+                mode: self.mode,
+                pageserver_connstring: Some(pageserver_connstring),
+                safekeepers_generation: safekeepers_generation.map(|g| g.into_inner()),
+                safekeeper_connstrings,
+                storage_auth_token: auth_token.clone(),
+                remote_extensions,
+                pgbouncer_settings: None,
+                shard_stripe_size: Some(shard_stripe_size),
+                local_proxy_config: None,
+                reconfigure_concurrency: self.reconfigure_concurrency,
+                drop_subscriptions_before_start: self.drop_subscriptions_before_start,
+                audit_log_level: ComputeAudit::Disabled,
+                logs_export_host: None::<String>,
+            };
+
+            // this strange code is needed to support respec() in tests
+            if self.cluster.is_some() {
+                debug!("Cluster is already set in the endpoint spec, using it");
+                spec.cluster = self.cluster.clone().unwrap();
+
+                debug!("spec.cluster {:?}", spec.cluster);
+
+                // fill missing fields again
+                if create_test_user {
+                    spec.cluster.roles.push(Role {
                         name: PgIdent::from_str("test").unwrap(),
                         encrypted_password: None,
                         options: None,
-                    }]
-                } else {
-                    Vec::new()
-                },
-                databases: if create_test_user {
-                    vec![Database {
+                    });
+                    spec.cluster.databases.push(Database {
                         name: PgIdent::from_str("neondb").unwrap(),
                         owner: PgIdent::from_str("test").unwrap(),
                         options: None,
                         restrict_conn: false,
                         invalid: false,
-                    }]
-                } else {
-                    Vec::new()
-                },
-                settings: None,
-                postgresql_conf: Some(postgresql_conf.clone()),
-            },
-            delta_operations: None,
-            tenant_id: Some(self.tenant_id),
-            timeline_id: Some(self.timeline_id),
-            project_id: None,
-            branch_id: None,
-            endpoint_id: Some(self.endpoint_id.clone()),
-            mode: self.mode,
-            pageserver_connstring: Some(pageserver_connstring),
-            safekeepers_generation: safekeepers_generation.map(|g| g.into_inner()),
-            safekeeper_connstrings,
-            storage_auth_token: auth_token.clone(),
-            remote_extensions,
-            pgbouncer_settings: None,
-            shard_stripe_size: Some(shard_stripe_size),
-            local_proxy_config: None,
-            reconfigure_concurrency: self.reconfigure_concurrency,
-            drop_subscriptions_before_start: self.drop_subscriptions_before_start,
-            audit_log_level: ComputeAudit::Disabled,
-            logs_export_host: None::<String>,
+                    });
+                }
+                spec.cluster.postgresql_conf = Some(postgresql_conf);
+            }
+
+            ComputeConfig {
+                spec: Some(spec),
+                compute_ctl_config: ComputeCtlConfig::default(),
+            }
         };
 
-        // this strange code is needed to support respec() in tests
-        if self.cluster.is_some() {
-            debug!("Cluster is already set in the endpoint spec, using it");
-            spec.cluster = self.cluster.clone().unwrap();
-
-            debug!("spec.cluster {:?}", spec.cluster);
-
-            // fill missing fields again
-            if create_test_user {
-                spec.cluster.roles.push(Role {
-                    name: PgIdent::from_str("test").unwrap(),
-                    encrypted_password: None,
-                    options: None,
-                });
-                spec.cluster.databases.push(Database {
-                    name: PgIdent::from_str("neondb").unwrap(),
-                    owner: PgIdent::from_str("test").unwrap(),
-                    options: None,
-                    restrict_conn: false,
-                    invalid: false,
-                });
-            }
-            spec.cluster.postgresql_conf = Some(postgresql_conf);
-        }
-
+        // TODO(tristan957): Remove the write to spec.json after compatibility
+        // tests work themselves out
         let spec_path = self.endpoint_path().join("spec.json");
-        std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
+        std::fs::write(spec_path, serde_json::to_string_pretty(&config.spec)?)?;
+        let config_path = self.endpoint_path().join("config.json");
+        std::fs::write(config_path, serde_json::to_string_pretty(&config)?)?;
 
         // Open log file. We'll redirect the stdout and stderr of `compute_ctl` to it.
         let logfile = std::fs::OpenOptions::new()
@@ -710,6 +723,16 @@ impl Endpoint {
             .append(true)
             .open(self.endpoint_path().join("compute.log"))?;
 
+        // TODO(tristan957): Remove when compatibility tests are no longer an
+        // issue
+        let old_compute_ctl = {
+            let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl"));
+            let help_output = cmd.arg("--help").output()?;
+            let help_output = String::from_utf8_lossy(&help_output.stdout);
+
+            !help_output.contains("--config")
+        };
+
         // Launch compute_ctl
         let conn_str = self.connstr("cloud_admin", "postgres");
         println!("Starting postgres node at '{}'", conn_str);
@@ -728,9 +751,18 @@ impl Endpoint {
         ])
         .args(["--pgdata", self.pgdata().to_str().unwrap()])
         .args(["--connstr", &conn_str])
+        // TODO(tristan957): Change this to --config when compatibility tests
+        // are no longer an issue
         .args([
             "--spec-path",
-            self.endpoint_path().join("spec.json").to_str().unwrap(),
+            self.endpoint_path()
+                .join(if old_compute_ctl {
+                    "spec.json"
+                } else {
+                    "config.json"
+                })
+                .to_str()
+                .unwrap(),
         ])
         .args([
             "--pgbin",
@@ -873,10 +905,12 @@ impl Endpoint {
         stripe_size: Option<ShardStripeSize>,
         safekeepers: Option<Vec<NodeId>>,
     ) -> Result<()> {
-        let mut spec: ComputeSpec = {
-            let spec_path = self.endpoint_path().join("spec.json");
-            let file = std::fs::File::open(spec_path)?;
-            serde_json::from_reader(file)?
+        let (mut spec, compute_ctl_config) = {
+            let config_path = self.endpoint_path().join("config.json");
+            let file = std::fs::File::open(config_path)?;
+            let config: ComputeConfig = serde_json::from_reader(file)?;
+
+            (config.spec.unwrap(), config.compute_ctl_config)
         };
 
         let postgresql_conf = self.read_postgresql_conf()?;
@@ -926,7 +960,7 @@ impl Endpoint {
             .body(
                 serde_json::to_string(&ConfigurationRequest {
                     spec,
-                    compute_ctl_config: ComputeCtlConfig::default(),
+                    compute_ctl_config,
                 })
                 .unwrap(),
             )
diff --git a/docker-compose/compute_wrapper/shell/compute.sh b/docker-compose/compute_wrapper/shell/compute.sh
index 418aaf876d..9409e9d055 100755
--- a/docker-compose/compute_wrapper/shell/compute.sh
+++ b/docker-compose/compute_wrapper/shell/compute.sh
@@ -11,8 +11,8 @@ generate_id() {
 
 PG_VERSION=${PG_VERSION:-14}
 
-SPEC_FILE_ORG=/var/db/postgres/specs/spec.json
-SPEC_FILE=/tmp/spec.json
+CONFIG_FILE_ORG=/var/db/postgres/configs/config.json
+CONFIG_FILE=/tmp/config.json
 
 echo "Waiting pageserver become ready."
 while ! nc -z pageserver 6400; do
@@ -20,7 +20,7 @@ while ! nc -z pageserver 6400; do
 done
 echo "Page server is ready."
 
-cp ${SPEC_FILE_ORG} ${SPEC_FILE}
+cp ${CONFIG_FILE_ORG} ${CONFIG_FILE}
 
  if [ -n "${TENANT_ID:-}" ] && [ -n "${TIMELINE_ID:-}" ]; then
    tenant_id=${TENANT_ID}
@@ -73,17 +73,27 @@ else
   ulid_extension=ulid
 fi
 echo "Adding pgx_ulid"
-shared_libraries=$(jq -r '.cluster.settings[] | select(.name=="shared_preload_libraries").value' ${SPEC_FILE})
-sed -i "s/${shared_libraries}/${shared_libraries},${ulid_extension}/" ${SPEC_FILE}
+shared_libraries=$(jq -r '.spec.cluster.settings[] | select(.name=="shared_preload_libraries").value' ${CONFIG_FILE})
+sed -i "s/${shared_libraries}/${shared_libraries},${ulid_extension}/" ${CONFIG_FILE}
 echo "Overwrite tenant id and timeline id in spec file"
-sed -i "s/TENANT_ID/${tenant_id}/" ${SPEC_FILE}
-sed -i "s/TIMELINE_ID/${timeline_id}/" ${SPEC_FILE}
+sed -i "s/TENANT_ID/${tenant_id}/" ${CONFIG_FILE}
+sed -i "s/TIMELINE_ID/${timeline_id}/" ${CONFIG_FILE}
 
-cat ${SPEC_FILE}
+cat ${CONFIG_FILE}
+
+# TODO(tristan957): Remove these workarounds for backwards compatibility after
+# the next compute release. That includes these next few lines and the
+# --spec-path in the compute_ctl invocation.
+if compute_ctl --help | grep --quiet -- '--config'; then
+  SPEC_PATH="$CONFIG_FILE"
+else
+  jq '.spec' < "$CONFIG_FILE" > /tmp/spec.json
+  SPEC_PATH=/tmp/spec.json
+fi
 
 echo "Start compute node"
 /usr/local/bin/compute_ctl --pgdata /var/db/postgres/compute \
      -C "postgresql://cloud_admin@localhost:55433/postgres"  \
      -b /usr/local/bin/postgres                              \
      --compute-id "compute-$RANDOM"                          \
-     -S ${SPEC_FILE}
+     --spec-path "$SPEC_PATH"
diff --git a/docker-compose/compute_wrapper/var/db/postgres/configs/config.json b/docker-compose/compute_wrapper/var/db/postgres/configs/config.json
new file mode 100644
index 0000000000..3ddf96512a
--- /dev/null
+++ b/docker-compose/compute_wrapper/var/db/postgres/configs/config.json
@@ -0,0 +1,148 @@
+{
+    "spec": {
+        "format_version": 1.0,
+
+        "timestamp": "2022-10-12T18:00:00.000Z",
+        "operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8c",
+
+        "cluster": {
+            "cluster_id": "docker_compose",
+            "name": "docker_compose_test",
+            "state": "restarted",
+            "roles": [
+                {
+                    "name": "cloud_admin",
+                    "encrypted_password": "b093c0d3b281ba6da1eacc608620abd8",
+                    "options": null
+                }
+            ],
+            "databases": [
+            ],
+            "settings": [
+                {
+                    "name": "fsync",
+                    "value": "off",
+                    "vartype": "bool"
+                },
+                {
+                    "name": "wal_level",
+                    "value": "logical",
+                    "vartype": "enum"
+                },
+                {
+                    "name": "wal_log_hints",
+                    "value": "on",
+                    "vartype": "bool"
+                },
+                {
+                    "name": "log_connections",
+                    "value": "on",
+                    "vartype": "bool"
+                },
+                {
+                    "name": "port",
+                    "value": "55433",
+                    "vartype": "integer"
+                },
+                {
+                    "name": "shared_buffers",
+                    "value": "1MB",
+                    "vartype": "string"
+                },
+                {
+                    "name": "max_connections",
+                    "value": "100",
+                    "vartype": "integer"
+                },
+                {
+                    "name": "listen_addresses",
+                    "value": "0.0.0.0",
+                    "vartype": "string"
+                },
+                {
+                    "name": "max_wal_senders",
+                    "value": "10",
+                    "vartype": "integer"
+                },
+                {
+                    "name": "max_replication_slots",
+                    "value": "10",
+                    "vartype": "integer"
+                },
+                {
+                    "name": "wal_sender_timeout",
+                    "value": "5s",
+                    "vartype": "string"
+                },
+                {
+                    "name": "wal_keep_size",
+                    "value": "0",
+                    "vartype": "integer"
+                },
+                {
+                    "name": "password_encryption",
+                    "value": "md5",
+                    "vartype": "enum"
+                },
+                {
+                    "name": "restart_after_crash",
+                    "value": "off",
+                    "vartype": "bool"
+                },
+                {
+                    "name": "synchronous_standby_names",
+                    "value": "walproposer",
+                    "vartype": "string"
+                },
+                {
+                    "name": "shared_preload_libraries",
+                    "value": "neon,pg_cron,timescaledb,pg_stat_statements",
+                    "vartype": "string"
+                },
+                {
+                    "name": "neon.safekeepers",
+                    "value": "safekeeper1:5454,safekeeper2:5454,safekeeper3:5454",
+                    "vartype": "string"
+                },
+                {
+                    "name": "neon.timeline_id",
+                    "value": "TIMELINE_ID",
+                    "vartype": "string"
+                },
+                {
+                    "name": "neon.tenant_id",
+                    "value": "TENANT_ID",
+                    "vartype": "string"
+                },
+                {
+                    "name": "neon.pageserver_connstring",
+                    "value": "host=pageserver port=6400",
+                    "vartype": "string"
+                },
+                {
+                    "name": "max_replication_write_lag",
+                    "value": "500MB",
+                    "vartype": "string"
+                },
+                {
+                    "name": "max_replication_flush_lag",
+                    "value": "10GB",
+                    "vartype": "string"
+                },
+                {
+                    "name": "cron.database",
+                    "value": "postgres",
+                    "vartype": "string"
+                }
+            ]
+        },
+
+        "delta_operations": [
+        ]
+    },
+    "compute_ctl_config": {
+        "jwks": {
+            "keys": []
+        }
+    }
+}
diff --git a/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json b/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
deleted file mode 100644
index 0308cab451..0000000000
--- a/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
+++ /dev/null
@@ -1,141 +0,0 @@
-{
-    "format_version": 1.0,
-
-    "timestamp": "2022-10-12T18:00:00.000Z",
-    "operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8c",
-
-    "cluster": {
-        "cluster_id": "docker_compose",
-        "name": "docker_compose_test",
-        "state": "restarted",
-        "roles": [
-            {
-                "name": "cloud_admin",
-                "encrypted_password": "b093c0d3b281ba6da1eacc608620abd8",
-                "options": null
-            }
-        ],
-        "databases": [
-        ],
-        "settings": [
-            {
-                "name": "fsync",
-                "value": "off",
-                "vartype": "bool"
-            },
-            {
-                "name": "wal_level",
-                "value": "logical",
-                "vartype": "enum"
-            },
-            {
-                "name": "wal_log_hints",
-                "value": "on",
-                "vartype": "bool"
-            },
-            {
-                "name": "log_connections",
-                "value": "on",
-                "vartype": "bool"
-            },
-            {
-                "name": "port",
-                "value": "55433",
-                "vartype": "integer"
-            },
-            {
-                "name": "shared_buffers",
-                "value": "1MB",
-                "vartype": "string"
-            },
-            {
-                "name": "max_connections",
-                "value": "100",
-                "vartype": "integer"
-            },
-            {
-                "name": "listen_addresses",
-                "value": "0.0.0.0",
-                "vartype": "string"
-            },
-            {
-                "name": "max_wal_senders",
-                "value": "10",
-                "vartype": "integer"
-            },
-            {
-                "name": "max_replication_slots",
-                "value": "10",
-                "vartype": "integer"
-            },
-            {
-                "name": "wal_sender_timeout",
-                "value": "5s",
-                "vartype": "string"
-            },
-            {
-                "name": "wal_keep_size",
-                "value": "0",
-                "vartype": "integer"
-            },
-            {
-                "name": "password_encryption",
-                "value": "md5",
-                "vartype": "enum"
-            },
-            {
-                "name": "restart_after_crash",
-                "value": "off",
-                "vartype": "bool"
-            },
-            {
-                "name": "synchronous_standby_names",
-                "value": "walproposer",
-                "vartype": "string"
-            },
-            {
-                "name": "shared_preload_libraries",
-                "value": "neon,pg_cron,timescaledb,pg_stat_statements",
-                "vartype": "string"
-            },
-            {
-                "name": "neon.safekeepers",
-                "value": "safekeeper1:5454,safekeeper2:5454,safekeeper3:5454",
-                "vartype": "string"
-            },
-            {
-                "name": "neon.timeline_id",
-                "value": "TIMELINE_ID",
-                "vartype": "string"
-            },
-            {
-                "name": "neon.tenant_id",
-                "value": "TENANT_ID",
-                "vartype": "string"
-            },
-            {
-                "name": "neon.pageserver_connstring",
-                "value": "host=pageserver port=6400",
-                "vartype": "string"
-            },
-            {
-                "name": "max_replication_write_lag",
-                "value": "500MB",
-                "vartype": "string"
-            },
-            {
-                "name": "max_replication_flush_lag",
-                "value": "10GB",
-                "vartype": "string"
-            },
-            {
-                "name": "cron.database",
-                "value": "postgres",
-                "vartype": "string"
-            }
-        ]
-    },
-
-    "delta_operations": [
-    ]
-}
diff --git a/docker-compose/docker-compose.yml b/docker-compose/docker-compose.yml
index 493a0a5523..fd3ad1fffc 100644
--- a/docker-compose/docker-compose.yml
+++ b/docker-compose/docker-compose.yml
@@ -159,7 +159,7 @@ services:
       #- RUST_BACKTRACE=1
     # Mount the test files directly, for faster editing cycle.
     volumes:
-      - ./compute_wrapper/var/db/postgres/specs/:/var/db/postgres/specs/
+      - ./compute_wrapper/var/db/postgres/configs/:/var/db/postgres/configs/
       - ./compute_wrapper/shell/:/shell/
     ports:
       - 55433:55433 # pg protocol handler
diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs
index c8f6019c5c..353949736b 100644
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -14,6 +14,32 @@ pub struct GenericAPIError {
     pub error: String,
 }
 
+/// All configuration parameters necessary for a compute. When
+/// [`ComputeConfig::spec`] is provided, it means that the compute is attached
+/// to a tenant. [`ComputeConfig::compute_ctl_config`] will always be provided
+/// and contains parameters necessary for operating `compute_ctl` independently
+/// of whether a tenant is attached to the compute or not.
+///
+/// This also happens to be the body of `compute_ctl`'s /configure request.
+#[derive(Debug, Deserialize, Serialize)]
+pub struct ComputeConfig {
+    /// The compute spec
+    pub spec: Option<ComputeSpec>,
+
+    /// The compute_ctl configuration
+    #[allow(dead_code)]
+    pub compute_ctl_config: ComputeCtlConfig,
+}
+
+impl From<ControlPlaneConfigResponse> for ComputeConfig {
+    fn from(value: ControlPlaneConfigResponse) -> Self {
+        Self {
+            spec: value.spec,
+            compute_ctl_config: value.compute_ctl_config,
+        }
+    }
+}
+
 #[derive(Debug, Clone, Serialize)]
 pub struct ExtensionInstallResponse {
     pub extension: PgIdent,
@@ -161,7 +187,7 @@ pub struct TlsConfig {
 
 /// Response of the `/computes/{compute_id}/spec` control-plane API.
 #[derive(Deserialize, Debug)]
-pub struct ControlPlaneSpecResponse {
+pub struct ControlPlaneConfigResponse {
     pub spec: Option<ComputeSpec>,
     pub status: ControlPlaneComputeStatus,
     pub compute_ctl_config: ComputeCtlConfig,
diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs
index 82950bcbaa..5e67ccce00 100644
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -1,8 +1,8 @@
-//! `ComputeSpec` represents the contents of the spec.json file.
-//!
-//! The spec.json file is used to pass information to 'compute_ctl'. It contains
-//! all the information needed to start up the right version of PostgreSQL,
-//! and connect it to the storage nodes.
+//! The ComputeSpec contains all the information needed to start up
+//! the right version of PostgreSQL, and connect it to the storage nodes.
+//! It can be passed as part of the `config.json`, or the control plane can
+//! provide it by calling the compute_ctl's `/compute_ctl` endpoint, or
+//! compute_ctl can fetch it by calling the control plane's API.
 use std::collections::HashMap;
 
 use indexmap::IndexMap;
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 858d367abf..0dd5508f12 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4297,10 +4297,10 @@ class Endpoint(PgProtocol, LogUtils):
 
     def respec_deep(self, **kwargs: Any) -> None:
         """
-        Update the spec.json file taking into account nested keys.
-        Distinct method from respec() to not break existing functionality.
-        NOTE: This method also updates the spec.json file, not endpoint.json.
-        We need it because neon_local also writes to spec.json, so intended
+        Update the endpoint.json file taking into account nested keys.
+        Distinct method from respec() to do not break existing functionality.
+        NOTE: This method also updates the config.json file, not endpoint.json.
+        We need it because neon_local also writes to config.json, so intended
         use-case is i) start endpoint with some config, ii) respec_deep(),
         iii) call reconfigure() to apply the changes.
         """
@@ -4313,17 +4313,17 @@ class Endpoint(PgProtocol, LogUtils):
                     curr[k] = v
             return curr
 
-        config_path = os.path.join(self.endpoint_path(), "spec.json")
+        config_path = os.path.join(self.endpoint_path(), "config.json")
         with open(config_path) as f:
-            data_dict: dict[str, Any] = json.load(f)
+            config: dict[str, Any] = json.load(f)
 
-        log.debug("Current compute spec: %s", json.dumps(data_dict, indent=4))
+        log.debug("Current compute config: %s", json.dumps(config, indent=4))
 
-        update(data_dict, kwargs)
+        update(config, kwargs)
 
         with open(config_path, "w") as file:
-            log.debug("Updating compute spec to: %s", json.dumps(data_dict, indent=4))
-            json.dump(data_dict, file, indent=4)
+            log.debug("Updating compute config to: %s", json.dumps(config, indent=4))
+            json.dump(config, file, indent=4)
 
     def wait_for_migrations(self, wait_for: int = NUM_COMPUTE_MIGRATIONS) -> None:
         """
@@ -4340,7 +4340,7 @@ class Endpoint(PgProtocol, LogUtils):
             wait_until(check_migrations_done)
 
     # Mock the extension part of spec passed from control plane for local testing
-    # endpooint.rs adds content of this file as a part of the spec.json
+    # endpooint.rs adds content of this file as a part of the config.json
     def create_remote_extension_spec(self, spec: dict[str, Any]):
         """Create a remote extension spec file for the endpoint."""
         remote_extensions_spec_path = os.path.join(
diff --git a/test_runner/regress/test_compute_catalog.py b/test_runner/regress/test_compute_catalog.py
index c1f05830b7..37208c9fff 100644
--- a/test_runner/regress/test_compute_catalog.py
+++ b/test_runner/regress/test_compute_catalog.py
@@ -90,10 +90,12 @@ def test_compute_catalog(neon_simple_env: NeonEnv):
     # and reconfigure the endpoint to create some test databases.
     endpoint.respec_deep(
         **{
-            "skip_pg_catalog_updates": False,
-            "cluster": {
-                "roles": TEST_ROLE_NAMES,
-                "databases": TEST_DB_NAMES,
+            "spec": {
+                "skip_pg_catalog_updates": False,
+                "cluster": {
+                    "roles": TEST_ROLE_NAMES,
+                    "databases": TEST_DB_NAMES,
+                },
             },
         }
     )
@@ -155,10 +157,12 @@ def test_compute_create_drop_dbs_and_roles(neon_simple_env: NeonEnv):
     # and reconfigure the endpoint to apply the changes.
     endpoint.respec_deep(
         **{
-            "skip_pg_catalog_updates": False,
-            "cluster": {
-                "roles": TEST_ROLE_NAMES,
-                "databases": TEST_DB_NAMES,
+            "spec": {
+                "skip_pg_catalog_updates": False,
+                "cluster": {
+                    "roles": TEST_ROLE_NAMES,
+                    "databases": TEST_DB_NAMES,
+                },
             },
         }
     )
@@ -196,12 +200,14 @@ def test_compute_create_drop_dbs_and_roles(neon_simple_env: NeonEnv):
 
     endpoint.respec_deep(
         **{
-            "skip_pg_catalog_updates": False,
-            "cluster": {
-                "roles": [],
-                "databases": [],
+            "spec": {
+                "skip_pg_catalog_updates": False,
+                "cluster": {
+                    "roles": [],
+                    "databases": [],
+                },
+                "delta_operations": delta_operations,
             },
-            "delta_operations": delta_operations,
         }
     )
     endpoint.reconfigure()
@@ -250,9 +256,11 @@ def test_dropdb_with_subscription(neon_simple_env: NeonEnv):
     # and reconfigure the endpoint to apply the changes.
     endpoint.respec_deep(
         **{
-            "skip_pg_catalog_updates": False,
-            "cluster": {
-                "databases": TEST_DB_NAMES,
+            "spec": {
+                "skip_pg_catalog_updates": False,
+                "cluster": {
+                    "databases": TEST_DB_NAMES,
+                },
             },
         }
     )
@@ -306,17 +314,19 @@ def test_dropdb_with_subscription(neon_simple_env: NeonEnv):
     # and reconfigure the endpoint to apply the changes.
     endpoint.respec_deep(
         **{
-            "skip_pg_catalog_updates": False,
-            "cluster": {
-                "databases": TEST_DB_NAMES_NEW,
+            "spec": {
+                "skip_pg_catalog_updates": False,
+                "cluster": {
+                    "databases": TEST_DB_NAMES_NEW,
+                },
+                "delta_operations": [
+                    {"action": "delete_db", "name": SUB_DB_NAME},
+                    # also test the case when we try to delete a non-existent database
+                    # shouldn't happen in normal operation,
+                    # but can occur when failed operations are retried
+                    {"action": "delete_db", "name": "nonexistent_db"},
+                ],
             },
-            "delta_operations": [
-                {"action": "delete_db", "name": SUB_DB_NAME},
-                # also test the case when we try to delete a non-existent database
-                # shouldn't happen in normal operation,
-                # but can occur when failed operations are retried
-                {"action": "delete_db", "name": "nonexistent_db"},
-            ],
         }
     )
 
@@ -354,25 +364,27 @@ def test_drop_role_with_table_privileges_from_neon_superuser(neon_simple_env: Ne
 
     endpoint.respec_deep(
         **{
-            "skip_pg_catalog_updates": False,
-            "cluster": {
-                "roles": [
-                    {
-                        # We need to create role via compute_ctl, because in this case it will receive
-                        # additional grants equivalent to our real environment, so we can repro some
-                        # issues.
-                        "name": "neon",
-                        # Some autocomplete-suggested hash, no specific meaning.
-                        "encrypted_password": "SCRAM-SHA-256$4096:hBT22QjqpydQWqEulorfXA==$miBogcoj68JWYdsNB5PW1X6PjSLBEcNuctuhtGkb4PY=:hxk2gxkwxGo6P7GCtfpMlhA9zwHvPMsCz+NQf2HfvWk=",
-                        "options": [],
-                    },
-                ],
-                "databases": [
-                    {
-                        "name": TEST_DB_NAME,
-                        "owner": "neon",
-                    },
-                ],
+            "spec": {
+                "skip_pg_catalog_updates": False,
+                "cluster": {
+                    "roles": [
+                        {
+                            # We need to create role via compute_ctl, because in this case it will receive
+                            # additional grants equivalent to our real environment, so we can repro some
+                            # issues.
+                            "name": "neon",
+                            # Some autocomplete-suggested hash, no specific meaning.
+                            "encrypted_password": "SCRAM-SHA-256$4096:hBT22QjqpydQWqEulorfXA==$miBogcoj68JWYdsNB5PW1X6PjSLBEcNuctuhtGkb4PY=:hxk2gxkwxGo6P7GCtfpMlhA9zwHvPMsCz+NQf2HfvWk=",
+                            "options": [],
+                        },
+                    ],
+                    "databases": [
+                        {
+                            "name": TEST_DB_NAME,
+                            "owner": "neon",
+                        },
+                    ],
+                },
             },
         }
     )
@@ -415,13 +427,15 @@ def test_drop_role_with_table_privileges_from_neon_superuser(neon_simple_env: Ne
     # Drop role via compute_ctl
     endpoint.respec_deep(
         **{
-            "skip_pg_catalog_updates": False,
-            "delta_operations": [
-                {
-                    "action": "delete_role",
-                    "name": TEST_GRANTEE,
-                },
-            ],
+            "spec": {
+                "skip_pg_catalog_updates": False,
+                "delta_operations": [
+                    {
+                        "action": "delete_role",
+                        "name": TEST_GRANTEE,
+                    },
+                ],
+            },
         }
     )
     endpoint.reconfigure()
@@ -444,13 +458,15 @@ def test_drop_role_with_table_privileges_from_neon_superuser(neon_simple_env: Ne
 
     endpoint.respec_deep(
         **{
-            "skip_pg_catalog_updates": False,
-            "delta_operations": [
-                {
-                    "action": "delete_role",
-                    "name": "readonly2",
-                },
-            ],
+            "spec": {
+                "skip_pg_catalog_updates": False,
+                "delta_operations": [
+                    {
+                        "action": "delete_role",
+                        "name": "readonly2",
+                    },
+                ],
+            },
         }
     )
     endpoint.reconfigure()
@@ -475,25 +491,27 @@ def test_drop_role_with_table_privileges_from_non_neon_superuser(neon_simple_env
     endpoint = env.endpoints.create_start("main")
     endpoint.respec_deep(
         **{
-            "skip_pg_catalog_updates": False,
-            "cluster": {
-                "roles": [
-                    {
-                        # We need to create role via compute_ctl, because in this case it will receive
-                        # additional grants equivalent to our real environment, so we can repro some
-                        # issues.
-                        "name": TEST_GRANTOR,
-                        # Some autocomplete-suggested hash, no specific meaning.
-                        "encrypted_password": "SCRAM-SHA-256$4096:hBT22QjqpydQWqEulorfXA==$miBogcoj68JWYdsNB5PW1X6PjSLBEcNuctuhtGkb4PY=:hxk2gxkwxGo6P7GCtfpMlhA9zwHvPMsCz+NQf2HfvWk=",
-                        "options": [],
-                    },
-                ],
-                "databases": [
-                    {
-                        "name": TEST_DB_NAME,
-                        "owner": TEST_GRANTOR,
-                    },
-                ],
+            "spec": {
+                "skip_pg_catalog_updates": False,
+                "cluster": {
+                    "roles": [
+                        {
+                            # We need to create role via compute_ctl, because in this case it will receive
+                            # additional grants equivalent to our real environment, so we can repro some
+                            # issues.
+                            "name": TEST_GRANTOR,
+                            # Some autocomplete-suggested hash, no specific meaning.
+                            "encrypted_password": "SCRAM-SHA-256$4096:hBT22QjqpydQWqEulorfXA==$miBogcoj68JWYdsNB5PW1X6PjSLBEcNuctuhtGkb4PY=:hxk2gxkwxGo6P7GCtfpMlhA9zwHvPMsCz+NQf2HfvWk=",
+                            "options": [],
+                        },
+                    ],
+                    "databases": [
+                        {
+                            "name": TEST_DB_NAME,
+                            "owner": TEST_GRANTOR,
+                        },
+                    ],
+                },
             },
         }
     )
@@ -507,13 +525,15 @@ def test_drop_role_with_table_privileges_from_non_neon_superuser(neon_simple_env
 
     endpoint.respec_deep(
         **{
-            "skip_pg_catalog_updates": False,
-            "delta_operations": [
-                {
-                    "action": "delete_role",
-                    "name": TEST_GRANTEE,
-                },
-            ],
+            "spec": {
+                "skip_pg_catalog_updates": False,
+                "delta_operations": [
+                    {
+                        "action": "delete_role",
+                        "name": TEST_GRANTEE,
+                    },
+                ],
+            },
         }
     )
     endpoint.reconfigure()
diff --git a/test_runner/regress/test_compute_reconfigure.py b/test_runner/regress/test_compute_reconfigure.py
index 6396ba67a1..b533d45b1e 100644
--- a/test_runner/regress/test_compute_reconfigure.py
+++ b/test_runner/regress/test_compute_reconfigure.py
@@ -31,15 +31,17 @@ def test_compute_reconfigure(neon_simple_env: NeonEnv):
 
     endpoint.respec_deep(
         **{
-            "skip_pg_catalog_updates": True,
-            "cluster": {
-                "settings": [
-                    {
-                        "name": "log_line_prefix",
-                        "vartype": "string",
-                        "value": TEST_LOG_LINE_PREFIX,
-                    }
-                ]
+            "spec": {
+                "skip_pg_catalog_updates": True,
+                "cluster": {
+                    "settings": [
+                        {
+                            "name": "log_line_prefix",
+                            "vartype": "string",
+                            "value": TEST_LOG_LINE_PREFIX,
+                        }
+                    ]
+                },
             },
         }
     )
diff --git a/test_runner/regress/test_subscriber_branching.py b/test_runner/regress/test_subscriber_branching.py
index 6175643389..83bebc19be 100644
--- a/test_runner/regress/test_subscriber_branching.py
+++ b/test_runner/regress/test_subscriber_branching.py
@@ -251,7 +251,7 @@ def test_multiple_subscription_branching(neon_simple_env: NeonEnv):
     NUMBER_OF_DBS = 5
 
     # Create and start endpoint so that neon_local put all the generated
-    # stuff into the spec.json file.
+    # stuff into the config.json file.
     endpoint = env.endpoints.create_start(
         "main",
         config_lines=[
@@ -280,13 +280,15 @@ def test_multiple_subscription_branching(neon_simple_env: NeonEnv):
             }
         )
 
-    # Update the spec.json file to create the databases
+    # Update the config.json file to create the databases
     # and reconfigure the endpoint to apply the changes.
     endpoint.respec_deep(
         **{
-            "skip_pg_catalog_updates": False,
-            "cluster": {
-                "databases": TEST_DB_NAMES,
+            "spec": {
+                "skip_pg_catalog_updates": False,
+                "cluster": {
+                    "databases": TEST_DB_NAMES,
+                },
             },
         }
     )

From e6b7589ba494f93b7337b1eef1ce5a58be7c33d1 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 24 Apr 2025 17:46:10 +0300
Subject: [PATCH 407/412] Undo commit d1728a6bcd872ab9d79a95a5f3f5fa72243ed898
 because it causes problems with creating pg_search extension (#11700)

## Problem

See https://neondb.slack.com/archives/C03H1K0PGKH/p1745489241982209

pg_search extension now can not be created.

## Summary of changes

Undo d1728a6bcd872ab9d79a95a5f3f5fa72243ed898

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 compute_tools/src/bin/compute_ctl.rs | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index aacef91d56..e337ee7b15 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -57,13 +57,24 @@ use tracing::{error, info};
 use url::Url;
 use utils::failpoint_support;
 
+// Compatibility hack: if the control plane specified any remote-ext-config
+// use the default value for extension storage proxy gateway.
+// Remove this once the control plane is updated to pass the gateway URL
+fn parse_remote_ext_config(arg: &str) -> Result<String> {
+    if arg.starts_with("http") {
+        Ok(arg.trim_end_matches('/').to_string())
+    } else {
+        Ok("http://pg-ext-s3-gateway".to_string())
+    }
+}
+
 #[derive(Parser)]
 #[command(rename_all = "kebab-case")]
 struct Cli {
     #[arg(short = 'b', long, default_value = "postgres", env = "POSTGRES_PATH")]
     pub pgbin: String,
 
-    #[arg(short = 'r', long)]
+    #[arg(short = 'r', long, value_parser = parse_remote_ext_config)]
     pub remote_ext_config: Option<String>,
 
     /// The port to bind the external listening HTTP server to. Clients running

From b5deda3e0863b33c19375d65e37ef996ec36453a Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 22 Apr 2025 17:47:22 -0500
Subject: [PATCH 408/412] Add CPU architecture to the remote extensions object
 key (#11590)

ARM computes are incoming and we need to account for that in remote
extensions. Previously, we just blindly assumed that all computes were
x86_64.

Note that we use the Go architecture naming convention instead of the
Rust one directly to do our best and be consistent across the stack.

Part-of: https://github.com/neondatabase/cloud/issues/23148

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 libs/compute_api/src/spec.rs                    | 13 +++++++++++--
 test_runner/regress/test_download_extensions.py | 14 +++++++++++++-
 2 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs
index 5e67ccce00..ad246c48ec 100644
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -242,13 +242,22 @@ impl RemoteExtSpec {
 
         match self.extension_data.get(real_ext_name) {
             Some(_ext_data) => {
+                // We have decided to use the Go naming convention due to Kubernetes.
+
+                let arch = match std::env::consts::ARCH {
+                    "x86_64" => "amd64",
+                    "aarch64" => "arm64",
+                    arch => arch,
+                };
+
                 // Construct the path to the extension archive
                 // BUILD_TAG/PG_MAJOR_VERSION/extensions/EXTENSION_NAME.tar.zst
                 //
                 // Keep it in sync with path generation in
                 // https://github.com/neondatabase/build-custom-extensions/tree/main
-                let archive_path_str =
-                    format!("{build_tag}/{pg_major_version}/extensions/{real_ext_name}.tar.zst");
+                let archive_path_str = format!(
+                    "{build_tag}/{arch}/{pg_major_version}/extensions/{real_ext_name}.tar.zst"
+                );
                 Ok((
                     real_ext_name.to_string(),
                     RemotePath::from_string(&archive_path_str)?,
diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py
index 77babe12cd..a81d55e57b 100644
--- a/test_runner/regress/test_download_extensions.py
+++ b/test_runner/regress/test_download_extensions.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import os
+import platform
 import shutil
 import tarfile
 from typing import TYPE_CHECKING
@@ -58,7 +59,18 @@ def test_remote_extensions(
     extensions_endpoint = f"http://{host}:{port}/pg-ext-s3-gateway"
 
     build_tag = os.environ.get("BUILD_TAG", "latest")
-    archive_route = f"{build_tag}/v{pg_version}/extensions/test_extension.tar.zst"
+
+    # We have decided to use the Go naming convention due to Kubernetes.
+    arch = platform.machine()
+    match arch:
+        case "aarch64":
+            arch = "arm64"
+        case "x86_64":
+            arch = "amd64"
+        case _:
+            pass
+
+    archive_route = f"{build_tag}/{arch}/v{pg_version}/extensions/test_extension.tar.zst"
     tarball = test_output_dir / "test_extension.tar"
     extension_dir = (
         base_dir / "test_runner" / "regress" / "data" / "test_remote_extensions" / "test_extension"

From 15fd43e0f6a246a0a76c0ae2c24d675938e1bcde Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 28 May 2025 19:52:55 +0300
Subject: [PATCH 409/412] Revert "Add online_advisor extension (#11898)"

This reverts commit 8ff25dca8eb579f7c17e0cc0833c8ad8fa3344f7.

There was a CI failure on the online-advistor tests. While that's been
investigated, don't include it.
---
 compute/compute-node.Dockerfile | 34 ---------------------------------
 1 file changed, 34 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 3459983a34..55d008acea 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -582,38 +582,6 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/hypopg.control
 
-#########################################################################################
-#
-# Layer "online_advisor-build"
-# compile online_advisor extension
-#
-#########################################################################################
-FROM build-deps AS online_advisor-src
-ARG PG_VERSION
-
-# online_advisor supports all Postgres version starting from PG14, but prior to PG17 has to be included in preload_shared_libraries
-# last release 1.0 - May 15, 2025
-WORKDIR /ext-src
-RUN case "${PG_VERSION:?}" in \
-    "v17") \
-        ;; \
-    *) \
-        echo "skipping the version of online_advistor for $PG_VERSION" && exit 0 \
-        ;; \
-    esac && \
-	wget https://github.com/knizhnik/online_advisor/archive/refs/tags/1.0.tar.gz -O online_advisor.tar.gz && \
-    echo "059b7d9e5a90013a58bdd22e9505b88406ce05790675eb2d8434e5b215652d54 online_advisor.tar.gz" | sha256sum --check && \
-    mkdir online_advisor-src && cd online_advisor-src && tar xzf ../online_advisor.tar.gz --strip-components=1 -C .
-
-FROM pg-build AS online_advisor-build
-COPY --from=online_advisor-src /ext-src/ /ext-src/
-WORKDIR /ext-src/
-RUN if [ -d online_advisor-src ]; then \
-	    cd online_advisor-src && \
-        make -j install && \
-        echo 'trusted = true' >> /usr/local/pgsql/share/extension/online_advisor.control; \
-    fi
-
 #########################################################################################
 #
 # Layer "pg_hashids-build"
@@ -1680,7 +1648,6 @@ COPY --from=pg_jsonschema-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_graphql-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_tiktoken-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=hypopg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=online_advisor-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_hashids-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=rum-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pgtap-build /usr/local/pgsql/ /usr/local/pgsql/
@@ -1856,7 +1823,6 @@ COPY --from=pgjwt-src /ext-src/ /ext-src/
 COPY --from=pg_graphql-src /ext-src/ /ext-src/
 #COPY --from=pg_tiktoken-src /ext-src/ /ext-src/
 COPY --from=hypopg-src /ext-src/ /ext-src/
-COPY --from=online_advisor-src /ext-src/ /ext-src/
 COPY --from=pg_hashids-src /ext-src/ /ext-src/
 COPY --from=rum-src /ext-src/ /ext-src/
 COPY --from=pgtap-src /ext-src/ /ext-src/

From 098fd689d1edcecc0ebe27af7755fb6eeff28179 Mon Sep 17 00:00:00 2001
From: Shockingly Good <fx@thefx.co>
Date: Fri, 30 May 2025 17:26:22 +0200
Subject: [PATCH 410/412] fix(compute) Remove the hardcoded default value for
 PGXN HTTP URL. (#12030)

Removes the hardcoded value for the Postgres Extensions HTTP gateway URL
as it is always provided by the calling code.
---
 compute_tools/src/bin/compute_ctl.rs | 31 +---------------------------
 1 file changed, 1 insertion(+), 30 deletions(-)

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index 02339f752c..f9d9c03422 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -57,21 +57,6 @@ use tracing::{error, info};
 use url::Url;
 use utils::failpoint_support;
 
-// Compatibility hack: if the control plane specified any remote-ext-config
-// use the default value for extension storage proxy gateway.
-// Remove this once the control plane is updated to pass the gateway URL
-fn parse_remote_ext_base_url(arg: &str) -> Result<String> {
-    const FALLBACK_PG_EXT_GATEWAY_BASE_URL: &str =
-        "http://pg-ext-s3-gateway.pg-ext-s3-gateway.svc.cluster.local";
-
-    Ok(if arg.starts_with("http") {
-        arg
-    } else {
-        FALLBACK_PG_EXT_GATEWAY_BASE_URL
-    }
-    .to_owned())
-}
-
 #[derive(Parser)]
 #[command(rename_all = "kebab-case")]
 struct Cli {
@@ -80,7 +65,7 @@ struct Cli {
 
     /// The base URL for the remote extension storage proxy gateway.
     /// Should be in the form of `http(s)://<gateway-hostname>[:<port>]`.
-    #[arg(short = 'r', long, value_parser = parse_remote_ext_base_url, alias = "remote-ext-config")]
+    #[arg(short = 'r', long, alias = "remote-ext-config")]
     pub remote_ext_base_url: Option<String>,
 
     /// The port to bind the external listening HTTP server to. Clients running
@@ -276,18 +261,4 @@ mod test {
     fn verify_cli() {
         Cli::command().debug_assert()
     }
-
-    #[test]
-    fn parse_pg_ext_gateway_base_url() {
-        let arg = "http://pg-ext-s3-gateway2";
-        let result = super::parse_remote_ext_base_url(arg).unwrap();
-        assert_eq!(result, arg);
-
-        let arg = "pg-ext-s3-gateway";
-        let result = super::parse_remote_ext_base_url(arg).unwrap();
-        assert_eq!(
-            result,
-            "http://pg-ext-s3-gateway.pg-ext-s3-gateway.svc.cluster.local"
-        );
-    }
 }

From c1ae29dea0bd861b8845c6a9deb690d34ed8914b Mon Sep 17 00:00:00 2001
From: Shockingly Good <victor@neon.tech>
Date: Fri, 30 May 2025 19:45:24 +0200
Subject: [PATCH 411/412] impr(compute): Remove the deprecated CLI arg alias
 for remote-ext-config. (#12087)

Also moves it from `String` to `Url`.
---
 compute_tools/src/bin/compute_ctl.rs  | 5 ++---
 compute_tools/src/compute.rs          | 3 ++-
 compute_tools/src/extension_server.rs | 5 +++--
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index f9d9c03422..db6835da61 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -64,9 +64,8 @@ struct Cli {
     pub pgbin: String,
 
     /// The base URL for the remote extension storage proxy gateway.
-    /// Should be in the form of `http(s)://<gateway-hostname>[:<port>]`.
-    #[arg(short = 'r', long, alias = "remote-ext-config")]
-    pub remote_ext_base_url: Option<String>,
+    #[arg(short = 'r', long)]
+    pub remote_ext_base_url: Option<Url>,
 
     /// The port to bind the external listening HTTP server to. Clients running
     /// outside the compute will talk to the compute through this port. Keep
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index ff49c737f0..d678b7d670 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -31,6 +31,7 @@ use std::time::{Duration, Instant};
 use std::{env, fs};
 use tokio::spawn;
 use tracing::{Instrument, debug, error, info, instrument, warn};
+use url::Url;
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;
 use utils::measured_stream::MeasuredReader;
@@ -96,7 +97,7 @@ pub struct ComputeNodeParams {
     pub internal_http_port: u16,
 
     /// the address of extension storage proxy gateway
-    pub remote_ext_base_url: Option<String>,
+    pub remote_ext_base_url: Option<Url>,
 
     /// Interval for installed extensions collection
     pub installed_extensions_collection_interval: u64,
diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs
index 3439383699..1857afa08c 100644
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -83,6 +83,7 @@ use reqwest::StatusCode;
 use tar::Archive;
 use tracing::info;
 use tracing::log::warn;
+use url::Url;
 use zstd::stream::read::Decoder;
 
 use crate::metrics::{REMOTE_EXT_REQUESTS_TOTAL, UNKNOWN_HTTP_STATUS};
@@ -158,14 +159,14 @@ fn parse_pg_version(human_version: &str) -> PostgresMajorVersion {
 pub async fn download_extension(
     ext_name: &str,
     ext_path: &RemotePath,
-    remote_ext_base_url: &str,
+    remote_ext_base_url: &Url,
     pgbin: &str,
 ) -> Result<u64> {
     info!("Download extension {:?} from {:?}", ext_name, ext_path);
 
     // TODO add retry logic
     let download_buffer =
-        match download_extension_tar(remote_ext_base_url, &ext_path.to_string()).await {
+        match download_extension_tar(remote_ext_base_url.as_str(), &ext_path.to_string()).await {
             Ok(buffer) => buffer,
             Err(error_message) => {
                 return Err(anyhow::anyhow!(

From 1823c915258954e6e9306457b08f9ebb58277e0f Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 4 Jun 2025 10:56:12 -0500
Subject: [PATCH 412/412] Use Url::join() when creating the final remote
 extension URL (#12121)

Url::to_string() adds a trailing slash on the base URL, so when we did
the format!(), we were adding a double forward slash.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute_tools/src/bin/compute_ctl.rs          | 67 ++++++++++++++-
 compute_tools/src/extension_server.rs         | 16 ++--
 libs/compute_api/src/spec.rs                  | 85 ++++++++++++++-----
 .../regress/test_download_extensions.py       |  7 +-
 4 files changed, 140 insertions(+), 35 deletions(-)

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index db6835da61..8b502a058e 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -40,7 +40,7 @@ use std::sync::mpsc;
 use std::thread;
 use std::time::Duration;
 
-use anyhow::{Context, Result};
+use anyhow::{Context, Result, bail};
 use clap::Parser;
 use compute_api::responses::ComputeConfig;
 use compute_tools::compute::{
@@ -57,14 +57,14 @@ use tracing::{error, info};
 use url::Url;
 use utils::failpoint_support;
 
-#[derive(Parser)]
+#[derive(Debug, Parser)]
 #[command(rename_all = "kebab-case")]
 struct Cli {
     #[arg(short = 'b', long, default_value = "postgres", env = "POSTGRES_PATH")]
     pub pgbin: String,
 
     /// The base URL for the remote extension storage proxy gateway.
-    #[arg(short = 'r', long)]
+    #[arg(short = 'r', long, value_parser = Self::parse_remote_ext_base_url)]
     pub remote_ext_base_url: Option<Url>,
 
     /// The port to bind the external listening HTTP server to. Clients running
@@ -126,6 +126,25 @@ struct Cli {
     pub installed_extensions_collection_interval: u64,
 }
 
+impl Cli {
+    /// Parse a URL from an argument. By default, this isn't necessary, but we
+    /// want to do some sanity checking.
+    fn parse_remote_ext_base_url(value: &str) -> Result<Url> {
+        // Remove extra trailing slashes, and add one. We use Url::join() later
+        // when downloading remote extensions. If the base URL is something like
+        // http://example.com/pg-ext-s3-gateway, and join() is called with
+        // something like "xyz", the resulting URL is http://example.com/xyz.
+        let value = value.trim_end_matches('/').to_owned() + "/";
+        let url = Url::parse(&value)?;
+
+        if url.query_pairs().count() != 0 {
+            bail!("parameters detected in remote extensions base URL")
+        }
+
+        Ok(url)
+    }
+}
+
 fn main() -> Result<()> {
     let cli = Cli::parse();
 
@@ -252,7 +271,8 @@ fn handle_exit_signal(sig: i32) {
 
 #[cfg(test)]
 mod test {
-    use clap::CommandFactory;
+    use clap::{CommandFactory, Parser};
+    use url::Url;
 
     use super::Cli;
 
@@ -260,4 +280,43 @@ mod test {
     fn verify_cli() {
         Cli::command().debug_assert()
     }
+
+    #[test]
+    fn verify_remote_ext_base_url() {
+        let cli = Cli::parse_from([
+            "compute_ctl",
+            "--pgdata=test",
+            "--connstr=test",
+            "--compute-id=test",
+            "--remote-ext-base-url",
+            "https://example.com/subpath",
+        ]);
+        assert_eq!(
+            cli.remote_ext_base_url.unwrap(),
+            Url::parse("https://example.com/subpath/").unwrap()
+        );
+
+        let cli = Cli::parse_from([
+            "compute_ctl",
+            "--pgdata=test",
+            "--connstr=test",
+            "--compute-id=test",
+            "--remote-ext-base-url",
+            "https://example.com//",
+        ]);
+        assert_eq!(
+            cli.remote_ext_base_url.unwrap(),
+            Url::parse("https://example.com").unwrap()
+        );
+
+        Cli::try_parse_from([
+            "compute_ctl",
+            "--pgdata=test",
+            "--connstr=test",
+            "--compute-id=test",
+            "--remote-ext-base-url",
+            "https://example.com?hello=world",
+        ])
+        .expect_err("URL parameters are not allowed");
+    }
 }
diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs
index 1857afa08c..3764bc1525 100644
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -166,7 +166,7 @@ pub async fn download_extension(
 
     // TODO add retry logic
     let download_buffer =
-        match download_extension_tar(remote_ext_base_url.as_str(), &ext_path.to_string()).await {
+        match download_extension_tar(remote_ext_base_url, &ext_path.to_string()).await {
             Ok(buffer) => buffer,
             Err(error_message) => {
                 return Err(anyhow::anyhow!(
@@ -271,10 +271,14 @@ pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) {
 }
 
 // Do request to extension storage proxy, e.g.,
-// curl http://pg-ext-s3-gateway/latest/v15/extensions/anon.tar.zst
+// curl http://pg-ext-s3-gateway.pg-ext-s3-gateway.svc.cluster.local/latest/v15/extensions/anon.tar.zst
 // using HTTP GET and return the response body as bytes.
-async fn download_extension_tar(remote_ext_base_url: &str, ext_path: &str) -> Result<Bytes> {
-    let uri = format!("{}/{}", remote_ext_base_url, ext_path);
+async fn download_extension_tar(remote_ext_base_url: &Url, ext_path: &str) -> Result<Bytes> {
+    let uri = remote_ext_base_url.join(ext_path).with_context(|| {
+        format!(
+            "failed to create the remote extension URI for {ext_path} using {remote_ext_base_url}"
+        )
+    })?;
     let filename = Path::new(ext_path)
         .file_name()
         .unwrap_or_else(|| std::ffi::OsStr::new("unknown"))
@@ -284,7 +288,7 @@ async fn download_extension_tar(remote_ext_base_url: &str, ext_path: &str) -> Re
 
     info!("Downloading extension file '{}' from uri {}", filename, uri);
 
-    match do_extension_server_request(&uri).await {
+    match do_extension_server_request(uri).await {
         Ok(resp) => {
             info!("Successfully downloaded remote extension data {}", ext_path);
             REMOTE_EXT_REQUESTS_TOTAL
@@ -303,7 +307,7 @@ async fn download_extension_tar(remote_ext_base_url: &str, ext_path: &str) -> Re
 
 // Do a single remote extensions server request.
 // Return result or (error message + stringified status code) in case of any failures.
-async fn do_extension_server_request(uri: &str) -> Result<Bytes, (String, String)> {
+async fn do_extension_server_request(uri: Url) -> Result<Bytes, (String, String)> {
     let resp = reqwest::get(uri).await.map_err(|e| {
         (
             format!(
diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs
index 09b550b96c..53bc36b2c2 100644
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -250,34 +250,44 @@ impl RemoteExtSpec {
         }
 
         match self.extension_data.get(real_ext_name) {
-            Some(_ext_data) => {
-                // We have decided to use the Go naming convention due to Kubernetes.
-
-                let arch = match std::env::consts::ARCH {
-                    "x86_64" => "amd64",
-                    "aarch64" => "arm64",
-                    arch => arch,
-                };
-
-                // Construct the path to the extension archive
-                // BUILD_TAG/PG_MAJOR_VERSION/extensions/EXTENSION_NAME.tar.zst
-                //
-                // Keep it in sync with path generation in
-                // https://github.com/neondatabase/build-custom-extensions/tree/main
-                let archive_path_str = format!(
-                    "{build_tag}/{arch}/{pg_major_version}/extensions/{real_ext_name}.tar.zst"
-                );
-                Ok((
-                    real_ext_name.to_string(),
-                    RemotePath::from_string(&archive_path_str)?,
-                ))
-            }
+            Some(_ext_data) => Ok((
+                real_ext_name.to_string(),
+                Self::build_remote_path(build_tag, pg_major_version, real_ext_name)?,
+            )),
             None => Err(anyhow::anyhow!(
                 "real_ext_name {} is not found",
                 real_ext_name
             )),
         }
     }
+
+    /// Get the architecture-specific portion of the remote extension path. We
+    /// use the Go naming convention due to Kubernetes.
+    fn get_arch() -> &'static str {
+        match std::env::consts::ARCH {
+            "x86_64" => "amd64",
+            "aarch64" => "arm64",
+            arch => arch,
+        }
+    }
+
+    /// Build a [`RemotePath`] for an extension.
+    fn build_remote_path(
+        build_tag: &str,
+        pg_major_version: &str,
+        ext_name: &str,
+    ) -> anyhow::Result<RemotePath> {
+        let arch = Self::get_arch();
+
+        // Construct the path to the extension archive
+        // BUILD_TAG/PG_MAJOR_VERSION/extensions/EXTENSION_NAME.tar.zst
+        //
+        // Keep it in sync with path generation in
+        // https://github.com/neondatabase/build-custom-extensions/tree/main
+        RemotePath::from_string(&format!(
+            "{build_tag}/{arch}/{pg_major_version}/extensions/{ext_name}.tar.zst"
+        ))
+    }
 }
 
 #[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
@@ -518,6 +528,37 @@ mod tests {
             .expect("Library should be found");
     }
 
+    #[test]
+    fn remote_extension_path() {
+        let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({
+            "public_extensions": ["ext"],
+            "custom_extensions": [],
+            "library_index": {
+                "extlib": "ext",
+            },
+            "extension_data": {
+                "ext": {
+                    "control_data": {
+                        "ext.control": ""
+                    },
+                    "archive_path": ""
+                }
+            },
+        }))
+        .unwrap();
+
+        let (_ext_name, ext_path) = rspec
+            .get_ext("ext", false, "latest", "v17")
+            .expect("Extension should be found");
+        // Starting with a forward slash would have consequences for the
+        // Url::join() that occurs when downloading a remote extension.
+        assert!(!ext_path.to_string().starts_with("/"));
+        assert_eq!(
+            ext_path,
+            RemoteExtSpec::build_remote_path("latest", "v17", "ext").unwrap()
+        );
+    }
+
     #[test]
     fn parse_spec_file() {
         let file = File::open("tests/cluster_spec.json").unwrap();
diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py
index 24ba0713d2..fe3b220c67 100644
--- a/test_runner/regress/test_download_extensions.py
+++ b/test_runner/regress/test_download_extensions.py
@@ -159,7 +159,8 @@ def test_remote_extensions(
 
     # Setup a mock nginx S3 gateway which will return our test extension.
     (host, port) = httpserver_listen_address
-    extensions_endpoint = f"http://{host}:{port}/pg-ext-s3-gateway"
+    remote_ext_base_url = f"http://{host}:{port}/pg-ext-s3-gateway"
+    log.info(f"remote extensions base URL: {remote_ext_base_url}")
 
     extension.build(pg_config, test_output_dir)
     tarball = extension.package(test_output_dir)
@@ -221,7 +222,7 @@ def test_remote_extensions(
 
     endpoint.create_remote_extension_spec(spec)
 
-    endpoint.start(remote_ext_base_url=extensions_endpoint)
+    endpoint.start(remote_ext_base_url=remote_ext_base_url)
 
     with endpoint.connect() as conn:
         with conn.cursor() as cur:
@@ -249,7 +250,7 @@ def test_remote_extensions(
     # Remove the extension files to force a redownload of the extension.
     extension.remove(test_output_dir, pg_version)
 
-    endpoint.start(remote_ext_base_url=extensions_endpoint)
+    endpoint.start(remote_ext_base_url=remote_ext_base_url)
 
     # Test that ALTER EXTENSION UPDATE statements also fetch remote extensions.
     with endpoint.connect() as conn: